fft2d_lib77.f Source File


Contents

Source Code


Source Code

c 2d parallel PIC library for fast fourier transforms
c-----------------------------------------------------------------------
      subroutine WPFST2RINIT(mixup,sctd,indx,indy,nxhyd,nxyd)
c this subroutine calculates tables needed by a two dimensional
c fast real sine and cosine transforms and their inverses.
c input: indx, indy, nxhyd, nxyd
c output: mixup, sctd
c mixup = array of bit reversed addresses
c sctd = sine/cosine table
c indx/indy = exponent which determines length in x/y direction,
c where nx=2**indx, ny=2**indy
c nxhyd = maximum of (nx/2,ny)
c nxyd = maximum of (nx,ny)
c written by viktor k. decyk, ucla
      implicit none
      integer indx, indy, nxhyd, nxyd
      integer mixup
      complex sctd
      dimension mixup(nxhyd), sctd(nxyd)
c local data
      integer indx1, indx1y, nx, ny, nxy, nxhy
      integer j, k, lb, ll, jb, it
      real dnxy, arg
      indx1 = indx - 1
      indx1y = max0(indx1,indy)
      nx = 2**indx
      ny = 2**indy
      nxy = max0(nx,ny)
      nxhy = 2**indx1y
c bit-reverse index table: mixup(j) = 1 + reversed bits of (j - 1)
      do 20 j = 1, nxhy
      lb = j - 1
      ll = 0
      do 10 k = 1, indx1y
      jb = lb/2
      it = lb - 2*jb
      lb = jb
      ll = 2*ll + it
   10 continue
      mixup(j) = ll + 1
   20 continue
c sine/cosine table for the angles n*pi/nxy
      dnxy = 0.5*6.28318530717959/float(nxy)
      do 30 j = 1, nxy
      arg = dnxy*float(j - 1)
      sctd(j) = cmplx(cos(arg),-sin(arg))
   30 continue
      return
      end
c-----------------------------------------------------------------------
      subroutine WPFSST2R(f,g,bs,br,isign,ntpose,mixup,sctd,ttp,indx,ind
     1y,kstrt,nxvh,nyv,kxp2,kyp,kypd,kxp2d,jblok,kblok,nxhyd,nxyd)
c wrapper function for parallel real sine/sine transform
      implicit none
      integer isign, ntpose, mixup, indx, indy, kstrt, nxvh, nyv
      integer kxp2, kyp, kypd, kxp2d, jblok, kblok, nxhyd, nxyd
      real f, g, bs, br, ttp
      complex sctd
      dimension f(2*nxvh,kypd,kblok), g(nyv,kxp2d,jblok)
      dimension bs(kxp2+1,kyp+1,kblok), br(kxp2+1,kyp+1,jblok)
      dimension mixup(nxhyd), sctd(nxyd)
c local data
      integer nx, ny, kxpi, kypi
      real tf
      double precision dtime
      data kxpi, kypi /1,1/
c calculate range of indices
      nx = 2**indx
      ny = 2**indy
c inverse fourier transform
      if (isign.lt.0) then
c perform x sine transform
         call PFST2RXX(f,isign,mixup,sctd,indx,indy,kstrt,kyp,kypi,kyp,n
     1xvh,kypd,kblok,nxhyd,nxyd)
c transpose f array to g
         call PWTIMERA(-1,ttp,dtime)
         call PRTPOSE(f,g,bs,br,nx,ny,kstrt,2*nxvh,nyv,kxp2,kyp,kxp2d,ky
     1pd,jblok,kblok)
         call PWTIMERA(1,ttp,dtime)
c perform y sine transform
         call PFST2RXY(g,isign,mixup,sctd,indx,indy,kstrt,kxp2,kxpi,kxp2
     1,nyv,kxp2d,jblok,nxhyd,nxyd)
c transpose g array to f
         if (ntpose.eq.0) then
            call PWTIMERA(-1,tf,dtime)
            call PRTPOSE(g,f,br,bs,ny,nx,kstrt,nyv,2*nxvh,kyp,kxp2,kypd,
     1kxp2d,kblok,jblok)
            call PWTIMERA(1,tf,dtime)
         endif
c forward fourier transform
      else if (isign.gt.0) then
c transpose f array to g
         if (ntpose.eq.0) then
            call PWTIMERA(-1,tf,dtime)
            call PRTPOSE(f,g,bs,br,nx,ny,kstrt,2*nxvh,nyv,kxp2,kyp,kxp2d
     1,kypd,jblok,kblok)
            call PWTIMERA(1,tf,dtime)
         endif
c perform y sine transform
         call PFST2RXY(g,isign,mixup,sctd,indx,indy,kstrt,kxp2,kxpi,kxp2
     1,nyv,kxp2d,jblok,nxhyd,nxyd)
c transpose g array to f
         call PWTIMERA(-1,ttp,dtime)
         call PRTPOSE(g,f,br,bs,ny,nx,kstrt,nyv,2*nxvh,kyp,kxp2,kypd,kxp
     12d,kblok,jblok)
         call PWTIMERA(1,ttp,dtime)
c perform x sine transform
         call PFST2RXX(f,isign,mixup,sctd,indx,indy,kstrt,kyp,kypi,kyp,n
     1xvh,kypd,kblok,nxhyd,nxyd)
      endif
      if (ntpose.eq.0) ttp = ttp + tf
      return
      end
c-----------------------------------------------------------------------
      subroutine WPFCCT2R(f,g,bs,br,isign,ntpose,mixup,sctd,ttp,indx,ind
     1y,kstrt,nxvh,nyv,kxp2,kyp,kypd,kxp2d,jblok,kblok,nxhyd,nxyd)
c wrapper function for parallel real cosine/cosine transform
      implicit none
      integer isign, ntpose, mixup, indx, indy, kstrt, nxvh, nyv
      integer kxp2, kyp, kypd, kxp2d, jblok, kblok, nxhyd, nxyd
      real f, g, bs, br, ttp
      complex sctd
      dimension f(2*nxvh,kypd,kblok), g(nyv,kxp2d,jblok)
      dimension bs(kxp2+1,kyp+1,kblok), br(kxp2+1,kyp+1,jblok)
      dimension mixup(nxhyd), sctd(nxyd)
c local data
      integer nx, ny, kxpi, kypi
      real tf
      double precision dtime
      data kxpi, kypi /1,1/
c calculate range of indices
      nx = 2**indx
      ny = 2**indy
c inverse fourier transform
      if (isign.lt.0) then
c perform x cosine transform
         call PFCT2RXX(f,isign,mixup,sctd,indx,indy,kstrt,kyp,kypi,kyp,n
     1xvh,kypd,kblok,nxhyd,nxyd)
c transpose f array to g
         call PWTIMERA(-1,ttp,dtime)
         call PRTPOSE(f,g,bs,br,nx,ny,kstrt,2*nxvh,nyv,kxp2,kyp,kxp2d,ky
     1pd,jblok,kblok)
         call PWTIMERA(1,ttp,dtime)
c perform y cosine transform
         call PFCT2RXY(g,isign,mixup,sctd,indx,indy,kstrt,kxp2,kxpi,kxp2
     1,nyv,kxp2d,jblok,nxhyd,nxyd)
c transpose g array to f
         if (ntpose.eq.0) then
            call PWTIMERA(-1,tf,dtime)
            call PRTPOSE(g,f,br,bs,ny,nx,kstrt,nyv,2*nxvh,kyp,kxp2,kypd,
     1kxp2d,kblok,jblok)
            call PWTIMERA(1,tf,dtime)
         endif
c forward fourier transform
      else if (isign.gt.0) then
c transpose f array to g
         if (ntpose.eq.0) then
            call PWTIMERA(-1,tf,dtime)
            call PRTPOSE(f,g,bs,br,nx,ny,kstrt,2*nxvh,nyv,kxp2,kyp,kxp2d
     1,kypd,jblok,kblok)
            call PWTIMERA(1,tf,dtime)
         endif
c perform y cosine transform
         call PFCT2RXY(g,isign,mixup,sctd,indx,indy,kstrt,kxp2,kxpi,kxp2
     1,nyv,kxp2d,jblok,nxhyd,nxyd)
c transpose g array to f
         call PWTIMERA(-1,ttp,dtime)
         call PRTPOSE(g,f,br,bs,ny,nx,kstrt,nyv,2*nxvh,kyp,kxp2,kypd,kxp
     12d,kblok,jblok)
         call PWTIMERA(1,ttp,dtime)
c perform x cosine transform
         call PFCT2RXX(f,isign,mixup,sctd,indx,indy,kstrt,kyp,kypi,kyp,n
     1xvh,kypd,kblok,nxhyd,nxyd)
      endif
      if (ntpose.eq.0) ttp = ttp + tf
      return
      end
c-----------------------------------------------------------------------
      subroutine PFST2RXX(f,isign,mixup,sctd,indx,indy,kstrt,kyp,kypi,ky
     1pp,nxvh,kypd,kblok,nxhyd,nxyd)
c this subroutine performs the x part of a two dimensional fast real
c sine transform and its inverse, for a subset of y,
c using real arithmetic, for data which is distributed in blocks
c algorithm is described in Numerical Recipies in Fortran, Second Ed.,
c by W. H. Press, B. P. Flannery, S. A. Teukolsky, and W. T. Vetterling, 
c [Cambridge Univ. Press, 1992], p. 508.
c for isign = (-1,1), input: all, output: f
c approximate flop count: N*(5*log2(N) + 18)/nvp
c where N = (nx/2)*ny
c indx/indy = exponent which determines length in x/y direction,
c where nx=2**indx, ny=2**indy
c if isign = -1, an inverse sine transform is performed
c f(n,k,i) = (1/nx*ny)*sum(f(j,k,i)*sin(pi*n*j/nx))
c if isign = 1, a forward sine transform is performed
c f(j,k,i) = sum(f(n,k,i)*sin(pi*n*j/nx))
c mixup = array of bit reversed addresses
c sctd = sine/cosine table
c kstrt = starting data block number
c kyp = number of data values per block in y
c kypi = initial y index used
c kypp = number of y indices used
c nxvh = first dimension of f >= nx/2 + 1
c kypd = second dimension of f >= kyp + 1
c kblok = number of data blocks in y
c nxhyd = maximum of (nx/2,ny)
c nxyd = maximum of (nx,ny)
c written by viktor k. decyk, ucla
      implicit none
      integer isign, mixup, indx, indy, kstrt, kyp, kypi, kypp
      integer nxvh, kypd, kblok, nxhyd, nxyd
      real f
      complex sctd
      dimension f(2*nxvh,kypd,kblok)
      dimension mixup(nxhyd), sctd(nxyd)
c local data
      integer indx1, indx1y, nx, nxh, nxhh, nx3, ny, nxy, nxhy, ks, kypt
      integer i, j, k, l, m, km, kmr, nrx, j1, j2, ns, ns2, k1, k2, kyps
      real at1, at2, t2, t3, t4, t5, t6, ani, sum1
      complex t1
      indx1 = indx - 1
      indx1y = max0(indx1,indy)
      nx = 2**indx
      nxh = nx/2
      nxhh = nx/4
      nx3 = nx + 3
      ny = 2**indy
      nxy = max0(nx,ny)
      nxhy = 2**indx1y
      ks = kstrt - 2
      kyps = kypi + kypp - 1
      if (kstrt.gt.ny) return
      if (isign.eq.0) return
c create auxiliary array in x
      kmr = nxy/nx
      kypt = kyps
      do 30 l = 1, kblok
      if ((kyps+kyp*(l+ks)).eq.ny) kypt = kyps + 1
      do 20 k = kypi, kypt
      do 10 j = 2, nxh
      j1 = 1 + kmr*(j - 1)
      at2 = f(nx+2-j,k,l)
      at1 = f(j,k,l) + at2
      at2 = f(j,k,l) - at2
      at1 = -aimag(sctd(j1))*at1
      at2 = .5*at2
      f(j,k,l) = at1 + at2
      f(nx+2-j,k,l) = at1 - at2
   10 continue
      f(1,k,l) = 0.0
      f(nxh+1,k,l) = 2.0*f(nxh+1,k,l)
   20 continue
   30 continue
c bit-reverse array elements in x
      nrx = nxhy/nxh
      kypt = kyps
      do 60 l = 1, kblok
      if ((kyps+kyp*(l+ks)).eq.ny) kypt = kyps + 1
      do 50 j = 1, nxh
      j1 = (mixup(j) - 1)/nrx + 1
      if (j.ge.j1) go to 50
      do 40 k = kypi, kypt
      t2 = f(2*j1-1,k,l)
      t3 = f(2*j1,k,l)
      f(2*j1-1,k,l) = f(2*j-1,k,l)
      f(2*j1,k,l) = f(2*j,k,l)
      f(2*j-1,k,l) = t2
      f(2*j,k,l) = t3
   40 continue
   50 continue
   60 continue
c first transform in x
      nrx = nxy/nxh
      do 110 m = 1, indx1
      ns = 2**(m - 1)
      ns2 = ns + ns
      km = nxhh/ns
      kmr = 2*km*nrx
      kypt = kyps
      do 100 l = 1, kblok
      if ((kyps+kyp*(l+ks)).eq.ny) kypt = kyps + 1
      do 90 k = 1, km
      k1 = ns2*(k - 1)
      k2 = k1 + ns
      do 80 j = 1, ns
      j1 = j + k1
      j2 = j + k2
      t1 = sctd(1+kmr*(j-1))
      do 70 i = kypi, kypt
      t2 = real(t1)*f(2*j2-1,i,l) - aimag(t1)*f(2*j2,i,l)
      t3 = aimag(t1)*f(2*j2-1,i,l) + real(t1)*f(2*j2,i,l)
      f(2*j2-1,i,l) = f(2*j1-1,i,l) - t2
      f(2*j2,i,l) = f(2*j1,i,l) - t3
      f(2*j1-1,i,l) = f(2*j1-1,i,l) + t2
      f(2*j1,i,l) = f(2*j1,i,l) + t3
   70 continue
   80 continue
   90 continue
  100 continue
  110 continue
c unscramble coefficients and normalize
c inverse fourier transform
      if (isign.lt.0) then
         kmr = nxy/nxh
         ani = 1./float(2*nx*ny)
         kypt = kyps
         do 150 l = 1, kblok
         if ((kyps+kyp*(l+ks)).eq.ny) kypt = kyps + 1
         do 130 j = 2, nxhh
         t1 = cmplx(aimag(sctd(1+kmr*(j-1))),-real(sctd(1+kmr*(j-1))))
         do 120 k = kypi, kypt
         t4 = f(nx3-2*j,k,l)
         t5 = -f(nx3-2*j+1,k,l)
         t2 = f(2*j-1,k,l) + t4
         t3 = f(2*j,k,l) + t5
         t6 = f(2*j-1,k,l) - t4
         t5 = f(2*j,k,l) - t5
         t4 = t6*real(t1) - t5*aimag(t1)
         t5 = t6*aimag(t1) + t5*real(t1)
         f(2*j-1,k,l) = ani*(t2 + t4)
         f(2*j,k,l) = ani*(t3 + t5)
         f(nx3-2*j,k,l) = ani*(t2 - t4)
         f(nx3-2*j+1,k,l) = ani*(t5 - t3)
  120    continue
  130    continue
         ani = 2.*ani
         do 140 k = kypi, kypt
         f(nxh+1,k,l) = ani*f(nxh+1,k,l)
         f(nxh+2,k,l) = -ani*f(nxh+2,k,l)
         t2 = ani*(f(1,k,l) + f(2,k,l))
         f(2,k,l) = ani*(f(1,k,l) - f(2,k,l))
         f(1,k,l) = t2
         f(nx+1,k,l) = ani*f(nx+1,k,l)
  140    continue
  150    continue
c forward fourier transform
      else if (isign.gt.0) then
         kmr = nxy/nxh
         kypt = kyps
         do 190 l = 1, kblok
         if ((kyps+kyp*(l+ks)).eq.ny) kypt = kyps + 1
         do 170 j = 2, nxhh
         t1 = cmplx(aimag(sctd(1+kmr*(j-1))),-real(sctd(1+kmr*(j-1))))
         do 160 k = kypi, kypt
         t4 = f(nx3-2*j,k,l)
         t5 = -f(nx3-2*j+1,k,l)
         t2 = f(2*j-1,k,l) + t4
         t3 = f(2*j,k,l) + t5
         t6 = f(2*j-1,k,l) - t4
         t5 = f(2*j,k,l) - t5
         t4 = t6*real(t1) - t5*aimag(t1)
         t5 = t6*aimag(t1) + t5*real(t1)
         f(2*j-1,k,l) = t2 + t4
         f(2*j,k,l) = t3 + t5
         f(nx3-2*j,k,l) = t2 - t4
         f(nx3-2*j+1,k,l) = t5 - t3
  160    continue
  170    continue
         do 180 k = kypi, kypt
         f(nxh+1,k,l) = 2.0*f(nxh+1,k,l)
         f(nxh+2,k,l) = -2.0*f(nxh+2,k,l)
         t2 = 2.0*(f(1,k,l) + f(2,k,l))
         f(2,k,l) = 2.0*(f(1,k,l) - f(2,k,l))
         f(1,k,l) = t2
         f(nx+1,k,l) = 2.0*f(nx+1,k,l)
  180    continue
  190    continue
      endif
c perform recursion for sine transform
      kypt = kyps
      do 220 l = 1, kblok
      if ((kyps+kyp*(l+ks)).eq.ny) kypt = kyps + 1
      do 210 k = kypi, kypt
      sum1 = .5*f(1,k,l)
      f(1,k,l) = 0.0
      f(2,k,l) = sum1
      do 200 j = 2, nxh
      sum1 = sum1 + f(2*j-1,k,l)
      f(2*j-1,k,l) = -f(2*j,k,l)
      f(2*j,k,l) = sum1
  200 continue
      f(nx+1,k,l) = 0.0
  210 continue
  220 continue
      return
      end
c-----------------------------------------------------------------------
      subroutine PFCT2RXX(f,isign,mixup,sctd,indx,indy,kstrt,kyp,kypi,ky
     1pp,nxvh,kypd,kblok,nxhyd,nxyd)
c this subroutine performs the x part of a two dimensional fast real
c cosine transform and its inverse, for a subset of y,
c using real arithmetic, for data which is distributed in blocks
c algorithm is described in Numerical Recipies in Fortran, Second Ed.,
c by W. H. Press, B. P. Flannery, S. A. Teukolsky, and W. T. Vetterling, 
c [Cambridge Univ. Press, 1992], p. 508.
c for isign = (-1,1), input: all, output: f
c approximate flop count: N*(5*log2(N) + 18)/nvp
c where N = (nx/2)*ny
c indx/indy = exponent which determines length in x/y direction,
c where nx=2**indx, ny=2**indy
c if isign = -1, an inverse cosine transform is performed
c f(n,k,i) = (1/nx*ny)*(.5*f(1,k,i) + ((-1)**n)*f(nx+1,k,i)
c            + sum(f(j,k,i)*cos(pi*n*j/nx)))
c if isign = 1, a forward cosine transform is performed
c f(j,k,i) = 2*(.5*f(1,k,i) + ((-1)**j)*f(n+1,k,i) + sum(f(n,k,i)*
c            cos(pi*n*j/nx))
c mixup = array of bit reversed addresses
c sctd = sine/cosine table
c kstrt = starting data block number
c kyp = number of data values per block in y
c kypi = initial y index used
c kypp = number of y indices used
c nxvh = first dimension of f >= nx/2 + 1
c kypd = second dimension of f >= kyp+1
c kblok = number of data blocks in y
c nxhyd = maximum of (nx/2,ny)
c nxyd = maximum of (nx,ny)
c written by viktor k. decyk, ucla
      implicit none
      integer isign, mixup, indx, indy, kstrt, kyp, kypi, kypp
      integer nxvh, kypd, kblok, nxhyd, nxyd
      real f
      complex sctd
      dimension f(2*nxvh,kypd,kblok)
      dimension mixup(nxhyd), sctd(nxyd)
c local data
      integer indx1, indx1y, nx, nxh, nxhh, nx3, ny, nxy, nxhy, ks, kypt
      integer i, j, k, l, m, km, kmr, nrx, j1, j2, ns, ns2, k1, k2, kyps
      real at1, at2, t2, t3, t4, t5, t6, ani, sum1
      complex t1
      indx1 = indx - 1
      indx1y = max0(indx1,indy)
      nx = 2**indx
      nxh = nx/2
      nxhh = nx/4
      nx3 = nx + 3
      ny = 2**indy
      nxy = max0(nx,ny)
      nxhy = 2**indx1y
      ks = kstrt - 2
      kyps = kypi + kypp - 1
      if (kstrt.gt.ny) return
      if (isign.eq.0) return
c create auxiliary array in x
      kmr = nxy/nx
      kypt = kyps
      do 30 l = 1, kblok
      if ((kyps+kyp*(l+ks)).eq.ny) kypt = kyps + 1
      do 20 k = kypi, kypt
      sum1 = .5*(f(1,k,l) - f(nx+1,k,l))
      do 10 j = 2, nxh
      j1 = 1 + kmr*(j - 1)
      at2 = f(nx+2-j,k,l)
      at1 = f(j,k,l) + at2
      at2 = f(j,k,l) - at2
      sum1 = sum1 + real(sctd(j1))*at2
      at2 = -aimag(sctd(j1))*at2
      at1 = .5*at1
      f(j,k,l) = at1 - at2
      f(nx+2-j,k,l) = at1 + at2
   10 continue
      f(1,k,l) = .5*(f(1,k,l) + f(nx+1,k,l))
      f(nx+1,k,l) = sum1
   20 continue
   30 continue
c bit-reverse array elements in x
      nrx = nxhy/nxh
      kypt = kyps
      do 60 l = 1, kblok
      if ((kyps+kyp*(l+ks)).eq.ny) kypt = kyps + 1
      do 50 j = 1, nxh
      j1 = (mixup(j) - 1)/nrx + 1
      if (j.ge.j1) go to 50
      do 40 k = kypi, kypt
      t2 = f(2*j1-1,k,l)
      t3 = f(2*j1,k,l)
      f(2*j1-1,k,l) = f(2*j-1,k,l)
      f(2*j1,k,l) = f(2*j,k,l)
      f(2*j-1,k,l) = t2
      f(2*j,k,l) = t3
   40 continue
   50 continue
   60 continue
c first transform in x
      nrx = nxy/nxh
      do 110 m = 1, indx1
      ns = 2**(m - 1)
      ns2 = ns + ns
      km = nxhh/ns
      kmr = 2*km*nrx
      kypt = kyps
      do 100 l = 1, kblok
      if ((kyps+kyp*(l+ks)).eq.ny) kypt = kyps + 1
      do 90 k = 1, km
      k1 = ns2*(k - 1)
      k2 = k1 + ns
      do 80 j = 1, ns
      j1 = j + k1
      j2 = j + k2
      t1 = sctd(1+kmr*(j-1))
      do 70 i = kypi, kypt
      t2 = real(t1)*f(2*j2-1,i,l) - aimag(t1)*f(2*j2,i,l)
      t3 = aimag(t1)*f(2*j2-1,i,l) + real(t1)*f(2*j2,i,l)
      f(2*j2-1,i,l) = f(2*j1-1,i,l) - t2
      f(2*j2,i,l) = f(2*j1,i,l) - t3
      f(2*j1-1,i,l) = f(2*j1-1,i,l) + t2
      f(2*j1,i,l) = f(2*j1,i,l) + t3
   70 continue
   80 continue
   90 continue
  100 continue
  110 continue
c unscramble coefficients and normalize
c inverse fourier transform
      if (isign.lt.0) then
         kmr = nxy/nxh
         ani = 1./float(2*nx*ny)
         kypt = kyps
         do 150 l = 1, kblok
         if ((kyps+kyp*(l+ks)).eq.ny) kypt = kyps + 1
         do 130 j = 2, nxhh
         t1 = cmplx(aimag(sctd(1+kmr*(j-1))),-real(sctd(1+kmr*(j-1))))
         do 120 k = kypi, kypt
         t4 = f(nx3-2*j,k,l)
         t5 = -f(nx3-2*j+1,k,l)
         t2 = f(2*j-1,k,l) + t4
         t3 = f(2*j,k,l) + t5
         t6 = f(2*j-1,k,l) - t4
         t5 = f(2*j,k,l) - t5
         t4 = t6*real(t1) - t5*aimag(t1)
         t5 = t6*aimag(t1) + t5*real(t1)
         f(2*j-1,k,l) = ani*(t2 + t4)
         f(2*j,k,l) = ani*(t3 + t5)
         f(nx3-2*j,k,l) = ani*(t2 - t4)
         f(nx3-2*j+1,k,l) = ani*(t5 - t3)
  120    continue
  130    continue
         ani = 2.*ani
         do 140 k = kypi, kypt
         f(nxh+1,k,l) = ani*f(nxh+1,k,l)
         f(nxh+2,k,l) = -ani*f(nxh+2,k,l)
         t2 = ani*(f(1,k,l) + f(2,k,l))
         f(2,k,l) = ani*(f(1,k,l) - f(2,k,l))
         f(1,k,l) = t2
         f(nx+1,k,l) = ani*f(nx+1,k,l)
  140    continue
  150    continue
c forward fourier transform
      else if (isign.gt.0) then
         kmr = nxy/nxh
         kypt = kyps
         do 190 l = 1, kblok
         if ((kyps+kyp*(l+ks)).eq.ny) kypt = kyps + 1
         do 170 j = 2, nxhh
         t1 = cmplx(aimag(sctd(1+kmr*(j-1))),-real(sctd(1+kmr*(j-1))))
         do 160 k = kypi, kypt
         t4 = f(nx3-2*j,k,l)
         t5 = -f(nx3-2*j+1,k,l)
         t2 = f(2*j-1,k,l) + t4
         t3 = f(2*j,k,l) + t5
         t6 = f(2*j-1,k,l) - t4
         t5 = f(2*j,k,l) - t5
         t4 = t6*real(t1) - t5*aimag(t1)
         t5 = t6*aimag(t1) + t5*real(t1)
         f(2*j-1,k,l) = t2 + t4
         f(2*j,k,l) = t3 + t5
         f(nx3-2*j,k,l) = t2 - t4
         f(nx3-2*j+1,k,l) = t5 - t3
  160    continue
  170    continue
         do 180 k = kypi, kypt
         f(nxh+1,k,l) = 2.0*f(nxh+1,k,l)
         f(nxh+2,k,l) = -2.0*f(nxh+2,k,l)
         t2 = 2.0*(f(1,k,l) + f(2,k,l))
         f(2,k,l) = 2.0*(f(1,k,l) - f(2,k,l))
         f(1,k,l) = t2
         f(nx+1,k,l) = 2.0*f(nx+1,k,l)
  180    continue
  190    continue
      endif
c perform recursion for cosine transform
      kypt = kyps
      do 220 l = 1, kblok
      if ((kyps+kyp*(l+ks)).eq.ny) kypt = kyps + 1
      do 210 k = kypi, kypt
      sum1 = f(nx+1,k,l)
      f(nx+1,k,l) = f(2,k,l)
      f(2,k,l) = sum1
      do 200 j = 2, nxh
      sum1 = sum1 - f(2*j,k,l)
      f(2*j,k,l) = sum1
  200 continue
  210 continue
  220 continue
      return
      end
c-----------------------------------------------------------------------
      subroutine PFST2RXY(g,isign,mixup,sctd,indx,indy,kstrt,kxp,kxpi,kx
     1pp,nyv,kxpd,jblok,nxhyd,nxyd)
c this subroutine performs the y part of a two dimensional fast real
c sine transform and its inverse, for a subset of x,
c using real arithmetic, for data which is distributed in blocks
c algorithm is described in Numerical Recipies in Fortran, Second Ed.,
c by W. H. Press, B. P. Flannery, S. A. Teukolsky, and W. T. Vetterling, 
c [Cambridge Univ. Press, 1992], p. 508.
c for isign = (-1,1), input: all, output: f
c approximate flop count: N*(5*log2(N) + 18)/nvp
c where N = (nx/2)*ny
c indx/indy = exponent which determines length in x/y direction,
c where nx=2**indx, ny=2**indy
c if isign = -1, an inverse sine transform is performed
c g(m,n,i) = sum(g(k,n,i)*sin(pi*m*k/ny))
c if isign = 1, a forward sine transform is performed
c g(k,n,i) = sum(g(m,n,i)*sin(pi*m*k/ny))
c mixup = array of bit reversed addresses
c sctd = sine/cosine table
c kstrt = starting data block number
c kxp = number of data values per block in x
c kxpi = initial x index used
c kxpp = number of x indices used
c nyv = first dimension of g >= ny + 1
c kxpd = second dimension of g >= kxp + 1
c jblok = number of data blocks in x
c nxhyd = maximum of (nx/2,ny)
c nxyd = maximum of (nx,ny)
c written by viktor k. decyk, ucla
      implicit none
      integer isign, mixup, indx, indy, kstrt, kxp, kxpi, kxpp
      integer nyv, kxpd, jblok, nxhyd, nxyd
      real g
      complex sctd
      dimension g(nyv,kxpd,jblok)
      dimension mixup(nxhyd), sctd(nxyd)
c local data
      integer indx1, indy1, indx1y, nx, ny, nyh, nyhh, ny3, nxy, nxhy
      integer i, j, k, l, m, ks, km, kmr, nry, j1, j2, ns, ns2, k1, k2
      integer kxps, kxpt
      real at1, at2, t2, t3, t4, t5, t6, ani, sum1
      complex t1
      indx1 = indx - 1
      indy1 = indy - 1
      indx1y = max0(indx1,indy)
      nx = 2**indx
      ny = 2**indy
      nyh = ny/2
      nyhh = ny/4
      ny3 = ny + 3
      nxy = max0(nx,ny)
      nxhy = 2**indx1y
      ks = kstrt - 2
      kxps = kxpi + kxpp - 1
      if (kstrt.gt.nx) return
      if (isign.eq.0) return
c create auxiliary array in y
      kmr = nxy/ny
      kxpt = kxps
      do 30 l = 1, jblok
      if ((kxps+kxp*(l+ks)).eq.nx) kxpt = kxps + 1
      do 20 j = kxpi, kxpt
      do 10 k = 2, nyh
      k1 = 1 + kmr*(k - 1)
      at2 = g(ny+2-k,j,l)
      at1 = g(k,j,l) + at2
      at2 = g(k,j,l) - at2
      at1 = -aimag(sctd(k1))*at1
      at2 = .5*at2
      g(k,j,l) = at1 + at2
      g(ny+2-k,j,l) = at1 - at2
   10 continue
      g(1,j,l) = 0.0
      g(nyh+1,j,l) = 2.0*g(nyh+1,j,l)
   20 continue
   30 continue
c bit-reverse array elements in y
      nry = nxhy/nyh
      kxpt = kxps
      do 60 l = 1, jblok
      if ((kxps+kxp*(l+ks)).eq.nx) kxpt = kxps + 1
      do 50 k = 1, nyh
      k1 = (mixup(k) - 1)/nry + 1
      if (k.ge.k1) go to 50
      do 40 j = kxpi, kxpt
      t2 = g(2*k1-1,j,l)
      t3 = g(2*k1,j,l)
      g(2*k1-1,j,l) = g(2*k-1,j,l)
      g(2*k1,j,l) = g(2*k,j,l)
      g(2*k-1,j,l) = t2
      g(2*k,j,l) = t3
   40 continue
   50 continue
   60 continue
c first transform in y
      nry = nxy/nyh
      do 110 m = 1, indy1
      ns = 2**(m - 1)
      ns2 = ns + ns
      km = nyhh/ns
      kmr = 2*km*nry
      kxpt = kxps
      do 100 l = 1, jblok
      if ((kxps+kxp*(l+ks)).eq.nx) kxpt = kxps + 1
      do 90 k = 1, km
      k1 = ns2*(k - 1)
      k2 = k1 + ns
      do 80 j = 1, ns
      j1 = j + k1
      j2 = j + k2
      t1 = sctd(1+kmr*(j-1))
      do 70 i = kxpi, kxpt
      t2 = real(t1)*g(2*j2-1,i,l) - aimag(t1)*g(2*j2,i,l)
      t3 = aimag(t1)*g(2*j2-1,i,l) + real(t1)*g(2*j2,i,l)
      g(2*j2-1,i,l) = g(2*j1-1,i,l) - t2
      g(2*j2,i,l) = g(2*j1,i,l) - t3
      g(2*j1-1,i,l) = g(2*j1-1,i,l) + t2
      g(2*j1,i,l) = g(2*j1,i,l) + t3
   70 continue
   80 continue
   90 continue
  100 continue
  110 continue
c unscramble coefficients and normalize
c inverse fourier transform
      if (isign.lt.0) then
         kmr = nxy/nyh
         ani = 0.5
         kxpt = kxps
         do 150 l = 1, jblok
         if ((kxps+kxp*(l+ks)).eq.nx) kxpt = kxps + 1
         do 130 k = 2, nyhh
         t1 = cmplx(aimag(sctd(1+kmr*(k-1))),-real(sctd(1+kmr*(k-1))))
         do 120 j = kxpi, kxpt
         t4 = g(ny3-2*k,j,l)
         t5 = -g(ny3-2*k+1,j,l)
         t2 = g(2*k-1,j,l) + t4
         t3 = g(2*k,j,l) + t5
         t6 = g(2*k-1,j,l) - t4
         t5 = g(2*k,j,l) - t5
         t4 = t6*real(t1) - t5*aimag(t1)
         t5 = t6*aimag(t1) + t5*real(t1)
         g(2*k-1,j,l) = ani*(t2 + t4)
         g(2*k,j,l) = ani*(t3 + t5)
         g(ny3-2*k,j,l) = ani*(t2 - t4)
         g(ny3-2*k+1,j,l) = ani*(t5 - t3)
  120    continue
  130    continue
         do 140 j = kxpi, kxpt
         g(nyh+1,j,l) = g(nyh+1,j,l)
         g(nyh+2,j,l) = -g(nyh+2,j,l)
         t2 = g(1,j,l) + g(2,j,l)
         g(2,j,l) = g(1,j,l) - g(2,j,l)
         g(1,j,l) = t2
         g(ny+1,j,l) = g(ny+1,j,l)
  140    continue
  150    continue
c forward fourier transform
      else if (isign.gt.0) then
         kmr = nxy/nyh
         kxpt = kxps
         do 190 l = 1, jblok
         if ((kxps+kxp*(l+ks)).eq.nx) kxpt = kxps + 1
         do 170 k = 2, nyhh
         t1 = cmplx(aimag(sctd(1+kmr*(k-1))),-real(sctd(1+kmr*(k-1))))
         do 160 j = kxpi, kxpt
         t4 = g(ny3-2*k,j,l)
         t5 = -g(ny3-2*k+1,j,l)
         t2 = g(2*k-1,j,l) + t4
         t3 = g(2*k,j,l) + t5
         t6 = g(2*k-1,j,l) - t4
         t5 = g(2*k,j,l) - t5
         t4 = t6*real(t1) - t5*aimag(t1)
         t5 = t6*aimag(t1) + t5*real(t1)
         g(2*k-1,j,l) = t2 + t4
         g(2*k,j,l) = t3 + t5
         g(ny3-2*k,j,l) = t2 - t4
         g(ny3-2*k+1,j,l) = t5 - t3
  160    continue
  170    continue
         do 180 j = kxpi, kxpt
         g(nyh+1,j,l) = 2.0*g(nyh+1,j,l)
         g(nyh+2,j,l) = -2.0*g(nyh+2,j,l)
         t2 = 2.0*(g(1,j,l) + g(2,j,l))
         g(2,j,l) = 2.0*(g(1,j,l) - g(2,j,l))
         g(1,j,l) = t2
         g(ny+1,j,l) = 2.0*g(ny+1,j,l)
  180    continue
  190    continue
      endif
c perform recursion for sine transform
      kxpt = kxps
      do 220 l = 1, jblok
      if ((kxps+kxp*(l+ks)).eq.nx) kxpt = kxps + 1
      do 210 j = kxpi, kxpt
      sum1 = .5*g(1,j,l)
      g(1,j,l) = 0.0
      g(2,j,l) = sum1
      do 200 k = 2, nyh
      sum1 = sum1 + g(2*k-1,j,l)
      g(2*k-1,j,l) = -g(2*k,j,l)
      g(2*k,j,l) = sum1
  200 continue
      g(ny+1,j,l) = 0.0
  210 continue
  220 continue
      return
      end
c-----------------------------------------------------------------------
      subroutine PFCT2RXY(g,isign,mixup,sctd,indx,indy,kstrt,kxp,kxpi,kx
     1pp,nyv,kxpd,jblok,nxhyd,nxyd)
c this subroutine performs the y part of a two dimensional fast real
c cosine transform and its inverse, for a subset of x,
c using real arithmetic, for data which is distributed in blocks
c algorithm is described in Numerical Recipies in Fortran, Second Ed.,
c by W. H. Press, B. P. Flannery, S. A. Teukolsky, and W. T. Vetterling, 
c [Cambridge Univ. Press, 1992], p. 508.
c for isign = (-1,1), input: all, output: f
c approximate flop count: N*(5*log2(N) + 18)/nvp
c where N = (nx/2)*ny
c indx/indy = exponent which determines length in x/y direction,
c where nx=2**indx, ny=2**indy
c if isign = -1, an inverse cosine transform is performed
c g(m,n,i) = (.5*g(1,n,i) + ((-1)**m)*g(ny+1,n,i)
c            + sum(g(k,n,i)*cos(pi*m*k/ny))
c if isign = 1, a forward cosine transform is performed
c g(k,n,i) = 2*(.5*g(1,n,i) + ((-1)**m)*g(ny+1,n,i) + sum(g(m,n,i)*
c            cos(pi*m*k/ny))
c mixup = array of bit reversed addresses
c sctd = sine/cosine table
c kstrt = starting data block number
c kxp = number of data values per block in x
c kxpi = initial x index used
c kxpp = number of x indices used
c nyv = first dimension of g >= ny + 1
c kxpd = second dimension of g >= kxp + 1
c jblok = number of data blocks in x
c nxhyd = maximum of (nx/2,ny)
c nxyd = maximum of (nx,ny)
c written by viktor k. decyk, ucla
      implicit none
      integer isign, mixup, indx, indy, kstrt, kxp, kxpi, kxpp
      integer nyv, kxpd, jblok, nxhyd, nxyd
      real g
      complex sctd
      dimension g(nyv,kxpd,jblok)
      dimension mixup(nxhyd), sctd(nxyd)
c local data
      integer indx1, indy1, indx1y, nx, ny, nyh, nyhh, ny3, nxy, nxhy
      integer i, j, k, l, m, ks, km, kmr, nry, j1, j2, ns, ns2, k1, k2
      integer kxps, kxpt
      real at1, at2, t2, t3, t4, t5, t6, ani, sum1
      complex t1
      indx1 = indx - 1
      indy1 = indy - 1
      indx1y = max0(indx1,indy)
      nx = 2**indx
      ny = 2**indy
      nyh = ny/2
      nyhh = ny/4
      ny3 = ny + 3
      nxy = max0(nx,ny)
      nxhy = 2**indx1y
      ks = kstrt - 2
      kxps = kxpi + kxpp - 1
      if (kstrt.gt.nx) return
      if (isign.eq.0) return
c create auxiliary array in y
      kmr = nxy/ny
      kxpt = kxps
      do 30 l = 1, jblok
      if ((kxps+kxp*(l+ks)).eq.nx) kxpt = kxps + 1
      do 20 j = kxpi, kxpt
      sum1 = .5*(g(1,j,l) - g(ny+1,j,l))
      do 10 k = 2, nyh
      k1 = 1 + kmr*(k - 1)
      at2 = g(ny+2-k,j,l)
      at1 = g(k,j,l) + at2
      at2 = g(k,j,l) - at2
      sum1 = sum1 + real(sctd(k1))*at2
      at2 = -aimag(sctd(k1))*at2
      at1 = .5*at1
      g(k,j,l) = at1 - at2
      g(ny+2-k,j,l) = at1 + at2
   10 continue
      g(1,j,l) = .5*(g(1,j,l) + g(ny+1,j,l))
      g(ny+1,j,l) = sum1
   20 continue
   30 continue
c bit-reverse array elements in y
      nry = nxhy/nyh
      kxpt = kxps
      do 60 l = 1, jblok
      if ((kxps+kxp*(l+ks)).eq.nx) kxpt = kxps + 1
      do 50 k = 1, nyh
      k1 = (mixup(k) - 1)/nry + 1
      if (k.ge.k1) go to 50
      do 40 j = kxpi, kxpt
      t2 = g(2*k1-1,j,l)
      t3 = g(2*k1,j,l)
      g(2*k1-1,j,l) = g(2*k-1,j,l)
      g(2*k1,j,l) = g(2*k,j,l)
      g(2*k-1,j,l) = t2
      g(2*k,j,l) = t3
   40 continue
   50 continue
   60 continue
c first transform in y
      nry = nxy/nyh
      do 110 m = 1, indy1
      ns = 2**(m - 1)
      ns2 = ns + ns
      km = nyhh/ns
      kmr = 2*km*nry
      kxpt = kxps
      do 100 l = 1, jblok
      if ((kxps+kxp*(l+ks)).eq.nx) kxpt = kxps + 1
      do 90 k = 1, km
      k1 = ns2*(k - 1)
      k2 = k1 + ns
      do 80 j = 1, ns
      j1 = j + k1
      j2 = j + k2
      t1 = sctd(1+kmr*(j-1))
      do 70 i = kxpi, kxpt
      t2 = real(t1)*g(2*j2-1,i,l) - aimag(t1)*g(2*j2,i,l)
      t3 = aimag(t1)*g(2*j2-1,i,l) + real(t1)*g(2*j2,i,l)
      g(2*j2-1,i,l) = g(2*j1-1,i,l) - t2
      g(2*j2,i,l) = g(2*j1,i,l) - t3
      g(2*j1-1,i,l) = g(2*j1-1,i,l) + t2
      g(2*j1,i,l) = g(2*j1,i,l) + t3
   70 continue
   80 continue
   90 continue
  100 continue
  110 continue
c unscramble coefficients and normalize
c inverse fourier transform
      if (isign.lt.0) then
         kmr = nxy/nyh
         ani = 0.5
         kxpt = kxps
         do 150 l = 1, jblok
         if ((kxps+kxp*(l+ks)).eq.nx) kxpt = kxps + 1
         do 130 k = 2, nyhh
         t1 = cmplx(aimag(sctd(1+kmr*(k-1))),-real(sctd(1+kmr*(k-1))))
         do 120 j = kxpi, kxpt
         t4 = g(ny3-2*k,j,l)
         t5 = -g(ny3-2*k+1,j,l)
         t2 = g(2*k-1,j,l) + t4
         t3 = g(2*k,j,l) + t5
         t6 = g(2*k-1,j,l) - t4
         t5 = g(2*k,j,l) - t5
         t4 = t6*real(t1) - t5*aimag(t1)
         t5 = t6*aimag(t1) + t5*real(t1)
         g(2*k-1,j,l) = ani*(t2 + t4)
         g(2*k,j,l) = ani*(t3 + t5)
         g(ny3-2*k,j,l) = ani*(t2 - t4)
         g(ny3-2*k+1,j,l) = ani*(t5 - t3)
  120    continue
  130    continue
         do 140 j = kxpi, kxpt
         g(nyh+1,j,l) = g(nyh+1,j,l)
         g(nyh+2,j,l) = -g(nyh+2,j,l)
         t2 = g(1,j,l) + g(2,j,l)
         g(2,j,l) = g(1,j,l) - g(2,j,l)
         g(1,j,l) = t2
         g(ny+1,j,l) = g(ny+1,j,l)
  140    continue
  150    continue
c forward fourier transform
      else if (isign.gt.0) then
         kmr = nxy/nyh
         kxpt = kxps
         do 190 l = 1, jblok
         if ((kxps+kxp*(l+ks)).eq.nx) kxpt = kxps + 1
         do 170 k = 2, nyhh
         t1 = cmplx(aimag(sctd(1+kmr*(k-1))),-real(sctd(1+kmr*(k-1))))
         do 160 j = kxpi, kxpt
         t4 = g(ny3-2*k,j,l)
         t5 = -g(ny3-2*k+1,j,l)
         t2 = g(2*k-1,j,l) + t4
         t3 = g(2*k,j,l) + t5
         t6 = g(2*k-1,j,l) - t4
         t5 = g(2*k,j,l) - t5
         t4 = t6*real(t1) - t5*aimag(t1)
         t5 = t6*aimag(t1) + t5*real(t1)
         g(2*k-1,j,l) = t2 + t4
         g(2*k,j,l) = t3 + t5
         g(ny3-2*k,j,l) = t2 - t4
         g(ny3-2*k+1,j,l) = t5 - t3
  160    continue
  170    continue
         do 180 j = kxpi, kxpt
         g(nyh+1,j,l) = 2.0*g(nyh+1,j,l)
         g(nyh+2,j,l) = -2.0*g(nyh+2,j,l)
         t2 = 2.0*(g(1,j,l) + g(2,j,l))
         g(2,j,l) = 2.0*(g(1,j,l) - g(2,j,l))
         g(1,j,l) = t2
         g(ny+1,j,l) = 2.0*g(ny+1,j,l)
  180    continue
  190    continue
      endif
c perform recursion for cosine transform
      kxpt = kxps
      do 220 l = 1, jblok
      if ((kxps+kxp*(l+ks)).eq.nx) kxpt = kxps + 1
      do 210 j = kxpi, kxpt
      sum1 = g(ny+1,j,l)
      g(ny+1,j,l) = g(2,j,l)
      g(2,j,l) = sum1
      do 200 k = 2, nyh
      sum1 = sum1 - g(2*k,j,l)
      g(2*k,j,l) = sum1
  200 continue
  210 continue
  220 continue
      return
      end
c-----------------------------------------------------------------------
      subroutine WPFCST2R2(f,g,bs,br,isign,ntpose,mixup,sctd,ttp,indx,in
     1dy,kstrt,nxvh,nyv,kxp2,kyp,kypd,kxp2d,jblok,kblok,nxhyd,nxyd)
c wrapper function for 2 parallel real sine/sine transforms
c for the electric field with dirichlet or magnetic field with neumann
c boundary conditions
      implicit none
      integer isign, ntpose, mixup, indx, indy, kstrt, nxvh, nyv
      integer kxp2, kyp, kypd, kxp2d, jblok, kblok, nxhyd, nxyd
      real f, g, bs, br, ttp
      complex sctd
      dimension f(2,2*nxvh,kypd,kblok), g(2,nyv,kxp2d,jblok)
      dimension bs(2,kxp2+1,kyp+1,kblok), br(2,kxp2+1,kyp+1,jblok)
      dimension mixup(nxhyd), sctd(nxyd)
c local data
      integer nx, ny, kxpi, kypi
      real tf
      double precision dtime
      data kxpi, kypi /1,1/
c calculate range of indices
      nx = 2**indx
      ny = 2**indy
c inverse fourier transform
      if (isign.lt.0) then
c perform x cosine-sine transform
         call PFCST2R2X(f,isign,mixup,sctd,indx,indy,kstrt,kyp,kypi,kyp,
     1nxvh,kypd,kblok,nxhyd,nxyd)
c transpose f array to g
         call PWTIMERA(-1,ttp,dtime)
         call PR2TPOSE(f,g,bs,br,nx,ny,kstrt,2*nxvh,nyv,kxp2,kyp,kxp2d,k
     1ypd,jblok,kblok)
         call PWTIMERA(1,ttp,dtime)
c perform y sine-cosine transform
         call PFSCT2R2Y(g,isign,mixup,sctd,indx,indy,kstrt,kxp2,kxpi,kxp
     12,nyv,kxp2d,jblok,nxhyd,nxyd)
c transpose g array to f
         if (ntpose.eq.0) then
            call PWTIMERA(-1,tf,dtime)
            call PR2TPOSE(g,f,br,bs,ny,nx,kstrt,nyv,2*nxvh,kyp,kxp2,kypd
     1,kxp2d,kblok,jblok)
            call PWTIMERA(1,tf,dtime)
         endif
c forward fourier transform
      else if (isign.gt.0) then
c transpose f array to g
         if (ntpose.eq.0) then
            call PWTIMERA(-1,tf,dtime)
            call PR2TPOSE(f,g,bs,br,nx,ny,kstrt,2*nxvh,nyv,kxp2,kyp,kxp2
     1d,kypd,jblok,kblok)
            call PWTIMERA(1,tf,dtime)
         endif
c perform y sine-cosine transform
         call PFSCT2R2Y(g,isign,mixup,sctd,indx,indy,kstrt,kxp2,kxpi,kxp
     12,nyv,kxp2d,jblok,nxhyd,nxyd)
c transpose g array to f
         call PWTIMERA(-1,ttp,dtime)
         call PR2TPOSE(g,f,br,bs,ny,nx,kstrt,nyv,2*nxvh,kyp,kxp2,kypd,kx
     1p2d,kblok,jblok)
         call PWTIMERA(1,ttp,dtime)
c perform x cosine-sine transform
         call PFCST2R2X(f,isign,mixup,sctd,indx,indy,kstrt,kyp,kypi,kyp,
     1nxvh,kypd,kblok,nxhyd,nxyd)
      endif
      if (ntpose.eq.0) ttp = ttp + tf
      return
      end
c-----------------------------------------------------------------------
      subroutine WPFSCT2R2(f,g,bs,br,isign,ntpose,mixup,sctd,ttp,indx,in
     1dy,kstrt,nxvh,nyv,kxp2,kyp,kypd,kxp2d,jblok,kblok,nxhyd,nxyd)
c wrapper function for 2 parallel real sine/sine transforms
c for the magnetic field with dirichlet or electric field with neumann
c boundary conditions
      implicit none
      integer isign, ntpose, mixup, indx, indy, kstrt, nxvh, nyv
      integer kxp2, kyp, kypd, kxp2d, jblok, kblok, nxhyd, nxyd
      real f, g, bs, br, ttp
      complex sctd
      dimension f(2,2*nxvh,kypd,kblok), g(2,nyv,kxp2d,jblok)
      dimension bs(2,kxp2+1,kyp+1,kblok), br(2,kxp2+1,kyp+1,jblok)
      dimension mixup(nxhyd), sctd(nxyd)
c local data
      integer nx, ny, kxpi, kypi
      real tf
      double precision dtime
      data kxpi, kypi /1,1/
c calculate range of indices
      nx = 2**indx
      ny = 2**indy
c inverse fourier transform
      if (isign.lt.0) then
c perform x sine-cosine transform
         call PFSCT2R2X(f,isign,mixup,sctd,indx,indy,kstrt,kyp,kypi,kyp,
     1nxvh,kypd,kblok,nxhyd,nxyd)
c transpose f array to g
         call PWTIMERA(-1,ttp,dtime)
         call PR2TPOSE(f,g,bs,br,nx,ny,kstrt,2*nxvh,nyv,kxp2,kyp,kxp2d,k
     1ypd,jblok,kblok)
         call PWTIMERA(1,ttp,dtime)
c perform y cosine-sine transform
         call PFCST2R2Y(g,isign,mixup,sctd,indx,indy,kstrt,kxp2,kxpi,kxp
     12,nyv,kxp2d,jblok,nxhyd,nxyd)
c transpose g array to f
         if (ntpose.eq.0) then
            call PWTIMERA(-1,tf,dtime)
            call PR2TPOSE(g,f,br,bs,ny,nx,kstrt,nyv,2*nxvh,kyp,kxp2,kypd
     1,kxp2d,kblok,jblok)
            call PWTIMERA(1,tf,dtime)
         endif
c forward fourier transform
      else if (isign.gt.0) then
c transpose f array to g
         if (ntpose.eq.0) then
            call PWTIMERA(-1,tf,dtime)
            call PR2TPOSE(f,g,bs,br,nx,ny,kstrt,2*nxvh,nyv,kxp2,kyp,kxp2
     1d,kypd,jblok,kblok)
            call PWTIMERA(1,tf,dtime)
         endif
c perform y cosine-sine transform
         call PFCST2R2Y(g,isign,mixup,sctd,indx,indy,kstrt,kxp2,kxpi,kxp
     12,nyv,kxp2d,jblok,nxhyd,nxyd)
c transpose g array to f
         call PWTIMERA(-1,ttp,dtime)
         call PR2TPOSE(g,f,br,bs,ny,nx,kstrt,nyv,2*nxvh,kyp,kxp2,kypd,kx
     1p2d,kblok,jblok)
         call PWTIMERA(1,ttp,dtime)
c perform x sine-cosine transform
         call PFSCT2R2X(f,isign,mixup,sctd,indx,indy,kstrt,kyp,kypi,kyp,
     1nxvh,kypd,kblok,nxhyd,nxyd)
      endif
      if (ntpose.eq.0) ttp = ttp + tf
      return
      end
c-----------------------------------------------------------------------
      subroutine PFCST2R2X(f,isign,mixup,sctd,indx,indy,kstrt,kyp,kypi,k
     1ypp,nxvh,kypd,kblok,nxhyd,nxyd)
c this subroutine performs the x part of 2 two dimensional fast real
c sine and cosine transforms and their inverses, for a subset of y,
c using real arithmetic, for data which is distributed in blocks
c algorithm is described in Numerical Recipies in Fortran, Second Ed.,
c by W. H. Press, B. P. Flannery, S. A. Teukolsky, and W. T. Vetterling, 
c [Cambridge Univ. Press, 1992], p. 508.
c for isign = (-1,1), input: all, output: f
c approximate flop count: N*(5*log2(N) + 18)/nvp
c where N = (nx/2)*ny
c indx/indy = exponent which determines length in x/y direction,
c where nx=2**indx, ny=2**indy
c if isign = -1, inverse sine-cosine transforms are performed
c f(1,n,k,i) = (1/nx*ny)*(.5*f(1,1,k,i) + ((-1)**n)*f(1,nx+1,k,i)
c              + sum(f(1,j,k,i)*cos(pi*n*j/nx)))
c f(2,n,k,i) = (1/nx*ny)*sum(f(2,j,k,i)*sin(pi*n*j/nx))
c if isign = 1, forward sine transforms are performed
c f(1,j,k,i) = 2*(.5*f(1,1,k,i) + ((-1)**j)*f(1,n+1,k,i)
c              + sum(f(1,n,k,i)*cos(pi*n*j/nx))
c f(2,j,k,i) = sum(f(2,n,k,i)*sin(pi*n*j/nx))
c mixup = array of bit reversed addresses
c sctd = sine/cosine table
c kstrt = starting data block number
c kyp = number of data values per block in y
c kypi = initial y index used
c kypp = number of y indices used
c nxvh = first dimension of f >= nx/2 + 1
c kypd = second dimension of f >= kyp + 1
c kblok = number of data blocks in y
c nxhyd = maximum of (nx/2,ny)
c nxyd = maximum of (nx,ny)
c written by viktor k. decyk, ucla
      implicit none
      integer isign, mixup, indx, indy, kstrt, kyp, kypi, kypp
      integer nxvh, kypd, kblok, nxhyd, nxyd
      real f
      complex sctd
      dimension f(2,2*nxvh,kypd,kblok)
      dimension mixup(nxhyd), sctd(nxyd)
c local data
      integer indx1, indx1y, nx, nxh, nxhh, nx3, ny, nxy, nxhy, ks, kypt
      integer i, j, k, l, m, km, kmr, nrx, j1, j2, ns, ns2, k1, k2, kyps
      integer jj
      real at1, at2, at3, t2, t3, t4, t5, t6, ani, sum1, sum2
      complex t1
      indx1 = indx - 1
      indx1y = max0(indx1,indy)
      nx = 2**indx
      nxh = nx/2
      nxhh = nx/4
      nx3 = nx + 3
      ny = 2**indy
      nxy = max0(nx,ny)
      nxhy = 2**indx1y
      ks = kstrt - 2
      kyps = kypi + kypp - 1
      if (kstrt.gt.ny) return
      if (isign.eq.0) return
c create auxiliary array in x
      kmr = nxy/nx
      kypt = kyps
      do 30 l = 1, kblok
      if ((kyps+kyp*(l+ks)).eq.ny) kypt = kyps + 1
      do 20 k = kypi, kypt
      sum1 = .5*(f(1,1,k,l) - f(1,nx+1,k,l))
      do 10 j = 2, nxh
      j1 = 1 + kmr*(j - 1)
      at3 = -aimag(sctd(j1))
      at2 = f(1,nx+2-j,k,l)
      at1 = f(1,j,k,l) + at2
      at2 = f(1,j,k,l) - at2
      sum1 = sum1 + real(sctd(j1))*at2
      at2 = at3*at2
      at1 = .5*at1
      f(1,j,k,l) = at1 - at2
      f(1,nx+2-j,k,l) = at1 + at2
      at2 = f(2,nx+2-j,k,l)
      at1 = f(2,j,k,l) + at2
      at2 = f(2,j,k,l) - at2
      at1 = at3*at1
      at2 = .5*at2
      f(2,j,k,l) = at1 + at2
      f(2,nx+2-j,k,l) = at1 - at2
   10 continue
      f(1,1,k,l) = .5*(f(1,1,k,l) + f(1,nx+1,k,l))
      f(1,nx+1,k,l) = sum1
      f(2,1,k,l) = 0.0
      f(2,nxh+1,k,l) = 2.0*f(2,nxh+1,k,l)
   20 continue
   30 continue
c bit-reverse array elements in x
      nrx = nxhy/nxh
      kypt = kyps
      do 70 l = 1, kblok
      if ((kyps+kyp*(l+ks)).eq.ny) kypt = kyps + 1
      do 60 j = 1, nxh
      j1 = (mixup(j) - 1)/nrx + 1
      if (j.ge.j1) go to 60
      do 50 k = kypi, kypt
      do 40 jj = 1, 2
      t2 = f(jj,2*j1-1,k,l)
      t3 = f(jj,2*j1,k,l)
      f(jj,2*j1-1,k,l) = f(jj,2*j-1,k,l)
      f(jj,2*j1,k,l) = f(jj,2*j,k,l)
      f(jj,2*j-1,k,l) = t2
      f(jj,2*j,k,l) = t3
   40 continue
   50 continue
   60 continue
   70 continue
c first transform in x
      nrx = nxy/nxh
      do 130 m = 1, indx1
      ns = 2**(m - 1)
      ns2 = ns + ns
      km = nxhh/ns
      kmr = 2*km*nrx
      kypt = kyps
      do 120 l = 1, kblok
      if ((kyps+kyp*(l+ks)).eq.ny) kypt = kyps + 1
      do 110 k = 1, km
      k1 = ns2*(k - 1)
      k2 = k1 + ns
      do 100 j = 1, ns
      j1 = j + k1
      j2 = j + k2
      t1 = sctd(1+kmr*(j-1))
      do 90 i = kypi, kypt
      do 80 jj = 1, 2
      t2 = real(t1)*f(jj,2*j2-1,i,l) - aimag(t1)*f(jj,2*j2,i,l)
      t3 = aimag(t1)*f(jj,2*j2-1,i,l) + real(t1)*f(jj,2*j2,i,l)
      f(jj,2*j2-1,i,l) = f(jj,2*j1-1,i,l) - t2
      f(jj,2*j2,i,l) = f(jj,2*j1,i,l) - t3
      f(jj,2*j1-1,i,l) = f(jj,2*j1-1,i,l) + t2
      f(jj,2*j1,i,l) = f(jj,2*j1,i,l) + t3
   80 continue
   90 continue
  100 continue
  110 continue
  120 continue
  130 continue
c unscramble coefficients and normalize
c inverse fourier transform
      if (isign.lt.0) then
         kmr = nxy/nxh
         ani = 1./float(2*nx*ny)
         kypt = kyps
         do 190 l = 1, kblok
         if ((kyps+kyp*(l+ks)).eq.ny) kypt = kyps + 1
         do 160 j = 2, nxhh
         t1 = cmplx(aimag(sctd(1+kmr*(j-1))),-real(sctd(1+kmr*(j-1))))
         do 150 k = kypi, kypt
         do 140 jj = 1, 2
         t4 = f(jj,nx3-2*j,k,l)
         t5 = -f(jj,nx3-2*j+1,k,l)
         t2 = f(jj,2*j-1,k,l) + t4
         t3 = f(jj,2*j,k,l) + t5
         t6 = f(jj,2*j-1,k,l) - t4
         t5 = f(jj,2*j,k,l) - t5
         t4 = t6*real(t1) - t5*aimag(t1)
         t5 = t6*aimag(t1) + t5*real(t1)
         f(jj,2*j-1,k,l) = ani*(t2 + t4)
         f(jj,2*j,k,l) = ani*(t3 + t5)
         f(jj,nx3-2*j,k,l) = ani*(t2 - t4)
         f(jj,nx3-2*j+1,k,l) = ani*(t5 - t3)
  140    continue
  150    continue
  160    continue
         ani = 2.*ani
         do 180 k = kypi, kypt
         do 170 jj = 1, 2
         f(jj,nxh+1,k,l) = ani*f(jj,nxh+1,k,l)
         f(jj,nxh+2,k,l) = -ani*f(jj,nxh+2,k,l)
         t2 = ani*(f(jj,1,k,l) + f(jj,2,k,l))
         f(jj,2,k,l) = ani*(f(jj,1,k,l) - f(jj,2,k,l))
         f(jj,1,k,l) = t2
         f(jj,nx+1,k,l) = ani*f(jj,nx+1,k,l)
  170    continue
  180    continue
  190    continue
c forward fourier transform
      else if (isign.gt.0) then
         kmr = nxy/nxh
         kypt = kyps
         do 250 l = 1, kblok
         if ((kyps+kyp*(l+ks)).eq.ny) kypt = kyps + 1
         do 220 j = 2, nxhh
         t1 = cmplx(aimag(sctd(1+kmr*(j-1))),-real(sctd(1+kmr*(j-1))))
         do 210 k = kypi, kypt
         do 200 jj = 1, 2
         t4 = f(jj,nx3-2*j,k,l)
         t5 = -f(jj,nx3-2*j+1,k,l)
         t2 = f(jj,2*j-1,k,l) + t4
         t3 = f(jj,2*j,k,l) + t5
         t6 = f(jj,2*j-1,k,l) - t4
         t5 = f(jj,2*j,k,l) - t5
         t4 = t6*real(t1) - t5*aimag(t1)
         t5 = t6*aimag(t1) + t5*real(t1)
         f(jj,2*j-1,k,l) = t2 + t4
         f(jj,2*j,k,l) = t3 + t5
         f(jj,nx3-2*j,k,l) = t2 - t4
         f(jj,nx3-2*j+1,k,l) = t5 - t3
  200    continue
  210    continue
  220    continue
         do 240 k = kypi, kypt
         do 230 jj = 1, 2
         f(jj,nxh+1,k,l) = 2.0*f(jj,nxh+1,k,l)
         f(jj,nxh+2,k,l) = -2.0*f(jj,nxh+2,k,l)
         t2 = 2.0*(f(jj,1,k,l) + f(jj,2,k,l))
         f(jj,2,k,l) = 2.0*(f(jj,1,k,l) - f(jj,2,k,l))
         f(jj,1,k,l) = t2
         f(jj,nx+1,k,l) = 2.0*f(jj,nx+1,k,l)
  230    continue
  240    continue
  250    continue
      endif
c perform recursion for cosine-sine transform
      kypt = kyps
      do 280 l = 1, kblok
      if ((kyps+kyp*(l+ks)).eq.ny) kypt = kyps + 1
      do 270 k = kypi, kypt
      sum1 = f(1,nx+1,k,l)
      f(1,nx+1,k,l) = f(1,2,k,l)
      f(1,2,k,l) = sum1
      sum2 = .5*f(2,1,k,l)
      f(2,1,k,l) = 0.0
      f(2,2,k,l) = sum2
      do 260 j = 2, nxh
      sum1 = sum1 - f(1,2*j,k,l)
      f(1,2*j,k,l) = sum1
      sum2 = sum2 + f(2,2*j-1,k,l)
      f(2,2*j-1,k,l) = -f(2,2*j,k,l)
      f(2,2*j,k,l) = sum2
  260 continue
      f(2,nx+1,k,l) = 0.0
  270 continue
  280 continue
      return
      end
c-----------------------------------------------------------------------
      subroutine PFSCT2R2X(f,isign,mixup,sctd,indx,indy,kstrt,kyp,kypi,k
     1ypp,nxvh,kypd,kblok,nxhyd,nxyd)
c this subroutine performs the x part of 2 two dimensional fast real
c sine and cosine transforms and their inverses, for a subset of y,
c using real arithmetic, for data which is distributed in blocks
c algorithm is described in Numerical Recipies in Fortran, Second Ed.,
c by W. H. Press, B. P. Flannery, S. A. Teukolsky, and W. T. Vetterling, 
c [Cambridge Univ. Press, 1992], p. 508.
c for isign = (-1,1), input: all, output: f
c approximate flop count: N*(5*log2(N) + 18)/nvp
c where N = (nx/2)*ny
c indx/indy = exponent which determines length in x/y direction,
c where nx=2**indx, ny=2**indy
c if isign = -1, inverse sine-cosine transforms are performed
c f(1,n,k,i) = (1/nx*ny)*sum(f(1,j,k,i)*sin(pi*n*j/nx))
c f(2,n,k,i) = (1/nx*ny)*(.5*f(2,1,k,i) + ((-1)**n)*f(2,nx+1,k,i)
c              + sum(f(2,j,k,i)*cos(pi*n*j/nx)))
c if isign = 1, forward sine transforms are performed
c f(1,j,k,i) = sum(f(1,n,k,i)*sin(pi*n*j/nx))
c f(2,j,k,i) = 2*(.5*f(2,1,k,i) + ((-1)**j)*f(2,n+1,k,i)
c              + sum(f(2,n,k,i)*cos(pi*n*j/nx))
c mixup = array of bit reversed addresses
c sctd = sine/cosine table
c kstrt = starting data block number
c kyp = number of data values per block in y
c kypi = initial y index used
c kypp = number of y indices used
c nxvh = first dimension of f >= nx/2 + 1
c kypd = second dimension of f >= kyp + 1
c kblok = number of data blocks in y
c nxhyd = maximum of (nx/2,ny)
c nxyd = maximum of (nx,ny)
c written by viktor k. decyk, ucla
      implicit none
      integer isign, mixup, indx, indy, kstrt, kyp, kypi, kypp
      integer nxvh, kypd, kblok, nxhyd, nxyd
      real f
      complex sctd
      dimension f(2,2*nxvh,kypd,kblok)
      dimension mixup(nxhyd), sctd(nxyd)
c local data
      integer indx1, indx1y, nx, nxh, nxhh, nx3, ny, nxy, nxhy, ks, kypt
      integer i, j, k, l, m, km, kmr, nrx, j1, j2, ns, ns2, k1, k2, kyps
      integer jj
      real at1, at2, at3, t2, t3, t4, t5, t6, ani, sum1, sum2
      complex t1
      indx1 = indx - 1
      indx1y = max0(indx1,indy)
      nx = 2**indx
      nxh = nx/2
      nxhh = nx/4
      nx3 = nx + 3
      ny = 2**indy
      nxy = max0(nx,ny)
      nxhy = 2**indx1y
      ks = kstrt - 2
      kyps = kypi + kypp - 1
      if (kstrt.gt.ny) return
      if (isign.eq.0) return
c create auxiliary array in x
      kmr = nxy/nx
      kypt = kyps
      do 30 l = 1, kblok
      if ((kyps+kyp*(l+ks)).eq.ny) kypt = kyps + 1
      do 20 k = kypi, kypt
      sum1 = .5*(f(2,1,k,l) - f(2,nx+1,k,l))
      do 10 j = 2, nxh
      j1 = 1 + kmr*(j - 1)
      at3 = -aimag(sctd(j1))
      at2 = f(1,nx+2-j,k,l)
      at1 = f(1,j,k,l) + at2
      at2 = f(1,j,k,l) - at2
      at1 = at3*at1
      at2 = .5*at2
      f(1,j,k,l) = at1 + at2
      f(1,nx+2-j,k,l) = at1 - at2
      at2 = f(2,nx+2-j,k,l)
      at1 = f(2,j,k,l) + at2
      at2 = f(2,j,k,l) - at2
      sum1 = sum1 + real(sctd(j1))*at2
      at2 = at3*at2
      at1 = .5*at1
      f(2,j,k,l) = at1 - at2
      f(2,nx+2-j,k,l) = at1 + at2
   10 continue
      f(1,1,k,l) = 0.0
      f(1,nxh+1,k,l) = 2.0*f(1,nxh+1,k,l)
      f(2,1,k,l) = .5*(f(2,1,k,l) + f(2,nx+1,k,l))
      f(2,nx+1,k,l) = sum1
   20 continue
   30 continue
c bit-reverse array elements in x
      nrx = nxhy/nxh
      kypt = kyps
      do 70 l = 1, kblok
      if ((kyps+kyp*(l+ks)).eq.ny) kypt = kyps + 1
      do 60 j = 1, nxh
      j1 = (mixup(j) - 1)/nrx + 1
      if (j.ge.j1) go to 60
      do 50 k = kypi, kypt
      do 40 jj = 1, 2
      t2 = f(jj,2*j1-1,k,l)
      t3 = f(jj,2*j1,k,l)
      f(jj,2*j1-1,k,l) = f(jj,2*j-1,k,l)
      f(jj,2*j1,k,l) = f(jj,2*j,k,l)
      f(jj,2*j-1,k,l) = t2
      f(jj,2*j,k,l) = t3
   40 continue
   50 continue
   60 continue
   70 continue
c first transform in x
      nrx = nxy/nxh
      do 130 m = 1, indx1
      ns = 2**(m - 1)
      ns2 = ns + ns
      km = nxhh/ns
      kmr = 2*km*nrx
      kypt = kyps
      do 120 l = 1, kblok
      if ((kyps+kyp*(l+ks)).eq.ny) kypt = kyps + 1
      do 110 k = 1, km
      k1 = ns2*(k - 1)
      k2 = k1 + ns
      do 100 j = 1, ns
      j1 = j + k1
      j2 = j + k2
      t1 = sctd(1+kmr*(j-1))
      do 90 i = kypi, kypt
      do 80 jj = 1, 2
      t2 = real(t1)*f(jj,2*j2-1,i,l) - aimag(t1)*f(jj,2*j2,i,l)
      t3 = aimag(t1)*f(jj,2*j2-1,i,l) + real(t1)*f(jj,2*j2,i,l)
      f(jj,2*j2-1,i,l) = f(jj,2*j1-1,i,l) - t2
      f(jj,2*j2,i,l) = f(jj,2*j1,i,l) - t3
      f(jj,2*j1-1,i,l) = f(jj,2*j1-1,i,l) + t2
      f(jj,2*j1,i,l) = f(jj,2*j1,i,l) + t3
   80 continue
   90 continue
  100 continue
  110 continue
  120 continue
  130 continue
c unscramble coefficients and normalize
c inverse fourier transform
      if (isign.lt.0) then
         kmr = nxy/nxh
         ani = 1./float(2*nx*ny)
         kypt = kyps
         do 190 l = 1, kblok
         if ((kyps+kyp*(l+ks)).eq.ny) kypt = kyps + 1
         do 160 j = 2, nxhh
         t1 = cmplx(aimag(sctd(1+kmr*(j-1))),-real(sctd(1+kmr*(j-1))))
         do 150 k = kypi, kypt
         do 140 jj = 1, 2
         t4 = f(jj,nx3-2*j,k,l)
         t5 = -f(jj,nx3-2*j+1,k,l)
         t2 = f(jj,2*j-1,k,l) + t4
         t3 = f(jj,2*j,k,l) + t5
         t6 = f(jj,2*j-1,k,l) - t4
         t5 = f(jj,2*j,k,l) - t5
         t4 = t6*real(t1) - t5*aimag(t1)
         t5 = t6*aimag(t1) + t5*real(t1)
         f(jj,2*j-1,k,l) = ani*(t2 + t4)
         f(jj,2*j,k,l) = ani*(t3 + t5)
         f(jj,nx3-2*j,k,l) = ani*(t2 - t4)
         f(jj,nx3-2*j+1,k,l) = ani*(t5 - t3)
  140    continue
  150    continue
  160    continue
         ani = 2.*ani
         do 180 k = kypi, kypt
         do 170 jj = 1, 2
         f(jj,nxh+1,k,l) = ani*f(jj,nxh+1,k,l)
         f(jj,nxh+2,k,l) = -ani*f(jj,nxh+2,k,l)
         t2 = ani*(f(jj,1,k,l) + f(jj,2,k,l))
         f(jj,2,k,l) = ani*(f(jj,1,k,l) - f(jj,2,k,l))
         f(jj,1,k,l) = t2
         f(jj,nx+1,k,l) = ani*f(jj,nx+1,k,l)
  170    continue
  180    continue
  190    continue
c forward fourier transform
      else if (isign.gt.0) then
         kmr = nxy/nxh
         kypt = kyps
         do 250 l = 1, kblok
         if ((kyps+kyp*(l+ks)).eq.ny) kypt = kyps + 1
         do 220 j = 2, nxhh
         t1 = cmplx(aimag(sctd(1+kmr*(j-1))),-real(sctd(1+kmr*(j-1))))
         do 210 k = kypi, kypt
         do 200 jj = 1, 2
         t4 = f(jj,nx3-2*j,k,l)
         t5 = -f(jj,nx3-2*j+1,k,l)
         t2 = f(jj,2*j-1,k,l) + t4
         t3 = f(jj,2*j,k,l) + t5
         t6 = f(jj,2*j-1,k,l) - t4
         t5 = f(jj,2*j,k,l) - t5
         t4 = t6*real(t1) - t5*aimag(t1)
         t5 = t6*aimag(t1) + t5*real(t1)
         f(jj,2*j-1,k,l) = t2 + t4
         f(jj,2*j,k,l) = t3 + t5
         f(jj,nx3-2*j,k,l) = t2 - t4
         f(jj,nx3-2*j+1,k,l) = t5 - t3
  200    continue
  210    continue
  220    continue
         do 240 k = kypi, kypt
         do 230 jj = 1, 2
         f(jj,nxh+1,k,l) = 2.0*f(jj,nxh+1,k,l)
         f(jj,nxh+2,k,l) = -2.0*f(jj,nxh+2,k,l)
         t2 = 2.0*(f(jj,1,k,l) + f(jj,2,k,l))
         f(jj,2,k,l) = 2.0*(f(jj,1,k,l) - f(jj,2,k,l))
         f(jj,1,k,l) = t2
         f(jj,nx+1,k,l) = 2.0*f(jj,nx+1,k,l)
  230    continue
  240    continue
  250    continue
      endif
c perform recursion for cosine-sine transform
      kypt = kyps
      do 280 l = 1, kblok
      if ((kyps+kyp*(l+ks)).eq.ny) kypt = kyps + 1
      do 270 k = kypi, kypt
      sum1 = .5*f(1,1,k,l)
      f(1,1,k,l) = 0.0
      f(1,2,k,l) = sum1
      sum2 = f(2,nx+1,k,l)
      f(2,nx+1,k,l) = f(2,2,k,l)
      f(2,2,k,l) = sum2
      do 260 j = 2, nxh
      sum1 = sum1 + f(1,2*j-1,k,l)
      f(1,2*j-1,k,l) = -f(1,2*j,k,l)
      f(1,2*j,k,l) = sum1
      sum2 = sum2 - f(2,2*j,k,l)
      f(2,2*j,k,l) = sum2
  260 continue
      f(1,nx+1,k,l) = 0.0
  270 continue
  280 continue
      return
      end
c-----------------------------------------------------------------------
      subroutine PFSCT2R2Y(g,isign,mixup,sctd,indx,indy,kstrt,kxp,kxpi,k
     1xpp,nyv,kxpd,jblok,nxhyd,nxyd)
c this subroutine performs the y part of 2 two dimensional fast real
c sine and cosine transforms and their inverses, for a subset of x,
c using real arithmetic, for data which is distributed in blocks
c algorithm is described in Numerical Recipies in Fortran, Second Ed.,
c by W. H. Press, B. P. Flannery, S. A. Teukolsky, and W. T. Vetterling, 
c [Cambridge Univ. Press, 1992], p. 508.
c for isign = (-1,1), input: all, output: f
c approximate flop count: N*(5*log2(N) + 18)/nvp
c where N = (nx/2)*ny
c indx/indy = exponent which determines length in x/y direction,
c where nx=2**indx, ny=2**indy
c if isign = -1, inverse sine-cosine transform are performed
c g(1,m,n,i) = sum(g(1,k,n,i)*sin(pi*m*k/ny))
c g(2,m,n,i) = (.5*g(2,1,n,i) + ((-1)**m)*g(2,ny+1,n,i)
c              + sum(g(2,k,n,i)*cos(pi*m*k/ny))
c if isign = 1, a forward sine-cosine transforms are performed
c g(1,k,n,i) = sum(g(1,m,n,i)*sin(pi*m*k/ny))
c g(2,k,n,i) = 2*(.5*g(2,1,n,i) + ((-1)**m)*g(2,ny+1,n,i)
c              + sum(g(2,m,n,i)*cos(pi*m*k/ny))
c mixup = array of bit reversed addresses
c sctd = sine/cosine table
c kstrt = starting data block number
c kxp = number of data values per block in x
c kxpi = initial x index used
c kxpp = number of x indices used
c nyv = first dimension of g >= ny + 1
c kxpd = second dimension of g >= kxp + 1
c jblok = number of data blocks in x
c nxhyd = maximum of (nx/2,ny)
c nxyd = maximum of (nx,ny)
c written by viktor k. decyk, ucla
      implicit none
      integer isign, mixup, indx, indy, kstrt, kxp, kxpi, kxpp
      integer nyv, kxpd, jblok, nxhyd, nxyd
      real g
      complex sctd
      dimension g(2,nyv,kxpd,jblok)
      dimension mixup(nxhyd), sctd(nxyd)
c local data
      integer indx1, indy1, indx1y, nx, ny, nyh, nyhh, ny3, nxy, nxhy
      integer i, j, k, l, m, ks, km, kmr, nry, j1, j2, ns, ns2, k1, k2
      integer kxps, kxpt, jj
      real at1, at2, at3, t2, t3, t4, t5, t6, ani, sum1, sum2
      complex t1
      indx1 = indx - 1
      indy1 = indy - 1
      indx1y = max0(indx1,indy)
      nx = 2**indx
      ny = 2**indy
      nyh = ny/2
      nyhh = ny/4
      ny3 = ny + 3
      nxy = max0(nx,ny)
      nxhy = 2**indx1y
      ks = kstrt - 2
      kxps = kxpi + kxpp - 1
      if (kstrt.gt.nx) return
      if (isign.eq.0) return
c create auxiliary array in y
      kmr = nxy/ny
      kxpt = kxps
      do 30 l = 1, jblok
      if ((kxps+kxp*(l+ks)).eq.nx) kxpt = kxps + 1
      do 20 j = kxpi, kxpt
      sum1 = .5*(g(2,1,j,l) - g(2,ny+1,j,l))
      do 10 k = 2, nyh
      k1 = 1 + kmr*(k - 1)
      at3 = -aimag(sctd(k1))
      at2 = g(1,ny+2-k,j,l)
      at1 = g(1,k,j,l) + at2
      at2 = g(1,k,j,l) - at2
      at1 = at3*at1
      at2 = .5*at2
      g(1,k,j,l) = at1 + at2
      g(1,ny+2-k,j,l) = at1 - at2
      at2 = g(2,ny+2-k,j,l)
      at1 = g(2,k,j,l) + at2
      at2 = g(2,k,j,l) - at2
      sum1 = sum1 + real(sctd(k1))*at2
      at2 = at3*at2
      at1 = .5*at1
      g(2,k,j,l) = at1 - at2
      g(2,ny+2-k,j,l) = at1 + at2
   10 continue
      g(1,1,j,l) = 0.0
      g(1,nyh+1,j,l) = 2.0*g(1,nyh+1,j,l)
      g(2,1,j,l) = .5*(g(2,1,j,l) + g(2,ny+1,j,l))
      g(2,ny+1,j,l) = sum1
   20 continue
   30 continue
c bit-reverse array elements in y
      nry = nxhy/nyh
      kxpt = kxps
      do 70 l = 1, jblok
      if ((kxps+kxp*(l+ks)).eq.nx) kxpt = kxps + 1
      do 60 k = 1, nyh
      k1 = (mixup(k) - 1)/nry + 1
      if (k.ge.k1) go to 60
      do 50 j = kxpi, kxpt
      do 40 jj = 1, 2
      t2 = g(jj,2*k1-1,j,l)
      t3 = g(jj,2*k1,j,l)
      g(jj,2*k1-1,j,l) = g(jj,2*k-1,j,l)
      g(jj,2*k1,j,l) = g(jj,2*k,j,l)
      g(jj,2*k-1,j,l) = t2
      g(jj,2*k,j,l) = t3
   40 continue
   50 continue
   60 continue
   70 continue
c first transform in y
      nry = nxy/nyh
      do 130 m = 1, indy1
      ns = 2**(m - 1)
      ns2 = ns + ns
      km = nyhh/ns
      kmr = 2*km*nry
      kxpt = kxps
      do 120 l = 1, jblok
      if ((kxps+kxp*(l+ks)).eq.nx) kxpt = kxps + 1
      do 110 k = 1, km
      k1 = ns2*(k - 1)
      k2 = k1 + ns
      do 100 j = 1, ns
      j1 = j + k1
      j2 = j + k2
      t1 = sctd(1+kmr*(j-1))
      do 90 i = kxpi, kxpt
      do 80 jj = 1, 2
      t2 = real(t1)*g(jj,2*j2-1,i,l) - aimag(t1)*g(jj,2*j2,i,l)
      t3 = aimag(t1)*g(jj,2*j2-1,i,l) + real(t1)*g(jj,2*j2,i,l)
      g(jj,2*j2-1,i,l) = g(jj,2*j1-1,i,l) - t2
      g(jj,2*j2,i,l) = g(jj,2*j1,i,l) - t3
      g(jj,2*j1-1,i,l) = g(jj,2*j1-1,i,l) + t2
      g(jj,2*j1,i,l) = g(jj,2*j1,i,l) + t3
   80 continue
   90 continue
  100 continue
  110 continue
  120 continue
  130 continue
c unscramble coefficients and normalize
c inverse fourier transform
      if (isign.lt.0) then
         kmr = nxy/nyh
         ani = 0.5
         kxpt = kxps
         do 190 l = 1, jblok
         if ((kxps+kxp*(l+ks)).eq.nx) kxpt = kxps + 1
         do 160 k = 2, nyhh
         t1 = cmplx(aimag(sctd(1+kmr*(k-1))),-real(sctd(1+kmr*(k-1))))
         do 150 j = kxpi, kxpt
         do 140 jj = 1, 2
         t4 = g(jj,ny3-2*k,j,l)
         t5 = -g(jj,ny3-2*k+1,j,l)
         t2 = g(jj,2*k-1,j,l) + t4
         t3 = g(jj,2*k,j,l) + t5
         t6 = g(jj,2*k-1,j,l) - t4
         t5 = g(jj,2*k,j,l) - t5
         t4 = t6*real(t1) - t5*aimag(t1)
         t5 = t6*aimag(t1) + t5*real(t1)
         g(jj,2*k-1,j,l) = ani*(t2 + t4)
         g(jj,2*k,j,l) = ani*(t3 + t5)
         g(jj,ny3-2*k,j,l) = ani*(t2 - t4)
         g(jj,ny3-2*k+1,j,l) = ani*(t5 - t3)
  140    continue
  150    continue
  160    continue
         do 180 j = kxpi, kxpt
         do 170 jj = 1, 2
         g(jj,nyh+1,j,l) = g(jj,nyh+1,j,l)
         g(jj,nyh+2,j,l) = -g(jj,nyh+2,j,l)
         t2 = g(jj,1,j,l) + g(jj,2,j,l)
         g(jj,2,j,l) = g(jj,1,j,l) - g(jj,2,j,l)
         g(jj,1,j,l) = t2
         g(jj,ny+1,j,l) = g(jj,ny+1,j,l)
  170    continue
  180    continue
  190    continue
c forward fourier transform
      else if (isign.gt.0) then
         kmr = nxy/nyh
         kxpt = kxps
         do 250 l = 1, jblok
         if ((kxps+kxp*(l+ks)).eq.nx) kxpt = kxps + 1
         do 220 k = 2, nyhh
         t1 = cmplx(aimag(sctd(1+kmr*(k-1))),-real(sctd(1+kmr*(k-1))))
         do 210 j = kxpi, kxpt
         do 200 jj = 1, 2
         t4 = g(jj,ny3-2*k,j,l)
         t5 = -g(jj,ny3-2*k+1,j,l)
         t2 = g(jj,2*k-1,j,l) + t4
         t3 = g(jj,2*k,j,l) + t5
         t6 = g(jj,2*k-1,j,l) - t4
         t5 = g(jj,2*k,j,l) - t5
         t4 = t6*real(t1) - t5*aimag(t1)
         t5 = t6*aimag(t1) + t5*real(t1)
         g(jj,2*k-1,j,l) = t2 + t4
         g(jj,2*k,j,l) = t3 + t5
         g(jj,ny3-2*k,j,l) = t2 - t4
         g(jj,ny3-2*k+1,j,l) = t5 - t3
  200    continue
  210    continue
  220    continue
         do 240 j = kxpi, kxpt
         do 230 jj = 1, 2
         g(jj,nyh+1,j,l) = 2.0*g(jj,nyh+1,j,l)
         g(jj,nyh+2,j,l) = -2.0*g(jj,nyh+2,j,l)
         t2 = 2.0*(g(jj,1,j,l) + g(jj,2,j,l))
         g(jj,2,j,l) = 2.0*(g(jj,1,j,l) - g(jj,2,j,l))
         g(jj,1,j,l) = t2
         g(jj,ny+1,j,l) = 2.0*g(jj,ny+1,j,l)
  230    continue
  240    continue
  250    continue
      endif
c perform recursion for sine-cosine transform
      kxpt = kxps
      do 280 l = 1, jblok
      if ((kxps+kxp*(l+ks)).eq.nx) kxpt = kxps + 1
      do 270 j = kxpi, kxpt
      sum1 = .5*g(1,1,j,l)
      g(1,1,j,l) = 0.0
      g(1,2,j,l) = sum1
      sum2 = g(2,ny+1,j,l)
      g(2,ny+1,j,l) = g(2,2,j,l)
      g(2,2,j,l) = sum2
      do 260 k = 2, nyh
      sum1 = sum1 + g(1,2*k-1,j,l)
      g(1,2*k-1,j,l) = -g(1,2*k,j,l)
      g(1,2*k,j,l) = sum1
      sum2 = sum2 - g(2,2*k,j,l)
      g(2,2*k,j,l) = sum2
  260 continue
      g(1,ny+1,j,l) = 0.0
  270 continue
  280 continue
      return
      end
c-----------------------------------------------------------------------
      subroutine PFCST2R2Y(g,isign,mixup,sctd,indx,indy,kstrt,kxp,kxpi,k
     1xpp,nyv,kxpd,jblok,nxhyd,nxyd)
c this subroutine performs the y part of 2 two dimensional fast real
c sine and cosine transforms and their inverses, for a subset of x,
c using real arithmetic, for data which is distributed in blocks
c algorithm is described in Numerical Recipies in Fortran, Second Ed.,
c by W. H. Press, B. P. Flannery, S. A. Teukolsky, and W. T. Vetterling, 
c [Cambridge Univ. Press, 1992], p. 508.
c for isign = (-1,1), input: all, output: f
c approximate flop count: N*(5*log2(N) + 18)/nvp
c where N = (nx/2)*ny
c indx/indy = exponent which determines length in x/y direction,
c where nx=2**indx, ny=2**indy
c if isign = -1, inverse sine-cosine transform are performed
c g(1,m,n,i) = (.5*g(1,1,n,i) + ((-1)**m)*g(1,ny+1,n,i)
c              + sum(g(1,k,n,i)*cos(pi*m*k/ny))
c g(2,m,n,i) = sum(g(2,k,n,i)*sin(pi*m*k/ny))
c if isign = 1, a forward sine-cosine transforms are performed
c g(1,k,n,i) = 2*(.5*g(1,1,n,i) + ((-1)**m)*g(1,ny+1,n,i)
c              + sum(g(1,m,n,i)*cos(pi*m*k/ny))
c g(2,k,n,i) = sum(g(2,m,n,i)*sin(pi*m*k/ny))
c mixup = array of bit reversed addresses
c sctd = sine/cosine table
c kstrt = starting data block number
c kxp = number of data values per block in x
c kxpi = initial x index used
c kxpp = number of x indices used
c nyv = first dimension of g >= ny + 1
c kxpd = second dimension of g >= kxp + 1
c jblok = number of data blocks in x
c nxhyd = maximum of (nx/2,ny)
c nxyd = maximum of (nx,ny)
c written by viktor k. decyk, ucla
      implicit none
      integer isign, mixup, indx, indy, kstrt, kxp, kxpi, kxpp
      integer nyv, kxpd, jblok, nxhyd, nxyd
      real g
      complex sctd
      dimension g(2,nyv,kxpd,jblok)
      dimension mixup(nxhyd), sctd(nxyd)
c local data
      integer indx1, indy1, indx1y, nx, ny, nyh, nyhh, ny3, nxy, nxhy
      integer i, j, k, l, m, ks, km, kmr, nry, j1, j2, ns, ns2, k1, k2
      integer kxps, kxpt, jj
      real at1, at2, at3, t2, t3, t4, t5, t6, ani, sum1, sum2
      complex t1
      indx1 = indx - 1
      indy1 = indy - 1
      indx1y = max0(indx1,indy)
      nx = 2**indx
      ny = 2**indy
      nyh = ny/2
      nyhh = ny/4
      ny3 = ny + 3
      nxy = max0(nx,ny)
      nxhy = 2**indx1y
      ks = kstrt - 2
      kxps = kxpi + kxpp - 1
      if (kstrt.gt.nx) return
      if (isign.eq.0) return
c create auxiliary array in y
      kmr = nxy/ny
      kxpt = kxps
      do 30 l = 1, jblok
      if ((kxps+kxp*(l+ks)).eq.nx) kxpt = kxps + 1
      do 20 j = kxpi, kxpt
      sum1 = .5*(g(1,1,j,l) - g(1,ny+1,j,l))
      do 10 k = 2, nyh
      k1 = 1 + kmr*(k - 1)
      at3 = -aimag(sctd(k1))
      at2 = g(1,ny+2-k,j,l)
      at1 = g(1,k,j,l) + at2
      at2 = g(1,k,j,l) - at2
      sum1 = sum1 + real(sctd(k1))*at2
      at2 = at3*at2
      at1 = .5*at1
      g(1,k,j,l) = at1 - at2
      g(1,ny+2-k,j,l) = at1 + at2
      at2 = g(2,ny+2-k,j,l)
      at1 = g(2,k,j,l) + at2
      at2 = g(2,k,j,l) - at2
      at1 = at3*at1
      at2 = .5*at2
      g(2,k,j,l) = at1 + at2
      g(2,ny+2-k,j,l) = at1 - at2
   10 continue
      g(1,1,j,l) = .5*(g(1,1,j,l) + g(1,ny+1,j,l))
      g(1,ny+1,j,l) = sum1
      g(2,1,j,l) = 0.0
      g(2,nyh+1,j,l) = 2.0*g(2,nyh+1,j,l)
   20 continue
   30 continue
c bit-reverse array elements in y
      nry = nxhy/nyh
      kxpt = kxps
      do 70 l = 1, jblok
      if ((kxps+kxp*(l+ks)).eq.nx) kxpt = kxps + 1
      do 60 k = 1, nyh
      k1 = (mixup(k) - 1)/nry + 1
      if (k.ge.k1) go to 60
      do 50 j = kxpi, kxpt
      do 40 jj = 1, 2
      t2 = g(jj,2*k1-1,j,l)
      t3 = g(jj,2*k1,j,l)
      g(jj,2*k1-1,j,l) = g(jj,2*k-1,j,l)
      g(jj,2*k1,j,l) = g(jj,2*k,j,l)
      g(jj,2*k-1,j,l) = t2
      g(jj,2*k,j,l) = t3
   40 continue
   50 continue
   60 continue
   70 continue
c first transform in y
      nry = nxy/nyh
      do 130 m = 1, indy1
      ns = 2**(m - 1)
      ns2 = ns + ns
      km = nyhh/ns
      kmr = 2*km*nry
      kxpt = kxps
      do 120 l = 1, jblok
      if ((kxps+kxp*(l+ks)).eq.nx) kxpt = kxps + 1
      do 110 k = 1, km
      k1 = ns2*(k - 1)
      k2 = k1 + ns
      do 100 j = 1, ns
      j1 = j + k1
      j2 = j + k2
      t1 = sctd(1+kmr*(j-1))
      do 90 i = kxpi, kxpt
      do 80 jj = 1, 2
      t2 = real(t1)*g(jj,2*j2-1,i,l) - aimag(t1)*g(jj,2*j2,i,l)
      t3 = aimag(t1)*g(jj,2*j2-1,i,l) + real(t1)*g(jj,2*j2,i,l)
      g(jj,2*j2-1,i,l) = g(jj,2*j1-1,i,l) - t2
      g(jj,2*j2,i,l) = g(jj,2*j1,i,l) - t3
      g(jj,2*j1-1,i,l) = g(jj,2*j1-1,i,l) + t2
      g(jj,2*j1,i,l) = g(jj,2*j1,i,l) + t3
   80 continue
   90 continue
  100 continue
  110 continue
  120 continue
  130 continue
c unscramble coefficients and normalize
c inverse fourier transform
      if (isign.lt.0) then
         kmr = nxy/nyh
         ani = 0.5
         kxpt = kxps
         do 190 l = 1, jblok
         if ((kxps+kxp*(l+ks)).eq.nx) kxpt = kxps + 1
         do 160 k = 2, nyhh
         t1 = cmplx(aimag(sctd(1+kmr*(k-1))),-real(sctd(1+kmr*(k-1))))
         do 150 j = kxpi, kxpt
         do 140 jj = 1, 2
         t4 = g(jj,ny3-2*k,j,l)
         t5 = -g(jj,ny3-2*k+1,j,l)
         t2 = g(jj,2*k-1,j,l) + t4
         t3 = g(jj,2*k,j,l) + t5
         t6 = g(jj,2*k-1,j,l) - t4
         t5 = g(jj,2*k,j,l) - t5
         t4 = t6*real(t1) - t5*aimag(t1)
         t5 = t6*aimag(t1) + t5*real(t1)
         g(jj,2*k-1,j,l) = ani*(t2 + t4)
         g(jj,2*k,j,l) = ani*(t3 + t5)
         g(jj,ny3-2*k,j,l) = ani*(t2 - t4)
         g(jj,ny3-2*k+1,j,l) = ani*(t5 - t3)
  140    continue
  150    continue
  160    continue
         do 180 j = kxpi, kxpt
         do 170 jj = 1, 2
         g(jj,nyh+1,j,l) = g(jj,nyh+1,j,l)
         g(jj,nyh+2,j,l) = -g(jj,nyh+2,j,l)
         t2 = g(jj,1,j,l) + g(jj,2,j,l)
         g(jj,2,j,l) = g(jj,1,j,l) - g(jj,2,j,l)
         g(jj,1,j,l) = t2
         g(jj,ny+1,j,l) = g(jj,ny+1,j,l)
  170    continue
  180    continue
  190    continue
c forward fourier transform
      else if (isign.gt.0) then
         kmr = nxy/nyh
         kxpt = kxps
         do 250 l = 1, jblok
         if ((kxps+kxp*(l+ks)).eq.nx) kxpt = kxps + 1
         do 220 k = 2, nyhh
         t1 = cmplx(aimag(sctd(1+kmr*(k-1))),-real(sctd(1+kmr*(k-1))))
         do 210 j = kxpi, kxpt
         do 200 jj = 1, 2
         t4 = g(jj,ny3-2*k,j,l)
         t5 = -g(jj,ny3-2*k+1,j,l)
         t2 = g(jj,2*k-1,j,l) + t4
         t3 = g(jj,2*k,j,l) + t5
         t6 = g(jj,2*k-1,j,l) - t4
         t5 = g(jj,2*k,j,l) - t5
         t4 = t6*real(t1) - t5*aimag(t1)
         t5 = t6*aimag(t1) + t5*real(t1)
         g(jj,2*k-1,j,l) = t2 + t4
         g(jj,2*k,j,l) = t3 + t5
         g(jj,ny3-2*k,j,l) = t2 - t4
         g(jj,ny3-2*k+1,j,l) = t5 - t3
  200    continue
  210    continue
  220    continue
         do 240 j = kxpi, kxpt
         do 230 jj = 1, 2
         g(jj,nyh+1,j,l) = 2.0*g(jj,nyh+1,j,l)
         g(jj,nyh+2,j,l) = -2.0*g(jj,nyh+2,j,l)
         t2 = 2.0*(g(jj,1,j,l) + g(jj,2,j,l))
         g(jj,2,j,l) = 2.0*(g(jj,1,j,l) - g(jj,2,j,l))
         g(jj,1,j,l) = t2
         g(jj,ny+1,j,l) = 2.0*g(jj,ny+1,j,l)
  230    continue
  240    continue
  250    continue
      endif
c perform recursion for sine-cosine transform
      kxpt = kxps
      do 280 l = 1, jblok
      if ((kxps+kxp*(l+ks)).eq.nx) kxpt = kxps + 1
      do 270 j = kxpi, kxpt
      sum1 = g(1,ny+1,j,l)
      g(1,ny+1,j,l) = g(1,2,j,l)
      g(1,2,j,l) = sum1
      sum2 = .5*g(2,1,j,l)
      g(2,1,j,l) = 0.0
      g(2,2,j,l) = sum2
      do 260 k = 2, nyh
      sum1 = sum1 - g(1,2*k,j,l)
      g(1,2*k,j,l) = sum1
      sum2 = sum2 + g(2,2*k-1,j,l)
      g(2,2*k-1,j,l) = -g(2,2*k,j,l)
      g(2,2*k,j,l) = sum2
  260 continue
      g(2,ny+1,j,l) = 0.0
  270 continue
  280 continue
      return
      end
c-----------------------------------------------------------------------
      subroutine WPFCST2R3(f,g,bs,br,isign,ntpose,mixup,sctd,ttp,indx,in
     1dy,kstrt,nxvh,nyv,kxp2,kyp,kypd,kxp2d,jblok,kblok,nxhyd,nxyd)
c wrapper function for 3 parallel real sine/sine transforms
c for the electric field with dirichlet or magnetic field with neumann
c boundary conditions
      implicit none
      integer isign, ntpose, mixup, indx, indy, kstrt, nxvh, nyv
      integer kxp2, kyp, kypd, kxp2d, jblok, kblok, nxhyd, nxyd
      real f, g, bs, br, ttp
      complex sctd
      dimension f(3,2*nxvh,kypd,kblok), g(3,nyv,kxp2d,jblok)
      dimension bs(3,kxp2+1,kyp+1,kblok), br(3,kxp2+1,kyp+1,jblok)
      dimension mixup(nxhyd), sctd(nxyd)
c local data
      integer nx, ny, kxpi, kypi
      real tf
      double precision dtime
      data kxpi, kypi /1,1/
c calculate range of indices
      nx = 2**indx
      ny = 2**indy
c inverse fourier transform
      if (isign.lt.0) then
c perform x cosine-sine transform
         call PFCSST2R3X(f,isign,mixup,sctd,indx,indy,kstrt,kyp,kypi,kyp
     1,nxvh,kypd,kblok,nxhyd,nxyd)
c transpose f array to g
         call PWTIMERA(-1,ttp,dtime)
         call PR3TPOSE(f,g,bs,br,nx,ny,kstrt,2*nxvh,nyv,kxp2,kyp,kxp2d,k
     1ypd,jblok,kblok)
         call PWTIMERA(1,ttp,dtime)
c perform y sine-cosine transform
         call PFSCST2R3Y(g,isign,mixup,sctd,indx,indy,kstrt,kxp2,kxpi,kx
     1p2,nyv,kxp2d,jblok,nxhyd,nxyd)
c transpose g array to f
         if (ntpose.eq.0) then
            call PWTIMERA(-1,tf,dtime)
            call PR3TPOSE(g,f,br,bs,ny,nx,kstrt,nyv,2*nxvh,kyp,kxp2,kypd
     1,kxp2d,kblok,jblok)
            call PWTIMERA(1,tf,dtime)
         endif
c forward fourier transform
      else if (isign.gt.0) then
c transpose f array to g
         if (ntpose.eq.0) then
            call PWTIMERA(-1,tf,dtime)
            call PR3TPOSE(f,g,bs,br,nx,ny,kstrt,2*nxvh,nyv,kxp2,kyp,kxp2
     1d,kypd,jblok,kblok)
            call PWTIMERA(1,tf,dtime)
         endif
c perform y sine-cosine transform
         call PFSCST2R3Y(g,isign,mixup,sctd,indx,indy,kstrt,kxp2,kxpi,kx
     1p2,nyv,kxp2d,jblok,nxhyd,nxyd)
c transpose g array to f
         call PWTIMERA(-1,ttp,dtime)
         call PR3TPOSE(g,f,br,bs,ny,nx,kstrt,nyv,2*nxvh,kyp,kxp2,kypd,kx
     1p2d,kblok,jblok)
         call PWTIMERA(1,ttp,dtime)
c perform x cosine-sine transform
         call PFCSST2R3X(f,isign,mixup,sctd,indx,indy,kstrt,kyp,kypi,kyp
     1,nxvh,kypd,kblok,nxhyd,nxyd)
      endif
      if (ntpose.eq.0) ttp = ttp + tf
      return
      end
c-----------------------------------------------------------------------
      subroutine WPFSCT2R3(f,g,bs,br,isign,ntpose,mixup,sctd,ttp,indx,in
     1dy,kstrt,nxvh,nyv,kxp2,kyp,kypd,kxp2d,jblok,kblok,nxhyd,nxyd)
c wrapper function for 3 parallel real sine/sine transforms
c for the magnetic field with dirichlet or electric field with neumann
c boundary conditions
      implicit none
      integer isign, ntpose, mixup, indx, indy, kstrt, nxvh, nyv
      integer kxp2, kyp, kypd, kxp2d, jblok, kblok, nxhyd, nxyd
      real f, g, bs, br, ttp
      complex sctd
      dimension f(3,2*nxvh,kypd,kblok), g(3,nyv,kxp2d,jblok)
      dimension bs(3,kxp2+1,kyp+1,kblok), br(3,kxp2+1,kyp+1,jblok)
      dimension mixup(nxhyd), sctd(nxyd)
c local data
      integer nx, ny, kxpi, kypi
      real tf
      double precision dtime
      data kxpi, kypi /1,1/
c calculate range of indices
      nx = 2**indx
      ny = 2**indy
c inverse fourier transform
      if (isign.lt.0) then
c perform x sine-cosine transform
         call PFSCCT2R3X(f,isign,mixup,sctd,indx,indy,kstrt,kyp,kypi,kyp
     1,nxvh,kypd,kblok,nxhyd,nxyd)
c transpose f array to g
         call PWTIMERA(-1,ttp,dtime)
         call PR3TPOSE(f,g,bs,br,nx,ny,kstrt,2*nxvh,nyv,kxp2,kyp,kxp2d,k
     1ypd,jblok,kblok)
         call PWTIMERA(1,ttp,dtime)
c perform y cosine-sine transform
         call PFCSCT2R3Y(g,isign,mixup,sctd,indx,indy,kstrt,kxp2,kxpi,kx
     1p2,nyv,kxp2d,jblok,nxhyd,nxyd)
c transpose g array to f
         if (ntpose.eq.0) then
            call PWTIMERA(-1,tf,dtime)
            call PR3TPOSE(g,f,br,bs,ny,nx,kstrt,nyv,2*nxvh,kyp,kxp2,kypd
     1,kxp2d,kblok,jblok)
            call PWTIMERA(1,tf,dtime)
         endif
c forward fourier transform
      else if (isign.gt.0) then
c transpose f array to g
         if (ntpose.eq.0) then
            call PWTIMERA(-1,tf,dtime)
            call PR3TPOSE(f,g,bs,br,nx,ny,kstrt,2*nxvh,nyv,kxp2,kyp,kxp2
     1d,kypd,jblok,kblok)
            call PWTIMERA(1,tf,dtime)
         endif
c perform y cosine-sine transform
         call PFCSCT2R3Y(g,isign,mixup,sctd,indx,indy,kstrt,kxp2,kxpi,kx
     1p2,nyv,kxp2d,jblok,nxhyd,nxyd)
c transpose g array to f
         call PWTIMERA(-1,ttp,dtime)
         call PR3TPOSE(g,f,br,bs,ny,nx,kstrt,nyv,2*nxvh,kyp,kxp2,kypd,kx
     1p2d,kblok,jblok)
         call PWTIMERA(1,ttp,dtime)
c perform x sine-cosine transform
         call PFSCCT2R3X(f,isign,mixup,sctd,indx,indy,kstrt,kyp,kypi,kyp
     1,nxvh,kypd,kblok,nxhyd,nxyd)
      endif
      if (ntpose.eq.0) ttp = ttp + tf
      return
      end
c-----------------------------------------------------------------------
      subroutine PFCSST2R3X(f,isign,mixup,sctd,indx,indy,kstrt,kyp,kypi,
     1kypp,nxvh,kypd,kblok,nxhyd,nxyd)
c this subroutine performs the x part of 2 two dimensional fast real
c sine and cosine transforms and their inverses, for a subset of y,
c using real arithmetic, for data which is distributed in blocks
c algorithm is described in Numerical Recipies in Fortran, Second Ed.,
c by W. H. Press, B. P. Flannery, S. A. Teukolsky, and W. T. Vetterling, 
c [Cambridge Univ. Press, 1992], p. 508.
c for isign = (-1,1), input: all, output: f
c approximate flop count: N*(5*log2(N) + 18)/nvp
c where N = (nx/2)*ny
c indx/indy = exponent which determines length in x/y direction,
c where nx=2**indx, ny=2**indy
c if isign = -1, inverse sine-cosine transforms are performed
c f(1,n,k,i) = (1/nx*ny)*(.5*f(1,1,k,i) + ((-1)**n)*f(1,nx+1,k,i)
c              + sum(f(1,j,k,i)*cos(pi*n*j/nx)))
c f(2:3,n,k,i) = (1/nx*ny)*sum(f(2:3,j,k,i)*sin(pi*n*j/nx))
c if isign = 1, forward sine transforms are performed
c f(1,j,k,i) = 2*(.5*f(1,1,k,i) + ((-1)**j)*f(1,n+1,k,i)
c              + sum(f(1,n,k,i)*cos(pi*n*j/nx))
c f(2:3,j,k,i) = sum(f(2:3,n,k,i)*sin(pi*n*j/nx))
c mixup = array of bit reversed addresses
c sctd = sine/cosine table
c kstrt = starting data block number
c kyp = number of data values per block in y
c kypi = initial y index used
c kypp = number of y indices used
c nxvh = first dimension of f >= nx/2 + 1
c kypd = second dimension of f >= kyp + 1
c kblok = number of data blocks in y
c nxhyd = maximum of (nx/2,ny)
c nxyd = maximum of (nx,ny)
c written by viktor k. decyk, ucla
      implicit none
      integer isign, mixup, indx, indy, kstrt, kyp, kypi, kypp
      integer nxvh, kypd, kblok, nxhyd, nxyd
      real f
      complex sctd
      dimension f(3,2*nxvh,kypd,kblok)
      dimension mixup(nxhyd), sctd(nxyd)
c local data
      integer indx1, indx1y, nx, nxh, nxhh, nx3, ny, nxy, nxhy, ks, kypt
      integer i, j, k, l, m, km, kmr, nrx, j1, j2, ns, ns2, k1, k2, kyps
      integer jj
      real at1, at2, at3, t2, t3, t4, t5, t6, ani, sum1, sum2, sum3
      complex t1
      indx1 = indx - 1
      indx1y = max0(indx1,indy)
      nx = 2**indx
      nxh = nx/2
      nxhh = nx/4
      nx3 = nx + 3
      ny = 2**indy
      nxy = max0(nx,ny)
      nxhy = 2**indx1y
      ks = kstrt - 2
      kyps = kypi + kypp - 1
      if (kstrt.gt.ny) return
      if (isign.eq.0) return
c create auxiliary array in x
      kmr = nxy/nx
      kypt = kyps
      do 30 l = 1, kblok
      if ((kyps+kyp*(l+ks)).eq.ny) kypt = kyps + 1
      do 20 k = kypi, kypt
      sum1 = .5*(f(1,1,k,l) - f(1,nx+1,k,l))
      do 10 j = 2, nxh
      j1 = 1 + kmr*(j - 1)
      at3 = -aimag(sctd(j1))
      at2 = f(1,nx+2-j,k,l)
      at1 = f(1,j,k,l) + at2
      at2 = f(1,j,k,l) - at2
      sum1 = sum1 + real(sctd(j1))*at2
      at2 = at3*at2
      at1 = .5*at1
      f(1,j,k,l) = at1 - at2
      f(1,nx+2-j,k,l) = at1 + at2
      at2 = f(2,nx+2-j,k,l)
      at1 = f(2,j,k,l) + at2
      at2 = f(2,j,k,l) - at2
      at1 = at3*at1
      at2 = .5*at2
      f(2,j,k,l) = at1 + at2
      f(2,nx+2-j,k,l) = at1 - at2
      at2 = f(3,nx+2-j,k,l)
      at1 = f(3,j,k,l) + at2
      at2 = f(3,j,k,l) - at2
      at1 = at3*at1
      at2 = .5*at2
      f(3,j,k,l) = at1 + at2
      f(3,nx+2-j,k,l) = at1 - at2
   10 continue
      f(1,1,k,l) = .5*(f(1,1,k,l) + f(1,nx+1,k,l))
      f(1,nx+1,k,l) = sum1
      f(2,1,k,l) = 0.0
      f(2,nxh+1,k,l) = 2.0*f(2,nxh+1,k,l)
      f(3,1,k,l) = 0.0
      f(3,nxh+1,k,l) = 2.0*f(3,nxh+1,k,l)
   20 continue
   30 continue
c bit-reverse array elements in x
      nrx = nxhy/nxh
      kypt = kyps
      do 70 l = 1, kblok
      if ((kyps+kyp*(l+ks)).eq.ny) kypt = kyps + 1
      do 60 j = 1, nxh
      j1 = (mixup(j) - 1)/nrx + 1
      if (j.ge.j1) go to 60
      do 50 k = kypi, kypt
      do 40 jj = 1, 3
      t2 = f(jj,2*j1-1,k,l)
      t3 = f(jj,2*j1,k,l)
      f(jj,2*j1-1,k,l) = f(jj,2*j-1,k,l)
      f(jj,2*j1,k,l) = f(jj,2*j,k,l)
      f(jj,2*j-1,k,l) = t2
      f(jj,2*j,k,l) = t3
   40 continue
   50 continue
   60 continue
   70 continue
c first transform in x
      nrx = nxy/nxh
      do 130 m = 1, indx1
      ns = 2**(m - 1)
      ns2 = ns + ns
      km = nxhh/ns
      kmr = 2*km*nrx
      kypt = kyps
      do 120 l = 1, kblok
      if ((kyps+kyp*(l+ks)).eq.ny) kypt = kyps + 1
      do 110 k = 1, km
      k1 = ns2*(k - 1)
      k2 = k1 + ns
      do 100 j = 1, ns
      j1 = j + k1
      j2 = j + k2
      t1 = sctd(1+kmr*(j-1))
      do 90 i = kypi, kypt
      do 80 jj = 1, 3
      t2 = real(t1)*f(jj,2*j2-1,i,l) - aimag(t1)*f(jj,2*j2,i,l)
      t3 = aimag(t1)*f(jj,2*j2-1,i,l) + real(t1)*f(jj,2*j2,i,l)
      f(jj,2*j2-1,i,l) = f(jj,2*j1-1,i,l) - t2
      f(jj,2*j2,i,l) = f(jj,2*j1,i,l) - t3
      f(jj,2*j1-1,i,l) = f(jj,2*j1-1,i,l) + t2
      f(jj,2*j1,i,l) = f(jj,2*j1,i,l) + t3
   80 continue
   90 continue
  100 continue
  110 continue
  120 continue
  130 continue
c unscramble coefficients and normalize
c inverse fourier transform
      if (isign.lt.0) then
         kmr = nxy/nxh
         ani = 1./float(2*nx*ny)
         kypt = kyps
         do 190 l = 1, kblok
         if ((kyps+kyp*(l+ks)).eq.ny) kypt = kyps + 1
         do 160 j = 2, nxhh
         t1 = cmplx(aimag(sctd(1+kmr*(j-1))),-real(sctd(1+kmr*(j-1))))
         do 150 k = kypi, kypt
         do 140 jj = 1, 3
         t4 = f(jj,nx3-2*j,k,l)
         t5 = -f(jj,nx3-2*j+1,k,l)
         t2 = f(jj,2*j-1,k,l) + t4
         t3 = f(jj,2*j,k,l) + t5
         t6 = f(jj,2*j-1,k,l) - t4
         t5 = f(jj,2*j,k,l) - t5
         t4 = t6*real(t1) - t5*aimag(t1)
         t5 = t6*aimag(t1) + t5*real(t1)
         f(jj,2*j-1,k,l) = ani*(t2 + t4)
         f(jj,2*j,k,l) = ani*(t3 + t5)
         f(jj,nx3-2*j,k,l) = ani*(t2 - t4)
         f(jj,nx3-2*j+1,k,l) = ani*(t5 - t3)
  140    continue
  150    continue
  160    continue
         ani = 2.*ani
         do 180 k = kypi, kypt
         do 170 jj = 1, 3
         f(jj,nxh+1,k,l) = ani*f(jj,nxh+1,k,l)
         f(jj,nxh+2,k,l) = -ani*f(jj,nxh+2,k,l)
         t2 = ani*(f(jj,1,k,l) + f(jj,2,k,l))
         f(jj,2,k,l) = ani*(f(jj,1,k,l) - f(jj,2,k,l))
         f(jj,1,k,l) = t2
         f(jj,nx+1,k,l) = ani*f(jj,nx+1,k,l)
  170    continue
  180    continue
  190    continue
c forward fourier transform
      else if (isign.gt.0) then
         kmr = nxy/nxh
         kypt = kyps
         do 250 l = 1, kblok
         if ((kyps+kyp*(l+ks)).eq.ny) kypt = kyps + 1
         do 220 j = 2, nxhh
         t1 = cmplx(aimag(sctd(1+kmr*(j-1))),-real(sctd(1+kmr*(j-1))))
         do 210 k = kypi, kypt
         do 200 jj = 1, 3
         t4 = f(jj,nx3-2*j,k,l)
         t5 = -f(jj,nx3-2*j+1,k,l)
         t2 = f(jj,2*j-1,k,l) + t4
         t3 = f(jj,2*j,k,l) + t5
         t6 = f(jj,2*j-1,k,l) - t4
         t5 = f(jj,2*j,k,l) - t5
         t4 = t6*real(t1) - t5*aimag(t1)
         t5 = t6*aimag(t1) + t5*real(t1)
         f(jj,2*j-1,k,l) = t2 + t4
         f(jj,2*j,k,l) = t3 + t5
         f(jj,nx3-2*j,k,l) = t2 - t4
         f(jj,nx3-2*j+1,k,l) = t5 - t3
  200    continue
  210    continue
  220    continue
         do 240 k = kypi, kypt
         do 230 jj = 1, 3
         f(jj,nxh+1,k,l) = 2.0*f(jj,nxh+1,k,l)
         f(jj,nxh+2,k,l) = -2.0*f(jj,nxh+2,k,l)
         t2 = 2.0*(f(jj,1,k,l) + f(jj,2,k,l))
         f(jj,2,k,l) = 2.0*(f(jj,1,k,l) - f(jj,2,k,l))
         f(jj,1,k,l) = t2
         f(jj,nx+1,k,l) = 2.0*f(jj,nx+1,k,l)
  230    continue
  240    continue
  250    continue
      endif
c perform recursion for cosine-sine transform
      kypt = kyps
      do 280 l = 1, kblok
      if ((kyps+kyp*(l+ks)).eq.ny) kypt = kyps + 1
      do 270 k = kypi, kypt
      sum1 = f(1,nx+1,k,l)
      f(1,nx+1,k,l) = f(1,2,k,l)
      f(1,2,k,l) = sum1
      sum2 = .5*f(2,1,k,l)
      f(2,1,k,l) = 0.0
      f(2,2,k,l) = sum2
      sum3 = .5*f(3,1,k,l)
      f(3,1,k,l) = 0.0
      f(3,2,k,l) = sum3
      do 260 j = 2, nxh
      sum1 = sum1 - f(1,2*j,k,l)
      f(1,2*j,k,l) = sum1
      sum2 = sum2 + f(2,2*j-1,k,l)
      f(2,2*j-1,k,l) = -f(2,2*j,k,l)
      f(2,2*j,k,l) = sum2
      sum3 = sum3 + f(3,2*j-1,k,l)
      f(3,2*j-1,k,l) = -f(3,2*j,k,l)
      f(3,2*j,k,l) = sum3
  260 continue
      f(2,nx+1,k,l) = 0.0
      f(3,nx+1,k,l) = 0.0
  270 continue
  280 continue
      return
      end
c-----------------------------------------------------------------------
      subroutine PFSCCT2R3X(f,isign,mixup,sctd,indx,indy,kstrt,kyp,kypi,
     1kypp,nxvh,kypd,kblok,nxhyd,nxyd)
c this subroutine performs the x part of 3 two dimensional fast real
c sine and cosine transforms and their inverses, for a subset of y,
c using real arithmetic, for data which is distributed in blocks
c algorithm is described in Numerical Recipies in Fortran, Second Ed.,
c by W. H. Press, B. P. Flannery, S. A. Teukolsky, and W. T. Vetterling, 
c [Cambridge Univ. Press, 1992], p. 508.
c for isign = (-1,1), input: all, output: f
c approximate flop count: N*(5*log2(N) + 18)/nvp
c where N = (nx/2)*ny
c indx/indy = exponent which determines length in x/y direction,
c where nx=2**indx, ny=2**indy
c if isign = -1, inverse sine-cosine transforms are performed
c f(1,n,k,i) = (1/nx*ny)*sum(f(1,j,k,i)*sin(pi*n*j/nx))
c f(2:3,n,k,i) = (1/nx*ny)*(.5*f(2:3,1,k,i) + ((-1)**n)*f(2:3,nx+1,k,i)
c              + sum(f(2:3,j,k,i)*cos(pi*n*j/nx)))
c if isign = 1, forward sine transforms are performed
c f(1,j,k,i) = sum(f(1,n,k,i)*sin(pi*n*j/nx))
c f(2:3,j,k,i) = 2*(.5*f(2:3,1,k,i) + ((-1)**j)*f(2:3,n+1,k,i)
c              + sum(f(2:3,n,k,i)*cos(pi*n*j/nx))
c mixup = array of bit reversed addresses
c sctd = sine/cosine table
c kstrt = starting data block number
c kyp = number of data values per block in y
c kypi = initial y index used
c kypp = number of y indices used
c nxvh = first dimension of f >= nx/2 + 1
c kypd = second dimension of f >= kyp + 1
c kblok = number of data blocks in y
c nxhyd = maximum of (nx/2,ny)
c nxyd = maximum of (nx,ny)
c written by viktor k. decyk, ucla
      implicit none
      integer isign, mixup, indx, indy, kstrt, kyp, kypi, kypp
      integer nxvh, kypd, kblok, nxhyd, nxyd
      real f
      complex sctd
      dimension f(3,2*nxvh,kypd,kblok)
      dimension mixup(nxhyd), sctd(nxyd)
c local data
      integer indx1, indx1y, nx, nxh, nxhh, nx3, ny, nxy, nxhy, ks, kypt
      integer i, j, k, l, m, km, kmr, nrx, j1, j2, ns, ns2, k1, k2, kyps
      integer jj
      real at1, at2, at3, t2, t3, t4, t5, t6, ani, sum1, sum2, sum3
      complex t1
      indx1 = indx - 1
      indx1y = max0(indx1,indy)
      nx = 2**indx
      nxh = nx/2
      nxhh = nx/4
      nx3 = nx + 3
      ny = 2**indy
      nxy = max0(nx,ny)
      nxhy = 2**indx1y
      ks = kstrt - 2
      kyps = kypi + kypp - 1
      if (kstrt.gt.ny) return
      if (isign.eq.0) return
c create auxiliary array in x
      kmr = nxy/nx
      kypt = kyps
      do 30 l = 1, kblok
      if ((kyps+kyp*(l+ks)).eq.ny) kypt = kyps + 1
      do 20 k = kypi, kypt
      sum1 = .5*(f(2,1,k,l) - f(2,nx+1,k,l))
      sum2 = .5*(f(3,1,k,l) - f(3,nx+1,k,l))
      do 10 j = 2, nxh
      j1 = 1 + kmr*(j - 1)
      at3 = -aimag(sctd(j1))
      at2 = f(1,nx+2-j,k,l)
      at1 = f(1,j,k,l) + at2
      at2 = f(1,j,k,l) - at2
      at1 = at3*at1
      at2 = .5*at2
      f(1,j,k,l) = at1 + at2
      f(1,nx+2-j,k,l) = at1 - at2
      at2 = f(2,nx+2-j,k,l)
      at1 = f(2,j,k,l) + at2
      at2 = f(2,j,k,l) - at2
      sum1 = sum1 + real(sctd(j1))*at2
      at2 = at3*at2
      at1 = .5*at1
      f(2,j,k,l) = at1 - at2
      f(2,nx+2-j,k,l) = at1 + at2
      at2 = f(3,nx+2-j,k,l)
      at1 = f(3,j,k,l) + at2
      at2 = f(3,j,k,l) - at2
      sum2 = sum2 + real(sctd(j1))*at2
      at2 = at3*at2
      at1 = .5*at1
      f(3,j,k,l) = at1 - at2
      f(3,nx+2-j,k,l) = at1 + at2
   10 continue
      f(1,1,k,l) = 0.0
      f(1,nxh+1,k,l) = 2.0*f(1,nxh+1,k,l)
      f(2,1,k,l) = .5*(f(2,1,k,l) + f(2,nx+1,k,l))
      f(2,nx+1,k,l) = sum1
      f(3,1,k,l) = .5*(f(3,1,k,l) + f(3,nx+1,k,l))
      f(3,nx+1,k,l) = sum2
   20 continue
   30 continue
c bit-reverse array elements in x
      nrx = nxhy/nxh
      kypt = kyps
      do 70 l = 1, kblok
      if ((kyps+kyp*(l+ks)).eq.ny) kypt = kyps + 1
      do 60 j = 1, nxh
      j1 = (mixup(j) - 1)/nrx + 1
      if (j.ge.j1) go to 60
      do 50 k = kypi, kypt
      do 40 jj = 1, 3
      t2 = f(jj,2*j1-1,k,l)
      t3 = f(jj,2*j1,k,l)
      f(jj,2*j1-1,k,l) = f(jj,2*j-1,k,l)
      f(jj,2*j1,k,l) = f(jj,2*j,k,l)
      f(jj,2*j-1,k,l) = t2
      f(jj,2*j,k,l) = t3
   40 continue
   50 continue
   60 continue
   70 continue
c first transform in x
      nrx = nxy/nxh
      do 130 m = 1, indx1
      ns = 2**(m - 1)
      ns2 = ns + ns
      km = nxhh/ns
      kmr = 2*km*nrx
      kypt = kyps
      do 120 l = 1, kblok
      if ((kyps+kyp*(l+ks)).eq.ny) kypt = kyps + 1
      do 110 k = 1, km
      k1 = ns2*(k - 1)
      k2 = k1 + ns
      do 100 j = 1, ns
      j1 = j + k1
      j2 = j + k2
      t1 = sctd(1+kmr*(j-1))
      do 90 i = kypi, kypt
      do 80 jj = 1, 3
      t2 = real(t1)*f(jj,2*j2-1,i,l) - aimag(t1)*f(jj,2*j2,i,l)
      t3 = aimag(t1)*f(jj,2*j2-1,i,l) + real(t1)*f(jj,2*j2,i,l)
      f(jj,2*j2-1,i,l) = f(jj,2*j1-1,i,l) - t2
      f(jj,2*j2,i,l) = f(jj,2*j1,i,l) - t3
      f(jj,2*j1-1,i,l) = f(jj,2*j1-1,i,l) + t2
      f(jj,2*j1,i,l) = f(jj,2*j1,i,l) + t3
   80 continue
   90 continue
  100 continue
  110 continue
  120 continue
  130 continue
c unscramble coefficients and normalize
c inverse fourier transform
      if (isign.lt.0) then
         kmr = nxy/nxh
         ani = 1./float(2*nx*ny)
         kypt = kyps
         do 190 l = 1, kblok
         if ((kyps+kyp*(l+ks)).eq.ny) kypt = kyps + 1
         do 160 j = 2, nxhh
         t1 = cmplx(aimag(sctd(1+kmr*(j-1))),-real(sctd(1+kmr*(j-1))))
         do 150 k = kypi, kypt
         do 140 jj = 1, 3
         t4 = f(jj,nx3-2*j,k,l)
         t5 = -f(jj,nx3-2*j+1,k,l)
         t2 = f(jj,2*j-1,k,l) + t4
         t3 = f(jj,2*j,k,l) + t5
         t6 = f(jj,2*j-1,k,l) - t4
         t5 = f(jj,2*j,k,l) - t5
         t4 = t6*real(t1) - t5*aimag(t1)
         t5 = t6*aimag(t1) + t5*real(t1)
         f(jj,2*j-1,k,l) = ani*(t2 + t4)
         f(jj,2*j,k,l) = ani*(t3 + t5)
         f(jj,nx3-2*j,k,l) = ani*(t2 - t4)
         f(jj,nx3-2*j+1,k,l) = ani*(t5 - t3)
  140    continue
  150    continue
  160    continue
         ani = 2.*ani
         do 180 k = kypi, kypt
         do 170 jj = 1, 3
         f(jj,nxh+1,k,l) = ani*f(jj,nxh+1,k,l)
         f(jj,nxh+2,k,l) = -ani*f(jj,nxh+2,k,l)
         t2 = ani*(f(jj,1,k,l) + f(jj,2,k,l))
         f(jj,2,k,l) = ani*(f(jj,1,k,l) - f(jj,2,k,l))
         f(jj,1,k,l) = t2
         f(jj,nx+1,k,l) = ani*f(jj,nx+1,k,l)
  170    continue
  180    continue
  190    continue
c forward fourier transform
      else if (isign.gt.0) then
         kmr = nxy/nxh
         kypt = kyps
         do 250 l = 1, kblok
         if ((kyps+kyp*(l+ks)).eq.ny) kypt = kyps + 1
         do 220 j = 2, nxhh
         t1 = cmplx(aimag(sctd(1+kmr*(j-1))),-real(sctd(1+kmr*(j-1))))
         do 210 k = kypi, kypt
         do 200 jj = 1, 3
         t4 = f(jj,nx3-2*j,k,l)
         t5 = -f(jj,nx3-2*j+1,k,l)
         t2 = f(jj,2*j-1,k,l) + t4
         t3 = f(jj,2*j,k,l) + t5
         t6 = f(jj,2*j-1,k,l) - t4
         t5 = f(jj,2*j,k,l) - t5
         t4 = t6*real(t1) - t5*aimag(t1)
         t5 = t6*aimag(t1) + t5*real(t1)
         f(jj,2*j-1,k,l) = t2 + t4
         f(jj,2*j,k,l) = t3 + t5
         f(jj,nx3-2*j,k,l) = t2 - t4
         f(jj,nx3-2*j+1,k,l) = t5 - t3
  200    continue
  210    continue
  220    continue
         do 240 k = kypi, kypt
         do 230 jj = 1, 3
         f(jj,nxh+1,k,l) = 2.0*f(jj,nxh+1,k,l)
         f(jj,nxh+2,k,l) = -2.0*f(jj,nxh+2,k,l)
         t2 = 2.0*(f(jj,1,k,l) + f(jj,2,k,l))
         f(jj,2,k,l) = 2.0*(f(jj,1,k,l) - f(jj,2,k,l))
         f(jj,1,k,l) = t2
         f(jj,nx+1,k,l) = 2.0*f(jj,nx+1,k,l)
  230    continue
  240    continue
  250    continue
      endif
c perform recursion for cosine-sine transform
      kypt = kyps
      do 280 l = 1, kblok
      if ((kyps+kyp*(l+ks)).eq.ny) kypt = kyps + 1
      do 270 k = kypi, kypt
      sum1 = .5*f(1,1,k,l)
      f(1,1,k,l) = 0.0
      f(1,2,k,l) = sum1
      sum2 = f(2,nx+1,k,l)
      f(2,nx+1,k,l) = f(2,2,k,l)
      f(2,2,k,l) = sum2
      sum3 = f(3,nx+1,k,l)
      f(3,nx+1,k,l) = f(3,2,k,l)
      f(3,2,k,l) = sum3
      do 260 j = 2, nxh
      sum1 = sum1 + f(1,2*j-1,k,l)
      f(1,2*j-1,k,l) = -f(1,2*j,k,l)
      f(1,2*j,k,l) = sum1
      sum2 = sum2 - f(2,2*j,k,l)
      f(2,2*j,k,l) = sum2
      sum3 = sum3 - f(3,2*j,k,l)
      f(3,2*j,k,l) = sum3
  260 continue
      f(1,nx+1,k,l) = 0.0
  270 continue
  280 continue
      return
      end
c-----------------------------------------------------------------------
      subroutine PFSCST2R3Y(g,isign,mixup,sctd,indx,indy,kstrt,kxp,kxpi,
     1kxpp,nyv,kxpd,jblok,nxhyd,nxyd)
c this subroutine performs the y part of 3 two dimensional fast real
c sine and cosine transforms and their inverses, for a subset of x,
c using real arithmetic, for data which is distributed in blocks
c algorithm is described in Numerical Recipies in Fortran, Second Ed.,
c by W. H. Press, B. P. Flannery, S. A. Teukolsky, and W. T. Vetterling, 
c [Cambridge Univ. Press, 1992], p. 508.
c for isign = (-1,1), input: all, output: f
c approximate flop count: N*(5*log2(N) + 18)/nvp
c where N = (nx/2)*ny
c indx/indy = exponent which determines length in x/y direction,
c where nx=2**indx, ny=2**indy
c if isign = -1, inverse sine-cosine transform are performed
c g(1,m,n,i) = sum(g(1,k,n,i)*sin(pi*m*k/ny))
c g(2,m,n,i) = (.5*g(2,1,n,i) + ((-1)**m)*g(2,ny+1,n,i)
c              + sum(g(2,k,n,i)*cos(pi*m*k/ny))
c g(3,m,n,i) = sum(g(3,k,n,i)*sin(pi*m*k/ny))
c if isign = 1, a forward sine-cosine transforms are performed
c g(1,k,n,i) = sum(g(1,m,n,i)*sin(pi*m*k/ny))
c g(2,k,n,i) = 2*(.5*g(2,1,n,i) + ((-1)**m)*g(2,ny+1,n,i)
c              + sum(g(2,m,n,i)*cos(pi*m*k/ny))
c g(3,k,n,i) = sum(g(3,m,n,i)*sin(pi*m*k/ny))
c mixup = array of bit reversed addresses
c sctd = sine/cosine table
c kstrt = starting data block number
c kxp = number of data values per block in x
c kxpi = initial x index used
c kxpp = number of x indices used
c nyv = first dimension of g >= ny + 1
c kxpd = second dimension of g >= kxp + 1
c jblok = number of data blocks in x
c nxhyd = maximum of (nx/2,ny)
c nxyd = maximum of (nx,ny)
c written by viktor k. decyk, ucla
      implicit none
      integer isign, mixup, indx, indy, kstrt, kxp, kxpi, kxpp
      integer nyv, kxpd, jblok, nxhyd, nxyd
      real g
      complex sctd
      dimension g(3,nyv,kxpd,jblok)
      dimension mixup(nxhyd), sctd(nxyd)
c local data
      integer indx1, indy1, indx1y, nx, ny, nyh, nyhh, ny3, nxy, nxhy
      integer i, j, k, l, m, ks, km, kmr, nry, j1, j2, ns, ns2, k1, k2
      integer kxps, kxpt, jj
      real at1, at2, at3, t2, t3, t4, t5, t6, ani, sum1, sum2, sum3
      complex t1
      indx1 = indx - 1
      indy1 = indy - 1
      indx1y = max0(indx1,indy)
      nx = 2**indx
      ny = 2**indy
      nyh = ny/2
      nyhh = ny/4
      ny3 = ny + 3
      nxy = max0(nx,ny)
      nxhy = 2**indx1y
      ks = kstrt - 2
      kxps = kxpi + kxpp - 1
      if (kstrt.gt.nx) return
      if (isign.eq.0) return
c create auxiliary array in y
      kmr = nxy/ny
      kxpt = kxps
      do 30 l = 1, jblok
      if ((kxps+kxp*(l+ks)).eq.nx) kxpt = kxps + 1
      do 20 j = kxpi, kxpt
      sum1 = .5*(g(2,1,j,l) - g(2,ny+1,j,l))
      do 10 k = 2, nyh
      k1 = 1 + kmr*(k - 1)
      at3 = -aimag(sctd(k1))
      at2 = g(1,ny+2-k,j,l)
      at1 = g(1,k,j,l) + at2
      at2 = g(1,k,j,l) - at2
      at1 = at3*at1
      at2 = .5*at2
      g(1,k,j,l) = at1 + at2
      g(1,ny+2-k,j,l) = at1 - at2
      at2 = g(2,ny+2-k,j,l)
      at1 = g(2,k,j,l) + at2
      at2 = g(2,k,j,l) - at2
      sum1 = sum1 + real(sctd(k1))*at2
      at2 = at3*at2
      at1 = .5*at1
      g(2,k,j,l) = at1 - at2
      g(2,ny+2-k,j,l) = at1 + at2
      at2 = g(3,ny+2-k,j,l)
      at1 = g(3,k,j,l) + at2
      at2 = g(3,k,j,l) - at2
      at1 = at3*at1
      at2 = .5*at2
      g(3,k,j,l) = at1 + at2
      g(3,ny+2-k,j,l) = at1 - at2
   10 continue
      g(1,1,j,l) = 0.0
      g(1,nyh+1,j,l) = 2.0*g(1,nyh+1,j,l)
      g(2,1,j,l) = .5*(g(2,1,j,l) + g(2,ny+1,j,l))
      g(2,ny+1,j,l) = sum1
      g(3,1,j,l) = 0.0
      g(3,nyh+1,j,l) = 2.0*g(3,nyh+1,j,l)
   20 continue
   30 continue
c bit-reverse array elements in y
      nry = nxhy/nyh
      kxpt = kxps
      do 70 l = 1, jblok
      if ((kxps+kxp*(l+ks)).eq.nx) kxpt = kxps + 1
      do 60 k = 1, nyh
      k1 = (mixup(k) - 1)/nry + 1
      if (k.ge.k1) go to 60
      do 50 j = kxpi, kxpt
      do 40 jj = 1, 3
      t2 = g(jj,2*k1-1,j,l)
      t3 = g(jj,2*k1,j,l)
      g(jj,2*k1-1,j,l) = g(jj,2*k-1,j,l)
      g(jj,2*k1,j,l) = g(jj,2*k,j,l)
      g(jj,2*k-1,j,l) = t2
      g(jj,2*k,j,l) = t3
   40 continue
   50 continue
   60 continue
   70 continue
c first transform in y
      nry = nxy/nyh
      do 130 m = 1, indy1
      ns = 2**(m - 1)
      ns2 = ns + ns
      km = nyhh/ns
      kmr = 2*km*nry
      kxpt = kxps
      do 120 l = 1, jblok
      if ((kxps+kxp*(l+ks)).eq.nx) kxpt = kxps + 1
      do 110 k = 1, km
      k1 = ns2*(k - 1)
      k2 = k1 + ns
      do 100 j = 1, ns
      j1 = j + k1
      j2 = j + k2
      t1 = sctd(1+kmr*(j-1))
      do 90 i = kxpi, kxpt
      do 80 jj = 1, 3
      t2 = real(t1)*g(jj,2*j2-1,i,l) - aimag(t1)*g(jj,2*j2,i,l)
      t3 = aimag(t1)*g(jj,2*j2-1,i,l) + real(t1)*g(jj,2*j2,i,l)
      g(jj,2*j2-1,i,l) = g(jj,2*j1-1,i,l) - t2
      g(jj,2*j2,i,l) = g(jj,2*j1,i,l) - t3
      g(jj,2*j1-1,i,l) = g(jj,2*j1-1,i,l) + t2
      g(jj,2*j1,i,l) = g(jj,2*j1,i,l) + t3
   80 continue
   90 continue
  100 continue
  110 continue
  120 continue
  130 continue
c unscramble coefficients and normalize
c inverse fourier transform
      if (isign.lt.0) then
         kmr = nxy/nyh
         ani = 0.5
         kxpt = kxps
         do 190 l = 1, jblok
         if ((kxps+kxp*(l+ks)).eq.nx) kxpt = kxps + 1
         do 160 k = 2, nyhh
         t1 = cmplx(aimag(sctd(1+kmr*(k-1))),-real(sctd(1+kmr*(k-1))))
         do 150 j = kxpi, kxpt
         do 140 jj = 1, 3
         t4 = g(jj,ny3-2*k,j,l)
         t5 = -g(jj,ny3-2*k+1,j,l)
         t2 = g(jj,2*k-1,j,l) + t4
         t3 = g(jj,2*k,j,l) + t5
         t6 = g(jj,2*k-1,j,l) - t4
         t5 = g(jj,2*k,j,l) - t5
         t4 = t6*real(t1) - t5*aimag(t1)
         t5 = t6*aimag(t1) + t5*real(t1)
         g(jj,2*k-1,j,l) = ani*(t2 + t4)
         g(jj,2*k,j,l) = ani*(t3 + t5)
         g(jj,ny3-2*k,j,l) = ani*(t2 - t4)
         g(jj,ny3-2*k+1,j,l) = ani*(t5 - t3)
  140    continue
  150    continue
  160    continue
         do 180 j = kxpi, kxpt
         do 170 jj = 1, 3
         g(jj,nyh+1,j,l) = g(jj,nyh+1,j,l)
         g(jj,nyh+2,j,l) = -g(jj,nyh+2,j,l)
         t2 = g(jj,1,j,l) + g(jj,2,j,l)
         g(jj,2,j,l) = g(jj,1,j,l) - g(jj,2,j,l)
         g(jj,1,j,l) = t2
         g(jj,ny+1,j,l) = g(jj,ny+1,j,l)
  170    continue
  180    continue
  190    continue
c forward fourier transform
      else if (isign.gt.0) then
         kmr = nxy/nyh
         kxpt = kxps
         do 250 l = 1, jblok
         if ((kxps+kxp*(l+ks)).eq.nx) kxpt = kxps + 1
         do 220 k = 2, nyhh
         t1 = cmplx(aimag(sctd(1+kmr*(k-1))),-real(sctd(1+kmr*(k-1))))
         do 210 j = kxpi, kxpt
         do 200 jj = 1, 3
         t4 = g(jj,ny3-2*k,j,l)
         t5 = -g(jj,ny3-2*k+1,j,l)
         t2 = g(jj,2*k-1,j,l) + t4
         t3 = g(jj,2*k,j,l) + t5
         t6 = g(jj,2*k-1,j,l) - t4
         t5 = g(jj,2*k,j,l) - t5
         t4 = t6*real(t1) - t5*aimag(t1)
         t5 = t6*aimag(t1) + t5*real(t1)
         g(jj,2*k-1,j,l) = t2 + t4
         g(jj,2*k,j,l) = t3 + t5
         g(jj,ny3-2*k,j,l) = t2 - t4
         g(jj,ny3-2*k+1,j,l) = t5 - t3
  200    continue
  210    continue
  220    continue
         do 240 j = kxpi, kxpt
         do 230 jj = 1, 3
         g(jj,nyh+1,j,l) = 2.0*g(jj,nyh+1,j,l)
         g(jj,nyh+2,j,l) = -2.0*g(jj,nyh+2,j,l)
         t2 = 2.0*(g(jj,1,j,l) + g(jj,2,j,l))
         g(jj,2,j,l) = 2.0*(g(jj,1,j,l) - g(jj,2,j,l))
         g(jj,1,j,l) = t2
         g(jj,ny+1,j,l) = 2.0*g(jj,ny+1,j,l)
  230    continue
  240    continue
  250    continue
      endif
c perform recursion for sine-cosine transform
      kxpt = kxps
      do 280 l = 1, jblok
      if ((kxps+kxp*(l+ks)).eq.nx) kxpt = kxps + 1
      do 270 j = kxpi, kxpt
      sum1 = .5*g(1,1,j,l)
      g(1,1,j,l) = 0.0
      g(1,2,j,l) = sum1
      sum2 = g(2,ny+1,j,l)
      g(2,ny+1,j,l) = g(2,2,j,l)
      g(2,2,j,l) = sum2
      sum3 = .5*g(3,1,j,l)
      g(3,1,j,l) = 0.0
      g(3,2,j,l) = sum3
      do 260 k = 2, nyh
      sum1 = sum1 + g(1,2*k-1,j,l)
      g(1,2*k-1,j,l) = -g(1,2*k,j,l)
      g(1,2*k,j,l) = sum1
      sum2 = sum2 - g(2,2*k,j,l)
      g(2,2*k,j,l) = sum2
      sum3 = sum3 + g(3,2*k-1,j,l)
      g(3,2*k-1,j,l) = -g(3,2*k,j,l)
      g(3,2*k,j,l) = sum3
  260 continue
      g(1,ny+1,j,l) = 0.0
      g(3,ny+1,j,l) = 0.0
  270 continue
  280 continue
      return
      end
c-----------------------------------------------------------------------
      subroutine PFCSCT2R3Y(g,isign,mixup,sctd,indx,indy,kstrt,kxp,kxpi,
     1kxpp,nyv,kxpd,jblok,nxhyd,nxyd)
c this subroutine performs the y part of 3 two dimensional fast real
c sine and cosine transforms and their inverses, for a subset of x,
c using real arithmetic, for data which is distributed in blocks
c algorithm is described in Numerical Recipies in Fortran, Second Ed.,
c by W. H. Press, B. P. Flannery, S. A. Teukolsky, and W. T. Vetterling, 
c [Cambridge Univ. Press, 1992], p. 508.
c for isign = (-1,1), input: all, output: f
c approximate flop count: N*(5*log2(N) + 18)/nvp
c where N = (nx/2)*ny
c indx/indy = exponent which determines length in x/y direction,
c where nx=2**indx, ny=2**indy
c if isign = -1, inverse sine-cosine transform are performed
c g(1,m,n,i) = (.5*g(1,1,n,i) + ((-1)**m)*g(1,ny+1,n,i)
c              + sum(g(1,k,n,i)*cos(pi*m*k/ny))
c g(2,m,n,i) = sum(g(2,k,n,i)*sin(pi*m*k/ny))
c g(3,m,n,i) = (.5*g(3,1,n,i) + ((-1)**m)*g(3,ny+1,n,i)
c              + sum(g(3,k,n,i)*cos(pi*m*k/ny))
c if isign = 1, a forward sine-cosine transforms are performed
c g(1,k,n,i) = 2*(.5*g(1,1,n,i) + ((-1)**m)*g(1,ny+1,n,i)
c              + sum(g(1,m,n,i)*cos(pi*m*k/ny))
c g(2,k,n,i) = sum(g(2,m,n,i)*sin(pi*m*k/ny))
c g(3,k,n,i) = 2*(.5*g(3,1,n,i) + ((-1)**m)*g(3,ny+1,n,i)
c              + sum(g(3,m,n,i)*cos(pi*m*k/ny))
c mixup = array of bit reversed addresses
c sctd = sine/cosine table
c kstrt = starting data block number
c kxp = number of data values per block in x
c kxpi = initial x index used
c kxpp = number of x indices used
c nyv = first dimension of g >= ny + 1
c kxpd = second dimension of g >= kxp + 1
c jblok = number of data blocks in x
c nxhyd = maximum of (nx/2,ny)
c nxyd = maximum of (nx,ny)
c written by viktor k. decyk, ucla
      implicit none
      integer isign, mixup, indx, indy, kstrt, kxp, kxpi, kxpp
      integer nyv, kxpd, jblok, nxhyd, nxyd
      real g
      complex sctd
      dimension g(3,nyv,kxpd,jblok)
      dimension mixup(nxhyd), sctd(nxyd)
c local data
      integer indx1, indy1, indx1y, nx, ny, nyh, nyhh, ny3, nxy, nxhy
      integer i, j, k, l, m, ks, km, kmr, nry, j1, j2, ns, ns2, k1, k2
      integer kxps, kxpt, jj
      real at1, at2, at3, t2, t3, t4, t5, t6, ani, sum1, sum2, sum3
      complex t1
      indx1 = indx - 1
      indy1 = indy - 1
      indx1y = max0(indx1,indy)
      nx = 2**indx
      ny = 2**indy
      nyh = ny/2
      nyhh = ny/4
      ny3 = ny + 3
      nxy = max0(nx,ny)
      nxhy = 2**indx1y
      ks = kstrt - 2
      kxps = kxpi + kxpp - 1
      if (kstrt.gt.nx) return
      if (isign.eq.0) return
c create auxiliary array in y
      kmr = nxy/ny
      kxpt = kxps
      do 30 l = 1, jblok
      if ((kxps+kxp*(l+ks)).eq.nx) kxpt = kxps + 1
      do 20 j = kxpi, kxpt
      sum1 = .5*(g(1,1,j,l) - g(1,ny+1,j,l))
      sum2 = .5*(g(3,1,j,l) - g(3,ny+1,j,l))
      do 10 k = 2, nyh
      k1 = 1 + kmr*(k - 1)
      at3 = -aimag(sctd(k1))
      at2 = g(1,ny+2-k,j,l)
      at1 = g(1,k,j,l) + at2
      at2 = g(1,k,j,l) - at2
      sum1 = sum1 + real(sctd(k1))*at2
      at2 = at3*at2
      at1 = .5*at1
      g(1,k,j,l) = at1 - at2
      g(1,ny+2-k,j,l) = at1 + at2
      at2 = g(2,ny+2-k,j,l)
      at1 = g(2,k,j,l) + at2
      at2 = g(2,k,j,l) - at2
      at1 = at3*at1
      at2 = .5*at2
      g(2,k,j,l) = at1 + at2
      g(2,ny+2-k,j,l) = at1 - at2
      at2 = g(3,ny+2-k,j,l)
      at1 = g(3,k,j,l) + at2
      at2 = g(3,k,j,l) - at2
      sum2 = sum2 + real(sctd(k1))*at2
      at2 = at3*at2
      at1 = .5*at1
      g(3,k,j,l) = at1 - at2
      g(3,ny+2-k,j,l) = at1 + at2
   10 continue
      g(1,1,j,l) = .5*(g(1,1,j,l) + g(1,ny+1,j,l))
      g(1,ny+1,j,l) = sum1
      g(2,1,j,l) = 0.0
      g(2,nyh+1,j,l) = 2.0*g(2,nyh+1,j,l)
      g(3,1,j,l) = .5*(g(3,1,j,l) + g(3,ny+1,j,l))
      g(3,ny+1,j,l) = sum2
   20 continue
   30 continue
c bit-reverse array elements in y
      nry = nxhy/nyh
      kxpt = kxps
      do 70 l = 1, jblok
      if ((kxps+kxp*(l+ks)).eq.nx) kxpt = kxps + 1
      do 60 k = 1, nyh
      k1 = (mixup(k) - 1)/nry + 1
      if (k.ge.k1) go to 60
      do 50 j = kxpi, kxpt
      do 40 jj = 1, 3
      t2 = g(jj,2*k1-1,j,l)
      t3 = g(jj,2*k1,j,l)
      g(jj,2*k1-1,j,l) = g(jj,2*k-1,j,l)
      g(jj,2*k1,j,l) = g(jj,2*k,j,l)
      g(jj,2*k-1,j,l) = t2
      g(jj,2*k,j,l) = t3
   40 continue
   50 continue
   60 continue
   70 continue
c first transform in y
      nry = nxy/nyh
      do 130 m = 1, indy1
      ns = 2**(m - 1)
      ns2 = ns + ns
      km = nyhh/ns
      kmr = 2*km*nry
      kxpt = kxps
      do 120 l = 1, jblok
      if ((kxps+kxp*(l+ks)).eq.nx) kxpt = kxps + 1
      do 110 k = 1, km
      k1 = ns2*(k - 1)
      k2 = k1 + ns
      do 100 j = 1, ns
      j1 = j + k1
      j2 = j + k2
      t1 = sctd(1+kmr*(j-1))
      do 90 i = kxpi, kxpt
      do 80 jj = 1, 3
      t2 = real(t1)*g(jj,2*j2-1,i,l) - aimag(t1)*g(jj,2*j2,i,l)
      t3 = aimag(t1)*g(jj,2*j2-1,i,l) + real(t1)*g(jj,2*j2,i,l)
      g(jj,2*j2-1,i,l) = g(jj,2*j1-1,i,l) - t2
      g(jj,2*j2,i,l) = g(jj,2*j1,i,l) - t3
      g(jj,2*j1-1,i,l) = g(jj,2*j1-1,i,l) + t2
      g(jj,2*j1,i,l) = g(jj,2*j1,i,l) + t3
   80 continue
   90 continue
  100 continue
  110 continue
  120 continue
  130 continue
c unscramble coefficients and normalize
c inverse fourier transform
      if (isign.lt.0) then
         kmr = nxy/nyh
         ani = 0.5
         kxpt = kxps
         do 190 l = 1, jblok
         if ((kxps+kxp*(l+ks)).eq.nx) kxpt = kxps + 1
         do 160 k = 2, nyhh
         t1 = cmplx(aimag(sctd(1+kmr*(k-1))),-real(sctd(1+kmr*(k-1))))
         do 150 j = kxpi, kxpt
         do 140 jj = 1, 3
         t4 = g(jj,ny3-2*k,j,l)
         t5 = -g(jj,ny3-2*k+1,j,l)
         t2 = g(jj,2*k-1,j,l) + t4
         t3 = g(jj,2*k,j,l) + t5
         t6 = g(jj,2*k-1,j,l) - t4
         t5 = g(jj,2*k,j,l) - t5
         t4 = t6*real(t1) - t5*aimag(t1)
         t5 = t6*aimag(t1) + t5*real(t1)
         g(jj,2*k-1,j,l) = ani*(t2 + t4)
         g(jj,2*k,j,l) = ani*(t3 + t5)
         g(jj,ny3-2*k,j,l) = ani*(t2 - t4)
         g(jj,ny3-2*k+1,j,l) = ani*(t5 - t3)
  140    continue
  150    continue
  160    continue
         do 180 j = kxpi, kxpt
         do 170 jj = 1, 3
         g(jj,nyh+1,j,l) = g(jj,nyh+1,j,l)
         g(jj,nyh+2,j,l) = -g(jj,nyh+2,j,l)
         t2 = g(jj,1,j,l) + g(jj,2,j,l)
         g(jj,2,j,l) = g(jj,1,j,l) - g(jj,2,j,l)
         g(jj,1,j,l) = t2
         g(jj,ny+1,j,l) = g(jj,ny+1,j,l)
  170    continue
  180    continue
  190    continue
c forward fourier transform
      else if (isign.gt.0) then
         kmr = nxy/nyh
         kxpt = kxps
         do 250 l = 1, jblok
         if ((kxps+kxp*(l+ks)).eq.nx) kxpt = kxps + 1
         do 220 k = 2, nyhh
         t1 = cmplx(aimag(sctd(1+kmr*(k-1))),-real(sctd(1+kmr*(k-1))))
         do 210 j = kxpi, kxpt
         do 200 jj = 1, 3
         t4 = g(jj,ny3-2*k,j,l)
         t5 = -g(jj,ny3-2*k+1,j,l)
         t2 = g(jj,2*k-1,j,l) + t4
         t3 = g(jj,2*k,j,l) + t5
         t6 = g(jj,2*k-1,j,l) - t4
         t5 = g(jj,2*k,j,l) - t5
         t4 = t6*real(t1) - t5*aimag(t1)
         t5 = t6*aimag(t1) + t5*real(t1)
         g(jj,2*k-1,j,l) = t2 + t4
         g(jj,2*k,j,l) = t3 + t5
         g(jj,ny3-2*k,j,l) = t2 - t4
         g(jj,ny3-2*k+1,j,l) = t5 - t3
  200    continue
  210    continue
  220    continue
         do 240 j = kxpi, kxpt
         do 230 jj = 1, 3
         g(jj,nyh+1,j,l) = 2.0*g(jj,nyh+1,j,l)
         g(jj,nyh+2,j,l) = -2.0*g(jj,nyh+2,j,l)
         t2 = 2.0*(g(jj,1,j,l) + g(jj,2,j,l))
         g(jj,2,j,l) = 2.0*(g(jj,1,j,l) - g(jj,2,j,l))
         g(jj,1,j,l) = t2
         g(jj,ny+1,j,l) = 2.0*g(jj,ny+1,j,l)
  230    continue
  240    continue
  250    continue
      endif
c perform recursion for sine-cosine transform
      kxpt = kxps
      do 280 l = 1, jblok
      if ((kxps+kxp*(l+ks)).eq.nx) kxpt = kxps + 1
      do 270 j = kxpi, kxpt
      sum1 = g(1,ny+1,j,l)
      g(1,ny+1,j,l) = g(1,2,j,l)
      g(1,2,j,l) = sum1
      sum2 = .5*g(2,1,j,l)
      g(2,1,j,l) = 0.0
      g(2,2,j,l) = sum2
      sum3 = g(3,ny+1,j,l)
      g(3,ny+1,j,l) = g(3,2,j,l)
      g(3,2,j,l) = sum3
      do 260 k = 2, nyh
      sum1 = sum1 - g(1,2*k,j,l)
      g(1,2*k,j,l) = sum1
      sum2 = sum2 + g(2,2*k-1,j,l)
      g(2,2*k-1,j,l) = -g(2,2*k,j,l)
      g(2,2*k,j,l) = sum2
      sum3 = sum3 - g(3,2*k,j,l)
      g(3,2*k,j,l) = sum3
  260 continue
      g(2,ny+1,j,l) = 0.0
  270 continue
  280 continue
      return
      end
c-----------------------------------------------------------------------
      subroutine PFSSCT2R3Y(g,isign,mixup,sctd,indx,indy,kstrt,kxp,kxpi,
     1kxpp,nyv,kxpd,jblok,nxhyd,nxyd)
c this subroutine performs the y part of 3 two dimensional fast real
c sine and cosine transforms and their inverses, for a subset of x,
c using real arithmetic, for data which is distributed in blocks
c algorithm is described in Numerical Recipies in Fortran, Second Ed.,
c by W. H. Press, B. P. Flannery, S. A. Teukolsky, and W. T. Vetterling, 
c [Cambridge Univ. Press, 1992], p. 508.
c for isign = (-1,1), input: all, output: f
c approximate flop count: N*(5*log2(N) + 18)/nvp
c where N = (nx/2)*ny
c indx/indy = exponent which determines length in x/y direction,
c where nx=2**indx, ny=2**indy
c if isign = -1, inverse sine-cosine transform are performed
c g(1,m,n,i) = (.5*g(1,1,n,i) + ((-1)**m)*g(1,ny+1,n,i)
c              + sum(g(1,k,n,i)*cos(pi*m*k/ny))
c g(2,m,n,i) = sum(g(2,k,n,i)*sin(pi*m*k/ny))
c g(3,m,n,i) = (.5*g(3,1,n,i) + ((-1)**m)*g(3,ny+1,n,i)
c              + sum(g(3,k,n,i)*cos(pi*m*k/ny))
c if isign = 1, a forward sine-cosine transforms are performed
c g(1,k,n,i) = 2*(.5*g(1,1,n,i) + ((-1)**m)*g(1,ny+1,n,i)
c              + sum(g(1,m,n,i)*cos(pi*m*k/ny))
c g(2,k,n,i) = sum(g(2,m,n,i)*sin(pi*m*k/ny))
c g(3,k,n,i) = 2*(.5*g(3,1,n,i) + ((-1)**m)*g(3,ny+1,n,i)
c              + sum(g(3,m,n,i)*cos(pi*m*k/ny))
c mixup = array of bit reversed addresses
c sctd = sine/cosine table
c kstrt = starting data block number
c kxp = number of data values per block in x
c kxpi = initial x index used
c kxpp = number of x indices used
c nyv = first dimension of g >= ny + 1
c kxpd = second dimension of g >= kxp + 1
c jblok = number of data blocks in x
c nxhyd = maximum of (nx/2,ny)
c nxyd = maximum of (nx,ny)
c written by viktor k. decyk, ucla
      implicit none
      integer isign, mixup, indx, indy, kstrt, kxp, kxpi, kxpp
      integer nyv, kxpd, jblok, nxhyd, nxyd
      real g
      complex sctd
      dimension g(3,nyv,kxpd,jblok)
      dimension mixup(nxhyd), sctd(nxyd)
c local data
      integer indx1, indy1, indx1y, nx, ny, nyh, nyhh, ny3, nxy, nxhy
      integer i, j, k, l, m, ks, km, kmr, nry, j1, j2, ns, ns2, k1, k2
      integer kxps, kxpt, jj
      real at1, at2, at3, t2, t3, t4, t5, t6, ani, sum1, sum2, sum3
      complex t1
      indx1 = indx - 1
      indy1 = indy - 1
      indx1y = max0(indx1,indy)
      nx = 2**indx
      ny = 2**indy
      nyh = ny/2
      nyhh = ny/4
      ny3 = ny + 3
      nxy = max0(nx,ny)
      nxhy = 2**indx1y
      ks = kstrt - 2
      kxps = kxpi + kxpp - 1
      if (kstrt.gt.nx) return
      if (isign.eq.0) return
c create auxiliary array in y
      kmr = nxy/ny
      kxpt = kxps
      do 30 l = 1, jblok
      if ((kxps+kxp*(l+ks)).eq.nx) kxpt = kxps + 1
      do 20 j = kxpi, kxpt
      sum2 = .5*(g(3,1,j,l) - g(3,ny+1,j,l))
      do 10 k = 2, nyh
      k1 = 1 + kmr*(k - 1)
      at3 = -aimag(sctd(k1))
      at2 = g(1,ny+2-k,j,l)
      at1 = g(1,k,j,l) + at2
      at2 = g(1,k,j,l) - at2
      at1 = at3*at1
      at2 = .5*at2
      g(1,k,j,l) = at1 + at2
      g(1,ny+2-k,j,l) = at1 - at2
      at2 = g(2,ny+2-k,j,l)
      at1 = g(2,k,j,l) + at2
      at2 = g(2,k,j,l) - at2
      at1 = at3*at1
      at2 = .5*at2
      g(2,k,j,l) = at1 + at2
      g(2,ny+2-k,j,l) = at1 - at2
      at2 = g(3,ny+2-k,j,l)
      at1 = g(3,k,j,l) + at2
      at2 = g(3,k,j,l) - at2
      sum2 = sum2 + real(sctd(k1))*at2
      at2 = at3*at2
      at1 = .5*at1
      g(3,k,j,l) = at1 - at2
      g(3,ny+2-k,j,l) = at1 + at2
   10 continue
      g(1,1,j,l) = 0.0
      g(1,nyh+1,j,l) = 2.0*g(1,nyh+1,j,l)
      g(2,1,j,l) = 0.0
      g(2,nyh+1,j,l) = 2.0*g(2,nyh+1,j,l)
      g(3,1,j,l) = .5*(g(3,1,j,l) + g(3,ny+1,j,l))
      g(3,ny+1,j,l) = sum2
   20 continue
   30 continue
c bit-reverse array elements in y
      nry = nxhy/nyh
      kxpt = kxps
      do 70 l = 1, jblok
      if ((kxps+kxp*(l+ks)).eq.nx) kxpt = kxps + 1
      do 60 k = 1, nyh
      k1 = (mixup(k) - 1)/nry + 1
      if (k.ge.k1) go to 60
      do 50 j = kxpi, kxpt
      do 40 jj = 1, 3
      t2 = g(jj,2*k1-1,j,l)
      t3 = g(jj,2*k1,j,l)
      g(jj,2*k1-1,j,l) = g(jj,2*k-1,j,l)
      g(jj,2*k1,j,l) = g(jj,2*k,j,l)
      g(jj,2*k-1,j,l) = t2
      g(jj,2*k,j,l) = t3
   40 continue
   50 continue
   60 continue
   70 continue
c first transform in y
      nry = nxy/nyh
      do 130 m = 1, indy1
      ns = 2**(m - 1)
      ns2 = ns + ns
      km = nyhh/ns
      kmr = 2*km*nry
      kxpt = kxps
      do 120 l = 1, jblok
      if ((kxps+kxp*(l+ks)).eq.nx) kxpt = kxps + 1
      do 110 k = 1, km
      k1 = ns2*(k - 1)
      k2 = k1 + ns
      do 100 j = 1, ns
      j1 = j + k1
      j2 = j + k2
      t1 = sctd(1+kmr*(j-1))
      do 90 i = kxpi, kxpt
      do 80 jj = 1, 3
      t2 = real(t1)*g(jj,2*j2-1,i,l) - aimag(t1)*g(jj,2*j2,i,l)
      t3 = aimag(t1)*g(jj,2*j2-1,i,l) + real(t1)*g(jj,2*j2,i,l)
      g(jj,2*j2-1,i,l) = g(jj,2*j1-1,i,l) - t2
      g(jj,2*j2,i,l) = g(jj,2*j1,i,l) - t3
      g(jj,2*j1-1,i,l) = g(jj,2*j1-1,i,l) + t2
      g(jj,2*j1,i,l) = g(jj,2*j1,i,l) + t3
   80 continue
   90 continue
  100 continue
  110 continue
  120 continue
  130 continue
c unscramble coefficients and normalize
c inverse fourier transform
      if (isign.lt.0) then
         kmr = nxy/nyh
         ani = 0.5
         kxpt = kxps
         do 190 l = 1, jblok
         if ((kxps+kxp*(l+ks)).eq.nx) kxpt = kxps + 1
         do 160 k = 2, nyhh
         t1 = cmplx(aimag(sctd(1+kmr*(k-1))),-real(sctd(1+kmr*(k-1))))
         do 150 j = kxpi, kxpt
         do 140 jj = 1, 3
         t4 = g(jj,ny3-2*k,j,l)
         t5 = -g(jj,ny3-2*k+1,j,l)
         t2 = g(jj,2*k-1,j,l) + t4
         t3 = g(jj,2*k,j,l) + t5
         t6 = g(jj,2*k-1,j,l) - t4
         t5 = g(jj,2*k,j,l) - t5
         t4 = t6*real(t1) - t5*aimag(t1)
         t5 = t6*aimag(t1) + t5*real(t1)
         g(jj,2*k-1,j,l) = ani*(t2 + t4)
         g(jj,2*k,j,l) = ani*(t3 + t5)
         g(jj,ny3-2*k,j,l) = ani*(t2 - t4)
         g(jj,ny3-2*k+1,j,l) = ani*(t5 - t3)
  140    continue
  150    continue
  160    continue
         do 180 j = kxpi, kxpt
         do 170 jj = 1, 3
         g(jj,nyh+1,j,l) = g(jj,nyh+1,j,l)
         g(jj,nyh+2,j,l) = -g(jj,nyh+2,j,l)
         t2 = g(jj,1,j,l) + g(jj,2,j,l)
         g(jj,2,j,l) = g(jj,1,j,l) - g(jj,2,j,l)
         g(jj,1,j,l) = t2
         g(jj,ny+1,j,l) = g(jj,ny+1,j,l)
  170    continue
  180    continue
  190    continue
c forward fourier transform
      else if (isign.gt.0) then
         kmr = nxy/nyh
         kxpt = kxps
         do 250 l = 1, jblok
         if ((kxps+kxp*(l+ks)).eq.nx) kxpt = kxps + 1
         do 220 k = 2, nyhh
         t1 = cmplx(aimag(sctd(1+kmr*(k-1))),-real(sctd(1+kmr*(k-1))))
         do 210 j = kxpi, kxpt
         do 200 jj = 1, 3
         t4 = g(jj,ny3-2*k,j,l)
         t5 = -g(jj,ny3-2*k+1,j,l)
         t2 = g(jj,2*k-1,j,l) + t4
         t3 = g(jj,2*k,j,l) + t5
         t6 = g(jj,2*k-1,j,l) - t4
         t5 = g(jj,2*k,j,l) - t5
         t4 = t6*real(t1) - t5*aimag(t1)
         t5 = t6*aimag(t1) + t5*real(t1)
         g(jj,2*k-1,j,l) = t2 + t4
         g(jj,2*k,j,l) = t3 + t5
         g(jj,ny3-2*k,j,l) = t2 - t4
         g(jj,ny3-2*k+1,j,l) = t5 - t3
  200    continue
  210    continue
  220    continue
         do 240 j = kxpi, kxpt
         do 230 jj = 1, 3
         g(jj,nyh+1,j,l) = 2.0*g(jj,nyh+1,j,l)
         g(jj,nyh+2,j,l) = -2.0*g(jj,nyh+2,j,l)
         t2 = 2.0*(g(jj,1,j,l) + g(jj,2,j,l))
         g(jj,2,j,l) = 2.0*(g(jj,1,j,l) - g(jj,2,j,l))
         g(jj,1,j,l) = t2
         g(jj,ny+1,j,l) = 2.0*g(jj,ny+1,j,l)
  230    continue
  240    continue
  250    continue
      endif
c perform recursion for sine-cosine transform
      kxpt = kxps
      do 280 l = 1, jblok
      if ((kxps+kxp*(l+ks)).eq.nx) kxpt = kxps + 1
      do 270 j = kxpi, kxpt
      sum1 = .5*g(1,1,j,l)
      g(1,1,j,l) = 0.0
      g(1,2,j,l) = sum1
      sum2 = .5*g(2,1,j,l)
      g(2,1,j,l) = 0.0
      g(2,2,j,l) = sum2
      sum3 = g(3,ny+1,j,l)
      g(3,ny+1,j,l) = g(3,2,j,l)
      g(3,2,j,l) = sum3
      do 260 k = 2, nyh
      sum1 = sum1 + g(1,2*k-1,j,l)
      g(1,2*k-1,j,l) = -g(1,2*k,j,l)
      g(1,2*k,j,l) = sum1
      sum2 = sum2 + g(2,2*k-1,j,l)
      g(2,2*k-1,j,l) = -g(2,2*k,j,l)
      g(2,2*k,j,l) = sum2
      sum3 = sum3 - g(3,2*k,j,l)
      g(3,2*k,j,l) = sum3
  260 continue
      g(1,ny+1,j,l) = 0.0
      g(2,ny+1,j,l) = 0.0
  270 continue
  280 continue
      return
      end
c-----------------------------------------------------------------------
      subroutine PFSSCT2R3X(f,isign,mixup,sctd,indx,indy,kstrt,kyp,kypi,
     1kypp,nxvh,kypd,kblok,nxhyd,nxyd)
c this subroutine performs the x part of 3 two dimensional fast real
c sine and cosine transforms and their inverses, for a subset of y,
c using real arithmetic, for data which is distributed in blocks
c algorithm is described in Numerical Recipies in Fortran, Second Ed.,
c by W. H. Press, B. P. Flannery, S. A. Teukolsky, and W. T. Vetterling, 
c [Cambridge Univ. Press, 1992], p. 508.
c for isign = (-1,1), input: all, output: f
c approximate flop count: N*(5*log2(N) + 18)/nvp
c where N = (nx/2)*ny
c indx/indy = exponent which determines length in x/y direction,
c where nx=2**indx, ny=2**indy
c if isign = -1, inverse sine-cosine transforms are performed
c f(1,n,k,i) = (1/nx*ny)*sum(f(1,j,k,i)*sin(pi*n*j/nx))
c f(2:3,n,k,i) = (1/nx*ny)*(.5*f(2:3,1,k,i) + ((-1)**n)*f(2:3,nx+1,k,i)
c              + sum(f(2:3,j,k,i)*cos(pi*n*j/nx)))
c if isign = 1, forward sine transforms are performed
c f(1,j,k,i) = sum(f(1,n,k,i)*sin(pi*n*j/nx))
c f(2:3,j,k,i) = 2*(.5*f(2:3,1,k,i) + ((-1)**j)*f(2:3,n+1,k,i)
c              + sum(f(2:3,n,k,i)*cos(pi*n*j/nx))
c mixup = array of bit reversed addresses
c sctd = sine/cosine table
c kstrt = starting data block number
c kyp = number of data values per block in y
c kypi = initial y index used
c kypp = number of y indices used
c nxvh = first dimension of f >= nx/2 + 1
c kypd = second dimension of f >= kyp + 1
c kblok = number of data blocks in y
c nxhyd = maximum of (nx/2,ny)
c nxyd = maximum of (nx,ny)
c written by viktor k. decyk, ucla
      implicit none
      integer isign, mixup, indx, indy, kstrt, kyp, kypi, kypp
      integer nxvh, kypd, kblok, nxhyd, nxyd
      real f
      complex sctd
      dimension f(3,2*nxvh,kypd,kblok)
      dimension mixup(nxhyd), sctd(nxyd)
c local data
      integer indx1, indx1y, nx, nxh, nxhh, nx3, ny, nxy, nxhy, ks, kypt
      integer i, j, k, l, m, km, kmr, nrx, j1, j2, ns, ns2, k1, k2, kyps
      integer jj
      real at1, at2, at3, t2, t3, t4, t5, t6, ani, sum1, sum2, sum3
      complex t1
      indx1 = indx - 1
      indx1y = max0(indx1,indy)
      nx = 2**indx
      nxh = nx/2
      nxhh = nx/4
      nx3 = nx + 3
      ny = 2**indy
      nxy = max0(nx,ny)
      nxhy = 2**indx1y
      ks = kstrt - 2
      kyps = kypi + kypp - 1
      if (kstrt.gt.ny) return
      if (isign.eq.0) return
c create auxiliary array in x
      kmr = nxy/nx
      kypt = kyps
      do 30 l = 1, kblok
      if ((kyps+kyp*(l+ks)).eq.ny) kypt = kyps + 1
      do 20 k = kypi, kypt
      sum2 = .5*(f(3,1,k,l) - f(3,nx+1,k,l))
      do 10 j = 2, nxh
      j1 = 1 + kmr*(j - 1)
      at3 = -aimag(sctd(j1))
      at2 = f(1,nx+2-j,k,l)
      at1 = f(1,j,k,l) + at2
      at2 = f(1,j,k,l) - at2
      at1 = at3*at1
      at2 = .5*at2
      f(1,j,k,l) = at1 + at2
      f(1,nx+2-j,k,l) = at1 - at2
      at2 = f(2,nx+2-j,k,l)
      at1 = f(2,j,k,l) + at2
      at2 = f(2,j,k,l) - at2
      at1 = at3*at1
      at2 = .5*at2
      f(2,j,k,l) = at1 + at2
      f(2,nx+2-j,k,l) = at1 - at2
      at2 = f(3,nx+2-j,k,l)
      at1 = f(3,j,k,l) + at2
      at2 = f(3,j,k,l) - at2
      sum2 = sum2 + real(sctd(j1))*at2
      at2 = at3*at2
      at1 = .5*at1
      f(3,j,k,l) = at1 - at2
      f(3,nx+2-j,k,l) = at1 + at2
   10 continue
      f(1,1,k,l) = 0.0
      f(1,nxh+1,k,l) = 2.0*f(1,nxh+1,k,l)
      f(2,1,k,l) = 0.0
      f(2,nxh+1,k,l) = 2.0*f(2,nxh+1,k,l)
      f(3,1,k,l) = .5*(f(3,1,k,l) + f(3,nx+1,k,l))
      f(3,nx+1,k,l) = sum2
   20 continue
   30 continue
c bit-reverse array elements in x
      nrx = nxhy/nxh
      kypt = kyps
      do 70 l = 1, kblok
      if ((kyps+kyp*(l+ks)).eq.ny) kypt = kyps + 1
      do 60 j = 1, nxh
      j1 = (mixup(j) - 1)/nrx + 1
      if (j.ge.j1) go to 60
      do 50 k = kypi, kypt
      do 40 jj = 1, 3
      t2 = f(jj,2*j1-1,k,l)
      t3 = f(jj,2*j1,k,l)
      f(jj,2*j1-1,k,l) = f(jj,2*j-1,k,l)
      f(jj,2*j1,k,l) = f(jj,2*j,k,l)
      f(jj,2*j-1,k,l) = t2
      f(jj,2*j,k,l) = t3
   40 continue
   50 continue
   60 continue
   70 continue
c first transform in x
      nrx = nxy/nxh
      do 130 m = 1, indx1
      ns = 2**(m - 1)
      ns2 = ns + ns
      km = nxhh/ns
      kmr = 2*km*nrx
      kypt = kyps
      do 120 l = 1, kblok
      if ((kyps+kyp*(l+ks)).eq.ny) kypt = kyps + 1
      do 110 k = 1, km
      k1 = ns2*(k - 1)
      k2 = k1 + ns
      do 100 j = 1, ns
      j1 = j + k1
      j2 = j + k2
      t1 = sctd(1+kmr*(j-1))
      do 90 i = kypi, kypt
      do 80 jj = 1, 3
      t2 = real(t1)*f(jj,2*j2-1,i,l) - aimag(t1)*f(jj,2*j2,i,l)
      t3 = aimag(t1)*f(jj,2*j2-1,i,l) + real(t1)*f(jj,2*j2,i,l)
      f(jj,2*j2-1,i,l) = f(jj,2*j1-1,i,l) - t2
      f(jj,2*j2,i,l) = f(jj,2*j1,i,l) - t3
      f(jj,2*j1-1,i,l) = f(jj,2*j1-1,i,l) + t2
      f(jj,2*j1,i,l) = f(jj,2*j1,i,l) + t3
   80 continue
   90 continue
  100 continue
  110 continue
  120 continue
  130 continue
c unscramble coefficients and normalize
c inverse fourier transform
      if (isign.lt.0) then
         kmr = nxy/nxh
         ani = 1./float(2*nx*ny)
         kypt = kyps
         do 190 l = 1, kblok
         if ((kyps+kyp*(l+ks)).eq.ny) kypt = kyps + 1
         do 160 j = 2, nxhh
         t1 = cmplx(aimag(sctd(1+kmr*(j-1))),-real(sctd(1+kmr*(j-1))))
         do 150 k = kypi, kypt
         do 140 jj = 1, 3
         t4 = f(jj,nx3-2*j,k,l)
         t5 = -f(jj,nx3-2*j+1,k,l)
         t2 = f(jj,2*j-1,k,l) + t4
         t3 = f(jj,2*j,k,l) + t5
         t6 = f(jj,2*j-1,k,l) - t4
         t5 = f(jj,2*j,k,l) - t5
         t4 = t6*real(t1) - t5*aimag(t1)
         t5 = t6*aimag(t1) + t5*real(t1)
         f(jj,2*j-1,k,l) = ani*(t2 + t4)
         f(jj,2*j,k,l) = ani*(t3 + t5)
         f(jj,nx3-2*j,k,l) = ani*(t2 - t4)
         f(jj,nx3-2*j+1,k,l) = ani*(t5 - t3)
  140    continue
  150    continue
  160    continue
         ani = 2.*ani
         do 180 k = kypi, kypt
         do 170 jj = 1, 3
         f(jj,nxh+1,k,l) = ani*f(jj,nxh+1,k,l)
         f(jj,nxh+2,k,l) = -ani*f(jj,nxh+2,k,l)
         t2 = ani*(f(jj,1,k,l) + f(jj,2,k,l))
         f(jj,2,k,l) = ani*(f(jj,1,k,l) - f(jj,2,k,l))
         f(jj,1,k,l) = t2
         f(jj,nx+1,k,l) = ani*f(jj,nx+1,k,l)
  170    continue
  180    continue
  190    continue
c forward fourier transform
      else if (isign.gt.0) then
         kmr = nxy/nxh
         kypt = kyps
         do 250 l = 1, kblok
         if ((kyps+kyp*(l+ks)).eq.ny) kypt = kyps + 1
         do 220 j = 2, nxhh
         t1 = cmplx(aimag(sctd(1+kmr*(j-1))),-real(sctd(1+kmr*(j-1))))
         do 210 k = kypi, kypt
         do 200 jj = 1, 3
         t4 = f(jj,nx3-2*j,k,l)
         t5 = -f(jj,nx3-2*j+1,k,l)
         t2 = f(jj,2*j-1,k,l) + t4
         t3 = f(jj,2*j,k,l) + t5
         t6 = f(jj,2*j-1,k,l) - t4
         t5 = f(jj,2*j,k,l) - t5
         t4 = t6*real(t1) - t5*aimag(t1)
         t5 = t6*aimag(t1) + t5*real(t1)
         f(jj,2*j-1,k,l) = t2 + t4
         f(jj,2*j,k,l) = t3 + t5
         f(jj,nx3-2*j,k,l) = t2 - t4
         f(jj,nx3-2*j+1,k,l) = t5 - t3
  200    continue
  210    continue
  220    continue
         do 240 k = kypi, kypt
         do 230 jj = 1, 3
         f(jj,nxh+1,k,l) = 2.0*f(jj,nxh+1,k,l)
         f(jj,nxh+2,k,l) = -2.0*f(jj,nxh+2,k,l)
         t2 = 2.0*(f(jj,1,k,l) + f(jj,2,k,l))
         f(jj,2,k,l) = 2.0*(f(jj,1,k,l) - f(jj,2,k,l))
         f(jj,1,k,l) = t2
         f(jj,nx+1,k,l) = 2.0*f(jj,nx+1,k,l)
  230    continue
  240    continue
  250    continue
      endif
c perform recursion for cosine-sine transform
      kypt = kyps
      do 280 l = 1, kblok
      if ((kyps+kyp*(l+ks)).eq.ny) kypt = kyps + 1
      do 270 k = kypi, kypt
      sum1 = .5*f(1,1,k,l)
      f(1,1,k,l) = 0.0
      f(1,2,k,l) = sum1
      sum2 = .5*f(2,1,k,l)
      f(2,1,k,l) = 0.0
      f(2,2,k,l) = sum2
      sum3 = f(3,nx+1,k,l)
      f(3,nx+1,k,l) = f(3,2,k,l)
      f(3,2,k,l) = sum3
      do 260 j = 2, nxh
      sum1 = sum1 + f(1,2*j-1,k,l)
      f(1,2*j-1,k,l) = -f(1,2*j,k,l)
      f(1,2*j,k,l) = sum1
      sum2 = sum2 + f(2,2*j-1,k,l)
      f(2,2*j-1,k,l) = -f(2,2*j,k,l)
      f(2,2*j,k,l) = sum2
      sum3 = sum3 - f(3,2*j,k,l)
      f(3,2*j,k,l) = sum3
  260 continue
      f(1,nx+1,k,l) = 0.0
      f(2,nx+1,k,l) = 0.0
  270 continue
  280 continue
      return
      end
c-----------------------------------------------------------------------
      subroutine WPFS3T2R3(f,g,bs,br,isign,ntpose,mixup,sctd,ttp,indx,in
     1dy,kstrt,nxvh,nyv,kxp2,kyp,kypd,kxp2d,jblok,kblok,nxhyd,nxyd)
c wrapper function for 3 parallel real sine/sine transforms
c for the electric field with dirichlet or magnetic field with neumann
c boundary conditions
      implicit none
      integer isign, ntpose, mixup, indx, indy, kstrt, nxvh, nyv
      integer kxp2, kyp, kypd, kxp2d, jblok, kblok, nxhyd, nxyd
      real f, g, bs, br, ttp
      complex sctd
      dimension f(3,2*nxvh,kypd,kblok), g(3,nyv,kxp2d,jblok)
      dimension bs(3,kxp2+1,kyp+1,kblok), br(3,kxp2+1,kyp+1,jblok)
      dimension mixup(nxhyd), sctd(nxyd)
c local data
      integer nx, ny, kxpi, kypi
      real tf
      double precision dtime
      data kxpi, kypi /1,1/
c calculate range of indices
      nx = 2**indx
      ny = 2**indy
c inverse fourier transform
      if (isign.lt.0) then
c perform x cosine-sine transform
         call PFSSCT2R3X(f,isign,mixup,sctd,indx,indy,kstrt,kyp,kypi,kyp
     1,nxvh,kypd,kblok,nxhyd,nxyd)
c transpose f array to g
         call PWTIMERA(-1,ttp,dtime)
         call PR3TPOSE(f,g,bs,br,nx,ny,kstrt,2*nxvh,nyv,kxp2,kyp,kxp2d,k
     1ypd,jblok,kblok)
         call PWTIMERA(1,ttp,dtime)
c perform y sine-cosine transform
         call PFSSCT2R3Y(g,isign,mixup,sctd,indx,indy,kstrt,kxp2,kxpi,kx
     1p2,nyv,kxp2d,jblok,nxhyd,nxyd)
c transpose g array to f
         if (ntpose.eq.0) then
            call PWTIMERA(-1,tf,dtime)
            call PR3TPOSE(g,f,br,bs,ny,nx,kstrt,nyv,2*nxvh,kyp,kxp2,kypd
     1,kxp2d,kblok,jblok)
            call PWTIMERA(1,tf,dtime)
         endif
c forward fourier transform
      else if (isign.gt.0) then
c transpose f array to g
         if (ntpose.eq.0) then
            call PWTIMERA(-1,tf,dtime)
            call PR3TPOSE(f,g,bs,br,nx,ny,kstrt,2*nxvh,nyv,kxp2,kyp,kxp2
     1d,kypd,jblok,kblok)
            call PWTIMERA(1,tf,dtime)
         endif
c perform y sine-cosine transform
         call PFSSCT2R3Y(g,isign,mixup,sctd,indx,indy,kstrt,kxp2,kxpi,kx
     1p2,nyv,kxp2d,jblok,nxhyd,nxyd)
c transpose g array to f
         call PWTIMERA(-1,ttp,dtime)
         call PR3TPOSE(g,f,br,bs,ny,nx,kstrt,nyv,2*nxvh,kyp,kxp2,kypd,kx
     1p2d,kblok,jblok)
         call PWTIMERA(1,ttp,dtime)
c perform x cosine-sine transform
         call PFSSCT2R3X(f,isign,mixup,sctd,indx,indy,kstrt,kyp,kypi,kyp
     1,nxvh,kypd,kblok,nxhyd,nxyd)
      endif
      if (ntpose.eq.0) ttp = ttp + tf
      return
      end
c-----------------------------------------------------------------------
      subroutine PRTPOSE(f,g,s,t,nx,ny,kstrt,nxv,nyv,kxp,kyp,kxpd,kypd,j
     1blok,kblok)
c this subroutine performs a transpose of a matrix f, distributed in y,
c to a matrix g, distributed in x, that is,
c g(k+kyp*(m-1),j,l) = f(j+kxp*(l-1),k,m), where
c 1 <= j <= kxp, 1 <= k <= kyp, 1 <= l <= nx/kxp, 1 <= m <= ny/kyp
c and where indices l and m can be distributed across processors.
c includes an extra guard cell for last row and column
c this subroutine sends and receives one message at a time, either
c synchronously or asynchronously. it uses a minimum of system resources
c f = real input array
c g = real output array
c s, t = real scratch arrays
c nx/ny = number of points in x/y
c kstrt = starting data block number
c nxv = first dimension of f >= nx+1
c nyv = first dimension of g >= ny+1
c kypd = second dimension of f >= kyp+1
c kxpd = second dimension of g >= kxp+1
c kxp/kyp = number of data values per block in x/y
c jblok/kblok = number of data blocks in x/y
      implicit none
      integer nx, ny, kstrt, nxv, nyv, kxp, kyp, kxpd, kypd
      integer jblok, kblok
      real f, g, s, t
      dimension f(nxv,kypd,kblok), g(nyv,kxpd,jblok)
      dimension s(kxp+1,kyp+1,kblok), t(kxp+1,kyp+1,jblok)
c common block for parallel processing
      integer nproc, lgrp, lstat, mreal, mint, mcplx, mdouble, lworld
c lstat = length of status array
      parameter(lstat=10)
c lgrp = current communicator
c mreal = default datatype for reals
      common /PPARMS/ nproc, lgrp, mreal, mint, mcplx, mdouble, lworld
c local data
      integer ks, kxb, kyb, kxp1, kyp1, kxpt, kypt
      integer jkblok, kxym, mtr, ntr, mntr
      integer l, i, joff, koff, k, j
      integer ir0, is0, ii, ir, is, ierr, msid, istatus
      dimension istatus(lstat)
      ks = kstrt - 2
      kxb = nx/kxp
      kyb = ny/kyp
c set constants to receive extra guard cells
      kxp1 = kxp + 1
      kyp1 = kyp + 1
      kxpt = kxp
      if (kstrt.eq.kxb) kxpt = kxp1
c this segment is used for shared memory computers
c     if (kstrt.gt.nx) return
c     kypt = kyp
c     do 40 l = 1, jblok
c     joff = kxp*(l + ks)
c     if ((l+ks).eq.(kxb-1)) kxpt = kxp1
c     do 30 i = 1, kyb
c     koff = kyp*(i - 1)
c     if (i.eq.kyb) kypt = kyp1
c     do 20 k = 1, kypt
c     do 10 j = 1, kxpt
c     g(k+koff,j,l) = f(j+joff,k,i)
c  10 continue
c  20 continue
c  30 continue
c  40 continue
c this segment is used for mpi computers
      jkblok = max0(jblok,kblok)
      kxym = min0(kxb,kyb)
      mtr = kyb/kxym
      ntr = kxb/kxym
      mntr = max0(mtr,ntr)
      do 70 l = 1, jkblok
      do 60 i = 1, kxym
      ir0 = iand(kxym-1,ieor(l+ks,i-1)) + 1
      is0 = ir0
      do 50 ii = 1, mntr
c post receive
      if ((kstrt.le.nx).and.(ii.le.mtr)) then
         ir = ir0 + kxym*(ii - 1)
         kypt = kyp
         if (ir.eq.kyb) kypt = kyp1
         call MPI_IRECV(t(1,1,l),kxp1*kyp1,mreal,ir-1,ir+kxym+1,lgrp,msi
     1d,ierr)
      endif
c send data
      if ((kstrt.le.ny).and.(ii.le.ntr)) then
         is = is0 + kxym*(ii - 1)
         joff = kxp*(is - 1)
         do 20 k = 1, kyp1
         do 10 j = 1, kxp1
         s(j,k,l) = f(j+joff,k,l)
   10    continue
   20    continue
         call MPI_SEND(s(1,1,l),kxp1*kyp1,mreal,is-1,l+ks+kxym+2,lgrp,ie
     1rr)
      endif
c receive data
      if ((kstrt.le.nx).and.(ii.le.mtr)) then
         koff = kyp*(ir - 1)
         call MPI_WAIT(msid,istatus,ierr)
         do 40 k = 1, kypt
         do 30 j = 1, kxpt
         g(k+koff,j,l) = t(j,k,l)
   30    continue
   40    continue
      endif
   50 continue
   60 continue
   70 continue
      return
      end
c-----------------------------------------------------------------------
      subroutine PR2TPOSE(f,g,s,t,nx,ny,kstrt,nxv,nyv,kxp,kyp,kxpd,kypd,
     1jblok,kblok)
c this subroutine performs a transpose of a matrix f, distributed in y,
c to a matrix g, distributed in x, that is,
c g(1:2,k+kyp*(m-1),j,l) = f(1:2,j+kxp*(l-1),k,m), where
c 1 <= j <= kxp, 1 <= k <= kyp, 1 <= l <= nx/kxp, 1 <= m <= ny/kyp
c and where indices l and m can be distributed across processors.
c includes an extra guard cell for last row and column
c this subroutine sends and receives one message at a time, either
c synchronously or asynchronously. it uses a minimum of system resources
c f = real input array
c g = real output array
c s, t = real scratch arrays
c nx/ny = number of points in x/y
c kstrt = starting data block number
c nxv = first dimension of f >= nx+1
c nyv = first dimension of g >= ny+1
c kypd = second dimension of f >= kyp+1
c kxpd = second dimension of g >= kxp+1
c kxp/kyp = number of data values per block in x/y
c jblok/kblok = number of data blocks in x/y
      implicit none
      integer nx, ny, kstrt, nxv, nyv, kxp, kyp, kxpd, kypd
      integer jblok, kblok
      real f, g, s, t
      dimension f(2,nxv,kypd,kblok), g(2,nyv,kxpd,jblok)
      dimension s(2,kxp+1,kyp+1,kblok), t(2,kxp+1,kyp+1,jblok)
c common block for parallel processing
      integer nproc, lgrp, lstat, mreal, mint, mcplx, mdouble, lworld
c lstat = length of status array
      parameter(lstat=10)
c lgrp = current communicator
c mreal = default datatype for reals
      common /PPARMS/ nproc, lgrp, mreal, mint, mcplx, mdouble, lworld
c local data
      integer ks, kxb, kyb, kxp1, kyp1, kxpt, kypt
      integer jkblok, kxym, mtr, ntr, mntr
      integer l, i, joff, koff, k, j
      integer ir0, is0, ii, ir, is, ierr, msid, istatus
      dimension istatus(lstat)
      ks = kstrt - 2
      kxb = nx/kxp
      kyb = ny/kyp
c set constants to receive extra guard cells
      kxp1 = kxp + 1
      kyp1 = kyp + 1
      kxpt = kxp
      if (kstrt.eq.kxb) kxpt = kxp1
c this segment is used for shared memory computers
c     if (kstrt.gt.nx) return
c     kypt = kyp
c     do 40 l = 1, jblok
c     joff = kxp*(l + ks)
c     if ((l+ks).eq.(kxb-1)) kxpt = kxp1
c     do 30 i = 1, kyb
c     koff = kyp*(i - 1)
c     if (i.eq.kyb) kypt = kyp1
c     do 20 k = 1, kypt
c     do 10 j = 1, kxpt
c     g(1,k+koff,j,l) = f(1,j+joff,k,i)
c     g(2,k+koff,j,l) = f(2,j+joff,k,i)
c  10 continue
c  20 continue
c  30 continue
c  40 continue
c this segment is used for mpi computers
      jkblok = max0(jblok,kblok)
      kxym = min0(kxb,kyb)
      mtr = kyb/kxym
      ntr = kxb/kxym
      mntr = max0(mtr,ntr)
      do 70 l = 1, jkblok
      do 60 i = 1, kxym
      ir0 = iand(kxym-1,ieor(l+ks,i-1)) + 1
      is0 = ir0
      do 50 ii = 1, mntr
c post receive
      if ((kstrt.le.nx).and.(ii.le.mtr)) then
         ir = ir0 + kxym*(ii - 1)
         kypt = kyp
         if (ir.eq.kyb) kypt = kyp1
         call MPI_IRECV(t(1,1,1,l),2*kxp1*kyp1,mreal,ir-1,ir+kxym+1,lgrp
     1,msid,ierr)
      endif
c send data
      if ((kstrt.le.ny).and.(ii.le.ntr)) then
         is = is0 + kxym*(ii - 1)
         joff = kxp*(is - 1)
         do 20 k = 1, kyp1
         do 10 j = 1, kxp1
         s(1,j,k,l) = f(1,j+joff,k,l)
         s(2,j,k,l) = f(2,j+joff,k,l)
   10    continue
   20    continue
         call MPI_SEND(s(1,1,1,l),2*kxp1*kyp1,mreal,is-1,l+ks+kxym+2,lgr
     1p,ierr)
      endif
c receive data
      if ((kstrt.le.nx).and.(ii.le.mtr)) then
         koff = kyp*(ir - 1)
         call MPI_WAIT(msid,istatus,ierr)
         do 40 k = 1, kypt
         do 30 j = 1, kxpt
         g(1,k+koff,j,l) = t(1,j,k,l)
         g(2,k+koff,j,l) = t(2,j,k,l)
   30    continue
   40    continue
      endif
   50 continue
   60 continue
   70 continue
      return
      end
c-----------------------------------------------------------------------
      subroutine PR3TPOSE(f,g,s,t,nx,ny,kstrt,nxv,nyv,kxp,kyp,kxpd,kypd,
     1jblok,kblok)
c this subroutine performs a transpose of a matrix f, distributed in y,
c to a matrix g, distributed in x, that is,
c g(1:3,k+kyp*(m-1),j,l) = f(1:3,j+kxp*(l-1),k,m), where
c 1 <= j <= kxp, 1 <= k <= kyp, 1 <= l <= nx/kxp, 1 <= m <= ny/kyp
c and where indices l and m can be distributed across processors.
c includes an extra guard cell for last row and column
c this subroutine sends and receives one message at a time, either
c synchronously or asynchronously. it uses a minimum of system resources
c f = real input array
c g = real output array
c s, t = real scratch arrays
c nx/ny = number of points in x/y
c kstrt = starting data block number
c nxv = first dimension of f >= nx+1
c nyv = first dimension of g >= ny+1
c kypd = second dimension of f >= kyp+1
c kxpd = second dimension of g >= kxp+1
c kxp/kyp = number of data values per block in x/y
c jblok/kblok = number of data blocks in x/y
      implicit none
      integer nx, ny, kstrt, nxv, nyv, kxp, kyp, kxpd, kypd
      integer jblok, kblok
      real f, g, s, t
      dimension f(3,nxv,kypd,kblok), g(3,nyv,kxpd,jblok)
      dimension s(3,kxp+1,kyp+1,kblok), t(3,kxp+1,kyp+1,jblok)
c common block for parallel processing
      integer nproc, lgrp, lstat, mreal, mint, mcplx, mdouble, lworld
c lstat = length of status array
      parameter(lstat=10)
c lgrp = current communicator
c mreal = default datatype for reals
      common /PPARMS/ nproc, lgrp, mreal, mint, mcplx, mdouble, lworld
c local data
      integer ks, kxb, kyb, kxp1, kyp1, kxpt, kypt
      integer jkblok, kxym, mtr, ntr, mntr
      integer l, i, joff, koff, k, j
      integer ir0, is0, ii, ir, is, ierr, msid, istatus
      dimension istatus(lstat)
      ks = kstrt - 2
      kxb = nx/kxp
      kyb = ny/kyp
c set constants to receive extra guard cells
      kxp1 = kxp + 1
      kyp1 = kyp + 1
      kxpt = kxp
      if (kstrt.eq.kxb) kxpt = kxp1
c this segment is used for shared memory computers
c     if (kstrt.gt.nx) return
c     kypt = kyp
c     do 40 l = 1, jblok
c     joff = kxp*(l + ks)
c     if ((l+ks).eq.(kxb-1)) kxpt = kxp1
c     do 30 i = 1, kyb
c     koff = kyp*(i - 1)
c     if (i.eq.kyb) kypt = kyp1
c     do 20 k = 1, kypt
c     do 10 j = 1, kxpt
c     g(1,k+koff,j,l) = f(1,j+joff,k,i)
c     g(2,k+koff,j,l) = f(2,j+joff,k,i)
c     g(3,k+koff,j,l) = f(3,j+joff,k,i)
c  10 continue
c  20 continue
c  30 continue
c  40 continue
c this segment is used for mpi computers
      jkblok = max0(jblok,kblok)
      kxym = min0(kxb,kyb)
      mtr = kyb/kxym
      ntr = kxb/kxym
      mntr = max0(mtr,ntr)
      do 70 l = 1, jkblok
      do 60 i = 1, kxym
      ir0 = iand(kxym-1,ieor(l+ks,i-1)) + 1
      is0 = ir0
      do 50 ii = 1, mntr
c post receive
      if ((kstrt.le.nx).and.(ii.le.mtr)) then
         ir = ir0 + kxym*(ii - 1)
         kypt = kyp
         if (ir.eq.kyb) kypt = kyp1
         call MPI_IRECV(t(1,1,1,l),3*kxp1*kyp1,mreal,ir-1,ir+kxym+1,lgrp
     1,msid,ierr)
      endif
c send data
      if ((kstrt.le.ny).and.(ii.le.ntr)) then
         is = is0 + kxym*(ii - 1)
         joff = kxp*(is - 1)
         do 20 k = 1, kyp1
         do 10 j = 1, kxp1
         s(1,j,k,l) = f(1,j+joff,k,l)
         s(2,j,k,l) = f(2,j+joff,k,l)
         s(3,j,k,l) = f(3,j+joff,k,l)
   10    continue
   20    continue
         call MPI_SEND(s(1,1,1,l),3*kxp1*kyp1,mreal,is-1,l+ks+kxym+2,lgr
     1p,ierr)
      endif
c receive data
      if ((kstrt.le.nx).and.(ii.le.mtr)) then
         koff = kyp*(ir - 1)
         call MPI_WAIT(msid,istatus,ierr)
         do 40 k = 1, kypt
         do 30 j = 1, kxpt
         g(1,k+koff,j,l) = t(1,j,k,l)
         g(2,k+koff,j,l) = t(2,j,k,l)
         g(3,k+koff,j,l) = t(3,j,k,l)
   30    continue
   40    continue
      endif
   50 continue
   60 continue
   70 continue
      return
      end
c-----------------------------------------------------------------------
      subroutine PWTIMERA(icntrl,time,dtime)
c this subroutine performs local wall clock timing
c input: icntrl, dtime
c icntrl = (-1,0,1) = (initialize,ignore,read) clock
c clock should be initialized before it is read!
c time = elapsed time in seconds
c dtime = current time
c written for mpi
      implicit none
      integer icntrl
      real time
      double precision dtime
c local data
      double precision jclock
      double precision MPI_WTIME
      external MPI_WTIME
c initialize clock
      if (icntrl.eq.(-1)) then
         dtime = MPI_WTIME()
c read clock and write time difference from last clock initialization
      else if (icntrl.eq.1) then
         jclock = dtime
         dtime = MPI_WTIME()
         time = real(dtime - jclock)
      endif
      return
      end
c-----------------------------------------------------------------------
      subroutine PDIVFD2(f,df,nx,ny,kstrt,ndim,nyv,kxp2,j2blok)
c this subroutine calculates the divergence in fourier space
c with dirichlet boundary conditions (zero potential)
c intended for calculating the charge density from the electric field
c input: all except df, output: df
c approximate flop count is: 6*nxc*nyc
c where nxc = (nx/2-1)/nvp, nyc = ny - 1, and nvp = number of procs
c the divergence is calculated using the equation:
c df(kx,ky) = sqrt(-1)*(kx*fx(kx,ky)+ky*fy(kx,ky))
c where kx = 2pi*j/nx, ky = 2pi*k/ny, and j,k = fourier mode numbers,
c except for df(kx=pi) = df(ky=pi) = df(kx=0,ky=0) = 0.
c nx/ny = system length in x/y direction
c ndim = number of field arrays, must be >= 2
c kstrt = starting data block number
c nyv = first dimension of field arrays, must be >= ny+1
c kxp2 = number of data values per block
c j2blok = number of data blocks
      real f, df
      dimension f(ndim,nyv,kxp2+1,j2blok), df(nyv,kxp2+1,j2blok)
      if (ndim.lt.2) return
      ks = kstrt - 2
      ny1 = ny + 1
      dnx = 6.28318530717959/float(nx + nx)
      dny = 6.28318530717959/float(ny + ny)
c calculate the divergence
      if (kstrt.gt.nx) return
      do 50 l = 1, j2blok
c mode numbers 0 < kx < nx and 0 < ky < ny
      joff = kxp2*(l + ks) - 1
      do 20 j = 1, kxp2
      dkx = dnx*float(j + joff)
      if ((j+joff).gt.0) then
         do 10 k = 2, ny
         dky = dny*float(k - 1)
         df(k,j,l) = -(dkx*f(1,k,j,l) + dky*f(2,k,j,l))
   10    continue
      endif
c mode numbers ky = 0, ny
      df(1,j,l) = 0.
      df(ny+1,j,l) = 0.
   20 continue
c mode numbers kx = 0, nx
      if ((l+ks).eq.0) then
         do 30 k = 2, ny
         df(k,1,l) = 0.
   30    continue
      endif
      do 40 k = 1, ny1
      df(k,kxp2+1,l) = 0.
   40 continue
   50 continue
      return
      end
c-----------------------------------------------------------------------
      subroutine PGRADFD2(df,f,nx,ny,kstrt,ndim,nyv,kxp2,j2blok)
c this subroutine calculates the gradient in fourier space
c with dirichlet boundary conditions (zero potential)
c intended for calculating the electric field from the potential
c input: all except f, output: f
c approximate flop count is: 4*nxc*nyc
c where nxc = (nx/2-1)/nvp, nyc = ny - 1, and nvp = number of procs
c the gradient is calculated using the equations:
c fx(kx,ky) = sqrt(-1)*kx*df(kx,ky)
c fy(kx,ky) = sqrt(-1)*ky*df(kx,ky)
c where kx = 2pi*j/nx, ky = 2pi*k/ny, and j,k = fourier mode numbers,
c except for fx(kx=pi) = fy(kx=pi) = 0, fx(ky=pi) = fy(ky=pi) = 0,
c and fx(kx=0,ky=0) = fy(kx=0,ky=0) = 0.
c nx/ny = system length in x/y direction
c ndim = number of field arrays, must be >= 2
c kstrt = starting data block number
c nyv = first dimension of field arrays, must be >= ny+1
c kxp2 = number of data values per block
c j2blok = number of data blocks
      real df, f
      dimension df(nyv,kxp2+1,j2blok), f(ndim,nyv,kxp2+1,j2blok)
      ks = kstrt - 2
      ny1 = ny + 1
      dnx = 6.28318530717959/float(nx + nx)
      dny = 6.28318530717959/float(ny + ny)
c calculate the gradient
      if (kstrt.gt.nx) return
      do 50 l = 1, j2blok
c mode numbers 0 < kx < nx and 0 < ky < ny
      joff = kxp2*(l + ks) - 1
      do 20 j = 1, kxp2
      dkx = dnx*float(j + joff)
      if ((j+joff).gt.0) then
         do 10 k = 2, ny
         dky = dny*float(k - 1)
         f(1,k,j,l) = dkx*df(k,j,l)
         f(2,k,j,l) = dky*df(k,j,l)
   10    continue
      endif
c mode numbers ky = 0, ny
      f(1,1,j,l) = 0.
      f(2,1,j,l) = 0.
      f(1,ny+1,j,l) = 0.
      f(2,ny+1,j,l) = 0.
   20 continue
c mode numbers kx = 0, nx
      if ((l+ks).eq.0) then
         do 30 k = 2, ny
         f(1,k,1,l) = 0.
         f(2,k,1,l) = 0.
   30    continue
      endif
      do 40 k = 1, ny1
      f(1,k,kxp2+1,l) = 0.
      f(2,k,kxp2+1,l) = 0.
   40 continue
   50 continue
      return
      end
c-----------------------------------------------------------------------
      subroutine PCURLFD22(f,g,nx,ny,kstrt,nyv,kxp2,j2blok)
c this subroutine calculates the curl in fourier space
c with dirichlet boundary conditions (zero potential)
c intended for calculating the magnetic field from the vector potential
c input: all except g, output: g
c approximate flop count is: 32*nxc*nyc
c where nxc = (nx/2-1)/nvp, nyc = ny - 1, and nvp = number of procs
c the curl is calculated using the equations:
c g(kx,ky) = sqrt(-1)*(kx*fy(kx,ky)-ky*fx(kx,ky))
c where kx = 2pi*j/nx, ky = 2pi*k/ny, and j,k = fourier mode numbers,
c nx/ny = system length in x/y direction
c kstrt = starting data block number
c nyv = first dimension of field arrays, must be >= ny+1
c kxp2 = number of data values per block
c j2blok = number of data blocks
      real f, g
      dimension f(2,nyv,kxp2+1,j2blok), g(nyv,kxp2+1,j2blok)
      ks = kstrt - 2
      ny1 = ny + 1
      dnx = 6.28318530717959/float(nx + nx)
      dny = 6.28318530717959/float(ny + ny)
c calculate the curl
      if (kstrt.gt.nx) return
      do 50 l = 1, j2blok
c mode numbers 0 < kx < nx and 0 < ky < ny
      joff = kxp2*(l + ks) - 1
      do 20 j = 1, kxp2
      dkx = dnx*float(j + joff)
      if ((j+joff).gt.0) then
         do 10 k = 2, ny
         dky = dny*float(k - 1)
         g(k,j,l) = dkx*f(2,k,j,l) - dky*f(1,k,j,l)
   10    continue
c mode numbers ky = 0, ny
         g(1,j,l) = dkx*f(2,1,j,l)
      endif
      g(ny+1,j,l) = 0.
   20 continue
c mode numbers kx = 0, nx
      if ((l+ks).eq.0) then
         do 30 k = 2, ny
         dky = dny*float(k - 1)
         g(k,1,l) = -dky*f(1,k,1,l)
   30    continue
         g(1,1,l) = 0.
      endif
      do 40 k = 1, ny1
      g(k,kxp2+1,l) = 0.
   40 continue
   50 continue
      return
      end
c-----------------------------------------------------------------------
      subroutine PCURLFD2(f,g,nx,ny,kstrt,nyv,kxp2,j2blok)
c this subroutine calculates the curl in fourier space
c with dirichlet boundary conditions (zero potential)
c intended for calculating the magnetic field from the vector potential
c input: all except g, output: g
c approximate flop count is: 8*nxc*nyc
c where nxc = (nx/2-1)/nvp, nyc = ny - 1, and nvp = number of procs
c the curl is calculated using the equations:
c gx(kx,ky) = sqrt(-1)*ky*fz(kx,ky)
c gy(kx,ky) = -sqrt(-1)*kx*fz(kx,ky)
c gz(kx,ky) = sqrt(-1)*(kx*fy(kx,ky)-ky*fx(kx,ky))
c where kx = 2pi*j/nx, ky = 2pi*k/ny, and j,k = fourier mode numbers,
c except for gx(kx=pi) = gy(kx=pi) = 0, gx(ky=pi) = gy(ky=pi) = 0,
c and gx(kx=0,ky=0) = gy(kx=0,ky=0) = 0.
c nx/ny = system length in x/y direction
c kstrt = starting data block number
c nyv = first dimension of field arrays, must be >= ny+1
c kxp2 = number of data values per block
c j2blok = number of data blocks
      real f, g
      dimension f(3,nyv,kxp2+1,j2blok), g(3,nyv,kxp2+1,j2blok)
      ks = kstrt - 2
      ny1 = ny + 1
      dnx = 6.28318530717959/float(nx + nx)
      dny = 6.28318530717959/float(ny + ny)
c calculate the curl
      if (kstrt.gt.nx) return
      do 50 l = 1, j2blok
c mode numbers 0 < kx < nx and 0 < ky < ny
      joff = kxp2*(l + ks) - 1
      do 20 j = 1, kxp2
      dkx = dnx*float(j + joff)
      if ((j+joff).gt.0) then
         do 10 k = 2, ny
         dky = dny*float(k - 1)
         g(1,k,j,l) = dky*f(3,k,j,l)
         g(2,k,j,l) = -dkx*f(3,k,j,l)
         g(3,k,j,l) = dkx*f(2,k,j,l) - dky*f(1,k,j,l)
   10    continue
c mode numbers ky = 0, ny
         g(1,1,j,l) = 0.
         g(2,1,j,l) = 0.
         g(3,1,j,l) = dkx*f(2,1,j,l)
      endif
      g(1,ny+1,j,l) = 0.
      g(2,ny+1,j,l) = 0.
      g(3,ny+1,j,l) = 0.
   20 continue
c mode numbers kx = 0, nx
      if ((l+ks).eq.0) then
         do 30 k = 2, ny
         dky = dny*float(k - 1)
         g(1,k,1,l) = 0.
         g(2,k,1,l) = 0.
         g(3,k,1,l) = -dky*f(1,k,1,l)
   30    continue
         g(1,1,1,l) = 0.
         g(2,1,1,l) = 0.
         g(3,1,1,l) = 0.
      endif
      do 40 k = 1, ny1
      g(1,k,kxp2+1,l) = 0.
      g(2,k,kxp2+1,l) = 0.
      g(3,k,kxp2+1,l) = 0.
   40 continue
   50 continue
      return
      end
c-----------------------------------------------------------------------
      subroutine WPPFSST2RM(f,g,bs,br,isign,ntpose,mixup,sctd,ttp,indx, &
     &indy,kstrt,nvp,nxvh,nyv,kxp2,kyp,kypd,kxp2d,nxhyd,nxyd)
! wrapper function for parallel real sine/sine transform
      implicit none
      integer isign, ntpose, indx, indy, kstrt, nvp, nxvh, nyv
      integer kxp2, kyp, kypd, kxp2d, nxhyd, nxyd, mixup
      real f, g, bs, br, ttp
      complex sctd
      dimension f(2*nxvh,kypd), g(nyv,kxp2d)
      dimension bs(kxp2+1,kyp+1), br(kxp2+1,kyp+1)
      dimension mixup(nxhyd), sctd(nxyd)
! local data
      integer nx, ny, kxpi, kypi, ks, kxp2p, kypp, kxb2, kyb
      real tf
      double precision dtime
      data kxpi, kypi /1,1/
! calculate range of indices
      nx = 2**indx
      ny = 2**indy
! ks = processor id
      ks = kstrt - 1
! kxp2p = actual size used in x direction
      kxp2p = min(kxp2,max(0,nx-kxp2*ks))
! kypp = actual size used in y direction
      kypp = min(kyp,max(0,ny-kyp*ks))
! kxb2 = minimum number of processors needed in x direction
      kxb2 = (nx - 1)/kxp2 + 1
! kyb = minimum number of processors needed in y direction
      kyb = (ny - 1)/kyp + 1
! add extra word for last processor in x
      if (ks==(kxb2-1)) kxp2p = kxp2p + 1
! add extra word for last processor in y
      if (ks==(kyb-1)) kypp = kypp + 1
! inverse fourier transform
      if (isign.lt.0) then
! perform x sine transform
         call PPFST2RMXX(f,isign,mixup,sctd,indx,indy,kstrt,kypi,kypp,  &
     &nxvh,kypd,nxhyd,nxyd)
! transpose f array to g
         call PWTIMERA(-1,ttp,dtime)
         call PPRTPOSE(f,g,bs,br,nx,ny,kxp2,kyp,kstrt,nvp,2*nxvh,nyv,   &
     &kxp2d,kypd)
         call PWTIMERA(1,ttp,dtime)
! perform y sine transform
         call PPFST2RMXY(g,isign,mixup,sctd,indx,indy,kstrt,kxpi,kxp2p, &
     &nyv,kxp2d,nxhyd,nxyd)
! transpose g array to f
         if (ntpose.eq.0) then
            call PWTIMERA(-1,tf,dtime)
            call PPRTPOSE(g,f,br,bs,ny,nx,kyp,kxp2,kstrt,nvp,nyv,2*nxvh,&
     &kypd,kxp2d)
            call PWTIMERA(1,tf,dtime)
         endif
! forward fourier transform
      else if (isign.gt.0) then
! transpose f array to g
         if (ntpose.eq.0) then
            call PWTIMERA(-1,tf,dtime)
            call PPRTPOSE(f,g,bs,br,nx,ny,kxp2,kyp,kstrt,nvp,2*nxvh,nyv,&
     &kxp2d,kypd)
            call PWTIMERA(1,tf,dtime)
         endif
! perform y sine transform
         call PPFST2RMXY(g,isign,mixup,sctd,indx,indy,kstrt,kxpi,kxp2p, &
     &nyv,kxp2d,nxhyd,nxyd)
! transpose g array to f
         call PWTIMERA(-1,ttp,dtime)
         call PPRTPOSE(g,f,br,bs,ny,nx,kyp,kxp2,kstrt,nvp,nyv,2*nxvh,   &
     &kypd,kxp2d)
         call PWTIMERA(1,ttp,dtime)
! perform x sine transform
         call PPFST2RMXX(f,isign,mixup,sctd,indx,indy,kstrt,kypi,kypp,  &
     &nxvh,kypd,nxhyd,nxyd)
      endif
      if (ntpose.eq.0) ttp = ttp + tf
      return
      end
!-----------------------------------------------------------------------
      subroutine WPPFSCT2RM(f,g,bs,br,isign,ntpose,mixup,sctd,ttp,indx, &
     &indy,kstrt,nvp,nxvh,nyv,kxp2,kyp,kypd,kxp2d,nxhyd,nxyd)
! wrapper function for parallel real sine/cosine transform
      implicit none
      integer isign, ntpose, indx, indy, kstrt, nvp, nxvh, nyv
      integer kxp2, kyp, kypd, kxp2d, nxhyd, nxyd, mixup
      real f, g, bs, br, ttp
      complex sctd
      dimension f(2*nxvh,kypd), g(nyv,kxp2d)
      dimension bs(kxp2+1,kyp+1), br(kxp2+1,kyp+1)
      dimension mixup(nxhyd), sctd(nxyd)
! local data
      integer nx, ny, kxpi, kypi, ks, kxp2p, kypp, kxb2, kyb
      real tf
      double precision dtime
      data kxpi, kypi /1,1/
! calculate range of indices
      nx = 2**indx
      ny = 2**indy
! ks = processor id
      ks = kstrt - 1
! kxp2p = actual size used in x direction
      kxp2p = min(kxp2,max(0,nx-kxp2*ks))
! kypp = actual size used in y direction
      kypp = min(kyp,max(0,ny-kyp*ks))
! kxb2 = minimum number of processors needed in x direction
      kxb2 = (nx - 1)/kxp2 + 1
! kyb = minimum number of processors needed in y direction
      kyb = (ny - 1)/kyp + 1
! add extra word for last processor in x
      if (ks==(kxb2-1)) kxp2p = kxp2p + 1
! add extra word for last processor in y
      if (ks==(kyb-1)) kypp = kypp + 1
! inverse fourier transform
      if (isign.lt.0) then
! perform x sine transform
         call PPFST2RMXX(f,isign,mixup,sctd,indx,indy,kstrt,kypi,kypp,  &
     &nxvh,kypd,nxhyd,nxyd)
! transpose f array to g
         call PWTIMERA(-1,ttp,dtime)
         call PPRTPOSE(f,g,bs,br,nx,ny,kxp2,kyp,kstrt,nvp,2*nxvh,nyv,   &
     &kxp2d,kypd)
         call PWTIMERA(1,ttp,dtime)
! perform y cosine transform
         call PPFCT2RMXY(g,isign,mixup,sctd,indx,indy,kstrt,kxpi,kxp2p, &
     &nyv,kxp2d,nxhyd,nxyd)
! transpose g array to f
         if (ntpose.eq.0) then
            call PWTIMERA(-1,tf,dtime)
            call PPRTPOSE(g,f,br,bs,ny,nx,kyp,kxp2,kstrt,nvp,nyv,2*nxvh,&
     &kypd,kxp2d)
            call PWTIMERA(1,tf,dtime)
         endif
! forward fourier transform
      else if (isign.gt.0) then
! transpose f array to g
         if (ntpose.eq.0) then
            call PWTIMERA(-1,tf,dtime)
            call PPRTPOSE(f,g,bs,br,nx,ny,kxp2,kyp,kstrt,nvp,2*nxvh,nyv,&
     &kxp2d,kypd)
            call PWTIMERA(1,tf,dtime)
         endif
! perform y cosine transform
         call PPFCT2RMXY(g,isign,mixup,sctd,indx,indy,kstrt,kxpi,kxp2p, &
     &nyv,kxp2d,nxhyd,nxyd)
! transpose g array to f
         call PWTIMERA(-1,ttp,dtime)
         call PPRTPOSE(g,f,br,bs,ny,nx,kyp,kxp2,kstrt,nvp,nyv,2*nxvh,   &
     &kypd,kxp2d)
         call PWTIMERA(1,ttp,dtime)
! perform x sine transform
         call PPFST2RMXX(f,isign,mixup,sctd,indx,indy,kstrt,kypi,kypp,  &
     &nxvh,kypd,nxhyd,nxyd)
      endif
      if (ntpose.eq.0) ttp = ttp + tf
      return
      end
!-----------------------------------------------------------------------
      subroutine WPPFCST2RM(f,g,bs,br,isign,ntpose,mixup,sctd,ttp,indx, &
     &indy,kstrt,nvp,nxvh,nyv,kxp2,kyp,kypd,kxp2d,nxhyd,nxyd)
! wrapper function for parallel real cosine/sine transform
      implicit none
      integer isign, ntpose, indx, indy, kstrt, nvp, nxvh, nyv
      integer kxp2, kyp, kypd, kxp2d, nxhyd, nxyd, mixup
      real f, g, bs, br, ttp
      complex sctd
      dimension f(2*nxvh,kypd), g(nyv,kxp2d)
      dimension bs(kxp2+1,kyp+1), br(kxp2+1,kyp+1)
      dimension mixup(nxhyd), sctd(nxyd)
! local data
      integer nx, ny, kxpi, kypi, ks, kxp2p, kypp, kxb2, kyb
      real tf
      double precision dtime
      data kxpi, kypi /1,1/
! calculate range of indices
      nx = 2**indx
      ny = 2**indy
! ks = processor id
      ks = kstrt - 1
! kxp2p = actual size used in x direction
      kxp2p = min(kxp2,max(0,nx-kxp2*ks))
! kypp = actual size used in y direction
      kypp = min(kyp,max(0,ny-kyp*ks))
! kxb2 = minimum number of processors needed in x direction
      kxb2 = (nx - 1)/kxp2 + 1
! kyb = minimum number of processors needed in y direction
      kyb = (ny - 1)/kyp + 1
! add extra word for last processor in x
      if (ks==(kxb2-1)) kxp2p = kxp2p + 1
! add extra word for last processor in y
      if (ks==(kyb-1)) kypp = kypp + 1
! inverse fourier transform
      if (isign.lt.0) then
! perform x cosine transform
         call PPFCT2RMXX(f,isign,mixup,sctd,indx,indy,kstrt,kypi,kypp,  &
     &nxvh,kypd,nxhyd,nxyd)
! transpose f array to g
         call PWTIMERA(-1,ttp,dtime)
         call PPRTPOSE(f,g,bs,br,nx,ny,kxp2,kyp,kstrt,nvp,2*nxvh,nyv,   &
     &kxp2d,kypd)
         call PWTIMERA(1,ttp,dtime)
! perform y sine transform
         call PPFST2RMXY(g,isign,mixup,sctd,indx,indy,kstrt,kxpi,kxp2p, &
     &nyv,kxp2d,nxhyd,nxyd)
! transpose g array to f
         if (ntpose.eq.0) then
            call PWTIMERA(-1,tf,dtime)
            call PPRTPOSE(g,f,br,bs,ny,nx,kyp,kxp2,kstrt,nvp,nyv,2*nxvh,&
     &kypd,kxp2d)
            call PWTIMERA(1,tf,dtime)
         endif
! forward fourier transform
      else if (isign.gt.0) then
! transpose f array to g
         if (ntpose.eq.0) then
            call PWTIMERA(-1,tf,dtime)
            call PPRTPOSE(f,g,bs,br,nx,ny,kxp2,kyp,kstrt,nvp,2*nxvh,nyv,&
     &kxp2d,kypd)
            call PWTIMERA(1,tf,dtime)
         endif
! perform y sine transform
         call PPFST2RMXY(g,isign,mixup,sctd,indx,indy,kstrt,kxpi,kxp2p, &
     &nyv,kxp2d,nxhyd,nxyd)
! transpose g array to f
         call PWTIMERA(-1,ttp,dtime)
         call PPRTPOSE(g,f,br,bs,ny,nx,kyp,kxp2,kstrt,nvp,nyv,2*nxvh,   &
     &kypd,kxp2d)
         call PWTIMERA(1,ttp,dtime)
! perform x cosine transform
         call PPFCT2RMXX(f,isign,mixup,sctd,indx,indy,kstrt,kypi,kypp,  &
     &nxvh,kypd,nxhyd,nxyd)
      endif
      if (ntpose.eq.0) ttp = ttp + tf
      return
      end
!-----------------------------------------------------------------------
      subroutine WPPFCCT2RM(f,g,bs,br,isign,ntpose,mixup,sctd,ttp,indx, &
     &indy,kstrt,nvp,nxvh,nyv,kxp2,kyp,kypd,kxp2d,nxhyd,nxyd)
! wrapper function for parallel real cosine/cosine transform
      implicit none
      integer isign, ntpose, indx, indy, kstrt, nvp, nxvh, nyv
      integer kxp2, kyp, kypd, kxp2d, nxhyd, nxyd, mixup
      real f, g, bs, br, ttp
      complex sctd
      dimension f(2*nxvh,kypd), g(nyv,kxp2d)
      dimension bs(kxp2+1,kyp+1), br(kxp2+1,kyp+1)
      dimension mixup(nxhyd), sctd(nxyd)
! local data
      integer nx, ny, kxpi, kypi, ks, kxp2p, kypp, kxb2, kyb
      real tf
      double precision dtime
      data kxpi, kypi /1,1/
! calculate range of indices
      nx = 2**indx
      ny = 2**indy
! ks = processor id
      ks = kstrt - 1
! kxp2p = actual size used in x direction
      kxp2p = min(kxp2,max(0,nx-kxp2*ks))
! kypp = actual size used in y direction
      kypp = min(kyp,max(0,ny-kyp*ks))
! kxb2 = minimum number of processors needed in x direction
      kxb2 = (nx - 1)/kxp2 + 1
! kyb = minimum number of processors needed in y direction
      kyb = (ny - 1)/kyp + 1
! add extra word for last processor in x
      if (ks==(kxb2-1)) kxp2p = kxp2p + 1
! add extra word for last processor in y
      if (ks==(kyb-1)) kypp = kypp + 1
! inverse fourier transform
      if (isign.lt.0) then
! perform x cosine transform
         call PPFCT2RMXX(f,isign,mixup,sctd,indx,indy,kstrt,kypi,kypp,  &
     &nxvh,kypd,nxhyd,nxyd)
! transpose f array to g
         call PWTIMERA(-1,ttp,dtime)
         call PPRTPOSE(f,g,bs,br,nx,ny,kxp2,kyp,kstrt,nvp,2*nxvh,nyv,   &
     &kxp2d,kypd)
         call PWTIMERA(1,ttp,dtime)
! perform y cosine transform
         call PPFCT2RMXY(g,isign,mixup,sctd,indx,indy,kstrt,kxpi,kxp2p, &
     &nyv,kxp2d,nxhyd,nxyd)
! transpose g array to f
         if (ntpose.eq.0) then
            call PWTIMERA(-1,tf,dtime)
            call PPRTPOSE(g,f,br,bs,ny,nx,kyp,kxp2,kstrt,nvp,nyv,2*nxvh,&
     &kypd,kxp2d)
            call PWTIMERA(1,tf,dtime)
         endif
! forward fourier transform
      else if (isign.gt.0) then
! transpose f array to g
         if (ntpose.eq.0) then
            call PWTIMERA(-1,tf,dtime)
            call PPRTPOSE(f,g,bs,br,nx,ny,kxp2,kyp,kstrt,nvp,2*nxvh,nyv,&
     &kxp2d,kypd)
            call PWTIMERA(1,tf,dtime)
         endif
! perform y cosine transform
         call PPFCT2RMXY(g,isign,mixup,sctd,indx,indy,kstrt,kxpi,kxp2p, &
     &nyv,kxp2d,nxhyd,nxyd)
! transpose g array to f
         call PWTIMERA(-1,ttp,dtime)
         call PPRTPOSE(g,f,br,bs,ny,nx,kyp,kxp2,kstrt,nvp,nyv,2*nxvh,   &
     &kypd,kxp2d)
         call PWTIMERA(1,ttp,dtime)
! perform x cosine transform
         call PPFCT2RMXX(f,isign,mixup,sctd,indx,indy,kstrt,kypi,kypp,  &
     &nxvh,kypd,nxhyd,nxyd)
      endif
      if (ntpose.eq.0) ttp = ttp + tf
      return
      end
!-----------------------------------------------------------------------
      subroutine PPFST2RMXX(f,isign,mixup,sctd,indx,indy,kstrt,kypi,kypp&
     &,nxvh,kypd,nxhyd,nxyd)
! this subroutine performs the x part of a two dimensional fast real
! sine transform and its inverse, for a subset of y,
! using real arithmetic, with OpenMP,
! for data which is distributed in blocks
! algorithm is described in Numerical Recipies in Fortran, Second Ed.,
! by W. H. Press, B. P. Flannery, S. A. Teukolsky, and W. T. Vetterling, 
! [Cambridge Univ. Press, 1992], p. 508.
! for isign = (-1,1), input: all, output: f
! approximate flop count: N*(5*log2(N) + 18)/nvp
! where N = (nx/2)*ny
! indx/indy = exponent which determines length in x/y direction,
! where nx=2**indx, ny=2**indy
! if isign = -1, an inverse sine transform is performed
! f(n,k) = (1/nx*ny)*sum(f(j,k)*sin(pi*n*j/nx))
! if isign = 1, a forward sine transform is performed
! f(j,k) = sum(f(n,k)*sin(pi*n*j/nx))
! mixup = array of bit reversed addresses
! sctd = sine/cosine table
! kstrt = starting data block number
! kypi = initial y index used
! kypp = number of y indices used
! nxvh = first dimension of f >= nx/2 + 1
! kypd = second dimension of f >= kyp + 1
! nxhyd = maximum of (nx/2,ny)
! nxyd = maximum of (nx,ny)
! written by viktor k. decyk, ucla
      implicit none
      integer isign, indx, indy, kstrt, kypi, kypp, nxvh, kypd
      integer nxhyd, nxyd, mixup
      real f
      complex sctd
      dimension f(2*nxvh,kypd)
      dimension mixup(nxhyd), sctd(nxyd)
! local data
      integer indx1, indx1y, nx, nxh, nxhh, nx3, ny, nxy, nxhy, ks
      integer i, j, k, m, km, kmr, nrx, j1, j2, ns, ns2, k1, k2, kyps
      integer nrxb
      real at1, at2, t2, t3, t4, t5, t6, ani
      complex t1
      double precision sum1
      indx1 = indx - 1
      indx1y = max0(indx1,indy)
      nx = 2**indx
      nxh = nx/2
      nxhh = nx/4
      nx3 = nx + 3
      ny = 2**indy
      nxy = max0(nx,ny)
      nxhy = 2**indx1y
      ks = kstrt - 1
      kyps = kypi + kypp - 1
      if (kstrt.gt.ny) return
      if (isign.eq.0) return
      ani = 0.5/(real(nx)*real(ny))
      nrxb = nxhy/nxh
      nrx = nxy/nxh
!$OMP PARALLEL DO
!$OMP& PRIVATE(i,j,k,m,ns,ns2,km,kmr,k1,k2,j1,j2,at1,at2,t2,t3,t4,t5,t6,
!$OMP& t1,sum1)
      do 90 i = kypi, kyps
! create auxiliary array in x
      kmr = nxy/nx
      do 10 j = 2, nxh
      j1 = 1 + kmr*(j - 1)
      at2 = f(nx+2-j,i)
      at1 = f(j,i) + at2
      at2 = f(j,i) - at2
      at1 = -aimag(sctd(j1))*at1
      at2 = 0.5*at2
      f(j,i) = at1 + at2
      f(nx+2-j,i) = at1 - at2
   10 continue
      f(1,i) = 0.0
      f(nxh+1,i) = 2.0*f(nxh+1,i)
! bit-reverse array elements in x
      do 20 j = 1, nxh
      j1 = (mixup(j) - 1)/nrxb + 1
      if (j.lt.j1) then
         t2 = f(2*j1-1,i)
         t3 = f(2*j1,i)
         f(2*j1-1,i) = f(2*j-1,i)
         f(2*j1,i) = f(2*j,i)
         f(2*j-1,i) = t2
         f(2*j,i) = t3
      endif
   20 continue
! then transform in x
      do 50 m = 1, indx1
      ns = 2**(m - 1)
      ns2 = ns + ns
      km = nxhh/ns
      kmr = 2*km*nrx
      do 40 k = 1, km
      k1 = ns2*(k - 1)
      k2 = k1 + ns
      do 30 j = 1, ns
      j1 = j + k1
      j2 = j + k2
      t1 = sctd(1+kmr*(j-1))
      t2 = real(t1)*f(2*j2-1,i) - aimag(t1)*f(2*j2,i)
      t3 = aimag(t1)*f(2*j2-1,i) + real(t1)*f(2*j2,i)
      f(2*j2-1,i) = f(2*j1-1,i) - t2
      f(2*j2,i) = f(2*j1,i) - t3
      f(2*j1-1,i) = f(2*j1-1,i) + t2
      f(2*j1,i) = f(2*j1,i) + t3
   30 continue
   40 continue
   50 continue
! unscramble coefficients and normalize
! inverse fourier transform
      if (isign.lt.0) then
         kmr = nxy/nxh
         do 60 j = 2, nxhh
         t1 = cmplx(aimag(sctd(1+kmr*(j-1))),-real(sctd(1+kmr*(j-1))))
         t4 = f(nx3-2*j,i)
         t5 = -f(nx3-2*j+1,i)
         t2 = f(2*j-1,i) + t4
         t3 = f(2*j,i) + t5
         t6 = f(2*j-1,i) - t4
         t5 = f(2*j,i) - t5
         t4 = t6*real(t1) - t5*aimag(t1)
         t5 = t6*aimag(t1) + t5*real(t1)
         f(2*j-1,i) = ani*(t2 + t4)
         f(2*j,i) = ani*(t3 + t5)
         f(nx3-2*j,i) = ani*(t2 - t4)
         f(nx3-2*j+1,i) = ani*(t5 - t3)
   60    continue
         f(nxh+1,i) = 2.0*ani*f(nxh+1,i)
         f(nxh+2,i) = -2.0*ani*f(nxh+2,i)
         t2 = 2.0*ani*(f(1,i) + f(2,i))
         f(2,i) = 2.0*ani*(f(1,i) - f(2,i))
         f(1,i) = t2
         f(nx+1,i) = 2.0*ani*f(nx+1,i)
! forward fourier transform
      else if (isign.gt.0) then
         kmr = nxy/nxh
         do 70 j = 2, nxhh
         t1 = cmplx(aimag(sctd(1+kmr*(j-1))),-real(sctd(1+kmr*(j-1))))
         t4 = f(nx3-2*j,i)
         t5 = -f(nx3-2*j+1,i)
         t2 = f(2*j-1,i) + t4
         t3 = f(2*j,i) + t5
         t6 = f(2*j-1,i) - t4
         t5 = f(2*j,i) - t5
         t4 = t6*real(t1) - t5*aimag(t1)
         t5 = t6*aimag(t1) + t5*real(t1)
         f(2*j-1,i) = t2 + t4
         f(2*j,i) = t3 + t5
         f(nx3-2*j,i) = t2 - t4
         f(nx3-2*j+1,i) = t5 - t3
   70    continue
         f(nxh+1,i) = 2.0*f(nxh+1,i)
         f(nxh+2,i) = -2.0*f(nxh+2,i)
         t2 = 2.0*(f(1,i) + f(2,i))
         f(2,i) = 2.0*(f(1,i) - f(2,i))
         f(1,i) = t2
         f(nx+1,i) = 2.0*f(nx+1,i)
      endif
! perform recursion for sine transform
      sum1 = 0.5*f(1,i)
      f(1,i) = 0.0
      f(2,i) = sum1
      do 80 j = 2, nxh
      sum1 = sum1 + f(2*j-1,i)
      f(2*j-1,i) = -f(2*j,i)
      f(2*j,i) = sum1
   80 continue
      f(nx+1,i) = 0.0
   90 continue
!$OMP END PARALLEL DO
      return
      end
!-----------------------------------------------------------------------
      subroutine PPFCT2RMXX(f,isign,mixup,sctd,indx,indy,kstrt,kypi,kypp&
     &,nxvh,kypd,nxhyd,nxyd)
! this subroutine performs the x part of a two dimensional fast real
! cosine transform and its inverse, for a subset of y,
! using real arithmetic, with OpenMP,
! for data which is distributed in blocks
! algorithm is described in Numerical Recipies in Fortran, Second Ed.,
! by W. H. Press, B. P. Flannery, S. A. Teukolsky, and W. T. Vetterling, 
! [Cambridge Univ. Press, 1992], p. 508.
! for isign = (-1,1), input: all, output: f
! approximate flop count: N*(5*log2(N) + 18)/nvp
! where N = (nx/2)*ny
! indx/indy = exponent which determines length in x/y direction,
! where nx=2**indx, ny=2**indy
! if isign = -1, an inverse cosine transform is performed
! f(n,k) = (1/nx*ny)*(.5*f(1,k) + ((-1)**n)*f(nx+1,k)
!            + sum(f(j,k)*cos(pi*n*j/nx)))
! if isign = 1, a forward cosine transform is performed
! f(j,k) = 2*(.5*f(1,k) + ((-1)**j)*f(n+1,k) + sum(f(n,k)*
!            cos(pi*n*j/nx))
! mixup = array of bit reversed addresses
! sctd = sine/cosine table
! kstrt = starting data block number
! kypi = initial y index used
! kypp = number of y indices used
! nxvh = first dimension of f >= nx/2 + 1
! kypd = second dimension of f >= kyp+1
! nxhyd = maximum of (nx/2,ny)
! nxyd = maximum of (nx,ny)
! written by viktor k. decyk, ucla
      implicit none
      integer isign, indx, indy, kstrt, kypi, kypp, nxvh, kypd
      integer nxhyd, nxyd, mixup
      real f
      complex sctd
      dimension f(2*nxvh,kypd)
      dimension mixup(nxhyd), sctd(nxyd)
! local data
      integer indx1, indx1y, nx, nxh, nxhh, nx3, ny, nxy, nxhy, ks
      integer i, j, k, m, km, kmr, nrx, j1, j2, ns, ns2, k1, k2, kyps
      integer nrxb
      real at1, at2, t2, t3, t4, t5, t6, ani
      double precision sum1
      complex t1
      indx1 = indx - 1
      indx1y = max0(indx1,indy)
      nx = 2**indx
      nxh = nx/2
      nxhh = nx/4
      nx3 = nx + 3
      ny = 2**indy
      nxy = max0(nx,ny)
      nxhy = 2**indx1y
      ks = kstrt - 1
      kyps = kypi + kypp - 1
      if (kstrt.gt.ny) return
      if (isign.eq.0) return
      ani = 0.5/(real(nx)*real(ny))
      nrxb = nxhy/nxh
      nrx = nxy/nxh
!$OMP PARALLEL DO
!$OMP& PRIVATE(i,j,k,m,ns,ns2,km,kmr,k1,k2,j1,j2,at1,at2,t2,t3,t4,t5,t6,
!$OMP& t1,sum1)
      do 90 i = kypi, kyps
! create auxiliary array in x
      kmr = nxy/nx
      sum1 = 0.5*(f(1,i) - f(nx+1,i))
      do 10 j = 2, nxh
      j1 = 1 + kmr*(j - 1)
      at2 = f(nx+2-j,i)
      at1 = f(j,i) + at2
      at2 = f(j,i) - at2
      sum1 = sum1 + real(sctd(j1))*at2
      at2 = -aimag(sctd(j1))*at2
      at1 = 0.5*at1
      f(j,i) = at1 - at2
      f(nx+2-j,i) = at1 + at2
   10 continue
      f(1,i) = 0.5*(f(1,i) + f(nx+1,i))
      f(nx+1,i) = sum1
! bit-reverse array elements in x
      do 20 j = 1, nxh
      j1 = (mixup(j) - 1)/nrxb + 1
      if (j.lt.j1) then
         t2 = f(2*j1-1,i)
         t3 = f(2*j1,i)
         f(2*j1-1,i) = f(2*j-1,i)
         f(2*j1,i) = f(2*j,i)
         f(2*j-1,i) = t2
         f(2*j,i) = t3
      endif
   20 continue
! then transform in x
      do 50 m = 1, indx1
      ns = 2**(m - 1)
      ns2 = ns + ns
      km = nxhh/ns
      kmr = 2*km*nrx
      do 40 k = 1, km
      k1 = ns2*(k - 1)
      k2 = k1 + ns
      do 30 j = 1, ns
      j1 = j + k1
      j2 = j + k2
      t1 = sctd(1+kmr*(j-1))
      t2 = real(t1)*f(2*j2-1,i) - aimag(t1)*f(2*j2,i)
      t3 = aimag(t1)*f(2*j2-1,i) + real(t1)*f(2*j2,i)
      f(2*j2-1,i) = f(2*j1-1,i) - t2
      f(2*j2,i) = f(2*j1,i) - t3
      f(2*j1-1,i) = f(2*j1-1,i) + t2
      f(2*j1,i) = f(2*j1,i) + t3
   30 continue
   40 continue
   50 continue
! unscramble coefficients and normalize
! inverse fourier transform
      if (isign.lt.0) then
         kmr = nxy/nxh
         do 60 j = 2, nxhh
         t1 = cmplx(aimag(sctd(1+kmr*(j-1))),-real(sctd(1+kmr*(j-1))))
         t4 = f(nx3-2*j,i)
         t5 = -f(nx3-2*j+1,i)
         t2 = f(2*j-1,i) + t4
         t3 = f(2*j,i) + t5
         t6 = f(2*j-1,i) - t4
         t5 = f(2*j,i) - t5
         t4 = t6*real(t1) - t5*aimag(t1)
         t5 = t6*aimag(t1) + t5*real(t1)
         f(2*j-1,i) = ani*(t2 + t4)
         f(2*j,i) = ani*(t3 + t5)
         f(nx3-2*j,i) = ani*(t2 - t4)
         f(nx3-2*j+1,i) = ani*(t5 - t3)
   60    continue
         f(nxh+1,i) = 2.0*ani*f(nxh+1,i)
         f(nxh+2,i) = -2.0*ani*f(nxh+2,i)
         t2 = 2.0*ani*(f(1,i) + f(2,i))
         f(2,i) = 2.0*ani*(f(1,i) - f(2,i))
         f(1,i) = t2
         f(nx+1,i) = 2.0*ani*f(nx+1,i)
! forward fourier transform
      else if (isign.gt.0) then
         kmr = nxy/nxh
         do 70 j = 2, nxhh
         t1 = cmplx(aimag(sctd(1+kmr*(j-1))),-real(sctd(1+kmr*(j-1))))
         t4 = f(nx3-2*j,i)
         t5 = -f(nx3-2*j+1,i)
         t2 = f(2*j-1,i) + t4
         t3 = f(2*j,i) + t5
         t6 = f(2*j-1,i) - t4
         t5 = f(2*j,i) - t5
         t4 = t6*real(t1) - t5*aimag(t1)
         t5 = t6*aimag(t1) + t5*real(t1)
         f(2*j-1,i) = t2 + t4
         f(2*j,i) = t3 + t5
         f(nx3-2*j,i) = t2 - t4
         f(nx3-2*j+1,i) = t5 - t3
   70    continue
         f(nxh+1,i) = 2.0*f(nxh+1,i)
         f(nxh+2,i) = -2.0*f(nxh+2,i)
         t2 = 2.0*(f(1,i) + f(2,i))
         f(2,i) = 2.0*(f(1,i) - f(2,i))
         f(1,i) = t2
         f(nx+1,i) = 2.0*f(nx+1,i)
      endif
! perform recursion for cosine transform
      sum1 = f(nx+1,i)
      f(nx+1,i) = f(2,i)
      f(2,i) = sum1
      do 80 j = 2, nxh
      sum1 = sum1 - f(2*j,i)
      f(2*j,i) = sum1
   80 continue
   90 continue
!$OMP END PARALLEL DO
      return
      end
!-----------------------------------------------------------------------
      subroutine PPFST2RMXY(g,isign,mixup,sctd,indx,indy,kstrt,kxpi,kxpp&
     &,nyv,kxpd,nxhyd,nxyd)
! this subroutine performs the y part of a two dimensional fast real
! sine transform and its inverse, for a subset of x,
! using real arithmetic, with OpenMP,
! for data which is distributed in blocks
! algorithm is described in Numerical Recipies in Fortran, Second Ed.,
! by W. H. Press, B. P. Flannery, S. A. Teukolsky, and W. T. Vetterling, 
! [Cambridge Univ. Press, 1992], p. 508.
! for isign = (-1,1), input: all, output: f
! approximate flop count: N*(5*log2(N) + 18)/nvp
! where N = (nx/2)*ny
! indx/indy = exponent which determines length in x/y direction,
! where nx=2**indx, ny=2**indy
! if isign = -1, an inverse sine transform is performed
! g(m,n) = sum(g(k,n)*sin(pi*m*k/ny))
! if isign = 1, a forward sine transform is performed
! g(k,n) = sum(g(m,n)*sin(pi*m*k/ny))
! mixup = array of bit reversed addresses
! sctd = sine/cosine table
! kstrt = starting data block number
! kxpi = initial x index used
! kxpp = number of x indices used
! nyv = first dimension of g >= ny + 1
! kxpd = second dimension of g >= kxp + 1
! nxhyd = maximum of (nx/2,ny)
! nxyd = maximum of (nx,ny)
! written by viktor k. decyk, ucla
      implicit none
      integer isign, indx, indy, kstrt, kxpi, kxpp, nyv, kxpd
      integer nxhyd, nxyd, mixup
      real g
      complex sctd
      dimension g(nyv,kxpd)
      dimension mixup(nxhyd), sctd(nxyd)
! local data
      integer indx1, indy1, indx1y, nx, ny, nyh, nyhh, ny3, nxy, nxhy
      integer i, j, k, m, ks, km, kmr, nry, j1, j2, ns, ns2, k1, k2
      integer kxps, nryb
      real at1, at2, t2, t3, t4, t5, t6
      complex t1
      double precision sum1
      indx1 = indx - 1
      indy1 = indy - 1
      indx1y = max0(indx1,indy)
      nx = 2**indx
      ny = 2**indy
      nyh = ny/2
      nyhh = ny/4
      ny3 = ny + 3
      nxy = max0(nx,ny)
      nxhy = 2**indx1y
      ks = kstrt - 1
      kxps = kxpi + kxpp - 1
      if (kstrt.gt.nx) return
      if (isign.eq.0) return
      nryb = nxhy/nyh
      nry = nxy/nyh
!$OMP PARALLEL DO
!$OMP& PRIVATE(i,j,k,m,ns,ns2,km,kmr,k1,k2,j1,j2,at1,at2,t2,t3,t4,t5,t6,
!$OMP& t1,sum1)
      do 90 i = kxpi, kxps
! create auxiliary array in y
      kmr = nxy/ny
      do 10 k = 2, nyh
      k1 = 1 + kmr*(k - 1)
      at2 = g(ny+2-k,i)
      at1 = g(k,i) + at2
      at2 = g(k,i) - at2
      at1 = -aimag(sctd(k1))*at1
      at2 = 0.5*at2
      g(k,i) = at1 + at2
      g(ny+2-k,i) = at1 - at2
   10 continue
      g(1,i) = 0.0
      g(nyh+1,i) = 2.0*g(nyh+1,i)
! bit-reverse array elements in y
      do 20 k = 1, nyh
      k1 = (mixup(k) - 1)/nryb + 1
      if (k.lt.k1) then
         t2 = g(2*k1-1,i)
         t3 = g(2*k1,i)
         g(2*k1-1,i) = g(2*k-1,i)
         g(2*k1,i) = g(2*k,i)
         g(2*k-1,i) = t2
         g(2*k,i) = t3
      endif
   20 continue
! then transform in y
      do 50 m = 1, indy1
      ns = 2**(m - 1)
      ns2 = ns + ns
      km = nyhh/ns
      kmr = 2*km*nry
      do 40 k = 1, km
      k1 = ns2*(k - 1)
      k2 = k1 + ns
      do 30 j = 1, ns
      j1 = j + k1
      j2 = j + k2
      t1 = sctd(1+kmr*(j-1))
      t2 = real(t1)*g(2*j2-1,i) - aimag(t1)*g(2*j2,i)
      t3 = aimag(t1)*g(2*j2-1,i) + real(t1)*g(2*j2,i)
      g(2*j2-1,i) = g(2*j1-1,i) - t2
      g(2*j2,i) = g(2*j1,i) - t3
      g(2*j1-1,i) = g(2*j1-1,i) + t2
      g(2*j1,i) = g(2*j1,i) + t3
   30 continue
   40 continue
   50 continue
! unscramble coefficients and normalize
! inverse fourier transform
      if (isign.lt.0) then
         kmr = nxy/nyh
         do 60 k = 2, nyhh
         t1 = cmplx(aimag(sctd(1+kmr*(k-1))),-real(sctd(1+kmr*(k-1))))
         t4 = g(ny3-2*k,i)
         t5 = -g(ny3-2*k+1,i)
         t2 = g(2*k-1,i) + t4
         t3 = g(2*k,i) + t5
         t6 = g(2*k-1,i) - t4
         t5 = g(2*k,i) - t5
         t4 = t6*real(t1) - t5*aimag(t1)
         t5 = t6*aimag(t1) + t5*real(t1)
         g(2*k-1,i) = 0.5*(t2 + t4)
         g(2*k,i) = 0.5*(t3 + t5)
         g(ny3-2*k,i) = 0.5*(t2 - t4)
         g(ny3-2*k+1,i) = 0.5*(t5 - t3)
   60    continue
         g(nyh+1,i) = g(nyh+1,i)
         g(nyh+2,i) = -g(nyh+2,i)
         t2 = g(1,i) + g(2,i)
         g(2,i) = g(1,i) - g(2,i)
         g(1,i) = t2
         g(ny+1,i) = g(ny+1,i)
! forward fourier transform
      else if (isign.gt.0) then
         kmr = nxy/nyh
         do 70 k = 2, nyhh
         t1 = cmplx(aimag(sctd(1+kmr*(k-1))),-real(sctd(1+kmr*(k-1))))
         t4 = g(ny3-2*k,i)
         t5 = -g(ny3-2*k+1,i)
         t2 = g(2*k-1,i) + t4
         t3 = g(2*k,i) + t5
         t6 = g(2*k-1,i) - t4
         t5 = g(2*k,i) - t5
         t4 = t6*real(t1) - t5*aimag(t1)
         t5 = t6*aimag(t1) + t5*real(t1)
         g(2*k-1,i) = t2 + t4
         g(2*k,i) = t3 + t5
         g(ny3-2*k,i) = t2 - t4
         g(ny3-2*k+1,i) = t5 - t3
   70    continue
         g(nyh+1,i) = 2.0*g(nyh+1,i)
         g(nyh+2,i) = -2.0*g(nyh+2,i)
         t2 = 2.0*(g(1,i) + g(2,i))
         g(2,i) = 2.0*(g(1,i) - g(2,i))
         g(1,i) = t2
         g(ny+1,i) = 2.0*g(ny+1,i)
      endif
! perform recursion for sine transform
      sum1 = 0.5*g(1,i)
      g(1,i) = 0.0
      g(2,i) = sum1
      do 80 k = 2, nyh
      sum1 = sum1 + g(2*k-1,i)
      g(2*k-1,i) = -g(2*k,i)
      g(2*k,i) = sum1
   80 continue
      g(ny+1,i) = 0.0
   90 continue
!$OMP END PARALLEL DO
      return
      end
!-----------------------------------------------------------------------
      subroutine PPFCT2RMXY(g,isign,mixup,sctd,indx,indy,kstrt,kxpi,kxpp&
     &,nyv,kxpd,nxhyd,nxyd)
! this subroutine performs the y part of a two dimensional fast real
! cosine transform and its inverse, for a subset of x,
! using real arithmetic, with OpenMP,
! for data which is distributed in blocks
! algorithm is described in Numerical Recipies in Fortran, Second Ed.,
! by W. H. Press, B. P. Flannery, S. A. Teukolsky, and W. T. Vetterling, 
! [Cambridge Univ. Press, 1992], p. 508.
! for isign = (-1,1), input: all, output: f
! approximate flop count: N*(5*log2(N) + 18)/nvp
! where N = (nx/2)*ny
! indx/indy = exponent which determines length in x/y direction,
! where nx=2**indx, ny=2**indy
! if isign = -1, an inverse cosine transform is performed
! g(m,n) = (.5*g(1,n) + ((-1)**m)*g(ny+1,n)
!            + sum(g(k,n)*cos(pi*m*k/ny))
! if isign = 1, a forward cosine transform is performed
! g(k,n) = 2*(.5*g(1,n) + ((-1)**m)*g(ny+1,n) + sum(g(m,n)*
!            cos(pi*m*k/ny))
! mixup = array of bit reversed addresses
! sctd = sine/cosine table
! kstrt = starting data block number
! kxpi = initial x index used
! kxpp = number of x indices used
! nyv = first dimension of g >= ny + 1
! kxpd = second dimension of g >= kxp + 1
! nxhyd = maximum of (nx/2,ny)
! nxyd = maximum of (nx,ny)
! written by viktor k. decyk, ucla
      implicit none
      integer isign, indx, indy, kstrt, kxpi, kxpp, nyv, kxpd
      integer nxhyd, nxyd, mixup
      real g
      complex sctd
      dimension g(nyv,kxpd)
      dimension mixup(nxhyd), sctd(nxyd)
! local data
      integer indx1, indy1, indx1y, nx, ny, nyh, nyhh, ny3, nxy, nxhy
      integer i, j, k, m, ks, km, kmr, nry, j1, j2, ns, ns2, k1, k2
      integer kxps, nryb
      real at1, at2, t2, t3, t4, t5, t6
      complex t1
      double precision sum1
      indx1 = indx - 1
      indy1 = indy - 1
      indx1y = max0(indx1,indy)
      nx = 2**indx
      ny = 2**indy
      nyh = ny/2
      nyhh = ny/4
      ny3 = ny + 3
      nxy = max0(nx,ny)
      nxhy = 2**indx1y
      ks = kstrt - 1
      kxps = kxpi + kxpp - 1
      if (kstrt.gt.nx) return
      if (isign.eq.0) return
      nryb = nxhy/nyh
      nry = nxy/nyh
!$OMP PARALLEL DO
!$OMP& PRIVATE(i,j,k,m,ns,ns2,km,kmr,k1,k2,j1,j2,at1,at2,t2,t3,t4,t5,t6,
!$OMP& t1,sum1)
      do 90 i = kxpi, kxps
! create auxiliary array in y
      kmr = nxy/ny
      sum1 = 0.5*(g(1,i) - g(ny+1,i))
      do 10 k = 2, nyh
      k1 = 1 + kmr*(k - 1)
      at2 = g(ny+2-k,i)
      at1 = g(k,i) + at2
      at2 = g(k,i) - at2
      sum1 = sum1 + real(sctd(k1))*at2
      at2 = -aimag(sctd(k1))*at2
      at1 = 0.5*at1
      g(k,i) = at1 - at2
      g(ny+2-k,i) = at1 + at2
   10 continue
      g(1,i) = 0.5*(g(1,i) + g(ny+1,i))
      g(ny+1,i) = sum1
! bit-reverse array elements in y
      do 20 k = 1, nyh
      k1 = (mixup(k) - 1)/nryb + 1
      if (k.lt.k1) then
         t2 = g(2*k1-1,i)
         t3 = g(2*k1,i)
         g(2*k1-1,i) = g(2*k-1,i)
         g(2*k1,i) = g(2*k,i)
         g(2*k-1,i) = t2
         g(2*k,i) = t3
      endif
   20 continue
! then transform in y
      do 50 m = 1, indy1
      ns = 2**(m - 1)
      ns2 = ns + ns
      km = nyhh/ns
      kmr = 2*km*nry
      do 40 k = 1, km
      k1 = ns2*(k - 1)
      k2 = k1 + ns
      do 30 j = 1, ns
      j1 = j + k1
      j2 = j + k2
      t1 = sctd(1+kmr*(j-1))
      t2 = real(t1)*g(2*j2-1,i) - aimag(t1)*g(2*j2,i)
      t3 = aimag(t1)*g(2*j2-1,i) + real(t1)*g(2*j2,i)
      g(2*j2-1,i) = g(2*j1-1,i) - t2
      g(2*j2,i) = g(2*j1,i) - t3
      g(2*j1-1,i) = g(2*j1-1,i) + t2
      g(2*j1,i) = g(2*j1,i) + t3
   30 continue
   40 continue
   50 continue
! unscramble coefficients and normalize
! inverse fourier transform
      if (isign.lt.0) then
         kmr = nxy/nyh
         do 60 k = 2, nyhh
         t1 = cmplx(aimag(sctd(1+kmr*(k-1))),-real(sctd(1+kmr*(k-1))))
         t4 = g(ny3-2*k,i)
         t5 = -g(ny3-2*k+1,i)
         t2 = g(2*k-1,i) + t4
         t3 = g(2*k,i) + t5
         t6 = g(2*k-1,i) - t4
         t5 = g(2*k,i) - t5
         t4 = t6*real(t1) - t5*aimag(t1)
         t5 = t6*aimag(t1) + t5*real(t1)
         g(2*k-1,i) = 0.5*(t2 + t4)
         g(2*k,i) = 0.5*(t3 + t5)
         g(ny3-2*k,i) = 0.5*(t2 - t4)
         g(ny3-2*k+1,i) = 0.5*(t5 - t3)
   60    continue
         g(nyh+1,i) = g(nyh+1,i)
         g(nyh+2,i) = -g(nyh+2,i)
         t2 = g(1,i) + g(2,i)
         g(2,i) = g(1,i) - g(2,i)
         g(1,i) = t2
         g(ny+1,i) = g(ny+1,i)
! forward fourier transform
      else if (isign.gt.0) then
         kmr = nxy/nyh
         do 70 k = 2, nyhh
         t1 = cmplx(aimag(sctd(1+kmr*(k-1))),-real(sctd(1+kmr*(k-1))))
         t4 = g(ny3-2*k,i)
         t5 = -g(ny3-2*k+1,i)
         t2 = g(2*k-1,i) + t4
         t3 = g(2*k,i) + t5
         t6 = g(2*k-1,i) - t4
         t5 = g(2*k,i) - t5
         t4 = t6*real(t1) - t5*aimag(t1)
         t5 = t6*aimag(t1) + t5*real(t1)
         g(2*k-1,i) = t2 + t4
         g(2*k,i) = t3 + t5
         g(ny3-2*k,i) = t2 - t4
         g(ny3-2*k+1,i) = t5 - t3
   70    continue
         g(nyh+1,i) = 2.0*g(nyh+1,i)
         g(nyh+2,i) = -2.0*g(nyh+2,i)
         t2 = 2.0*(g(1,i) + g(2,i))
         g(2,i) = 2.0*(g(1,i) - g(2,i))
         g(1,i) = t2
         g(ny+1,i) = 2.0*g(ny+1,i)
      endif
! perform recursion for cosine transform
      sum1 = g(ny+1,i)
      g(ny+1,i) = g(2,i)
      g(2,i) = sum1
      do 80 k = 2, nyh
      sum1 = sum1 - g(2*k,i)
      g(2*k,i) = sum1
   80 continue
   90 continue
!$OMP END PARALLEL DO
      return
      end
!-----------------------------------------------------------------------
      subroutine WPPFCST2RM2(f,g,bs,br,isign,ntpose,mixup,sctd,ttp,indx,&
     &indy,kstrt,nvp,nxvh,nyv,kxp2,kyp,kypd,kxp2d,nxhyd,nxyd)
! wrapper function for 2 parallel real cosine/sine transforms
! for the electric field with dirichlet or magnetic field with neumann
! boundary conditions
! x component has a cosine/sine transform in x and y, respectively
! y component has a sine/cosine transform in x and y, respectively
      implicit none
      integer isign, ntpose, indx, indy, kstrt, nvp, nxvh, nyv
      integer kxp2, kyp, kypd, kxp2d, nxhyd, nxyd, mixup
      real f, g, bs, br, ttp
      complex sctd
      dimension f(2,2*nxvh,kypd), g(2,nyv,kxp2d)
      dimension bs(2,kxp2+1,kyp+1), br(2,kxp2+1,kyp+1)
      dimension mixup(nxhyd), sctd(nxyd)
! local data
      integer nx, ny, kxpi, kypi, ks, kxp2p, kypp, kxb2, kyb
      real tf
      double precision dtime
      data kxpi, kypi /1,1/
! calculate range of indices
      nx = 2**indx
      ny = 2**indy
! ks = processor id
      ks = kstrt - 1
! kxp2p = actual size used in x direction
      kxp2p = min(kxp2,max(0,nx-kxp2*ks))
! kypp = actual size used in y direction
      kypp = min(kyp,max(0,ny-kyp*ks))
! kxb2 = minimum number of processors needed in x direction
      kxb2 = (nx - 1)/kxp2 + 1
! kyb = minimum number of processors needed in y direction
      kyb = (ny - 1)/kyp + 1
! add extra word for last processor in x
      if (ks==(kxb2-1)) kxp2p = kxp2p + 1
! add extra word for last processor in y
      if (ks==(kyb-1)) kypp = kypp + 1
! inverse fourier transform
      if (isign.lt.0) then
! perform x cosine-sine transform
         call PPFCST2RM2X(f,isign,mixup,sctd,indx,indy,kstrt,kypi,kypp, &
     &nxvh,kypd,nxhyd,nxyd)
! transpose f array to g
         call PWTIMERA(-1,ttp,dtime)
         call PPRNTPOSE(f,g,bs,br,nx,ny,kxp2,kyp,kstrt,nvp,2,2*nxvh,nyv,&
     &kxp2d,kypd)
         call PWTIMERA(1,ttp,dtime)
! perform y sine-cosine transform
         call PPFSCT2RM2Y(g,isign,mixup,sctd,indx,indy,kstrt,kxpi,kxp2p,&
     &nyv,kxp2d,nxhyd,nxyd)
! transpose g array to f
         if (ntpose.eq.0) then
            call PWTIMERA(-1,tf,dtime)
            call PPRNTPOSE(g,f,br,bs,ny,nx,kyp,kxp2,kstrt,nvp,2,nyv,    &
     &2*nxvh,kypd,kxp2d)
            call PWTIMERA(1,tf,dtime)
         endif
! forward fourier transform
      else if (isign.gt.0) then
! transpose f array to g
         if (ntpose.eq.0) then
            call PWTIMERA(-1,tf,dtime)
            call PPRNTPOSE(f,g,bs,br,nx,ny,kxp2,kyp,kstrt,nvp,2,2*nxvh, &
     &nyv,kxp2d,kypd)
            call PWTIMERA(1,tf,dtime)
         endif
! perform y sine-cosine transform
         call PPFSCT2RM2Y(g,isign,mixup,sctd,indx,indy,kstrt,kxpi,kxp2p,&
     &nyv,kxp2d,nxhyd,nxyd)
! transpose g array to f
         call PWTIMERA(-1,ttp,dtime)
         call PPRNTPOSE(g,f,br,bs,ny,nx,kyp,kxp2,kstrt,nvp,2,nyv,2*nxvh,&
     &kypd,kxp2d)
         call PWTIMERA(1,ttp,dtime)
! perform x cosine-sine transform
         call PPFCST2RM2X(f,isign,mixup,sctd,indx,indy,kstrt,kypi,kypp, &
     &nxvh,kypd,nxhyd,nxyd)
      endif
      if (ntpose.eq.0) ttp = ttp + tf
      return
      end
!-----------------------------------------------------------------------
      subroutine WPPFSCT2RM2(f,g,bs,br,isign,ntpose,mixup,sctd,ttp,indx,&
     &indy,kstrt,nvp,nxvh,nyv,kxp2,kyp,kypd,kxp2d,nxhyd,nxyd)
! wrapper function for 2 parallel real sine/cosine transforms
! for the magnetic field with dirichlet or electric field with neumann
! boundary conditions
! x component has a sine/cosine transform in x and y, respectively
! y component has a cosine/sine transform in x and y, respectively
      implicit none
      integer isign, ntpose, indx, indy, kstrt, nvp, nxvh, nyv
      integer kxp2, kyp, kypd, kxp2d, nxhyd, nxyd, mixup
      real f, g, bs, br, ttp
      complex sctd
      dimension f(2,2*nxvh,kypd), g(2,nyv,kxp2d)
      dimension bs(2,kxp2+1,kyp+1), br(2,kxp2+1,kyp+1)
      dimension mixup(nxhyd), sctd(nxyd)
! local data
      integer nx, ny, kxpi, kypi, ks, kxp2p, kypp, kxb2, kyb
      real tf
      double precision dtime
      data kxpi, kypi /1,1/
! calculate range of indices
      nx = 2**indx
      ny = 2**indy
! ks = processor id
      ks = kstrt - 1
! kxp2p = actual size used in x direction
      kxp2p = min(kxp2,max(0,nx-kxp2*ks))
! kypp = actual size used in y direction
      kypp = min(kyp,max(0,ny-kyp*ks))
! kxb2 = minimum number of processors needed in x direction
      kxb2 = (nx - 1)/kxp2 + 1
! kyb = minimum number of processors needed in y direction
      kyb = (ny - 1)/kyp + 1
! add extra word for last processor in x
      if (ks==(kxb2-1)) kxp2p = kxp2p + 1
! add extra word for last processor in y
      if (ks==(kyb-1)) kypp = kypp + 1
! inverse fourier transform
      if (isign.lt.0) then
! perform x sine-cosine transform
         call PPFSCT2RM2X(f,isign,mixup,sctd,indx,indy,kstrt,kypi,kypp, &
     &nxvh,kypd,nxhyd,nxyd)
! transpose f array to g
         call PWTIMERA(-1,ttp,dtime)
         call PPRNTPOSE(f,g,bs,br,nx,ny,kxp2,kyp,kstrt,nvp,2,2*nxvh,nyv,&
     &kxp2d,kypd)
         call PWTIMERA(1,ttp,dtime)
! perform y cosine-sine transform
         call PPFCST2RM2Y(g,isign,mixup,sctd,indx,indy,kstrt,kxpi,kxp2p,&
     &nyv,kxp2d,nxhyd,nxyd)
! transpose g array to f
         if (ntpose.eq.0) then
            call PWTIMERA(-1,tf,dtime)
            call PPRNTPOSE(g,f,br,bs,ny,nx,kyp,kxp2,kstrt,nvp,2,nyv,    &
     &2*nxvh,kypd,kxp2d)
            call PWTIMERA(1,tf,dtime)
         endif
! forward fourier transform
      else if (isign.gt.0) then
! transpose f array to g
         if (ntpose.eq.0) then
            call PWTIMERA(-1,tf,dtime)
            call PPRNTPOSE(f,g,bs,br,nx,ny,kxp2,kyp,kstrt,nvp,2,2*nxvh, &
     &nyv,kxp2d,kypd)
            call PWTIMERA(1,tf,dtime)
         endif
! perform y cosine-sine transform
         call PPFCST2RM2Y(g,isign,mixup,sctd,indx,indy,kstrt,kxpi,kxp2p,&
     &nyv,kxp2d,nxhyd,nxyd)
! transpose g array to f
         call PWTIMERA(-1,ttp,dtime)
         call PPRNTPOSE(g,f,br,bs,ny,nx,kyp,kxp2,kstrt,nvp,2,nyv,2*nxvh,&
     &kypd,kxp2d)
         call PWTIMERA(1,ttp,dtime)
! perform x sine-cosine transform
         call PPFSCT2RM2X(f,isign,mixup,sctd,indx,indy,kstrt,kypi,kypp, &
     &nxvh,kypd,nxhyd,nxyd)
      endif
      if (ntpose.eq.0) ttp = ttp + tf
      return
      end
!-----------------------------------------------------------------------
      subroutine PPFCST2RM2X(f,isign,mixup,sctd,indx,indy,kstrt,kypi,   &
     &kypp,nxvh,kypd,nxhyd,nxyd)
! this subroutine performs the x part of 2 two dimensional fast real
! sine and cosine transforms and their inverses, for a subset of y,
! using real arithmetic, with OpenMP,
! for data which is distributed in blocks
! x component has a cosine transform, y component a sine transform
! algorithm is described in Numerical Recipies in Fortran, Second Ed.,
! by W. H. Press, B. P. Flannery, S. A. Teukolsky, and W. T. Vetterling, 
! [Cambridge Univ. Press, 1992], p. 508.
! for isign = (-1,1), input: all, output: f
! approximate flop count: N*(5*log2(N) + 18)/nvp
! where N = (nx/2)*ny
! indx/indy = exponent which determines length in x/y direction,
! where nx=2**indx, ny=2**indy
! if isign = -1, inverse cosine-sine transforms are performed
! f(1,n,k) = (1/nx*ny)*(.5*f(1,1,k) + ((-1)**n)*f(1,nx+1,k)
!              + sum(f(1,j,k)*cos(pi*n*j/nx)))
! f(2,n,k) = (1/nx*ny)*sum(f(2,j,k)*sin(pi*n*j/nx))
! if isign = 1, forward cosine-sine transforms are performed
! f(1,j,k) = 2*(.5*f(1,1,k) + ((-1)**j)*f(1,n+1,k)
!              + sum(f(1,n,k)*cos(pi*n*j/nx))
! f(2,j,k) = sum(f(2,n,k)*sin(pi*n*j/nx))
! mixup = array of bit reversed addresses
! sctd = sine/cosine table
! kstrt = starting data block number
! kypi = initial y index used
! kypp = number of y indices used
! nxvh = second dimension of f >= nx/2 + 1
! kypd = third dimension of f >= kyp + 1
! nxhyd = maximum of (nx/2,ny)
! nxyd = maximum of (nx,ny)
! written by viktor k. decyk, ucla
      implicit none
      integer isign, indx, indy, kstrt, kypi, kypp
      integer nxvh, kypd, nxhyd, nxyd, mixup
      real f
      complex sctd
      dimension f(2,2*nxvh,kypd)
      dimension mixup(nxhyd), sctd(nxyd)
! local data
      integer indx1, indx1y, nx, nxh, nxhh, nx3, ny, nxy, nxhy, ks
      integer i, j, k, m, km, kmr, nrx, j1, j2, ns, ns2, k1, k2, kyps
      integer nrxb, jj
      real at1, at2, at3, t2, t3, t4, t5, t6, ani
      complex t1
      double precision sum1, sum2
      indx1 = indx - 1
      indx1y = max0(indx1,indy)
      nx = 2**indx
      nxh = nx/2
      nxhh = nx/4
      nx3 = nx + 3
      ny = 2**indy
      nxy = max0(nx,ny)
      nxhy = 2**indx1y
      ks = kstrt - 1
      kyps = kypi + kypp - 1
      if (kstrt.gt.ny) return
      if (isign.eq.0) return
      ani = 0.5/(real(nx)*real(ny))
      nrxb = nxhy/nxh
      nrx = nxy/nxh
!$OMP PARALLEL DO
!$OMP& PRIVATE(i,j,k,m,jj,ns,ns2,km,kmr,k1,k2,j1,j2,at1,at2,at3,t2,t3,t4
!$OMP& ,t5,t6,t1,sum1,sum2)
      do 150 i = kypi, kyps
! create auxiliary array in x
      kmr = nxy/nx
      sum1 = 0.5*(f(1,1,i) - f(1,nx+1,i))
      do 10 j = 2, nxh
      j1 = 1 + kmr*(j - 1)
      at3 = -aimag(sctd(j1))
      at2 = f(1,nx+2-j,i)
      at1 = f(1,j,i) + at2
      at2 = f(1,j,i) - at2
      sum1 = sum1 + real(sctd(j1))*at2
      at2 = at3*at2
      at1 = 0.5*at1
      f(1,j,i) = at1 - at2
      f(1,nx+2-j,i) = at1 + at2
      at2 = f(2,nx+2-j,i)
      at1 = f(2,j,i) + at2
      at2 = f(2,j,i) - at2
      at1 = at3*at1
      at2 = 0.5*at2
      f(2,j,i) = at1 + at2
      f(2,nx+2-j,i) = at1 - at2
   10 continue
      f(1,1,i) = 0.5*(f(1,1,i) + f(1,nx+1,i))
      f(1,nx+1,i) = sum1
      f(2,1,i) = 0.0
      f(2,nxh+1,i) = 2.0*f(2,nxh+1,i)
! bit-reverse array elements in x
      do 30 j = 1, nxh
      j1 = (mixup(j) - 1)/nrxb + 1
      if (j.lt.j1) then
         do 20 jj = 1, 2
         t2 = f(jj,2*j1-1,i)
         t3 = f(jj,2*j1,i)
         f(jj,2*j1-1,i) = f(jj,2*j-1,i)
         f(jj,2*j1,i) = f(jj,2*j,i)
         f(jj,2*j-1,i) = t2
         f(jj,2*j,i) = t3
   20    continue
      endif
   30 continue
! then transform in x
      do 70 m = 1, indx1
      ns = 2**(m - 1)
      ns2 = ns + ns
      km = nxhh/ns
      kmr = 2*km*nrx
      do 60 k = 1, km
      k1 = ns2*(k - 1)
      k2 = k1 + ns
      do 50 j = 1, ns
      j1 = j + k1
      j2 = j + k2
      t1 = sctd(1+kmr*(j-1))
      do 40 jj = 1, 2
      t2 = real(t1)*f(jj,2*j2-1,i) - aimag(t1)*f(jj,2*j2,i)
      t3 = aimag(t1)*f(jj,2*j2-1,i) + real(t1)*f(jj,2*j2,i)
      f(jj,2*j2-1,i) = f(jj,2*j1-1,i) - t2
      f(jj,2*j2,i) = f(jj,2*j1,i) - t3
      f(jj,2*j1-1,i) = f(jj,2*j1-1,i) + t2
      f(jj,2*j1,i) = f(jj,2*j1,i) + t3
   40 continue
   50 continue
   60 continue
   70 continue
! unscramble coefficients and normalize
! inverse fourier transform
      if (isign.lt.0) then
         kmr = nxy/nxh
         do 90 j = 2, nxhh
         t1 = cmplx(aimag(sctd(1+kmr*(j-1))),-real(sctd(1+kmr*(j-1))))
         do 80 jj = 1, 2
         t4 = f(jj,nx3-2*j,i)
         t5 = -f(jj,nx3-2*j+1,i)
         t2 = f(jj,2*j-1,i) + t4
         t3 = f(jj,2*j,i) + t5
         t6 = f(jj,2*j-1,i) - t4
         t5 = f(jj,2*j,i) - t5
         t4 = t6*real(t1) - t5*aimag(t1)
         t5 = t6*aimag(t1) + t5*real(t1)
         f(jj,2*j-1,i) = ani*(t2 + t4)
         f(jj,2*j,i) = ani*(t3 + t5)
         f(jj,nx3-2*j,i) = ani*(t2 - t4)
         f(jj,nx3-2*j+1,i) = ani*(t5 - t3)
   80    continue
   90    continue
         do 100 jj = 1, 2
         f(jj,nxh+1,i) = 2.0*ani*f(jj,nxh+1,i)
         f(jj,nxh+2,i) = -2.0*ani*f(jj,nxh+2,i)
         t2 = 2.0*ani*(f(jj,1,i) + f(jj,2,i))
         f(jj,2,i) = 2.0*ani*(f(jj,1,i) - f(jj,2,i))
         f(jj,1,i) = t2
         f(jj,nx+1,i) = 2.0*ani*f(jj,nx+1,i)
  100    continue
! forward fourier transform
      else if (isign.gt.0) then
         kmr = nxy/nxh
         do 120 j = 2, nxhh
         t1 = cmplx(aimag(sctd(1+kmr*(j-1))),-real(sctd(1+kmr*(j-1))))
         do 110 jj = 1, 2
         t4 = f(jj,nx3-2*j,i)
         t5 = -f(jj,nx3-2*j+1,i)
         t2 = f(jj,2*j-1,i) + t4
         t3 = f(jj,2*j,i) + t5
         t6 = f(jj,2*j-1,i) - t4
         t5 = f(jj,2*j,i) - t5
         t4 = t6*real(t1) - t5*aimag(t1)
         t5 = t6*aimag(t1) + t5*real(t1)
         f(jj,2*j-1,i) = t2 + t4
         f(jj,2*j,i) = t3 + t5
         f(jj,nx3-2*j,i) = t2 - t4
         f(jj,nx3-2*j+1,i) = t5 - t3
  110    continue
  120    continue
         do 130 jj = 1, 2
         f(jj,nxh+1,i) = 2.0*f(jj,nxh+1,i)
         f(jj,nxh+2,i) = -2.0*f(jj,nxh+2,i)
         t2 = 2.0*(f(jj,1,i) + f(jj,2,i))
         f(jj,2,i) = 2.0*(f(jj,1,i) - f(jj,2,i))
         f(jj,1,i) = t2
         f(jj,nx+1,i) = 2.0*f(jj,nx+1,i)
  130    continue
      endif
! perform recursion for cosine-sine transform
      sum1 = f(1,nx+1,i)
      f(1,nx+1,i) = f(1,2,i)
      f(1,2,i) = sum1
      sum2 = 0.5*f(2,1,i)
      f(2,1,i) = 0.0
      f(2,2,i) = sum2
      do 140 j = 2, nxh
      sum1 = sum1 - f(1,2*j,i)
      f(1,2*j,i) = sum1
      sum2 = sum2 + f(2,2*j-1,i)
      f(2,2*j-1,i) = -f(2,2*j,i)
      f(2,2*j,i) = sum2
  140 continue
      f(2,nx+1,i) = 0.0
  150 continue
!$OMP END PARALLEL DO
      return
      end
!-----------------------------------------------------------------------
      subroutine PPFSCT2RM2X(f,isign,mixup,sctd,indx,indy,kstrt,kypi,   &
     &kypp,nxvh,kypd,nxhyd,nxyd)
! this subroutine performs the x part of 2 two dimensional fast real
! sine and cosine transforms and their inverses, for a subset of y,
! using real arithmetic, with OpenMP,
! for data which is distributed in blocks
! x component has a sine transform, y component a cosine transform
! algorithm is described in Numerical Recipies in Fortran, Second Ed.,
! by W. H. Press, B. P. Flannery, S. A. Teukolsky, and W. T. Vetterling, 
! [Cambridge Univ. Press, 1992], p. 508.
! for isign = (-1,1), input: all, output: f
! approximate flop count: N*(5*log2(N) + 18)/nvp
! where N = (nx/2)*ny
! indx/indy = exponent which determines length in x/y direction,
! where nx=2**indx, ny=2**indy
! if isign = -1, inverse sine-cosine transforms are performed
! f(1,n,k) = (1/nx*ny)*sum(f(1,j,k)*sin(pi*n*j/nx))
! f(2,n,k) = (1/nx*ny)*(.5*f(2,1,k) + ((-1)**n)*f(2,nx+1,k)
!              + sum(f(2,j,k)*cos(pi*n*j/nx)))
! if isign = 1, forward sine-cosine transforms are performed
! f(1,j,k) = sum(f(1,n,k)*sin(pi*n*j/nx))
! f(2,j,k) = 2*(.5*f(2,1,k) + ((-1)**j)*f(2,n+1,k)
!              + sum(f(2,n,k)*cos(pi*n*j/nx))
! mixup = array of bit reversed addresses
! sctd = sine/cosine table
! kstrt = starting data block number
! kypi = initial y index used
! kypp = number of y indices used
! nxvh = second dimension of f >= nx/2 + 1
! kypd = third dimension of f >= kyp + 1
! nxhyd = maximum of (nx/2,ny)
! nxyd = maximum of (nx,ny)
! written by viktor k. decyk, ucla
      implicit none
      integer isign, indx, indy, kstrt, kypi, kypp
      integer nxvh, kypd, nxhyd, nxyd, mixup
      real f
      complex sctd
      dimension f(2,2*nxvh,kypd)
      dimension mixup(nxhyd), sctd(nxyd)
! local data
      integer indx1, indx1y, nx, nxh, nxhh, nx3, ny, nxy, nxhy, ks
      integer i, j, k, m, km, kmr, nrx, j1, j2, ns, ns2, k1, k2, kyps
      integer nrxb, jj
      real at1, at2, at3, t2, t3, t4, t5, t6, ani
      complex t1
      double precision sum1, sum2
      indx1 = indx - 1
      indx1y = max0(indx1,indy)
      nx = 2**indx
      nxh = nx/2
      nxhh = nx/4
      nx3 = nx + 3
      ny = 2**indy
      nxy = max0(nx,ny)
      nxhy = 2**indx1y
      ks = kstrt - 1
      kyps = kypi + kypp - 1
      if (kstrt.gt.ny) return
      if (isign.eq.0) return
      ani = 0.5/(real(nx)*real(ny))
      nrxb = nxhy/nxh
      nrx = nxy/nxh
!$OMP PARALLEL DO
!$OMP& PRIVATE(i,j,k,m,jj,ns,ns2,km,kmr,k1,k2,j1,j2,at1,at2,at3,t2,t3,t4
!$OMP& ,t5,t6,t1,sum1,sum2)
      do 150 i = kypi, kyps
! create auxiliary array in x
      kmr = nxy/nx
      sum1 = 0.5*(f(2,1,i) - f(2,nx+1,i))
      do 10 j = 2, nxh
      j1 = 1 + kmr*(j - 1)
      at3 = -aimag(sctd(j1))
      at2 = f(1,nx+2-j,i)
      at1 = f(1,j,i) + at2
      at2 = f(1,j,i) - at2
      at1 = at3*at1
      at2 = 0.5*at2
      f(1,j,i) = at1 + at2
      f(1,nx+2-j,i) = at1 - at2
      at2 = f(2,nx+2-j,i)
      at1 = f(2,j,i) + at2
      at2 = f(2,j,i) - at2
      sum1 = sum1 + real(sctd(j1))*at2
      at2 = at3*at2
      at1 = 0.5*at1
      f(2,j,i) = at1 - at2
      f(2,nx+2-j,i) = at1 + at2
   10 continue
      f(1,1,i) = 0.0
      f(1,nxh+1,i) = 2.0*f(1,nxh+1,i)
      f(2,1,i) = 0.5*(f(2,1,i) + f(2,nx+1,i))
      f(2,nx+1,i) = sum1
! bit-reverse array elements in x
      do 30 j = 1, nxh
      j1 = (mixup(j) - 1)/nrxb + 1
      if (j.lt.j1) then
         do 20 jj = 1, 2
         t2 = f(jj,2*j1-1,i)
         t3 = f(jj,2*j1,i)
         f(jj,2*j1-1,i) = f(jj,2*j-1,i)
         f(jj,2*j1,i) = f(jj,2*j,i)
         f(jj,2*j-1,i) = t2
         f(jj,2*j,i) = t3
   20    continue
      endif
   30 continue
! then transform in x
      do 70 m = 1, indx1
      ns = 2**(m - 1)
      ns2 = ns + ns
      km = nxhh/ns
      kmr = 2*km*nrx
      do 60 k = 1, km
      k1 = ns2*(k - 1)
      k2 = k1 + ns
      do 50 j = 1, ns
      j1 = j + k1
      j2 = j + k2
      t1 = sctd(1+kmr*(j-1))
      do 40 jj = 1, 2
      t2 = real(t1)*f(jj,2*j2-1,i) - aimag(t1)*f(jj,2*j2,i)
      t3 = aimag(t1)*f(jj,2*j2-1,i) + real(t1)*f(jj,2*j2,i)
      f(jj,2*j2-1,i) = f(jj,2*j1-1,i) - t2
      f(jj,2*j2,i) = f(jj,2*j1,i) - t3
      f(jj,2*j1-1,i) = f(jj,2*j1-1,i) + t2
      f(jj,2*j1,i) = f(jj,2*j1,i) + t3
   40 continue
   50 continue
   60 continue
   70 continue
! unscramble coefficients and normalize
! inverse fourier transform
      if (isign.lt.0) then
         kmr = nxy/nxh
         do 90 j = 2, nxhh
         t1 = cmplx(aimag(sctd(1+kmr*(j-1))),-real(sctd(1+kmr*(j-1))))
         do 80 jj = 1, 2
         t4 = f(jj,nx3-2*j,i)
         t5 = -f(jj,nx3-2*j+1,i)
         t2 = f(jj,2*j-1,i) + t4
         t3 = f(jj,2*j,i) + t5
         t6 = f(jj,2*j-1,i) - t4
         t5 = f(jj,2*j,i) - t5
         t4 = t6*real(t1) - t5*aimag(t1)
         t5 = t6*aimag(t1) + t5*real(t1)
         f(jj,2*j-1,i) = ani*(t2 + t4)
         f(jj,2*j,i) = ani*(t3 + t5)
         f(jj,nx3-2*j,i) = ani*(t2 - t4)
         f(jj,nx3-2*j+1,i) = ani*(t5 - t3)
   80    continue
   90    continue
         do 100 jj = 1, 2
         f(jj,nxh+1,i) = 2.0*ani*f(jj,nxh+1,i)
         f(jj,nxh+2,i) = -2.0*ani*f(jj,nxh+2,i)
         t2 = 2.0*ani*(f(jj,1,i) + f(jj,2,i))
         f(jj,2,i) = 2.0*ani*(f(jj,1,i) - f(jj,2,i))
         f(jj,1,i) = t2
         f(jj,nx+1,i) = 2.0*ani*f(jj,nx+1,i)
  100    continue
! forward fourier transform
      else if (isign.gt.0) then
         kmr = nxy/nxh
         do 120 j = 2, nxhh
         t1 = cmplx(aimag(sctd(1+kmr*(j-1))),-real(sctd(1+kmr*(j-1))))
         do 110 jj = 1, 2
         t4 = f(jj,nx3-2*j,i)
         t5 = -f(jj,nx3-2*j+1,i)
         t2 = f(jj,2*j-1,i) + t4
         t3 = f(jj,2*j,i) + t5
         t6 = f(jj,2*j-1,i) - t4
         t5 = f(jj,2*j,i) - t5
         t4 = t6*real(t1) - t5*aimag(t1)
         t5 = t6*aimag(t1) + t5*real(t1)
         f(jj,2*j-1,i) = t2 + t4
         f(jj,2*j,i) = t3 + t5
         f(jj,nx3-2*j,i) = t2 - t4
         f(jj,nx3-2*j+1,i) = t5 - t3
  110    continue
  120    continue
         do 130 jj = 1, 2
         f(jj,nxh+1,i) = 2.0*f(jj,nxh+1,i)
         f(jj,nxh+2,i) = -2.0*f(jj,nxh+2,i)
         t2 = 2.0*(f(jj,1,i) + f(jj,2,i))
         f(jj,2,i) = 2.0*(f(jj,1,i) - f(jj,2,i))
         f(jj,1,i) = t2
         f(jj,nx+1,i) = 2.0*f(jj,nx+1,i)
  130    continue
      endif
! perform recursion for cosine-sine transform
      sum1 = 0.5*f(1,1,i)
      f(1,1,i) = 0.0
      f(1,2,i) = sum1
      sum2 = f(2,nx+1,i)
      f(2,nx+1,i) = f(2,2,i)
      f(2,2,i) = sum2
      do 140 j = 2, nxh
      sum1 = sum1 + f(1,2*j-1,i)
      f(1,2*j-1,i) = -f(1,2*j,i)
      f(1,2*j,i) = sum1
      sum2 = sum2 - f(2,2*j,i)
      f(2,2*j,i) = sum2
  140 continue
      f(1,nx+1,i) = 0.0
  150 continue
!$OMP END PARALLEL DO
      return
      end
!-----------------------------------------------------------------------
      subroutine PPFSCT2RM2Y(g,isign,mixup,sctd,indx,indy,kstrt,kxpi,   &
     &kxpp,nyv,kxpd,nxhyd,nxyd)
! this subroutine performs the y part of 2 two dimensional fast real
! sine and cosine transforms and their inverses, for a subset of x,
! using real arithmetic, with OpenMP,
! for data which is distributed in blocks
! x component has a sine transform, y component a cosine transform
! algorithm is described in Numerical Recipies in Fortran, Second Ed.,
! by W. H. Press, B. P. Flannery, S. A. Teukolsky, and W. T. Vetterling, 
! [Cambridge Univ. Press, 1992], p. 508.
! for isign = (-1,1), input: all, output: f
! approximate flop count: N*(5*log2(N) + 18)/nvp
! where N = (nx/2)*ny
! indx/indy = exponent which determines length in x/y direction,
! where nx=2**indx, ny=2**indy
! if isign = -1, inverse sine-cosine transform are performed
! g(1,m,n) = sum(g(1,k,n)*sin(pi*m*k/ny))
! g(2,m,n) = (.5*g(2,1,n) + ((-1)**m)*g(2,ny+1,n)
!              + sum(g(2,k,n)*cos(pi*m*k/ny))
! if isign = 1, a forward sine-cosine transforms are performed
! g(1,k,n) = sum(g(1,m,n)*sin(pi*m*k/ny))
! g(2,k,n) = 2*(.5*g(2,1,n) + ((-1)**m)*g(2,ny+1,n)
!              + sum(g(2,m,n)*cos(pi*m*k/ny))
! mixup = array of bit reversed addresses
! sctd = sine/cosine table
! kstrt = starting data block number
! kxpi = initial x index used
! kxpp = number of x indices used
! nyv = second dimension of g >= ny + 1
! kxpd = third dimension of g >= kxp + 1
! nxhyd = maximum of (nx/2,ny)
! nxyd = maximum of (nx,ny)
! written by viktor k. decyk, ucla
      implicit none
      integer isign, indx, indy, kstrt, kxpi, kxpp
      integer nyv, kxpd, nxhyd, nxyd, mixup
      real g
      complex sctd
      dimension g(2,nyv,kxpd)
      dimension mixup(nxhyd), sctd(nxyd)
! local data
      integer indx1, indy1, indx1y, nx, ny, nyh, nyhh, ny3, nxy, nxhy
      integer i, j, k, m, ks, km, kmr, nry, j1, j2, ns, ns2, k1, k2
      integer kxps, nryb, jj
      real at1, at2, at3, t2, t3, t4, t5, t6
      complex t1
      double precision sum1, sum2
      indx1 = indx - 1
      indy1 = indy - 1
      indx1y = max0(indx1,indy)
      nx = 2**indx
      ny = 2**indy
      nyh = ny/2
      nyhh = ny/4
      ny3 = ny + 3
      nxy = max0(nx,ny)
      nxhy = 2**indx1y
      ks = kstrt - 1
      kxps = kxpi + kxpp - 1
      if (kstrt.gt.nx) return
      if (isign.eq.0) return
      nryb = nxhy/nyh
      nry = nxy/nyh
!$OMP PARALLEL DO
!$OMP& PRIVATE(i,j,k,m,jj,ns,ns2,km,kmr,k1,k2,j1,j2,at1,at2,at3,t2,t3,t4
!$OMP& ,t5,t6,t1,sum1,sum2)
      do 150 i = kxpi, kxps
! create auxiliary array in y
      kmr = nxy/ny
      sum1 = 0.5*(g(2,1,i) - g(2,ny+1,i))
      do 10 k = 2, nyh
      k1 = 1 + kmr*(k - 1)
      at3 = -aimag(sctd(k1))
      at2 = g(1,ny+2-k,i)
      at1 = g(1,k,i) + at2
      at2 = g(1,k,i) - at2
      at1 = at3*at1
      at2 = 0.5*at2
      g(1,k,i) = at1 + at2
      g(1,ny+2-k,i) = at1 - at2
      at2 = g(2,ny+2-k,i)
      at1 = g(2,k,i) + at2
      at2 = g(2,k,i) - at2
      sum1 = sum1 + real(sctd(k1))*at2
      at2 = at3*at2
      at1 = 0.5*at1
      g(2,k,i) = at1 - at2
      g(2,ny+2-k,i) = at1 + at2
   10 continue
      g(1,1,i) = 0.0
      g(1,nyh+1,i) = 2.0*g(1,nyh+1,i)
      g(2,1,i) = 0.5*(g(2,1,i) + g(2,ny+1,i))
      g(2,ny+1,i) = sum1
! bit-reverse array elements in y
      do 30 k = 1, nyh
      k1 = (mixup(k) - 1)/nryb + 1
      if (k.lt.k1) then
         do 20 jj = 1, 2
         t2 = g(jj,2*k1-1,i)
         t3 = g(jj,2*k1,i)
         g(jj,2*k1-1,i) = g(jj,2*k-1,i)
         g(jj,2*k1,i) = g(jj,2*k,i)
         g(jj,2*k-1,i) = t2
         g(jj,2*k,i) = t3
   20    continue
      endif
   30 continue
! then transform in y
      do 70 m = 1, indy1
      ns = 2**(m - 1)
      ns2 = ns + ns
      km = nyhh/ns
      kmr = 2*km*nry
      do 60 k = 1, km
      k1 = ns2*(k - 1)
      k2 = k1 + ns
      do 50 j = 1, ns
      j1 = j + k1
      j2 = j + k2
      t1 = sctd(1+kmr*(j-1))
      do 40 jj = 1, 2
      t2 = real(t1)*g(jj,2*j2-1,i) - aimag(t1)*g(jj,2*j2,i)
      t3 = aimag(t1)*g(jj,2*j2-1,i) + real(t1)*g(jj,2*j2,i)
      g(jj,2*j2-1,i) = g(jj,2*j1-1,i) - t2
      g(jj,2*j2,i) = g(jj,2*j1,i) - t3
      g(jj,2*j1-1,i) = g(jj,2*j1-1,i) + t2
      g(jj,2*j1,i) = g(jj,2*j1,i) + t3
   40 continue
   50 continue
   60 continue
   70 continue
! unscramble coefficients and normalize
! inverse fourier transform
      if (isign.lt.0) then
         kmr = nxy/nyh
         do 90 k = 2, nyhh
         t1 = cmplx(aimag(sctd(1+kmr*(k-1))),-real(sctd(1+kmr*(k-1))))
         do 80 jj = 1, 2
         t4 = g(jj,ny3-2*k,i)
         t5 = -g(jj,ny3-2*k+1,i)
         t2 = g(jj,2*k-1,i) + t4
         t3 = g(jj,2*k,i) + t5
         t6 = g(jj,2*k-1,i) - t4
         t5 = g(jj,2*k,i) - t5
         t4 = t6*real(t1) - t5*aimag(t1)
         t5 = t6*aimag(t1) + t5*real(t1)
         g(jj,2*k-1,i) = 0.5*(t2 + t4)
         g(jj,2*k,i) = 0.5*(t3 + t5)
         g(jj,ny3-2*k,i) = 0.5*(t2 - t4)
         g(jj,ny3-2*k+1,i) = 0.5*(t5 - t3)
   80    continue
   90    continue
         do 100 jj = 1, 2
         g(jj,nyh+1,i) = g(jj,nyh+1,i)
         g(jj,nyh+2,i) = -g(jj,nyh+2,i)
         t2 = g(jj,1,i) + g(jj,2,i)
         g(jj,2,i) = g(jj,1,i) - g(jj,2,i)
         g(jj,1,i) = t2
         g(jj,ny+1,i) = g(jj,ny+1,i)
  100    continue
! forward fourier transform
      else if (isign.gt.0) then
         kmr = nxy/nyh
         do 120 k = 2, nyhh
         t1 = cmplx(aimag(sctd(1+kmr*(k-1))),-real(sctd(1+kmr*(k-1))))
         do 110 jj = 1, 2
         t4 = g(jj,ny3-2*k,i)
         t5 = -g(jj,ny3-2*k+1,i)
         t2 = g(jj,2*k-1,i) + t4
         t3 = g(jj,2*k,i) + t5
         t6 = g(jj,2*k-1,i) - t4
         t5 = g(jj,2*k,i) - t5
         t4 = t6*real(t1) - t5*aimag(t1)
         t5 = t6*aimag(t1) + t5*real(t1)
         g(jj,2*k-1,i) = t2 + t4
         g(jj,2*k,i) = t3 + t5
         g(jj,ny3-2*k,i) = t2 - t4
         g(jj,ny3-2*k+1,i) = t5 - t3
  110    continue
  120    continue
         do 130 jj = 1, 2
         g(jj,nyh+1,i) = 2.0*g(jj,nyh+1,i)
         g(jj,nyh+2,i) = -2.0*g(jj,nyh+2,i)
         t2 = 2.0*(g(jj,1,i) + g(jj,2,i))
         g(jj,2,i) = 2.0*(g(jj,1,i) - g(jj,2,i))
         g(jj,1,i) = t2
         g(jj,ny+1,i) = 2.0*g(jj,ny+1,i)
  130    continue
      endif
! perform recursion for sine-cosine transform
      sum1 = 0.5*g(1,1,i)
      g(1,1,i) = 0.0
      g(1,2,i) = sum1
      sum2 = g(2,ny+1,i)
      g(2,ny+1,i) = g(2,2,i)
      g(2,2,i) = sum2
      do 140 k = 2, nyh
      sum1 = sum1 + g(1,2*k-1,i)
      g(1,2*k-1,i) = -g(1,2*k,i)
      g(1,2*k,i) = sum1
      sum2 = sum2 - g(2,2*k,i)
      g(2,2*k,i) = sum2
  140 continue
      g(1,ny+1,i) = 0.0
  150 continue
!$OMP END PARALLEL DO
      return
      end
!-----------------------------------------------------------------------
      subroutine PPFCST2RM2Y(g,isign,mixup,sctd,indx,indy,kstrt,kxpi,   &
     &kxpp,nyv,kxpd,nxhyd,nxyd)
! this subroutine performs the y part of 2 two dimensional fast real
! sine and cosine transforms and their inverses, for a subset of x,
! using real arithmetic, with OpenMP,
! for data which is distributed in blocks
! x component has a cosine transform, y component a sine transform
! algorithm is described in Numerical Recipies in Fortran, Second Ed.,
! by W. H. Press, B. P. Flannery, S. A. Teukolsky, and W. T. Vetterling, 
! [Cambridge Univ. Press, 1992], p. 508.
! for isign = (-1,1), input: all, output: f
! approximate flop count: N*(5*log2(N) + 18)/nvp
! where N = (nx/2)*ny
! indx/indy = exponent which determines length in x/y direction,
! where nx=2**indx, ny=2**indy
! if isign = -1, inverse cosine-sine transform are performed
! g(1,m,n) = (.5*g(1,1,n) + ((-1)**m)*g(1,ny+1,n)
!              + sum(g(1,k,n)*cos(pi*m*k/ny))
! g(2,m,n) = sum(g(2,k,n)*sin(pi*m*k/ny))
! if isign = 1, a forward cosine-sine transforms are performed
! g(1,k,n) = 2*(.5*g(1,1,n) + ((-1)**m)*g(1,ny+1,n)
!              + sum(g(1,m,n)*cos(pi*m*k/ny))
! g(2,k,n) = sum(g(2,m,n)*sin(pi*m*k/ny))
! mixup = array of bit reversed addresses
! sctd = sine/cosine table
! kstrt = starting data block number
! kxpi = initial x index used
! kxpp = number of x indices used
! nyv = second dimension of g >= ny + 1
! kxpd = third dimension of g >= kxp + 1
! nxhyd = maximum of (nx/2,ny)
! nxyd = maximum of (nx,ny)
! written by viktor k. decyk, ucla
      implicit none
      integer isign, indx, indy, kstrt, kxpi, kxpp
      integer nyv, kxpd, nxhyd, nxyd, mixup
      real g
      complex sctd
      dimension g(2,nyv,kxpd)
      dimension mixup(nxhyd), sctd(nxyd)
! local data
      integer indx1, indy1, indx1y, nx, ny, nyh, nyhh, ny3, nxy, nxhy
      integer i, j, k, m, ks, km, kmr, nry, j1, j2, ns, ns2, k1, k2
      integer kxps, nryb, jj
      real at1, at2, at3, t2, t3, t4, t5, t6
      complex t1
      double precision sum1, sum2
      indx1 = indx - 1
      indy1 = indy - 1
      indx1y = max0(indx1,indy)
      nx = 2**indx
      ny = 2**indy
      nyh = ny/2
      nyhh = ny/4
      ny3 = ny + 3
      nxy = max0(nx,ny)
      nxhy = 2**indx1y
      ks = kstrt - 1
      kxps = kxpi + kxpp - 1
      if (kstrt.gt.nx) return
      if (isign.eq.0) return
      nryb = nxhy/nyh
      nry = nxy/nyh
!$OMP PARALLEL DO
!$OMP& PRIVATE(i,j,k,m,jj,ns,ns2,km,kmr,k1,k2,j1,j2,at1,at2,at3,t2,t3,t4
!$OMP& ,t5,t6,t1,sum1,sum2)
      do 150 i = kxpi, kxps
! create auxiliary array in y
      kmr = nxy/ny
      sum1 = 0.5*(g(1,1,i) - g(1,ny+1,i))
      do 10 k = 2, nyh
      k1 = 1 + kmr*(k - 1)
      at3 = -aimag(sctd(k1))
      at2 = g(1,ny+2-k,i)
      at1 = g(1,k,i) + at2
      at2 = g(1,k,i) - at2
      sum1 = sum1 + real(sctd(k1))*at2
      at2 = at3*at2
      at1 = 0.5*at1
      g(1,k,i) = at1 - at2
      g(1,ny+2-k,i) = at1 + at2
      at2 = g(2,ny+2-k,i)
      at1 = g(2,k,i) + at2
      at2 = g(2,k,i) - at2
      at1 = at3*at1
      at2 = 0.5*at2
      g(2,k,i) = at1 + at2
      g(2,ny+2-k,i) = at1 - at2
   10 continue
      g(1,1,i) = 0.5*(g(1,1,i) + g(1,ny+1,i))
      g(1,ny+1,i) = sum1
      g(2,1,i) = 0.0
      g(2,nyh+1,i) = 2.0*g(2,nyh+1,i)
! bit-reverse array elements in y
      do 30 k = 1, nyh
      k1 = (mixup(k) - 1)/nryb + 1
      if (k.lt.k1) then
         do 20 jj = 1, 2
         t2 = g(jj,2*k1-1,i)
         t3 = g(jj,2*k1,i)
         g(jj,2*k1-1,i) = g(jj,2*k-1,i)
         g(jj,2*k1,i) = g(jj,2*k,i)
         g(jj,2*k-1,i) = t2
         g(jj,2*k,i) = t3
   20    continue
      endif
   30 continue
! then transform in y
      do 70 m = 1, indy1
      ns = 2**(m - 1)
      ns2 = ns + ns
      km = nyhh/ns
      kmr = 2*km*nry
      do 60 k = 1, km
      k1 = ns2*(k - 1)
      k2 = k1 + ns
      do 50 j = 1, ns
      j1 = j + k1
      j2 = j + k2
      t1 = sctd(1+kmr*(j-1))
      do 40 jj = 1, 2
      t2 = real(t1)*g(jj,2*j2-1,i) - aimag(t1)*g(jj,2*j2,i)
      t3 = aimag(t1)*g(jj,2*j2-1,i) + real(t1)*g(jj,2*j2,i)
      g(jj,2*j2-1,i) = g(jj,2*j1-1,i) - t2
      g(jj,2*j2,i) = g(jj,2*j1,i) - t3
      g(jj,2*j1-1,i) = g(jj,2*j1-1,i) + t2
      g(jj,2*j1,i) = g(jj,2*j1,i) + t3
   40 continue
   50 continue
   60 continue
   70 continue
! unscramble coefficients and normalize
! inverse fourier transform
      if (isign.lt.0) then
         kmr = nxy/nyh
         do 90 k = 2, nyhh
         t1 = cmplx(aimag(sctd(1+kmr*(k-1))),-real(sctd(1+kmr*(k-1))))
         do 80 jj = 1, 2
         t4 = g(jj,ny3-2*k,i)
         t5 = -g(jj,ny3-2*k+1,i)
         t2 = g(jj,2*k-1,i) + t4
         t3 = g(jj,2*k,i) + t5
         t6 = g(jj,2*k-1,i) - t4
         t5 = g(jj,2*k,i) - t5
         t4 = t6*real(t1) - t5*aimag(t1)
         t5 = t6*aimag(t1) + t5*real(t1)
         g(jj,2*k-1,i) = 0.5*(t2 + t4)
         g(jj,2*k,i) = 0.5*(t3 + t5)
         g(jj,ny3-2*k,i) = 0.5*(t2 - t4)
         g(jj,ny3-2*k+1,i) = 0.5*(t5 - t3)
   80    continue
   90    continue
         do 100 jj = 1, 2
         g(jj,nyh+1,i) = g(jj,nyh+1,i)
         g(jj,nyh+2,i) = -g(jj,nyh+2,i)
         t2 = g(jj,1,i) + g(jj,2,i)
         g(jj,2,i) = g(jj,1,i) - g(jj,2,i)
         g(jj,1,i) = t2
         g(jj,ny+1,i) = g(jj,ny+1,i)
  100    continue
! forward fourier transform
      else if (isign.gt.0) then
         kmr = nxy/nyh
         do 120 k = 2, nyhh
         t1 = cmplx(aimag(sctd(1+kmr*(k-1))),-real(sctd(1+kmr*(k-1))))
         do 110 jj = 1, 2
         t4 = g(jj,ny3-2*k,i)
         t5 = -g(jj,ny3-2*k+1,i)
         t2 = g(jj,2*k-1,i) + t4
         t3 = g(jj,2*k,i) + t5
         t6 = g(jj,2*k-1,i) - t4
         t5 = g(jj,2*k,i) - t5
         t4 = t6*real(t1) - t5*aimag(t1)
         t5 = t6*aimag(t1) + t5*real(t1)
         g(jj,2*k-1,i) = t2 + t4
         g(jj,2*k,i) = t3 + t5
         g(jj,ny3-2*k,i) = t2 - t4
         g(jj,ny3-2*k+1,i) = t5 - t3
  110    continue
  120    continue
         do 130 jj = 1, 2
         g(jj,nyh+1,i) = 2.0*g(jj,nyh+1,i)
         g(jj,nyh+2,i) = -2.0*g(jj,nyh+2,i)
         t2 = 2.0*(g(jj,1,i) + g(jj,2,i))
         g(jj,2,i) = 2.0*(g(jj,1,i) - g(jj,2,i))
         g(jj,1,i) = t2
         g(jj,ny+1,i) = 2.0*g(jj,ny+1,i)
  130    continue
      endif
! perform recursion for sine-cosine transform
      sum1 = g(1,ny+1,i)
      g(1,ny+1,i) = g(1,2,i)
      g(1,2,i) = sum1
      sum2 = 0.5*g(2,1,i)
      g(2,1,i) = 0.0
      g(2,2,i) = sum2
      do 140 k = 2, nyh
      sum1 = sum1 - g(1,2*k,i)
      g(1,2*k,i) = sum1
      sum2 = sum2 + g(2,2*k-1,i)
      g(2,2*k-1,i) = -g(2,2*k,i)
      g(2,2*k,i) = sum2
  140 continue
      g(2,ny+1,i) = 0.0
  150 continue
!$OMP END PARALLEL DO
      return
      end
!-----------------------------------------------------------------------
      subroutine WPPFCST2RM3(f,g,bs,br,isign,ntpose,mixup,sctd,ttp,indx,&
     &indy,kstrt,nvp,nxvh,nyv,kxp2,kyp,kypd,kxp2d,nxhyd,nxyd)
! wrapper function for 3 parallel real cosine/sine transforms
! for the electric field with dirichlet or magnetic field with neumann
! boundary conditions
! x component has a cosine/sine transform in x and y, respectively
! y/z component has a sine/cosine transform in x and y, respectively
      implicit none
      integer isign, ntpose, mixup, indx, indy, kstrt, nvp, nxvh, nyv
      integer kxp2, kyp, kypd, kxp2d, nxhyd, nxyd
      real f, g, bs, br, ttp
      complex sctd
      dimension f(3,2*nxvh,kypd), g(3,nyv,kxp2d)
      dimension bs(3,kxp2+1,kyp+1), br(3,kxp2+1,kyp+1)
      dimension mixup(nxhyd), sctd(nxyd)
! local data
      integer nx, ny, kxpi, kypi, ks, kxp2p, kypp, kxb2, kyb
      real tf
      double precision dtime
      data kxpi, kypi /1,1/
! calculate range of indices
      nx = 2**indx
      ny = 2**indy
! ks = processor id
      ks = kstrt - 1
! kxp2p = actual size used in x direction
      kxp2p = min(kxp2,max(0,nx-kxp2*ks))
! kypp = actual size used in y direction
      kypp = min(kyp,max(0,ny-kyp*ks))
! kxb2 = minimum number of processors needed in x direction
      kxb2 = (nx - 1)/kxp2 + 1
! kyb = minimum number of processors needed in y direction
      kyb = (ny - 1)/kyp + 1
! add extra word for last processor in x
      if (ks==(kxb2-1)) kxp2p = kxp2p + 1
! add extra word for last processor in y
      if (ks==(kyb-1)) kypp = kypp + 1
! inverse fourier transform
      if (isign.lt.0) then
! perform x cosine-sine-sine transform
         call PPFCSST2RM3X(f,isign,mixup,sctd,indx,indy,kstrt,kypi,kypp,&
     &nxvh,kypd,nxhyd,nxyd)
! transpose f array to g
         call PWTIMERA(-1,ttp,dtime)
         call PPRNTPOSE(f,g,bs,br,nx,ny,kxp2,kyp,kstrt,nvp,3,2*nxvh,nyv,&
     &kxp2d,kypd)
         call PWTIMERA(1,ttp,dtime)
! perform y sine-cosine-sine transform
         call PPFSCST2RM3Y(g,isign,mixup,sctd,indx,indy,kstrt,kxpi,kxp2p&
     &,nyv,kxp2d,nxhyd,nxyd)
! transpose g array to f
         if (ntpose.eq.0) then
            call PWTIMERA(-1,tf,dtime)
            call PPRNTPOSE(g,f,br,bs,ny,nx,kyp,kxp2,kstrt,nvp,3,nyv,    &
     &2*nxvh,kypd,kxp2d)
            call PWTIMERA(1,tf,dtime)
         endif
! forward fourier transform
      else if (isign.gt.0) then
! transpose f array to g
         if (ntpose.eq.0) then
            call PWTIMERA(-1,tf,dtime)
            call PPRNTPOSE(f,g,bs,br,nx,ny,kxp2,kyp,kstrt,nvp,3,2*nxvh, &
     &nyv,kxp2d,kypd)
            call PWTIMERA(1,tf,dtime)
         endif
! perform y sine-cosine-sine transform
         call PPFSCST2RM3Y(g,isign,mixup,sctd,indx,indy,kstrt,kxpi,kxp2p&
     &,nyv,kxp2d,nxhyd,nxyd)
! transpose g array to f
         call PWTIMERA(-1,ttp,dtime)
         call PPRNTPOSE(g,f,br,bs,ny,nx,kyp,kxp2,kstrt,nvp,3,nyv,2*nxvh,&
     &kypd,kxp2d)
         call PWTIMERA(1,ttp,dtime)
! perform x cosine-sine-sine transform
         call PPFCSST2RM3X(f,isign,mixup,sctd,indx,indy,kstrt,kypi,kypp,&
     &nxvh,kypd,nxhyd,nxyd)
      endif
      if (ntpose.eq.0) ttp = ttp + tf
      return
      end
!-----------------------------------------------------------------------
      subroutine WPPFSCT2RM3(f,g,bs,br,isign,ntpose,mixup,sctd,ttp,indx,&
     &indy,kstrt,nvp,nxvh,nyv,kxp2,kyp,kypd,kxp2d,nxhyd,nxyd)
! wrapper function for 3 parallel real sine/cosine transforms
! for the magnetic field with dirichlet or electric field with neumann
! boundary conditions
! x component has a sine/cosine transform in x and y, respectively
! y component has a cosine/sine transform in x and y, respectively
! z component has a cosine/cosine transform in x and y, respectively
      implicit none
      integer isign, ntpose, mixup, indx, indy, kstrt, nvp, nxvh, nyv
      integer kxp2, kyp, kypd, kxp2d, nxhyd, nxyd
      real f, g, bs, br, ttp
      complex sctd
      dimension f(3,2*nxvh,kypd), g(3,nyv,kxp2d)
      dimension bs(3,kxp2+1,kyp+1), br(3,kxp2+1,kyp+1)
      dimension mixup(nxhyd), sctd(nxyd)
! local data
      integer nx, ny, kxpi, kypi, ks, kxp2p, kypp, kxb2, kyb
      real tf
      double precision dtime
      data kxpi, kypi /1,1/
! calculate range of indices
      nx = 2**indx
      ny = 2**indy
! ks = processor id
      ks = kstrt - 1
! kxp2p = actual size used in x direction
      kxp2p = min(kxp2,max(0,nx-kxp2*ks))
! kypp = actual size used in y direction
      kypp = min(kyp,max(0,ny-kyp*ks))
! kxb2 = minimum number of processors needed in x direction
      kxb2 = (nx - 1)/kxp2 + 1
! kyb = minimum number of processors needed in y direction
      kyb = (ny - 1)/kyp + 1
! add extra word for last processor in x
      if (ks==(kxb2-1)) kxp2p = kxp2p + 1
! add extra word for last processor in y
      if (ks==(kyb-1)) kypp = kypp + 1
! inverse fourier transform
      if (isign.lt.0) then
! perform x sine-cosine-cosine transform
         call PPFSCCT2RM3X(f,isign,mixup,sctd,indx,indy,kstrt,kypi,kypp,&
     &nxvh,kypd,nxhyd,nxyd)
! transpose f array to g
         call PWTIMERA(-1,ttp,dtime)
         call PPRNTPOSE(f,g,bs,br,nx,ny,kxp2,kyp,kstrt,nvp,3,2*nxvh,nyv,&
     &kxp2d,kypd)
         call PWTIMERA(1,ttp,dtime)
! perform y cosine-sine-cosine transform
         call PPFCSCT2RM3Y(g,isign,mixup,sctd,indx,indy,kstrt,kxpi,kxp2p&
     &,nyv,kxp2d,nxhyd,nxyd)
! transpose g array to f
         if (ntpose.eq.0) then
            call PWTIMERA(-1,tf,dtime)
            call PPRNTPOSE(g,f,br,bs,ny,nx,kyp,kxp2,kstrt,nvp,3,nyv,    &
     &2*nxvh,kypd,kxp2d)
            call PWTIMERA(1,tf,dtime)
         endif
! forward fourier transform
      else if (isign.gt.0) then
! transpose f array to g
         if (ntpose.eq.0) then
            call PWTIMERA(-1,tf,dtime)
            call PPRNTPOSE(f,g,bs,br,nx,ny,kxp2,kyp,kstrt,nvp,3,2*nxvh, &
     &nyv,kxp2d,kypd)
            call PWTIMERA(1,tf,dtime)
         endif
! perform y cosine-sine-cosine transform
         call PPFCSCT2RM3Y(g,isign,mixup,sctd,indx,indy,kstrt,kxpi,kxp2p&
     &,nyv,kxp2d,nxhyd,nxyd)
! transpose g array to f
         call PWTIMERA(-1,ttp,dtime)
         call PPRNTPOSE(g,f,br,bs,ny,nx,kyp,kxp2,kstrt,nvp,3,nyv,2*nxvh,&
     &kypd,kxp2d)
         call PWTIMERA(1,ttp,dtime)
! perform x sine-cosine-cosine transform
         call PPFSCCT2RM3X(f,isign,mixup,sctd,indx,indy,kstrt,kypi,kypp,&
     &nxvh,kypd,nxhyd,nxyd)
      endif
      if (ntpose.eq.0) ttp = ttp + tf
      return
      end
!-----------------------------------------------------------------------
      subroutine PPFCSST2RM3X(f,isign,mixup,sctd,indx,indy,kstrt,kypi,  &
     &kypp,nxvh,kypd,nxhyd,nxyd)
! this subroutine performs the x part of 3 two dimensional fast real
! sine and cosine transforms and their inverses, for a subset of y,
! using real arithmetic, with OpenMP,
! for data which is distributed in blocks
! x component has a cosine transform, y/z component a sine transform
! algorithm is described in Numerical Recipies in Fortran, Second Ed.,
! by W. H. Press, B. P. Flannery, S. A. Teukolsky, and W. T. Vetterling, 
! [Cambridge Univ. Press, 1992], p. 508.
! for isign = (-1,1), input: all, output: f
! approximate flop count: N*(5*log2(N) + 18)/nvp
! where N = (nx/2)*ny
! indx/indy = exponent which determines length in x/y direction,
! where nx=2**indx, ny=2**indy
! if isign = -1, inverse cosine-sine-sine transforms are performed
! f(1,n,k) = (1/nx*ny)*(.5*f(1,1,k) + ((-1)**n)*f(1,nx+1,k)
!              + sum(f(1,j,k)*cos(pi*n*j/nx)))
! f(2:3,n,k) = (1/nx*ny)*sum(f(2:3,j,k)*sin(pi*n*j/nx))
! if isign = 1, forward cosine-sine-sine transforms are performed
! f(1,j,k) = 2*(.5*f(1,1,k) + ((-1)**j)*f(1,n+1,k)
!              + sum(f(1,n,k)*cos(pi*n*j/nx))
! f(2:3,j,k) = sum(f(2:3,n,k)*sin(pi*n*j/nx))
! mixup = array of bit reversed addresses
! sctd = sine/cosine table
! kstrt = starting data block number
! kypi = initial y index used
! kypp = number of y indices used
! nxvh = first dimension of f >= nx/2 + 1
! kypd = second dimension of f >= kyp + 1
! nxhyd = maximum of (nx/2,ny)
! nxyd = maximum of (nx,ny)
! written by viktor k. decyk, ucla
      implicit none
      integer isign, indx, indy, kstrt, kypi, kypp
      integer nxvh, kypd, nxhyd, nxyd, mixup
      real f
      complex sctd
      dimension f(3,2*nxvh,kypd)
      dimension mixup(nxhyd), sctd(nxyd)
! local data
      integer indx1, indx1y, nx, nxh, nxhh, nx3, ny, nxy, nxhy, ks
      integer i, j, k, m, km, kmr, nrx, j1, j2, ns, ns2, k1, k2, kyps
      integer nrxb, jj
      real at1, at2, at3, t2, t3, t4, t5, t6, ani
      complex t1
      double precision sum1, sum2, sum3
      indx1 = indx - 1
      indx1y = max0(indx1,indy)
      nx = 2**indx
      nxh = nx/2
      nxhh = nx/4
      nx3 = nx + 3
      ny = 2**indy
      nxy = max0(nx,ny)
      nxhy = 2**indx1y
      ks = kstrt - 1
      kyps = kypi + kypp - 1
      if (kstrt.gt.ny) return
      if (isign.eq.0) return
      ani = 0.5/(real(nx)*real(ny))
      nrxb = nxhy/nxh
      nrx = nxy/nxh
!$OMP PARALLEL DO
!$OMP& PRIVATE(i,j,k,m,jj,ns,ns2,km,kmr,k1,k2,j1,j2,at1,at2,at3,t2,t3,t4
!$OMP& ,t5,t6,t1,sum1,sum2,sum3)
      do 150 i = kypi, kyps
! create auxiliary array in x
      kmr = nxy/nx
      sum1 = 0.5*(f(1,1,i) - f(1,nx+1,i))
      do 10 j = 2, nxh
      j1 = 1 + kmr*(j - 1)
      at3 = -aimag(sctd(j1))
      at2 = f(1,nx+2-j,i)
      at1 = f(1,j,i) + at2
      at2 = f(1,j,i) - at2
      sum1 = sum1 + real(sctd(j1))*at2
      at2 = at3*at2
      at1 = 0.5*at1
      f(1,j,i) = at1 - at2
      f(1,nx+2-j,i) = at1 + at2
      at2 = f(2,nx+2-j,i)
      at1 = f(2,j,i) + at2
      at2 = f(2,j,i) - at2
      at1 = at3*at1
      at2 = 0.5*at2
      f(2,j,i) = at1 + at2
      f(2,nx+2-j,i) = at1 - at2
      at2 = f(3,nx+2-j,i)
      at1 = f(3,j,i) + at2
      at2 = f(3,j,i) - at2
      at1 = at3*at1
      at2 = 0.5*at2
      f(3,j,i) = at1 + at2
      f(3,nx+2-j,i) = at1 - at2
   10 continue
      f(1,1,i) = 0.5*(f(1,1,i) + f(1,nx+1,i))
      f(1,nx+1,i) = sum1
      f(2,1,i) = 0.0
      f(2,nxh+1,i) = 2.0*f(2,nxh+1,i)
      f(3,1,i) = 0.0
      f(3,nxh+1,i) = 2.0*f(3,nxh+1,i)
! bit-reverse array elements in x
      do 30 j = 1, nxh
      j1 = (mixup(j) - 1)/nrxb + 1
      if (j.lt.j1) then
         do 20 jj = 1, 3
         t2 = f(jj,2*j1-1,i)
         t3 = f(jj,2*j1,i)
         f(jj,2*j1-1,i) = f(jj,2*j-1,i)
         f(jj,2*j1,i) = f(jj,2*j,i)
         f(jj,2*j-1,i) = t2
         f(jj,2*j,i) = t3
   20    continue
      endif
   30 continue
! then transform in x
      do 70 m = 1, indx1
      ns = 2**(m - 1)
      ns2 = ns + ns
      km = nxhh/ns
      kmr = 2*km*nrx
      do 60 k = 1, km
      k1 = ns2*(k - 1)
      k2 = k1 + ns
      do 50 j = 1, ns
      j1 = j + k1
      j2 = j + k2
      t1 = sctd(1+kmr*(j-1))
      do 40 jj = 1, 3
      t2 = real(t1)*f(jj,2*j2-1,i) - aimag(t1)*f(jj,2*j2,i)
      t3 = aimag(t1)*f(jj,2*j2-1,i) + real(t1)*f(jj,2*j2,i)
      f(jj,2*j2-1,i) = f(jj,2*j1-1,i) - t2
      f(jj,2*j2,i) = f(jj,2*j1,i) - t3
      f(jj,2*j1-1,i) = f(jj,2*j1-1,i) + t2
      f(jj,2*j1,i) = f(jj,2*j1,i) + t3
   40 continue
   50 continue
   60 continue
   70 continue
! unscramble coefficients and normalize
! inverse fourier transform
      if (isign.lt.0) then
         kmr = nxy/nxh
         do 90 j = 2, nxhh
         t1 = cmplx(aimag(sctd(1+kmr*(j-1))),-real(sctd(1+kmr*(j-1))))
         do 80 jj = 1, 3
         t4 = f(jj,nx3-2*j,i)
         t5 = -f(jj,nx3-2*j+1,i)
         t2 = f(jj,2*j-1,i) + t4
         t3 = f(jj,2*j,i) + t5
         t6 = f(jj,2*j-1,i) - t4
         t5 = f(jj,2*j,i) - t5
         t4 = t6*real(t1) - t5*aimag(t1)
         t5 = t6*aimag(t1) + t5*real(t1)
         f(jj,2*j-1,i) = ani*(t2 + t4)
         f(jj,2*j,i) = ani*(t3 + t5)
         f(jj,nx3-2*j,i) = ani*(t2 - t4)
         f(jj,nx3-2*j+1,i) = ani*(t5 - t3)
   80    continue
   90    continue
         do 100 jj = 1, 3
         f(jj,nxh+1,i) = 2.0*ani*f(jj,nxh+1,i)
         f(jj,nxh+2,i) = -2.0*ani*f(jj,nxh+2,i)
         t2 = 2.0*ani*(f(jj,1,i) + f(jj,2,i))
         f(jj,2,i) = 2.0*ani*(f(jj,1,i) - f(jj,2,i))
         f(jj,1,i) = t2
         f(jj,nx+1,i) = 2.0*ani*f(jj,nx+1,i)
  100    continue
! forward fourier transform
      else if (isign.gt.0) then
         kmr = nxy/nxh
         do 120 j = 2, nxhh
         t1 = cmplx(aimag(sctd(1+kmr*(j-1))),-real(sctd(1+kmr*(j-1))))
         do 110 jj = 1, 3
         t4 = f(jj,nx3-2*j,i)
         t5 = -f(jj,nx3-2*j+1,i)
         t2 = f(jj,2*j-1,i) + t4
         t3 = f(jj,2*j,i) + t5
         t6 = f(jj,2*j-1,i) - t4
         t5 = f(jj,2*j,i) - t5
         t4 = t6*real(t1) - t5*aimag(t1)
         t5 = t6*aimag(t1) + t5*real(t1)
         f(jj,2*j-1,i) = t2 + t4
         f(jj,2*j,i) = t3 + t5
         f(jj,nx3-2*j,i) = t2 - t4
         f(jj,nx3-2*j+1,i) = t5 - t3
  110    continue
  120    continue
         do 130 jj = 1, 3
         f(jj,nxh+1,i) = 2.0*f(jj,nxh+1,i)
         f(jj,nxh+2,i) = -2.0*f(jj,nxh+2,i)
         t2 = 2.0*(f(jj,1,i) + f(jj,2,i))
         f(jj,2,i) = 2.0*(f(jj,1,i) - f(jj,2,i))
         f(jj,1,i) = t2
         f(jj,nx+1,i) = 2.0*f(jj,nx+1,i)
  130    continue
      endif
! perform recursion for cosine-sine transform
      sum1 = f(1,nx+1,i)
      f(1,nx+1,i) = f(1,2,i)
      f(1,2,i) = sum1
      sum2 = 0.5*f(2,1,i)
      f(2,1,i) = 0.0
      f(2,2,i) = sum2
      sum3 = 0.5*f(3,1,i)
      f(3,1,i) = 0.0
      f(3,2,i) = sum3
      do 140 j = 2, nxh
      sum1 = sum1 - f(1,2*j,i)
      f(1,2*j,i) = sum1
      sum2 = sum2 + f(2,2*j-1,i)
      f(2,2*j-1,i) = -f(2,2*j,i)
      f(2,2*j,i) = sum2
      sum3 = sum3 + f(3,2*j-1,i)
      f(3,2*j-1,i) = -f(3,2*j,i)
      f(3,2*j,i) = sum3
  140 continue
      f(2,nx+1,i) = 0.0
      f(3,nx+1,i) = 0.0
  150 continue
!$OMP END PARALLEL DO
      return
      end
!-----------------------------------------------------------------------
      subroutine PPFSCCT2RM3X(f,isign,mixup,sctd,indx,indy,kstrt,kypi,  &
     &kypp,nxvh,kypd,nxhyd,nxyd)
! this subroutine performs the x part of 3 two dimensional fast real
! sine and cosine transforms and their inverses, for a subset of y,
! using real arithmetic, with OpenMP,
! for data which is distributed in blocks
! x component has a sine transform, y/z component a cosine transform
! algorithm is described in Numerical Recipies in Fortran, Second Ed.,
! by W. H. Press, B. P. Flannery, S. A. Teukolsky, and W. T. Vetterling, 
! [Cambridge Univ. Press, 1992], p. 508.
! for isign = (-1,1), input: all, output: f
! approximate flop count: N*(5*log2(N) + 18)/nvp
! where N = (nx/2)*ny
! indx/indy = exponent which determines length in x/y direction,
! where nx=2**indx, ny=2**indy
! if isign = -1, inverse sine-cosine-cosine transforms are performed
! f(1,n,k) = (1/nx*ny)*sum(f(1,j,k)*sin(pi*n*j/nx))
! f(2:3,n,k) = (1/nx*ny)*(.5*f(2:3,1,k) + ((-1)**n)*f(2:3,nx+1,k)
!              + sum(f(2:3,j,k)*cos(pi*n*j/nx)))
! if isign = 1, forward sine-cosine-cosine transforms are performed
! f(1,j,k) = sum(f(1,n,k)*sin(pi*n*j/nx))
! f(2:3,j,k) = 2*(.5*f(2:3,1,k) + ((-1)**j)*f(2:3,n+1,k)
!              + sum(f(2:3,n,k)*cos(pi*n*j/nx))
! mixup = array of bit reversed addresses
! sctd = sine/cosine table
! kstrt = starting data block number
! kypi = initial y index used
! kypp = number of y indices used
! nxvh = first dimension of f >= nx/2 + 1
! kypd = second dimension of f >= kyp + 1
! nxhyd = maximum of (nx/2,ny)
! nxyd = maximum of (nx,ny)
! written by viktor k. decyk, ucla
      implicit none
      integer isign, indx, indy, kstrt, kypi, kypp
      integer nxvh, kypd, nxhyd, nxyd, mixup
      real f
      complex sctd
      dimension f(3,2*nxvh,kypd)
      dimension mixup(nxhyd), sctd(nxyd)
! local data
      integer indx1, indx1y, nx, nxh, nxhh, nx3, ny, nxy, nxhy, ks
      integer i, j, k, m, km, kmr, nrx, j1, j2, ns, ns2, k1, k2, kyps
      integer nrxb, jj
      real at1, at2, at3, t2, t3, t4, t5, t6, ani
      complex t1
      double precision sum1, sum2, sum3
      indx1 = indx - 1
      indx1y = max0(indx1,indy)
      nx = 2**indx
      nxh = nx/2
      nxhh = nx/4
      nx3 = nx + 3
      ny = 2**indy
      nxy = max0(nx,ny)
      nxhy = 2**indx1y
      ks = kstrt - 1
      kyps = kypi + kypp - 1
      if (kstrt.gt.ny) return
      if (isign.eq.0) return
      ani = 0.5/(real(nx)*real(ny))
      nrxb = nxhy/nxh
      nrx = nxy/nxh
!$OMP PARALLEL DO
!$OMP& PRIVATE(i,j,k,m,jj,ns,ns2,km,kmr,k1,k2,j1,j2,at1,at2,at3,t2,t3,t4
!$OMP& ,t5,t6,t1,sum1,sum2,sum3)
      do 150 i = kypi, kyps
! create auxiliary array in x
      kmr = nxy/nx
      sum1 = 0.5*(f(2,1,i) - f(2,nx+1,i))
      sum2 = 0.5*(f(3,1,i) - f(3,nx+1,i))
      do 10 j = 2, nxh
      j1 = 1 + kmr*(j - 1)
      at3 = -aimag(sctd(j1))
      at2 = f(1,nx+2-j,i)
      at1 = f(1,j,i) + at2
      at2 = f(1,j,i) - at2
      at1 = at3*at1
      at2 = 0.5*at2
      f(1,j,i) = at1 + at2
      f(1,nx+2-j,i) = at1 - at2
      at2 = f(2,nx+2-j,i)
      at1 = f(2,j,i) + at2
      at2 = f(2,j,i) - at2
      sum1 = sum1 + real(sctd(j1))*at2
      at2 = at3*at2
      at1 = 0.5*at1
      f(2,j,i) = at1 - at2
      f(2,nx+2-j,i) = at1 + at2
      at2 = f(3,nx+2-j,i)
      at1 = f(3,j,i) + at2
      at2 = f(3,j,i) - at2
      sum2 = sum2 + real(sctd(j1))*at2
      at2 = at3*at2
      at1 = 0.5*at1
      f(3,j,i) = at1 - at2
      f(3,nx+2-j,i) = at1 + at2
   10 continue
      f(1,1,i) = 0.0
      f(1,nxh+1,i) = 2.0*f(1,nxh+1,i)
      f(2,1,i) = 0.5*(f(2,1,i) + f(2,nx+1,i))
      f(2,nx+1,i) = sum1
      f(3,1,i) = 0.5*(f(3,1,i) + f(3,nx+1,i))
      f(3,nx+1,i) = sum2
! bit-reverse array elements in x
      do 30 j = 1, nxh
      j1 = (mixup(j) - 1)/nrxb + 1
      if (j.lt.j1) then
         do 20 jj = 1, 3
         t2 = f(jj,2*j1-1,i)
         t3 = f(jj,2*j1,i)
         f(jj,2*j1-1,i) = f(jj,2*j-1,i)
         f(jj,2*j1,i) = f(jj,2*j,i)
         f(jj,2*j-1,i) = t2
         f(jj,2*j,i) = t3
   20    continue
      endif
   30 continue
! then transform in x
      do 70 m = 1, indx1
      ns = 2**(m - 1)
      ns2 = ns + ns
      km = nxhh/ns
      kmr = 2*km*nrx
      do 60 k = 1, km
      k1 = ns2*(k - 1)
      k2 = k1 + ns
      do 50 j = 1, ns
      j1 = j + k1
      j2 = j + k2
      t1 = sctd(1+kmr*(j-1))
      do 40 jj = 1, 3
      t2 = real(t1)*f(jj,2*j2-1,i) - aimag(t1)*f(jj,2*j2,i)
      t3 = aimag(t1)*f(jj,2*j2-1,i) + real(t1)*f(jj,2*j2,i)
      f(jj,2*j2-1,i) = f(jj,2*j1-1,i) - t2
      f(jj,2*j2,i) = f(jj,2*j1,i) - t3
      f(jj,2*j1-1,i) = f(jj,2*j1-1,i) + t2
      f(jj,2*j1,i) = f(jj,2*j1,i) + t3
   40 continue
   50 continue
   60 continue
   70 continue
! unscramble coefficients and normalize
! inverse fourier transform
      if (isign.lt.0) then
         kmr = nxy/nxh
         do 90 j = 2, nxhh
         t1 = cmplx(aimag(sctd(1+kmr*(j-1))),-real(sctd(1+kmr*(j-1))))
         do 80 jj = 1, 3
         t4 = f(jj,nx3-2*j,i)
         t5 = -f(jj,nx3-2*j+1,i)
         t2 = f(jj,2*j-1,i) + t4
         t3 = f(jj,2*j,i) + t5
         t6 = f(jj,2*j-1,i) - t4
         t5 = f(jj,2*j,i) - t5
         t4 = t6*real(t1) - t5*aimag(t1)
         t5 = t6*aimag(t1) + t5*real(t1)
         f(jj,2*j-1,i) = ani*(t2 + t4)
         f(jj,2*j,i) = ani*(t3 + t5)
         f(jj,nx3-2*j,i) = ani*(t2 - t4)
         f(jj,nx3-2*j+1,i) = ani*(t5 - t3)
   80    continue
   90    continue
         do 100 jj = 1, 3
         f(jj,nxh+1,i) = 2.0*ani*f(jj,nxh+1,i)
         f(jj,nxh+2,i) = -2.0*ani*f(jj,nxh+2,i)
         t2 = 2.0*ani*(f(jj,1,i) + f(jj,2,i))
         f(jj,2,i) = 2.0*ani*(f(jj,1,i) - f(jj,2,i))
         f(jj,1,i) = t2
         f(jj,nx+1,i) = 2.0*ani*f(jj,nx+1,i)
  100    continue
! forward fourier transform
      else if (isign.gt.0) then
         kmr = nxy/nxh
         do 120 j = 2, nxhh
         t1 = cmplx(aimag(sctd(1+kmr*(j-1))),-real(sctd(1+kmr*(j-1))))
         do 110 jj = 1, 3
         t4 = f(jj,nx3-2*j,i)
         t5 = -f(jj,nx3-2*j+1,i)
         t2 = f(jj,2*j-1,i) + t4
         t3 = f(jj,2*j,i) + t5
         t6 = f(jj,2*j-1,i) - t4
         t5 = f(jj,2*j,i) - t5
         t4 = t6*real(t1) - t5*aimag(t1)
         t5 = t6*aimag(t1) + t5*real(t1)
         f(jj,2*j-1,i) = t2 + t4
         f(jj,2*j,i) = t3 + t5
         f(jj,nx3-2*j,i) = t2 - t4
         f(jj,nx3-2*j+1,i) = t5 - t3
  110    continue
  120    continue
         do 130 jj = 1, 3
         f(jj,nxh+1,i) = 2.0*f(jj,nxh+1,i)
         f(jj,nxh+2,i) = -2.0*f(jj,nxh+2,i)
         t2 = 2.0*(f(jj,1,i) + f(jj,2,i))
         f(jj,2,i) = 2.0*(f(jj,1,i) - f(jj,2,i))
         f(jj,1,i) = t2
         f(jj,nx+1,i) = 2.0*f(jj,nx+1,i)
  130    continue
      endif
! perform recursion for cosine-sine transform
      sum1 = 0.5*f(1,1,i)
      f(1,1,i) = 0.0
      f(1,2,i) = sum1
      sum2 = f(2,nx+1,i)
      f(2,nx+1,i) = f(2,2,i)
      f(2,2,i) = sum2
      sum3 = f(3,nx+1,i)
      f(3,nx+1,i) = f(3,2,i)
      f(3,2,i) = sum3
      do 140 j = 2, nxh
      sum1 = sum1 + f(1,2*j-1,i)
      f(1,2*j-1,i) = -f(1,2*j,i)
      f(1,2*j,i) = sum1
      sum2 = sum2 - f(2,2*j,i)
      f(2,2*j,i) = sum2
      sum3 = sum3 - f(3,2*j,i)
      f(3,2*j,i) = sum3
  140 continue
      f(1,nx+1,i) = 0.0
  150 continue
!$OMP END PARALLEL DO
      return
      end
!-----------------------------------------------------------------------
      subroutine PPFSCST2RM3Y(g,isign,mixup,sctd,indx,indy,kstrt,kxpi,  &
     &kxpp,nyv,kxpd,nxhyd,nxyd)
! this subroutine performs the y part of 3 two dimensional fast real
! sine and cosine transforms and their inverses, for a subset of x,
! using real arithmetic, with OpenMP,
! for data which is distributed in blocks
! x/z component has a sine transform, y component a cosine transform
! algorithm is described in Numerical Recipies in Fortran, Second Ed.,
! by W. H. Press, B. P. Flannery, S. A. Teukolsky, and W. T. Vetterling, 
! [Cambridge Univ. Press, 1992], p. 508.
! for isign = (-1,1), input: all, output: f
! approximate flop count: N*(5*log2(N) + 18)/nvp
! where N = (nx/2)*ny
! indx/indy = exponent which determines length in x/y direction,
! where nx=2**indx, ny=2**indy
! if isign = -1, inverse sine-cosine-sine transform are performed
! g(1,m,n) = sum(g(1,k,n)*sin(pi*m*k/ny))
! g(2,m,n) = (.5*g(2,1,n) + ((-1)**m)*g(2,ny+1,n)
!              + sum(g(2,k,n)*cos(pi*m*k/ny))
! g(3,m,n) = sum(g(3,k,n)*sin(pi*m*k/ny))
! if isign = 1, a forward sine-cosine-sine transforms are performed
! g(1,k,n) = sum(g(1,m,n)*sin(pi*m*k/ny))
! g(2,k,n) = 2*(.5*g(2,1,n) + ((-1)**m)*g(2,ny+1,n)
!              + sum(g(2,m,n)*cos(pi*m*k/ny))
! g(3,k,n) = sum(g(3,m,n)*sin(pi*m*k/ny))
! mixup = array of bit reversed addresses
! sctd = sine/cosine table
! kstrt = starting data block number
! kxpi = initial x index used
! kxpp = number of x indices used
! nyv = first dimension of g >= ny + 1
! kxpd = second dimension of g >= kxp + 1
! nxhyd = maximum of (nx/2,ny)
! nxyd = maximum of (nx,ny)
! written by viktor k. decyk, ucla
      implicit none
      integer isign, indx, indy, kstrt, kxpi, kxpp
      integer nyv, kxpd, nxhyd, nxyd, mixup
      real g
      complex sctd
      dimension g(3,nyv,kxpd)
      dimension mixup(nxhyd), sctd(nxyd)
! local data
      integer indx1, indy1, indx1y, nx, ny, nyh, nyhh, ny3, nxy, nxhy
      integer i, j, k, m, ks, km, kmr, nry, j1, j2, ns, ns2, k1, k2
      integer kxps, nryb, jj
      real at1, at2, at3, t2, t3, t4, t5, t6
      complex t1
      double precision sum1, sum2, sum3
      indx1 = indx - 1
      indy1 = indy - 1
      indx1y = max0(indx1,indy)
      nx = 2**indx
      ny = 2**indy
      nyh = ny/2
      nyhh = ny/4
      ny3 = ny + 3
      nxy = max0(nx,ny)
      nxhy = 2**indx1y
      ks = kstrt - 1
      kxps = kxpi + kxpp - 1
      if (kstrt.gt.nx) return
      if (isign.eq.0) return
      nryb = nxhy/nyh
      nry = nxy/nyh
!$OMP PARALLEL DO
!$OMP& PRIVATE(i,j,k,m,jj,ns,ns2,km,kmr,k1,k2,j1,j2,at1,at2,at3,t2,t3,t4
!$OMP& ,t5,t6,t1,sum1,sum2,sum3)
      do 150 i = kxpi, kxps
! create auxiliary array in y
      kmr = nxy/ny
      sum1 = 0.5*(g(2,1,i) - g(2,ny+1,i))
      do 10 k = 2, nyh
      k1 = 1 + kmr*(k - 1)
      at3 = -aimag(sctd(k1))
      at2 = g(1,ny+2-k,i)
      at1 = g(1,k,i) + at2
      at2 = g(1,k,i) - at2
      at1 = at3*at1
      at2 = 0.5*at2
      g(1,k,i) = at1 + at2
      g(1,ny+2-k,i) = at1 - at2
      at2 = g(2,ny+2-k,i)
      at1 = g(2,k,i) + at2
      at2 = g(2,k,i) - at2
      sum1 = sum1 + real(sctd(k1))*at2
      at2 = at3*at2
      at1 = 0.5*at1
      g(2,k,i) = at1 - at2
      g(2,ny+2-k,i) = at1 + at2
      at2 = g(3,ny+2-k,i)
      at1 = g(3,k,i) + at2
      at2 = g(3,k,i) - at2
      at1 = at3*at1
      at2 = 0.5*at2
      g(3,k,i) = at1 + at2
      g(3,ny+2-k,i) = at1 - at2
   10 continue
      g(1,1,i) = 0.0
      g(1,nyh+1,i) = 2.0*g(1,nyh+1,i)
      g(2,1,i) = 0.5*(g(2,1,i) + g(2,ny+1,i))
      g(2,ny+1,i) = sum1
      g(3,1,i) = 0.0
      g(3,nyh+1,i) = 2.0*g(3,nyh+1,i)
! bit-reverse array elements in y
      do 30 k = 1, nyh
      k1 = (mixup(k) - 1)/nryb + 1
      if (k.lt.k1) then
         do 20 jj = 1, 3
         t2 = g(jj,2*k1-1,i)
         t3 = g(jj,2*k1,i)
         g(jj,2*k1-1,i) = g(jj,2*k-1,i)
         g(jj,2*k1,i) = g(jj,2*k,i)
         g(jj,2*k-1,i) = t2
         g(jj,2*k,i) = t3
   20    continue
      endif
   30 continue
! then transform in y
      do 70 m = 1, indy1
      ns = 2**(m - 1)
      ns2 = ns + ns
      km = nyhh/ns
      kmr = 2*km*nry
      do 60 k = 1, km
      k1 = ns2*(k - 1)
      k2 = k1 + ns
      do 50 j = 1, ns
      j1 = j + k1
      j2 = j + k2
      t1 = sctd(1+kmr*(j-1))
      do 40 jj = 1, 3
      t2 = real(t1)*g(jj,2*j2-1,i) - aimag(t1)*g(jj,2*j2,i)
      t3 = aimag(t1)*g(jj,2*j2-1,i) + real(t1)*g(jj,2*j2,i)
      g(jj,2*j2-1,i) = g(jj,2*j1-1,i) - t2
      g(jj,2*j2,i) = g(jj,2*j1,i) - t3
      g(jj,2*j1-1,i) = g(jj,2*j1-1,i) + t2
      g(jj,2*j1,i) = g(jj,2*j1,i) + t3
   40 continue
   50 continue
   60 continue
   70 continue
! unscramble coefficients and normalize
! inverse fourier transform
      if (isign.lt.0) then
         kmr = nxy/nyh
         do 90 k = 2, nyhh
         t1 = cmplx(aimag(sctd(1+kmr*(k-1))),-real(sctd(1+kmr*(k-1))))
         do 80 jj = 1, 3
         t4 = g(jj,ny3-2*k,i)
         t5 = -g(jj,ny3-2*k+1,i)
         t2 = g(jj,2*k-1,i) + t4
         t3 = g(jj,2*k,i) + t5
         t6 = g(jj,2*k-1,i) - t4
         t5 = g(jj,2*k,i) - t5
         t4 = t6*real(t1) - t5*aimag(t1)
         t5 = t6*aimag(t1) + t5*real(t1)
         g(jj,2*k-1,i) = 0.5*(t2 + t4)
         g(jj,2*k,i) = 0.5*(t3 + t5)
         g(jj,ny3-2*k,i) = 0.5*(t2 - t4)
         g(jj,ny3-2*k+1,i) = 0.5*(t5 - t3)
   80    continue
   90    continue
         do 100 jj = 1, 3
         g(jj,nyh+1,i) = g(jj,nyh+1,i)
         g(jj,nyh+2,i) = -g(jj,nyh+2,i)
         t2 = g(jj,1,i) + g(jj,2,i)
         g(jj,2,i) = g(jj,1,i) - g(jj,2,i)
         g(jj,1,i) = t2
         g(jj,ny+1,i) = g(jj,ny+1,i)
  100    continue
! forward fourier transform
      else if (isign.gt.0) then
         kmr = nxy/nyh
         do 120 k = 2, nyhh
         t1 = cmplx(aimag(sctd(1+kmr*(k-1))),-real(sctd(1+kmr*(k-1))))
         do 110 jj = 1, 3
         t4 = g(jj,ny3-2*k,i)
         t5 = -g(jj,ny3-2*k+1,i)
         t2 = g(jj,2*k-1,i) + t4
         t3 = g(jj,2*k,i) + t5
         t6 = g(jj,2*k-1,i) - t4
         t5 = g(jj,2*k,i) - t5
         t4 = t6*real(t1) - t5*aimag(t1)
         t5 = t6*aimag(t1) + t5*real(t1)
         g(jj,2*k-1,i) = t2 + t4
         g(jj,2*k,i) = t3 + t5
         g(jj,ny3-2*k,i) = t2 - t4
         g(jj,ny3-2*k+1,i) = t5 - t3
  110    continue
  120    continue
         do 130 jj = 1, 3
         g(jj,nyh+1,i) = 2.0*g(jj,nyh+1,i)
         g(jj,nyh+2,i) = -2.0*g(jj,nyh+2,i)
         t2 = 2.0*(g(jj,1,i) + g(jj,2,i))
         g(jj,2,i) = 2.0*(g(jj,1,i) - g(jj,2,i))
         g(jj,1,i) = t2
         g(jj,ny+1,i) = 2.0*g(jj,ny+1,i)
  130    continue
      endif
! perform recursion for sine-cosine transform
      sum1 = 0.5*g(1,1,i)
      g(1,1,i) = 0.0
      g(1,2,i) = sum1
      sum2 = g(2,ny+1,i)
      g(2,ny+1,i) = g(2,2,i)
      g(2,2,i) = sum2
      sum3 = 0.5*g(3,1,i)
      g(3,1,i) = 0.0
      g(3,2,i) = sum3
      do 140 k = 2, nyh
      sum1 = sum1 + g(1,2*k-1,i)
      g(1,2*k-1,i) = -g(1,2*k,i)
      g(1,2*k,i) = sum1
      sum2 = sum2 - g(2,2*k,i)
      g(2,2*k,i) = sum2
      sum3 = sum3 + g(3,2*k-1,i)
      g(3,2*k-1,i) = -g(3,2*k,i)
      g(3,2*k,i) = sum3
  140 continue
      g(1,ny+1,i) = 0.0
      g(3,ny+1,i) = 0.0
  150 continue
!$OMP END PARALLEL DO
      return
      end
!-----------------------------------------------------------------------
      subroutine PPFCSCT2RM3Y(g,isign,mixup,sctd,indx,indy,kstrt,kxpi,  &
     &kxpp,nyv,kxpd,nxhyd,nxyd)
! this subroutine performs the y part of 3 two dimensional fast real
! sine and cosine transforms and their inverses, for a subset of x,
! using real arithmetic, with OpenMP,
! for data which is distributed in blocks
! x/z component has a cosine transform, y component a sine transform
! algorithm is described in Numerical Recipies in Fortran, Second Ed.,
! by W. H. Press, B. P. Flannery, S. A. Teukolsky, and W. T. Vetterling, 
! [Cambridge Univ. Press, 1992], p. 508.
! for isign = (-1,1), input: all, output: f
! approximate flop count: N*(5*log2(N) + 18)/nvp
! where N = (nx/2)*ny
! indx/indy = exponent which determines length in x/y direction,
! where nx=2**indx, ny=2**indy
! if isign = -1, inverse cosine-sine-cosine transform are performed
! g(1,m,n) = (.5*g(1,1,n) + ((-1)**m)*g(1,ny+1,n)
!              + sum(g(1,k,n)*cos(pi*m*k/ny))
! g(2,m,n) = sum(g(2,k,n)*sin(pi*m*k/ny))
! g(3,m,n) = (.5*g(3,1,n) + ((-1)**m)*g(3,ny+1,n)
!              + sum(g(3,k,n)*cos(pi*m*k/ny))
! if isign = 1, a forward cosine-sine-cosine transforms are performed
! g(1,k,n) = 2*(.5*g(1,1,n) + ((-1)**m)*g(1,ny+1,n)
!              + sum(g(1,m,n)*cos(pi*m*k/ny))
! g(2,k,n) = sum(g(2,m,n)*sin(pi*m*k/ny))
! g(3,k,n) = 2*(.5*g(3,1,n) + ((-1)**m)*g(3,ny+1,n)
!              + sum(g(3,m,n)*cos(pi*m*k/ny))
! mixup = array of bit reversed addresses
! sctd = sine/cosine table
! kstrt = starting data block number
! kxpi = initial x index used
! kxpp = number of x indices used
! nyv = first dimension of g >= ny + 1
! kxpd = second dimension of g >= kxp + 1
! nxhyd = maximum of (nx/2,ny)
! nxyd = maximum of (nx,ny)
! written by viktor k. decyk, ucla
      implicit none
      integer isign, indx, indy, kstrt, kxpi, kxpp
      integer nyv, kxpd, nxhyd, nxyd, mixup
      real g
      complex sctd
      dimension g(3,nyv,kxpd)
      dimension mixup(nxhyd), sctd(nxyd)
! local data
      integer indx1, indy1, indx1y, nx, ny, nyh, nyhh, ny3, nxy, nxhy
      integer i, j, k, m, ks, km, kmr, nry, j1, j2, ns, ns2, k1, k2
      integer kxps, nryb, jj
      real at1, at2, at3, t2, t3, t4, t5, t6
      complex t1
      double precision sum1, sum2, sum3
      indx1 = indx - 1
      indy1 = indy - 1
      indx1y = max0(indx1,indy)
      nx = 2**indx
      ny = 2**indy
      nyh = ny/2
      nyhh = ny/4
      ny3 = ny + 3
      nxy = max0(nx,ny)
      nxhy = 2**indx1y
      ks = kstrt - 1
      kxps = kxpi + kxpp - 1
      if (kstrt.gt.nx) return
      if (isign.eq.0) return
      nryb = nxhy/nyh
      nry = nxy/nyh
!$OMP PARALLEL DO
!$OMP& PRIVATE(i,j,k,m,jj,ns,ns2,km,kmr,k1,k2,j1,j2,at1,at2,at3,t2,t3,t4
!$OMP& ,t5,t6,t1,sum1,sum2,sum3)
      do 150 i = kxpi, kxps
! create auxiliary array in y
      kmr = nxy/ny
      sum1 = 0.5*(g(1,1,i) - g(1,ny+1,i))
      sum2 = 0.5*(g(3,1,i) - g(3,ny+1,i))
      do 10 k = 2, nyh
      k1 = 1 + kmr*(k - 1)
      at3 = -aimag(sctd(k1))
      at2 = g(1,ny+2-k,i)
      at1 = g(1,k,i) + at2
      at2 = g(1,k,i) - at2
      sum1 = sum1 + real(sctd(k1))*at2
      at2 = at3*at2
      at1 = 0.5*at1
      g(1,k,i) = at1 - at2
      g(1,ny+2-k,i) = at1 + at2
      at2 = g(2,ny+2-k,i)
      at1 = g(2,k,i) + at2
      at2 = g(2,k,i) - at2
      at1 = at3*at1
      at2 = 0.5*at2
      g(2,k,i) = at1 + at2
      g(2,ny+2-k,i) = at1 - at2
      at2 = g(3,ny+2-k,i)
      at1 = g(3,k,i) + at2
      at2 = g(3,k,i) - at2
      sum2 = sum2 + real(sctd(k1))*at2
      at2 = at3*at2
      at1 = 0.5*at1
      g(3,k,i) = at1 - at2
      g(3,ny+2-k,i) = at1 + at2
   10 continue
      g(1,1,i) = 0.5*(g(1,1,i) + g(1,ny+1,i))
      g(1,ny+1,i) = sum1
      g(2,1,i) = 0.0
      g(2,nyh+1,i) = 2.0*g(2,nyh+1,i)
      g(3,1,i) = 0.5*(g(3,1,i) + g(3,ny+1,i))
      g(3,ny+1,i) = sum2
! bit-reverse array elements in y
      do 30 k = 1, nyh
      k1 = (mixup(k) - 1)/nryb + 1
      if (k.lt.k1) then
         do 20 jj = 1, 3
         t2 = g(jj,2*k1-1,i)
         t3 = g(jj,2*k1,i)
         g(jj,2*k1-1,i) = g(jj,2*k-1,i)
         g(jj,2*k1,i) = g(jj,2*k,i)
         g(jj,2*k-1,i) = t2
         g(jj,2*k,i) = t3
   20    continue
      endif
   30 continue
! then transform in y
      do 70 m = 1, indy1
      ns = 2**(m - 1)
      ns2 = ns + ns
      km = nyhh/ns
      kmr = 2*km*nry
      do 60 k = 1, km
      k1 = ns2*(k - 1)
      k2 = k1 + ns
      do 50 j = 1, ns
      j1 = j + k1
      j2 = j + k2
      t1 = sctd(1+kmr*(j-1))
      do 40 jj = 1, 3
      t2 = real(t1)*g(jj,2*j2-1,i) - aimag(t1)*g(jj,2*j2,i)
      t3 = aimag(t1)*g(jj,2*j2-1,i) + real(t1)*g(jj,2*j2,i)
      g(jj,2*j2-1,i) = g(jj,2*j1-1,i) - t2
      g(jj,2*j2,i) = g(jj,2*j1,i) - t3
      g(jj,2*j1-1,i) = g(jj,2*j1-1,i) + t2
      g(jj,2*j1,i) = g(jj,2*j1,i) + t3
   40 continue
   50 continue
   60 continue
   70 continue
! unscramble coefficients and normalize
! inverse fourier transform
      if (isign.lt.0) then
         kmr = nxy/nyh
         do 90 k = 2, nyhh
         t1 = cmplx(aimag(sctd(1+kmr*(k-1))),-real(sctd(1+kmr*(k-1))))
         do 80 jj = 1, 3
         t4 = g(jj,ny3-2*k,i)
         t5 = -g(jj,ny3-2*k+1,i)
         t2 = g(jj,2*k-1,i) + t4
         t3 = g(jj,2*k,i) + t5
         t6 = g(jj,2*k-1,i) - t4
         t5 = g(jj,2*k,i) - t5
         t4 = t6*real(t1) - t5*aimag(t1)
         t5 = t6*aimag(t1) + t5*real(t1)
         g(jj,2*k-1,i) = 0.5*(t2 + t4)
         g(jj,2*k,i) = 0.5*(t3 + t5)
         g(jj,ny3-2*k,i) = 0.5*(t2 - t4)
         g(jj,ny3-2*k+1,i) = 0.5*(t5 - t3)
   80    continue
   90    continue
         do 100 jj = 1, 3
         g(jj,nyh+1,i) = g(jj,nyh+1,i)
         g(jj,nyh+2,i) = -g(jj,nyh+2,i)
         t2 = g(jj,1,i) + g(jj,2,i)
         g(jj,2,i) = g(jj,1,i) - g(jj,2,i)
         g(jj,1,i) = t2
         g(jj,ny+1,i) = g(jj,ny+1,i)
  100    continue
! forward fourier transform
      else if (isign.gt.0) then
         kmr = nxy/nyh
         do 120 k = 2, nyhh
         t1 = cmplx(aimag(sctd(1+kmr*(k-1))),-real(sctd(1+kmr*(k-1))))
         do 110 jj = 1, 3
         t4 = g(jj,ny3-2*k,i)
         t5 = -g(jj,ny3-2*k+1,i)
         t2 = g(jj,2*k-1,i) + t4
         t3 = g(jj,2*k,i) + t5
         t6 = g(jj,2*k-1,i) - t4
         t5 = g(jj,2*k,i) - t5
         t4 = t6*real(t1) - t5*aimag(t1)
         t5 = t6*aimag(t1) + t5*real(t1)
         g(jj,2*k-1,i) = t2 + t4
         g(jj,2*k,i) = t3 + t5
         g(jj,ny3-2*k,i) = t2 - t4
         g(jj,ny3-2*k+1,i) = t5 - t3
  110    continue
  120    continue
         do 130 jj = 1, 3
         g(jj,nyh+1,i) = 2.0*g(jj,nyh+1,i)
         g(jj,nyh+2,i) = -2.0*g(jj,nyh+2,i)
         t2 = 2.0*(g(jj,1,i) + g(jj,2,i))
         g(jj,2,i) = 2.0*(g(jj,1,i) - g(jj,2,i))
         g(jj,1,i) = t2
         g(jj,ny+1,i) = 2.0*g(jj,ny+1,i)
  130    continue
      endif
! perform recursion for sine-cosine transform
      sum1 = g(1,ny+1,i)
      g(1,ny+1,i) = g(1,2,i)
      g(1,2,i) = sum1
      sum2 = 0.5*g(2,1,i)
      g(2,1,i) = 0.0
      g(2,2,i) = sum2
      sum3 = g(3,ny+1,i)
      g(3,ny+1,i) = g(3,2,i)
      g(3,2,i) = sum3
      do 140 k = 2, nyh
      sum1 = sum1 - g(1,2*k,i)
      g(1,2*k,i) = sum1
      sum2 = sum2 + g(2,2*k-1,i)
      g(2,2*k-1,i) = -g(2,2*k,i)
      g(2,2*k,i) = sum2
      sum3 = sum3 - g(3,2*k,i)
      g(3,2*k,i) = sum3
  140 continue
      g(2,ny+1,i) = 0.0
  150 continue
!$OMP END PARALLEL DO
      return
      end
!-----------------------------------------------------------------------
      subroutine WPPFSCT2RM4(f,g,bs,br,isign,ntpose,mixup,sctd,ttp,indx,&
     &indy,kstrt,nvp,nxvh,nyv,kxp2,kyp,kypd,kxp2d,nxhyd,nxyd)
! wrapper function for 4 parallel real sine/cosine transforms
! for the momentum flux with dirichlet boundary conditions
! x component has a sine/sine transform in x and y, respectively
! y component has a cosine/cosine transform in x and y, respectively
! z component has a cosine/sine transform in x and y, respectively
! w component has a sine/cosine transform in x and y, respectively
      implicit none
      integer isign, ntpose, mixup, indx, indy, kstrt, nvp, nxvh, nyv
      integer kxp2, kyp, kypd, kxp2d, nxhyd, nxyd
      real f, g, bs, br, ttp
      complex sctd
      dimension f(4,2*nxvh,kypd), g(4,nyv,kxp2d)
      dimension bs(4,kxp2+1,kyp+1), br(4,kxp2+1,kyp+1)
      dimension mixup(nxhyd), sctd(nxyd)
! local data
      integer nx, ny, kxpi, kypi, ks, kxp2p, kypp, kxb2, kyb
      real tf
      double precision dtime
      data kxpi, kypi /1,1/
! calculate range of indices
      nx = 2**indx
      ny = 2**indy
! ks = processor id
      ks = kstrt - 1
! kxp2p = actual size used in x direction
      kxp2p = min(kxp2,max(0,nx-kxp2*ks))
! kypp = actual size used in y direction
      kypp = min(kyp,max(0,ny-kyp*ks))
! kxb2 = minimum number of processors needed in x direction
      kxb2 = (nx - 1)/kxp2 + 1
! kyb = minimum number of processors needed in y direction
      kyb = (ny - 1)/kyp + 1
! add extra word for last processor in x
      if (ks==(kxb2-1)) kxp2p = kxp2p + 1
! add extra word for last processor in y
      if (ks==(kyb-1)) kypp = kypp + 1
! inverse fourier transform
      if (isign.lt.0) then
! perform x sine-cosine-cosine-sine transform
         call PPFSCCST2RM4X(f,isign,mixup,sctd,indx,indy,kstrt,kypi,kypp&
     &,nxvh,kypd,nxhyd,nxyd)
! transpose f array to g
         call PWTIMERA(-1,ttp,dtime)
         call PPRNTPOSE(f,g,bs,br,nx,ny,kxp2,kyp,kstrt,nvp,4,2*nxvh,nyv,&
     &kxp2d,kypd)
         call PWTIMERA(1,ttp,dtime)
! perform y sine-cosine-sine-cosine transform
         call PPFSCSCT2RM4Y(g,isign,mixup,sctd,indx,indy,kstrt,kxpi,    &
     &kxp2p,nyv,kxp2d,nxhyd,nxyd)
! transpose g array to f
         if (ntpose.eq.0) then
            call PWTIMERA(-1,tf,dtime)
            call PPRNTPOSE(g,f,br,bs,ny,nx,kyp,kxp2,kstrt,nvp,4,nyv,    &
     &2*nxvh,kypd,kxp2d)
            call PWTIMERA(1,tf,dtime)
         endif
! forward fourier transform
      else if (isign.gt.0) then
! transpose f array to g
         if (ntpose.eq.0) then
            call PWTIMERA(-1,tf,dtime)
            call PPRNTPOSE(f,g,bs,br,nx,ny,kxp2,kyp,kstrt,nvp,4,2*nxvh, &
     &nyv,kxp2d,kypd)
            call PWTIMERA(1,tf,dtime)
         endif
! perform y sine-cosine-sine-cosine transform
         call PPFSCSCT2RM4Y(g,isign,mixup,sctd,indx,indy,kstrt,kxpi,    &
     &kxp2p,nyv,kxp2d,nxhyd,nxyd)
! transpose g array to f
         call PWTIMERA(-1,ttp,dtime)
         call PPRNTPOSE(g,f,br,bs,ny,nx,kyp,kxp2,kstrt,nvp,4,nyv,2*nxvh,&
     &kypd,kxp2d)
         call PWTIMERA(1,ttp,dtime)
! perform y sine-cosine-sine-cosine transform
         call PPFSCCST2RM4X(f,isign,mixup,sctd,indx,indy,kstrt,kypi,kypp&
     &,nxvh,kypd,nxhyd,nxyd)
      endif
      if (ntpose.eq.0) ttp = ttp + tf
      return
      end
!-----------------------------------------------------------------------
      subroutine PPFSCCST2RM4X(f,isign,mixup,sctd,indx,indy,kstrt,kypi, &
     &kypp,nxvh,kypd,nxhyd,nxyd)
! this subroutine performs the x part of 4 two dimensional fast real
! sine and cosine transforms and their inverses, for a subset of y,
! using real arithmetic, with OpenMP,
! for data which is distributed in blocks
! x/w component has a sine transform, y/z component a cosine transform
! algorithm is described in Numerical Recipies in Fortran, Second Ed.,
! by W. H. Press, B. P. Flannery, S. A. Teukolsky, and W. T. Vetterling, 
! [Cambridge Univ. Press, 1992], p. 508.
! for isign = (-1,1), input: all, output: f
! approximate flop count: N*(5*log2(N) + 18)/nvp
! where N = (nx/2)*ny
! indx/indy = exponent which determines length in x/y direction,
! where nx=2**indx, ny=2**indy
! if isign = -1, inverse sine-cosine-cosine-sine transforms are
! performed
! f(1,n,k) = (1/nx*ny)*sum(f(1,j,k)*sin(pi*n*j/nx))
! f(2:3,n,k) = (1/nx*ny)*(.5*f(2:3,1,k) + ((-1)**n)*f(2:3,nx+1,k)
!              + sum(f(2:3,j,k)*cos(pi*n*j/nx)))
! f(4,n,k) = (1/nx*ny)*sum(f(4,j,k)*sin(pi*n*j/nx))
! if isign = 1, forward sine-cosine-cosine-sine transforms are performed
! f(1,j,k) = sum(f(1,n,k)*sin(pi*n*j/nx))
! f(2:3,j,k) = 2*(.5*f(2:3,1,k) + ((-1)**j)*f(2:3,n+1,k)
!              + sum(f(2:3,n,k)*cos(pi*n*j/nx))
! f(4,j,k) = sum(f(4,n,k)*sin(pi*n*j/nx))
! mixup = array of bit reversed addresses
! sctd = sine/cosine table
! kstrt = starting data block number
! kypi = initial y index used
! kypp = number of y indices used
! nxvh = first dimension of f >= nx/2 + 1
! kypd = second dimension of f >= kyp + 1
! nxhyd = maximum of (nx/2,ny)
! nxyd = maximum of (nx,ny)
! written by viktor k. decyk, ucla
      implicit none
      integer isign, indx, indy, kstrt, kypi, kypp
      integer nxvh, kypd, nxhyd, nxyd, mixup
      real f
      complex sctd
      dimension f(4,2*nxvh,kypd)
      dimension mixup(nxhyd), sctd(nxyd)
! local data
      integer indx1, indx1y, nx, nxh, nxhh, nx3, ny, nxy, nxhy, ks
      integer i, j, k, m, km, kmr, nrx, j1, j2, ns, ns2, k1, k2, kyps
      integer nrxb, jj
      real at1, at2, at3, t2, t3, t4, t5, t6, ani
      complex t1
      double precision sum1, sum2, sum3, sum4
      indx1 = indx - 1
      indx1y = max0(indx1,indy)
      nx = 2**indx
      nxh = nx/2
      nxhh = nx/4
      nx3 = nx + 3
      ny = 2**indy
      nxy = max0(nx,ny)
      nxhy = 2**indx1y
      ks = kstrt - 1
      kyps = kypi + kypp - 1
      if (kstrt.gt.ny) return
      if (isign.eq.0) return
      ani = 0.5/(real(nx)*real(ny))
      nrxb = nxhy/nxh
      nrx = nxy/nxh
!$OMP PARALLEL DO
!$OMP& PRIVATE(i,j,k,m,jj,ns,ns2,km,kmr,k1,k2,j1,j2,at1,at2,at3,t2,t3,t4
!$OMP& ,t5,t6,t1,sum1,sum2,sum3,sum4)
      do 150 i = kypi, kyps
! create auxiliary array in x
      kmr = nxy/nx
      sum2 = 0.5*(f(2,1,i) - f(2,nx+1,i))
      sum3 = 0.5*(f(3,1,i) - f(3,nx+1,i))
      do 10 j = 2, nxh
      j1 = 1 + kmr*(j - 1)
      at3 = -aimag(sctd(j1))
      at2 = f(1,nx+2-j,i)
      at1 = f(1,j,i) + at2
      at2 = f(1,j,i) - at2
      at1 = at3*at1
      at2 = 0.5*at2
      f(1,j,i) = at1 + at2
      f(1,nx+2-j,i) = at1 - at2
      at2 = f(2,nx+2-j,i)
      at1 = f(2,j,i) + at2
      at2 = f(2,j,i) - at2
      sum2 = sum2 + real(sctd(j1))*at2
      at2 = at3*at2
      at1 = 0.5*at1
      f(2,j,i) = at1 - at2
      f(2,nx+2-j,i) = at1 + at2
      at2 = f(3,nx+2-j,i)
      at1 = f(3,j,i) + at2
      at2 = f(3,j,i) - at2
      sum3 = sum3 + real(sctd(j1))*at2
      at2 = at3*at2
      at1 = 0.5*at1
      f(3,j,i) = at1 - at2
      f(3,nx+2-j,i) = at1 + at2
      at2 = f(4,nx+2-j,i)
      at1 = f(4,j,i) + at2
      at2 = f(4,j,i) - at2
      at1 = at3*at1
      at2 = 0.5*at2
      f(4,j,i) = at1 + at2
      f(4,nx+2-j,i) = at1 - at2
   10 continue
      f(1,1,i) = 0.0
      f(1,nxh+1,i) = 2.0*f(1,nxh+1,i)
      f(2,1,i) = 0.5*(f(2,1,i) + f(2,nx+1,i))
      f(2,nx+1,i) = sum2
      f(3,1,i) = 0.5*(f(3,1,i) + f(3,nx+1,i))
      f(3,nx+1,i) = sum3
      f(4,1,i) = 0.0
      f(4,nxh+1,i) = 2.0*f(4,nxh+1,i)
! bit-reverse array elements in x
      do 30 j = 1, nxh
      j1 = (mixup(j) - 1)/nrxb + 1
      if (j.lt.j1) then
         do 20 jj = 1, 4
         t2 = f(jj,2*j1-1,i)
         t3 = f(jj,2*j1,i)
         f(jj,2*j1-1,i) = f(jj,2*j-1,i)
         f(jj,2*j1,i) = f(jj,2*j,i)
         f(jj,2*j-1,i) = t2
         f(jj,2*j,i) = t3
   20    continue
      endif
   30 continue
! then transform in x
      do 70 m = 1, indx1
      ns = 2**(m - 1)
      ns2 = ns + ns
      km = nxhh/ns
      kmr = 2*km*nrx
      do 60 k = 1, km
      k1 = ns2*(k - 1)
      k2 = k1 + ns
      do 50 j = 1, ns
      j1 = j + k1
      j2 = j + k2
      t1 = sctd(1+kmr*(j-1))
      do 40 jj = 1, 4
      t2 = real(t1)*f(jj,2*j2-1,i) - aimag(t1)*f(jj,2*j2,i)
      t3 = aimag(t1)*f(jj,2*j2-1,i) + real(t1)*f(jj,2*j2,i)
      f(jj,2*j2-1,i) = f(jj,2*j1-1,i) - t2
      f(jj,2*j2,i) = f(jj,2*j1,i) - t3
      f(jj,2*j1-1,i) = f(jj,2*j1-1,i) + t2
      f(jj,2*j1,i) = f(jj,2*j1,i) + t3
   40 continue
   50 continue
   60 continue
   70 continue
! unscramble coefficients and normalize
! inverse fourier transform
      if (isign.lt.0) then
         kmr = nxy/nxh
         do 90 j = 2, nxhh
         t1 = cmplx(aimag(sctd(1+kmr*(j-1))),-real(sctd(1+kmr*(j-1))))
         do 80 jj = 1, 4
         t4 = f(jj,nx3-2*j,i)
         t5 = -f(jj,nx3-2*j+1,i)
         t2 = f(jj,2*j-1,i) + t4
         t3 = f(jj,2*j,i) + t5
         t6 = f(jj,2*j-1,i) - t4
         t5 = f(jj,2*j,i) - t5
         t4 = t6*real(t1) - t5*aimag(t1)
         t5 = t6*aimag(t1) + t5*real(t1)
         f(jj,2*j-1,i) = ani*(t2 + t4)
         f(jj,2*j,i) = ani*(t3 + t5)
         f(jj,nx3-2*j,i) = ani*(t2 - t4)
         f(jj,nx3-2*j+1,i) = ani*(t5 - t3)
   80    continue
   90    continue
         do 100 jj = 1, 4
         f(jj,nxh+1,i) = 2.0*ani*f(jj,nxh+1,i)
         f(jj,nxh+2,i) = -2.0*ani*f(jj,nxh+2,i)
         t2 = 2.0*ani*(f(jj,1,i) + f(jj,2,i))
         f(jj,2,i) = 2.0*ani*(f(jj,1,i) - f(jj,2,i))
         f(jj,1,i) = t2
         f(jj,nx+1,i) = 2.0*ani*f(jj,nx+1,i)
  100    continue
! forward fourier transform
      else if (isign.gt.0) then
         kmr = nxy/nxh
         do 120 j = 2, nxhh
         t1 = cmplx(aimag(sctd(1+kmr*(j-1))),-real(sctd(1+kmr*(j-1))))
         do 110 jj = 1, 4
         t4 = f(jj,nx3-2*j,i)
         t5 = -f(jj,nx3-2*j+1,i)
         t2 = f(jj,2*j-1,i) + t4
         t3 = f(jj,2*j,i) + t5
         t6 = f(jj,2*j-1,i) - t4
         t5 = f(jj,2*j,i) - t5
         t4 = t6*real(t1) - t5*aimag(t1)
         t5 = t6*aimag(t1) + t5*real(t1)
         f(jj,2*j-1,i) = t2 + t4
         f(jj,2*j,i) = t3 + t5
         f(jj,nx3-2*j,i) = t2 - t4
         f(jj,nx3-2*j+1,i) = t5 - t3
  110    continue
  120    continue
         do 130 jj = 1, 4
         f(jj,nxh+1,i) = 2.0*f(jj,nxh+1,i)
         f(jj,nxh+2,i) = -2.0*f(jj,nxh+2,i)
         t2 = 2.0*(f(jj,1,i) + f(jj,2,i))
         f(jj,2,i) = 2.0*(f(jj,1,i) - f(jj,2,i))
         f(jj,1,i) = t2
         f(jj,nx+1,i) = 2.0*f(jj,nx+1,i)
  130    continue
      endif
! perform recursion for cosine-sine transform
      sum1 = 0.5*f(1,1,i)
      f(1,1,i) = 0.0
      f(1,2,i) = sum1
      sum2 = f(2,nx+1,i)
      f(2,nx+1,i) = f(2,2,i)
      f(2,2,i) = sum2
      sum3 = f(3,nx+1,i)
      f(3,nx+1,i) = f(3,2,i)
      f(3,2,i) = sum3
      sum4 = 0.5*f(4,1,i)
      f(4,1,i) = 0.0
      f(4,2,i) = sum4
      do 140 j = 2, nxh
      sum1 = sum1 + f(1,2*j-1,i)
      f(1,2*j-1,i) = -f(1,2*j,i)
      f(1,2*j,i) = sum1
      sum2 = sum2 - f(2,2*j,i)
      f(2,2*j,i) = sum2
      sum3 = sum3 - f(3,2*j,i)
      f(3,2*j,i) = sum3
      sum4 = sum4 + f(4,2*j-1,i)
      f(4,2*j-1,i) = -f(4,2*j,i)
      f(4,2*j,i) = sum4
  140 continue
      f(1,nx+1,i) = 0.0
      f(4,nx+1,i) = 0.0
  150 continue
!$OMP END PARALLEL DO
      return
      end
!-----------------------------------------------------------------------
      subroutine PPFSCSCT2RM4Y(g,isign,mixup,sctd,indx,indy,kstrt,kxpi, &
     &kxpp,nyv,kxpd,nxhyd,nxyd)
! this subroutine performs the y part of 4 two dimensional fast real
! sine and cosine transforms and their inverses, for a subset of x,
! using real arithmetic, with OpenMP,
! for data which is distributed in blocks
! x/z component has a sine transform, y/w component a cosine transform
! algorithm is described in Numerical Recipies in Fortran, Second Ed.,
! by W. H. Press, B. P. Flannery, S. A. Teukolsky, and W. T. Vetterling, 
! [Cambridge Univ. Press, 1992], p. 508.
! for isign = (-1,1), input: all, output: f
! approximate flop count: N*(5*log2(N) + 18)/nvp
! where N = (nx/2)*ny
! indx/indy = exponent which determines length in x/y direction,
! where nx=2**indx, ny=2**indy
! if isign = -1, inverse sine-cosine-sine-cosine transform are performed
! g(1,m,n) = sum(g(1,k,n)*sin(pi*m*k/ny))
! g(2,m,n) = (.5*g(2,1,n) + ((-1)**m)*g(2,ny+1,n)
!              + sum(g(2,k,n)*cos(pi*m*k/ny))
! g(3,m,n) = sum(g(3,k,n)*sin(pi*m*k/ny))
! g(4,m,n) = (.5*g(4,1,n) + ((-1)**m)*g(4,ny+1,n)
!              + sum(g(4,k,n)*cos(pi*m*k/ny))
! if isign = 1, forward sine-cosine-sine-cosine transform are performed
! g(1,k,n) = sum(g(1,m,n)*sin(pi*m*k/ny))
! g(2,k,n) = 2*(.5*g(2,1,n) + ((-1)**m)*g(2,ny+1,n)
!              + sum(g(2,m,n)*cos(pi*m*k/ny))
! g(3,k,n) = sum(g(3,m,n)*sin(pi*m*k/ny))
! g(4,k,n) = 2*(.5*g(4,1,n) + ((-1)**m)*g(4,ny+1,n)
!              + sum(g(4,m,n)*cos(pi*m*k/ny))
! mixup = array of bit reversed addresses
! sctd = sine/cosine table
! kstrt = starting data block number
! kxpi = initial x index used
! kxpp = number of x indices used
! nyv = first dimension of g >= ny + 1
! kxpd = second dimension of g >= kxp + 1
! nxhyd = maximum of (nx/2,ny)
! nxyd = maximum of (nx,ny)
! written by viktor k. decyk, ucla
      implicit none
      integer isign, indx, indy, kstrt, kxpi, kxpp
      integer nyv, kxpd, nxhyd, nxyd, mixup
      real g
      complex sctd
      dimension g(4,nyv,kxpd)
      dimension mixup(nxhyd), sctd(nxyd)
! local data
      integer indx1, indy1, indx1y, nx, ny, nyh, nyhh, ny3, nxy, nxhy
      integer i, j, k, m, ks, km, kmr, nry, j1, j2, ns, ns2, k1, k2
      integer kxps, nryb, jj
      real at1, at2, at3, t2, t3, t4, t5, t6
      complex t1
      double precision sum1, sum2, sum3, sum4
      indx1 = indx - 1
      indy1 = indy - 1
      indx1y = max0(indx1,indy)
      nx = 2**indx
      ny = 2**indy
      nyh = ny/2
      nyhh = ny/4
      ny3 = ny + 3
      nxy = max0(nx,ny)
      nxhy = 2**indx1y
      ks = kstrt - 1
      kxps = kxpi + kxpp - 1
      if (kstrt.gt.nx) return
      if (isign.eq.0) return
      nryb = nxhy/nyh
      nry = nxy/nyh
!$OMP PARALLEL DO
!$OMP& PRIVATE(i,j,k,m,jj,ns,ns2,km,kmr,k1,k2,j1,j2,at1,at2,at3,t2,t3,t4
!$OMP& ,t5,t6,t1,sum1,sum2,sum3,sum4)
      do 150 i = kxpi, kxps
! create auxiliary array in y
      kmr = nxy/ny
      sum2 = 0.5*(g(2,1,i) - g(2,ny+1,i))
      sum4 = 0.5*(g(4,1,i) - g(4,ny+1,i))
      do 10 k = 2, nyh
      k1 = 1 + kmr*(k - 1)
      at3 = -aimag(sctd(k1))
      at2 = g(1,ny+2-k,i)
      at1 = g(1,k,i) + at2
      at2 = g(1,k,i) - at2
      at1 = at3*at1
      at2 = 0.5*at2
      g(1,k,i) = at1 + at2
      g(1,ny+2-k,i) = at1 - at2
      at2 = g(2,ny+2-k,i)
      at1 = g(2,k,i) + at2
      at2 = g(2,k,i) - at2
      sum2 = sum2 + real(sctd(k1))*at2
      at2 = at3*at2
      at1 = 0.5*at1
      g(2,k,i) = at1 - at2
      g(2,ny+2-k,i) = at1 + at2
      at2 = g(3,ny+2-k,i)
      at1 = g(3,k,i) + at2
      at2 = g(3,k,i) - at2
      at1 = at3*at1
      at2 = 0.5*at2
      g(3,k,i) = at1 + at2
      g(3,ny+2-k,i) = at1 - at2
      at2 = g(4,ny+2-k,i)
      at1 = g(4,k,i) + at2
      at2 = g(4,k,i) - at2
      sum4 = sum4 + real(sctd(k1))*at2
      at2 = at3*at2
      at1 = 0.5*at1
      g(4,k,i) = at1 - at2
      g(4,ny+2-k,i) = at1 + at2
   10 continue
      g(1,1,i) = 0.0
      g(1,nyh+1,i) = 2.0*g(1,nyh+1,i)
      g(2,1,i) = 0.5*(g(2,1,i) + g(2,ny+1,i))
      g(2,ny+1,i) = sum2
      g(3,1,i) = 0.0
      g(3,nyh+1,i) = 2.0*g(3,nyh+1,i)
      g(4,1,i) = 0.5*(g(4,1,i) + g(4,ny+1,i))
      g(4,ny+1,i) = sum4
! bit-reverse array elements in y
      do 30 k = 1, nyh
      k1 = (mixup(k) - 1)/nryb + 1
      if (k.lt.k1) then
         do 20 jj = 1, 4
         t2 = g(jj,2*k1-1,i)
         t3 = g(jj,2*k1,i)
         g(jj,2*k1-1,i) = g(jj,2*k-1,i)
         g(jj,2*k1,i) = g(jj,2*k,i)
         g(jj,2*k-1,i) = t2
         g(jj,2*k,i) = t3
   20    continue
      endif
   30 continue
! then transform in y
      do 70 m = 1, indy1
      ns = 2**(m - 1)
      ns2 = ns + ns
      km = nyhh/ns
      kmr = 2*km*nry
      do 60 k = 1, km
      k1 = ns2*(k - 1)
      k2 = k1 + ns
      do 50 j = 1, ns
      j1 = j + k1
      j2 = j + k2
      t1 = sctd(1+kmr*(j-1))
      do 40 jj = 1, 4
      t2 = real(t1)*g(jj,2*j2-1,i) - aimag(t1)*g(jj,2*j2,i)
      t3 = aimag(t1)*g(jj,2*j2-1,i) + real(t1)*g(jj,2*j2,i)
      g(jj,2*j2-1,i) = g(jj,2*j1-1,i) - t2
      g(jj,2*j2,i) = g(jj,2*j1,i) - t3
      g(jj,2*j1-1,i) = g(jj,2*j1-1,i) + t2
      g(jj,2*j1,i) = g(jj,2*j1,i) + t3
   40 continue
   50 continue
   60 continue
   70 continue
! unscramble coefficients and normalize
! inverse fourier transform
      if (isign.lt.0) then
         kmr = nxy/nyh
         do 90 k = 2, nyhh
         t1 = cmplx(aimag(sctd(1+kmr*(k-1))),-real(sctd(1+kmr*(k-1))))
         do 80 jj = 1, 4
         t4 = g(jj,ny3-2*k,i)
         t5 = -g(jj,ny3-2*k+1,i)
         t2 = g(jj,2*k-1,i) + t4
         t3 = g(jj,2*k,i) + t5
         t6 = g(jj,2*k-1,i) - t4
         t5 = g(jj,2*k,i) - t5
         t4 = t6*real(t1) - t5*aimag(t1)
         t5 = t6*aimag(t1) + t5*real(t1)
         g(jj,2*k-1,i) = 0.5*(t2 + t4)
         g(jj,2*k,i) = 0.5*(t3 + t5)
         g(jj,ny3-2*k,i) = 0.5*(t2 - t4)
         g(jj,ny3-2*k+1,i) = 0.5*(t5 - t3)
   80    continue
   90    continue
         do 100 jj = 1, 4
         g(jj,nyh+1,i) = g(jj,nyh+1,i)
         g(jj,nyh+2,i) = -g(jj,nyh+2,i)
         t2 = g(jj,1,i) + g(jj,2,i)
         g(jj,2,i) = g(jj,1,i) - g(jj,2,i)
         g(jj,1,i) = t2
         g(jj,ny+1,i) = g(jj,ny+1,i)
  100    continue
! forward fourier transform
      else if (isign.gt.0) then
         kmr = nxy/nyh
         do 120 k = 2, nyhh
         t1 = cmplx(aimag(sctd(1+kmr*(k-1))),-real(sctd(1+kmr*(k-1))))
         do 110 jj = 1, 4
         t4 = g(jj,ny3-2*k,i)
         t5 = -g(jj,ny3-2*k+1,i)
         t2 = g(jj,2*k-1,i) + t4
         t3 = g(jj,2*k,i) + t5
         t6 = g(jj,2*k-1,i) - t4
         t5 = g(jj,2*k,i) - t5
         t4 = t6*real(t1) - t5*aimag(t1)
         t5 = t6*aimag(t1) + t5*real(t1)
         g(jj,2*k-1,i) = t2 + t4
         g(jj,2*k,i) = t3 + t5
         g(jj,ny3-2*k,i) = t2 - t4
         g(jj,ny3-2*k+1,i) = t5 - t3
  110    continue
  120    continue
         do 130 jj = 1, 4
         g(jj,nyh+1,i) = 2.0*g(jj,nyh+1,i)
         g(jj,nyh+2,i) = -2.0*g(jj,nyh+2,i)
         t2 = 2.0*(g(jj,1,i) + g(jj,2,i))
         g(jj,2,i) = 2.0*(g(jj,1,i) - g(jj,2,i))
         g(jj,1,i) = t2
         g(jj,ny+1,i) = 2.0*g(jj,ny+1,i)
  130    continue
      endif
! perform recursion for sine-cosine transform
      sum1 = 0.5*g(1,1,i)
      g(1,1,i) = 0.0
      g(1,2,i) = sum1
      sum2 = g(2,ny+1,i)
      g(2,ny+1,i) = g(2,2,i)
      g(2,2,i) = sum2
      sum3 = 0.5*g(3,1,i)
      g(3,1,i) = 0.0
      g(3,2,i) = sum3
      sum4 = g(4,ny+1,i)
      g(4,ny+1,i) = g(4,2,i)
      g(4,2,i) = sum4
      do 140 k = 2, nyh
      sum1 = sum1 + g(1,2*k-1,i)
      g(1,2*k-1,i) = -g(1,2*k,i)
      g(1,2*k,i) = sum1
      sum2 = sum2 - g(2,2*k,i)
      g(2,2*k,i) = sum2
      sum3 = sum3 + g(3,2*k-1,i)
      g(3,2*k-1,i) = -g(3,2*k,i)
      g(3,2*k,i) = sum3
      sum4 = sum4 - g(4,2*k,i)
      g(4,2*k,i) = sum4
  140 continue
      g(1,ny+1,i) = 0.0
      g(3,ny+1,i) = 0.0
  150 continue
!$OMP END PARALLEL DO
      return
      end
!-----------------------------------------------------------------------
      subroutine WPPFSCT2RM22(f,g,bs,br,isign,ntpose,mixup,sctd,ttp,indx&
     &,indy,kstrt,nvp,nxvh,nyv,kxp2,kyp,kypd,kxp2d,nxhyd,nxyd)
! wrapper function for 2 parallel real sine/cosine transforms
! for the momentum flux with dirichlet boundary conditions
! x component has a sine/sine transform in x and y, respectively
! y component has a cosine/cosine transform in x and y, respectively
      implicit none
      integer isign, ntpose, mixup, indx, indy, kstrt, nvp, nxvh, nyv
      integer kxp2, kyp, kypd, kxp2d, nxhyd, nxyd
      real f, g, bs, br, ttp
      complex sctd
      dimension f(2,2*nxvh,kypd), g(2,nyv,kxp2d)
      dimension bs(2,kxp2+1,kyp+1), br(2,kxp2+1,kyp+1)
      dimension mixup(nxhyd), sctd(nxyd)
! local data
      integer nx, ny, kxpi, kypi, ks, kxp2p, kypp, kxb2, kyb
      real tf
      double precision dtime
      data kxpi, kypi /1,1/
! calculate range of indices
      nx = 2**indx
      ny = 2**indy
! ks = processor id
      ks = kstrt - 1
! kxp2p = actual size used in x direction
      kxp2p = min(kxp2,max(0,nx-kxp2*ks))
! kypp = actual size used in y direction
      kypp = min(kyp,max(0,ny-kyp*ks))
! kxb2 = minimum number of processors needed in x direction
      kxb2 = (nx - 1)/kxp2 + 1
! kyb = minimum number of processors needed in y direction
      kyb = (ny - 1)/kyp + 1
! add extra word for last processor in x
      if (ks==(kxb2-1)) kxp2p = kxp2p + 1
! add extra word for last processor in y
      if (ks==(kyb-1)) kypp = kypp + 1
! inverse fourier transform
      if (isign.lt.0) then
! perform x sine-cosine transform
         call PPFSCCST2RM22X(f,isign,mixup,sctd,indx,indy,kstrt,kypi,   &
     &kypp,nxvh,kypd,nxhyd,nxyd)
! transpose f array to g
         call PWTIMERA(-1,ttp,dtime)
         call PPRNTPOSE(f,g,bs,br,nx,ny,kxp2,kyp,kstrt,nvp,2,2*nxvh,nyv,&
     &kxp2d,kypd)
         call PWTIMERA(1,ttp,dtime)
! perform y sine-cosine transform
         call PPFSCSCT2RM22Y(g,isign,mixup,sctd,indx,indy,kstrt,kxpi,   &
     &kxp2p,nyv,kxp2d,nxhyd,nxyd)
! transpose g array to f
         if (ntpose.eq.0) then
            call PWTIMERA(-1,tf,dtime)
            call PPRNTPOSE(g,f,br,bs,ny,nx,kyp,kxp2,kstrt,nvp,2,nyv,    &
     &2*nxvh,kypd,kxp2d)
            call PWTIMERA(1,tf,dtime)
         endif
! forward fourier transform
      else if (isign.gt.0) then
! transpose f array to g
         if (ntpose.eq.0) then
            call PWTIMERA(-1,tf,dtime)
            call PPRNTPOSE(f,g,bs,br,nx,ny,kxp2,kyp,kstrt,nvp,2,2*nxvh, &
     &nyv,kxp2d,kypd)
            call PWTIMERA(1,tf,dtime)
         endif
! perform y cosine-sine transform
         call PPFSCSCT2RM22Y(g,isign,mixup,sctd,indx,indy,kstrt,kxpi,   &
     &kxp2p,nyv,kxp2d,nxhyd,nxyd)
! transpose g array to f
         call PWTIMERA(-1,ttp,dtime)
         call PPRNTPOSE(g,f,br,bs,ny,nx,kyp,kxp2,kstrt,nvp,2,nyv,2*nxvh,&
     &kypd,kxp2d)
         call PWTIMERA(1,ttp,dtime)
! perform y sine-cosine transform
         call PPFSCCST2RM22X(f,isign,mixup,sctd,indx,indy,kstrt,kypi,   &
     &kypp,nxvh,kypd,nxhyd,nxyd)
      endif
      if (ntpose.eq.0) ttp = ttp + tf
      return
      end
!-----------------------------------------------------------------------
      subroutine PPFSCCST2RM22X(f,isign,mixup,sctd,indx,indy,kstrt,kypi,&
     &kypp,nxvh,kypd,nxhyd,nxyd)
! this subroutine performs the x part of 2 two dimensional fast real
! sine and cosine transforms and their inverses, for a subset of y,
! using real arithmetic, with OpenMP,
! for data which is distributed in blocks
! x component has a sine transform, y component a cosine transform
! algorithm is described in Numerical Recipies in Fortran, Second Ed.,
! by W. H. Press, B. P. Flannery, S. A. Teukolsky, and W. T. Vetterling, 
! [Cambridge Univ. Press, 1992], p. 508.
! for isign = (-1,1), input: all, output: f
! approximate flop count: N*(5*log2(N) + 18)/nvp
! where N = (nx/2)*ny
! indx/indy = exponent which determines length in x/y direction,
! where nx=2**indx, ny=2**indy
! if isign = -1, inverse sine-cosine transforms are
! performed
! f(1,n,k) = (1/nx*ny)*sum(f(1,j,k)*sin(pi*n*j/nx))
! f(2,n,k) = (1/nx*ny)*(.5*f(2,1,k) + ((-1)**n)*f(2,nx+1,k)
!              + sum(f(2,j,k)*cos(pi*n*j/nx)))
! if isign = 1, forward sine-cosine transforms are performed
! f(1,j,k) = sum(f(1,n,k)*sin(pi*n*j/nx))
! f(2,j,k) = 2*(.5*f(2,1,k) + ((-1)**j)*f(2,n+1,k)
!              + sum(f(2:3,n,k)*cos(pi*n*j/nx))
! mixup = array of bit reversed addresses
! sctd = sine/cosine table
! kstrt = starting data block number
! kypi = initial y index used
! kypp = number of y indices used
! nxvh = first dimension of f >= nx/2 + 1
! kypd = second dimension of f >= kyp + 1
! nxhyd = maximum of (nx/2,ny)
! nxyd = maximum of (nx,ny)
! written by viktor k. decyk, ucla
      implicit none
      integer isign, indx, indy, kstrt, kypi, kypp
      integer nxvh, kypd, nxhyd, nxyd, mixup
      real f
      complex sctd
      dimension f(2,2*nxvh,kypd)
      dimension mixup(nxhyd), sctd(nxyd)
! local data
      integer indx1, indx1y, nx, nxh, nxhh, nx3, ny, nxy, nxhy, ks
      integer i, j, k, m, km, kmr, nrx, j1, j2, ns, ns2, k1, k2, kyps
      integer nrxb, jj
      real at1, at2, at3, t2, t3, t4, t5, t6, ani
      complex t1
      double precision sum1, sum2
      indx1 = indx - 1
      indx1y = max0(indx1,indy)
      nx = 2**indx
      nxh = nx/2
      nxhh = nx/4
      nx3 = nx + 3
      ny = 2**indy
      nxy = max0(nx,ny)
      nxhy = 2**indx1y
      ks = kstrt - 1
      kyps = kypi + kypp - 1
      if (kstrt.gt.ny) return
      if (isign.eq.0) return
      ani = 0.5/(real(nx)*real(ny))
      nrxb = nxhy/nxh
      nrx = nxy/nxh
!$OMP PARALLEL DO
!$OMP& PRIVATE(i,j,k,m,jj,ns,ns2,km,kmr,k1,k2,j1,j2,at1,at2,at3,t2,t3,t4
!$OMP& ,t5,t6,t1,sum1,sum2)
      do 150 i = kypi, kyps
! create auxiliary array in x
      kmr = nxy/nx
      sum2 = 0.5*(f(2,1,i) - f(2,nx+1,i))
      do 10 j = 2, nxh
      j1 = 1 + kmr*(j - 1)
      at3 = -aimag(sctd(j1))
      at2 = f(1,nx+2-j,i)
      at1 = f(1,j,i) + at2
      at2 = f(1,j,i) - at2
      at1 = at3*at1
      at2 = 0.5*at2
      f(1,j,i) = at1 + at2
      f(1,nx+2-j,i) = at1 - at2
      at2 = f(2,nx+2-j,i)
      at1 = f(2,j,i) + at2
      at2 = f(2,j,i) - at2
      sum2 = sum2 + real(sctd(j1))*at2
      at2 = at3*at2
      at1 = 0.5*at1
      f(2,j,i) = at1 - at2
      f(2,nx+2-j,i) = at1 + at2
   10 continue
      f(1,1,i) = 0.0
      f(1,nxh+1,i) = 2.0*f(1,nxh+1,i)
      f(2,1,i) = 0.5*(f(2,1,i) + f(2,nx+1,i))
      f(2,nx+1,i) = sum2
! bit-reverse array elements in x
      do 30 j = 1, nxh
      j1 = (mixup(j) - 1)/nrxb + 1
      if (j.lt.j1) then
         do 20 jj = 1, 2
         t2 = f(jj,2*j1-1,i)
         t3 = f(jj,2*j1,i)
         f(jj,2*j1-1,i) = f(jj,2*j-1,i)
         f(jj,2*j1,i) = f(jj,2*j,i)
         f(jj,2*j-1,i) = t2
         f(jj,2*j,i) = t3
   20    continue
      endif
   30 continue
! then transform in x
      do 70 m = 1, indx1
      ns = 2**(m - 1)
      ns2 = ns + ns
      km = nxhh/ns
      kmr = 2*km*nrx
      do 60 k = 1, km
      k1 = ns2*(k - 1)
      k2 = k1 + ns
      do 50 j = 1, ns
      j1 = j + k1
      j2 = j + k2
      t1 = sctd(1+kmr*(j-1))
      do 40 jj = 1, 2
      t2 = real(t1)*f(jj,2*j2-1,i) - aimag(t1)*f(jj,2*j2,i)
      t3 = aimag(t1)*f(jj,2*j2-1,i) + real(t1)*f(jj,2*j2,i)
      f(jj,2*j2-1,i) = f(jj,2*j1-1,i) - t2
      f(jj,2*j2,i) = f(jj,2*j1,i) - t3
      f(jj,2*j1-1,i) = f(jj,2*j1-1,i) + t2
      f(jj,2*j1,i) = f(jj,2*j1,i) + t3
   40 continue
   50 continue
   60 continue
   70 continue
! unscramble coefficients and normalize
! inverse fourier transform
      if (isign.lt.0) then
         kmr = nxy/nxh
         do 90 j = 2, nxhh
         t1 = cmplx(aimag(sctd(1+kmr*(j-1))),-real(sctd(1+kmr*(j-1))))
         do 80 jj = 1, 2
         t4 = f(jj,nx3-2*j,i)
         t5 = -f(jj,nx3-2*j+1,i)
         t2 = f(jj,2*j-1,i) + t4
         t3 = f(jj,2*j,i) + t5
         t6 = f(jj,2*j-1,i) - t4
         t5 = f(jj,2*j,i) - t5
         t4 = t6*real(t1) - t5*aimag(t1)
         t5 = t6*aimag(t1) + t5*real(t1)
         f(jj,2*j-1,i) = ani*(t2 + t4)
         f(jj,2*j,i) = ani*(t3 + t5)
         f(jj,nx3-2*j,i) = ani*(t2 - t4)
         f(jj,nx3-2*j+1,i) = ani*(t5 - t3)
   80    continue
   90    continue
         do 100 jj = 1, 2
         f(jj,nxh+1,i) = 2.0*ani*f(jj,nxh+1,i)
         f(jj,nxh+2,i) = -2.0*ani*f(jj,nxh+2,i)
         t2 = 2.0*ani*(f(jj,1,i) + f(jj,2,i))
         f(jj,2,i) = 2.0*ani*(f(jj,1,i) - f(jj,2,i))
         f(jj,1,i) = t2
         f(jj,nx+1,i) = 2.0*ani*f(jj,nx+1,i)
  100    continue
! forward fourier transform
      else if (isign.gt.0) then
         kmr = nxy/nxh
         do 120 j = 2, nxhh
         t1 = cmplx(aimag(sctd(1+kmr*(j-1))),-real(sctd(1+kmr*(j-1))))
         do 110 jj = 1, 2
         t4 = f(jj,nx3-2*j,i)
         t5 = -f(jj,nx3-2*j+1,i)
         t2 = f(jj,2*j-1,i) + t4
         t3 = f(jj,2*j,i) + t5
         t6 = f(jj,2*j-1,i) - t4
         t5 = f(jj,2*j,i) - t5
         t4 = t6*real(t1) - t5*aimag(t1)
         t5 = t6*aimag(t1) + t5*real(t1)
         f(jj,2*j-1,i) = t2 + t4
         f(jj,2*j,i) = t3 + t5
         f(jj,nx3-2*j,i) = t2 - t4
         f(jj,nx3-2*j+1,i) = t5 - t3
  110    continue
  120    continue
         do 130 jj = 1, 2
         f(jj,nxh+1,i) = 2.0*f(jj,nxh+1,i)
         f(jj,nxh+2,i) = -2.0*f(jj,nxh+2,i)
         t2 = 2.0*(f(jj,1,i) + f(jj,2,i))
         f(jj,2,i) = 2.0*(f(jj,1,i) - f(jj,2,i))
         f(jj,1,i) = t2
         f(jj,nx+1,i) = 2.0*f(jj,nx+1,i)
  130    continue
      endif
! perform recursion for cosine-sine transform
      sum1 = 0.5*f(1,1,i)
      f(1,1,i) = 0.0
      f(1,2,i) = sum1
      sum2 = f(2,nx+1,i)
      f(2,nx+1,i) = f(2,2,i)
      f(2,2,i) = sum2
      do 140 j = 2, nxh
      sum1 = sum1 + f(1,2*j-1,i)
      f(1,2*j-1,i) = -f(1,2*j,i)
      f(1,2*j,i) = sum1
      sum2 = sum2 - f(2,2*j,i)
      f(2,2*j,i) = sum2
  140 continue
      f(1,nx+1,i) = 0.0
  150 continue
!$OMP END PARALLEL DO
      return
      end
!-----------------------------------------------------------------------
      subroutine PPFSCSCT2RM22Y(g,isign,mixup,sctd,indx,indy,kstrt,kxpi,&
     &kxpp,nyv,kxpd,nxhyd,nxyd)
! this subroutine performs the y part of 2 two dimensional fast real
! sine and cosine transforms and their inverses, for a subset of x,
! using real arithmetic, with OpenMP,
! for data which is distributed in blocks
! x component has a sine transform, y component a cosine transform
! algorithm is described in Numerical Recipies in Fortran, Second Ed.,
! by W. H. Press, B. P. Flannery, S. A. Teukolsky, and W. T. Vetterling, 
! [Cambridge Univ. Press, 1992], p. 508.
! for isign = (-1,1), input: all, output: f
! approximate flop count: N*(5*log2(N) + 18)/nvp
! where N = (nx/2)*ny
! indx/indy = exponent which determines length in x/y direction,
! where nx=2**indx, ny=2**indy
! if isign = -1, inverse sine-cosine transform are performed
! g(1,m,n) = sum(g(1,k,n)*sin(pi*m*k/ny))
! g(2,m,n) = (.5*g(2,1,n) + ((-1)**m)*g(2,ny+1,n)
!              + sum(g(2,k,n)*cos(pi*m*k/ny))
! if isign = 1, forward sine-cosine transform are performed
! g(1,k,n) = sum(g(1,m,n)*sin(pi*m*k/ny))
! g(2,k,n) = 2*(.5*g(2,1,n) + ((-1)**m)*g(2,ny+1,n)
!              + sum(g(2,m,n)*cos(pi*m*k/ny))
! mixup = array of bit reversed addresses
! sctd = sine/cosine table
! kstrt = starting data block number
! kxpi = initial x index used
! kxpp = number of x indices used
! nyv = first dimension of g >= ny + 1
! kxpd = second dimension of g >= kxp + 1
! nxhyd = maximum of (nx/2,ny)
! nxyd = maximum of (nx,ny)
! written by viktor k. decyk, ucla
      implicit none
      integer isign, indx, indy, kstrt, kxpi, kxpp
      integer nyv, kxpd, nxhyd, nxyd, mixup
      real g
      complex sctd
      dimension g(2,nyv,kxpd)
      dimension mixup(nxhyd), sctd(nxyd)
! local data
      integer indx1, indy1, indx1y, nx, ny, nyh, nyhh, ny3, nxy, nxhy
      integer i, j, k, m, ks, km, kmr, nry, j1, j2, ns, ns2, k1, k2
      integer kxps, nryb, jj
      real at1, at2, at3, t2, t3, t4, t5, t6
      complex t1
      double precision sum1, sum2
      indx1 = indx - 1
      indy1 = indy - 1
      indx1y = max0(indx1,indy)
      nx = 2**indx
      ny = 2**indy
      nyh = ny/2
      nyhh = ny/4
      ny3 = ny + 3
      nxy = max0(nx,ny)
      nxhy = 2**indx1y
      ks = kstrt - 1
      kxps = kxpi + kxpp - 1
      if (kstrt.gt.nx) return
      if (isign.eq.0) return
      nryb = nxhy/nyh
      nry = nxy/nyh
!$OMP PARALLEL DO
!$OMP& PRIVATE(i,j,k,m,jj,ns,ns2,km,kmr,k1,k2,j1,j2,at1,at2,at3,t2,t3,t4
!$OMP& ,t5,t6,t1,sum1,sum2)
      do 150 i = kxpi, kxps
! create auxiliary array in y
      kmr = nxy/ny
      sum2 = 0.5*(g(2,1,i) - g(2,ny+1,i))
      do 10 k = 2, nyh
      k1 = 1 + kmr*(k - 1)
      at3 = -aimag(sctd(k1))
      at2 = g(1,ny+2-k,i)
      at1 = g(1,k,i) + at2
      at2 = g(1,k,i) - at2
      at1 = at3*at1
      at2 = 0.5*at2
      g(1,k,i) = at1 + at2
      g(1,ny+2-k,i) = at1 - at2
      at2 = g(2,ny+2-k,i)
      at1 = g(2,k,i) + at2
      at2 = g(2,k,i) - at2
      sum2 = sum2 + real(sctd(k1))*at2
      at2 = at3*at2
      at1 = 0.5*at1
      g(2,k,i) = at1 - at2
      g(2,ny+2-k,i) = at1 + at2
   10 continue
      g(1,1,i) = 0.0
      g(1,nyh+1,i) = 2.0*g(1,nyh+1,i)
      g(2,1,i) = 0.5*(g(2,1,i) + g(2,ny+1,i))
      g(2,ny+1,i) = sum2
! bit-reverse array elements in y
      do 30 k = 1, nyh
      k1 = (mixup(k) - 1)/nryb + 1
      if (k.lt.k1) then
         do 20 jj = 1, 2
         t2 = g(jj,2*k1-1,i)
         t3 = g(jj,2*k1,i)
         g(jj,2*k1-1,i) = g(jj,2*k-1,i)
         g(jj,2*k1,i) = g(jj,2*k,i)
         g(jj,2*k-1,i) = t2
         g(jj,2*k,i) = t3
   20    continue
      endif
   30 continue
! then transform in y
      do 70 m = 1, indy1
      ns = 2**(m - 1)
      ns2 = ns + ns
      km = nyhh/ns
      kmr = 2*km*nry
      do 60 k = 1, km
      k1 = ns2*(k - 1)
      k2 = k1 + ns
      do 50 j = 1, ns
      j1 = j + k1
      j2 = j + k2
      t1 = sctd(1+kmr*(j-1))
      do 40 jj = 1, 2
      t2 = real(t1)*g(jj,2*j2-1,i) - aimag(t1)*g(jj,2*j2,i)
      t3 = aimag(t1)*g(jj,2*j2-1,i) + real(t1)*g(jj,2*j2,i)
      g(jj,2*j2-1,i) = g(jj,2*j1-1,i) - t2
      g(jj,2*j2,i) = g(jj,2*j1,i) - t3
      g(jj,2*j1-1,i) = g(jj,2*j1-1,i) + t2
      g(jj,2*j1,i) = g(jj,2*j1,i) + t3
   40 continue
   50 continue
   60 continue
   70 continue
! unscramble coefficients and normalize
! inverse fourier transform
      if (isign.lt.0) then
         kmr = nxy/nyh
         do 90 k = 2, nyhh
         t1 = cmplx(aimag(sctd(1+kmr*(k-1))),-real(sctd(1+kmr*(k-1))))
         do 80 jj = 1, 2
         t4 = g(jj,ny3-2*k,i)
         t5 = -g(jj,ny3-2*k+1,i)
         t2 = g(jj,2*k-1,i) + t4
         t3 = g(jj,2*k,i) + t5
         t6 = g(jj,2*k-1,i) - t4
         t5 = g(jj,2*k,i) - t5
         t4 = t6*real(t1) - t5*aimag(t1)
         t5 = t6*aimag(t1) + t5*real(t1)
         g(jj,2*k-1,i) = 0.5*(t2 + t4)
         g(jj,2*k,i) = 0.5*(t3 + t5)
         g(jj,ny3-2*k,i) = 0.5*(t2 - t4)
         g(jj,ny3-2*k+1,i) = 0.5*(t5 - t3)
   80    continue
   90    continue
         do 100 jj = 1, 2
         g(jj,nyh+1,i) = g(jj,nyh+1,i)
         g(jj,nyh+2,i) = -g(jj,nyh+2,i)
         t2 = g(jj,1,i) + g(jj,2,i)
         g(jj,2,i) = g(jj,1,i) - g(jj,2,i)
         g(jj,1,i) = t2
         g(jj,ny+1,i) = g(jj,ny+1,i)
  100    continue
! forward fourier transform
      else if (isign.gt.0) then
         kmr = nxy/nyh
         do 120 k = 2, nyhh
         t1 = cmplx(aimag(sctd(1+kmr*(k-1))),-real(sctd(1+kmr*(k-1))))
         do 110 jj = 1, 2
         t4 = g(jj,ny3-2*k,i)
         t5 = -g(jj,ny3-2*k+1,i)
         t2 = g(jj,2*k-1,i) + t4
         t3 = g(jj,2*k,i) + t5
         t6 = g(jj,2*k-1,i) - t4
         t5 = g(jj,2*k,i) - t5
         t4 = t6*real(t1) - t5*aimag(t1)
         t5 = t6*aimag(t1) + t5*real(t1)
         g(jj,2*k-1,i) = t2 + t4
         g(jj,2*k,i) = t3 + t5
         g(jj,ny3-2*k,i) = t2 - t4
         g(jj,ny3-2*k+1,i) = t5 - t3
  110    continue
  120    continue
         do 130 jj = 1, 2
         g(jj,nyh+1,i) = 2.0*g(jj,nyh+1,i)
         g(jj,nyh+2,i) = -2.0*g(jj,nyh+2,i)
         t2 = 2.0*(g(jj,1,i) + g(jj,2,i))
         g(jj,2,i) = 2.0*(g(jj,1,i) - g(jj,2,i))
         g(jj,1,i) = t2
         g(jj,ny+1,i) = 2.0*g(jj,ny+1,i)
  130    continue
      endif
! perform recursion for sine-cosine transform
      sum1 = 0.5*g(1,1,i)
      g(1,1,i) = 0.0
      g(1,2,i) = sum1
      sum2 = g(2,ny+1,i)
      g(2,ny+1,i) = g(2,2,i)
      g(2,2,i) = sum2
      do 140 k = 2, nyh
      sum1 = sum1 + g(1,2*k-1,i)
      g(1,2*k-1,i) = -g(1,2*k,i)
      g(1,2*k,i) = sum1
      sum2 = sum2 - g(2,2*k,i)
      g(2,2*k,i) = sum2
  140 continue
      g(1,ny+1,i) = 0.0
  150 continue
!$OMP END PARALLEL DO
      return
      end
!-----------------------------------------------------------------------
      subroutine WPPFSST2RM23(f,g,bs,br,isign,ntpose,mixup,sctd,ttp,indx&
     &,indy,kstrt,nvp,nxvh,nyv,kxp2,kyp,kypd,kxp2d,nxhyd,nxyd)
! wrapper function for 3 parallel real sine transforms
! x/y component has a sine/sine transform in x and y, respectively
! z component has a cosine/cosine transform in x and y, respectively
      implicit none
      integer isign, ntpose, mixup, indx, indy, kstrt, nvp, nxvh, nyv
      integer kxp2, kyp, kypd, kxp2d, nxhyd, nxyd
      real f, g, bs, br, ttp
      complex sctd
      dimension f(3,2*nxvh,kypd), g(3,nyv,kxp2d)
      dimension bs(3,kxp2+1,kyp+1), br(3,kxp2+1,kyp+1)
      dimension mixup(nxhyd), sctd(nxyd)
! local data
      integer nx, ny, kxpi, kypi, ks, kxp2p, kypp, kxb2, kyb
      real tf
      double precision dtime
      data kxpi, kypi /1,1/
! calculate range of indices
      nx = 2**indx
      ny = 2**indy
! ks = processor id
      ks = kstrt - 1
! kxp2p = actual size used in x direction
      kxp2p = min(kxp2,max(0,nx-kxp2*ks))
! kypp = actual size used in y direction
      kypp = min(kyp,max(0,ny-kyp*ks))
! kxb2 = minimum number of processors needed in x direction
      kxb2 = (nx - 1)/kxp2 + 1
! kyb = minimum number of processors needed in y direction
      kyb = (ny - 1)/kyp + 1
! add extra word for last processor in x
      if (ks==(kxb2-1)) kxp2p = kxp2p + 1
! add extra word for last processor in y
      if (ks==(kyb-1)) kypp = kypp + 1
! inverse fourier transform
      if (isign.lt.0) then
! perform x sine transforms
         call PPFSSCT2RM23X(f,isign,mixup,sctd,indx,indy,kstrt,kypi,kypp&
     &,nxvh,kypd,nxhyd,nxyd)
! transpose f array to g
         call PWTIMERA(-1,ttp,dtime)
         call PPRNTPOSE(f,g,bs,br,nx,ny,kxp2,kyp,kstrt,nvp,3,2*nxvh,nyv,&
     &kxp2d,kypd)
         call PWTIMERA(1,ttp,dtime)
! perform y sine transforms
         call PPFSSCT2RM23Y(g,isign,mixup,sctd,indx,indy,kstrt,kxpi,    &
     &kxp2p,nyv,kxp2d,nxhyd,nxyd)
! transpose g array to f
         if (ntpose.eq.0) then
            call PWTIMERA(-1,tf,dtime)
            call PPRNTPOSE(g,f,br,bs,ny,nx,kyp,kxp2,kstrt,nvp,3,nyv,    &
     &2*nxvh,kypd,kxp2d)
            call PWTIMERA(1,tf,dtime)
         endif
! forward fourier transform
      else if (isign.gt.0) then
! transpose f array to g
         if (ntpose.eq.0) then
            call PWTIMERA(-1,tf,dtime)
            call PPRNTPOSE(f,g,bs,br,nx,ny,kxp2,kyp,kstrt,nvp,3,2*nxvh, &
     &nyv,kxp2d,kypd)
            call PWTIMERA(1,tf,dtime)
         endif
! perform y sine transforms
         call PPFSSCT2RM23Y(g,isign,mixup,sctd,indx,indy,kstrt,kxpi,    &
     &kxp2p,nyv,kxp2d,nxhyd,nxyd)
! transpose g array to f
         call PWTIMERA(-1,ttp,dtime)
         call PPRNTPOSE(g,f,br,bs,ny,nx,kyp,kxp2,kstrt,nvp,3,nyv,2*nxvh,&
     &kypd,kxp2d)
         call PWTIMERA(1,ttp,dtime)
! perform x sine transforms
         call PPFSSCT2RM23X(f,isign,mixup,sctd,indx,indy,kstrt,kypi,kypp&
     &,nxvh,kypd,nxhyd,nxyd)
      endif
      if (ntpose.eq.0) ttp = ttp + tf
      return
      end
!-----------------------------------------------------------------------
      subroutine PPFSSCT2RM23X(f,isign,mixup,sctd,indx,indy,kstrt,kypi, &
     &kypp,nxvh,kypd,nxhyd,nxyd)
! this subroutine performs the x part of 3 two dimensional fast real
! sine transforms and their inverses, for a subset of y,
! using real arithmetic, with OpenMP,
! for data which is distributed in blocks
! x/y component has a sine transform, z component a cosine transform
! algorithm is described in Numerical Recipies in Fortran, Second Ed.,
! by W. H. Press, B. P. Flannery, S. A. Teukolsky, and W. T. Vetterling, 
! [Cambridge Univ. Press, 1992], p. 508.
! for isign = (-1,1), input: all, output: f
! approximate flop count: N*(5*log2(N) + 18)/nvp
! where N = (nx/2)*ny
! indx/indy = exponent which determines length in x/y direction,
! where nx=2**indx, ny=2**indy
! if isign = -1, inverse sine-sine-cosine transforms are performed
! f(1:2,n,k) = (1/nx*ny)*sum(f(2:3,j,k)*sin(pi*n*j/nx))
! f(3,n,k) = (1/nx*ny)*(.5*f(1,1,k) + ((-1)**n)*f(1,nx+1,k)
!              + sum(f(1,j,k)*cos(pi*n*j/nx)))
! if isign = 1, forward sine-sine-cosine transforms are performed
! f(1:2,j,k) = sum(f(2:3,n,k)*sin(pi*n*j/nx))
! f(3,j,k) = 2*(.5*f(1,1,k) + ((-1)**j)*f(1,n+1,k)
!              + sum(f(1,n,k)*cos(pi*n*j/nx))
! mixup = array of bit reversed addresses
! sctd = sine/cosine table
! kstrt = starting data block number
! kypi = initial y index used
! kypp = number of y indices used
! nxvh = first dimension of f >= nx/2 + 1
! kypd = second dimension of f >= kyp + 1
! nxhyd = maximum of (nx/2,ny)
! nxyd = maximum of (nx,ny)
! written by viktor k. decyk, ucla
      implicit none
      integer isign, indx, indy, kstrt, kypi, kypp
      integer nxvh, kypd, nxhyd, nxyd, mixup
      real f
      complex sctd
      dimension f(3,2*nxvh,kypd)
      dimension mixup(nxhyd), sctd(nxyd)
! local data
      integer indx1, indx1y, nx, nxh, nxhh, nx3, ny, nxy, nxhy, ks
      integer i, j, k, m, km, kmr, nrx, j1, j2, ns, ns2, k1, k2, kyps
      integer nrxb, jj
      real at1, at2, at3, t2, t3, t4, t5, t6, ani
      complex t1
      double precision sum1, sum2, sum3
      indx1 = indx - 1
      indx1y = max0(indx1,indy)
      nx = 2**indx
      nxh = nx/2
      nxhh = nx/4
      nx3 = nx + 3
      ny = 2**indy
      nxy = max0(nx,ny)
      nxhy = 2**indx1y
      ks = kstrt - 1
      kyps = kypi + kypp - 1
      if (kstrt.gt.ny) return
      if (isign.eq.0) return
      ani = 0.5/(real(nx)*real(ny))
      nrxb = nxhy/nxh
      nrx = nxy/nxh
!$OMP PARALLEL DO
!$OMP& PRIVATE(i,j,k,m,jj,ns,ns2,km,kmr,k1,k2,j1,j2,at1,at2,at3,t2,t3,t4
!$OMP& ,t5,t6,t1,sum1,sum2,sum3)
      do 150 i = kypi, kyps
! create auxiliary array in x
      kmr = nxy/nx
      sum3 = 0.5*(f(3,1,i) - f(3,nx+1,i))
      do 10 j = 2, nxh
      j1 = 1 + kmr*(j - 1)
      at3 = -aimag(sctd(j1))
      at2 = f(1,nx+2-j,i)
      at1 = f(1,j,i) + at2
      at2 = f(1,j,i) - at2
      at1 = at3*at1
      at2 = 0.5*at2
      f(1,j,i) = at1 + at2
      f(1,nx+2-j,i) = at1 - at2
      at2 = f(2,nx+2-j,i)
      at1 = f(2,j,i) + at2
      at2 = f(2,j,i) - at2
      at1 = at3*at1
      at2 = 0.5*at2
      f(2,j,i) = at1 + at2
      f(2,nx+2-j,i) = at1 - at2
      at2 = f(3,nx+2-j,i)
      at1 = f(3,j,i) + at2
      at2 = f(3,j,i) - at2
      sum3 = sum3 + real(sctd(j1))*at2
      at2 = at3*at2
      at1 = 0.5*at1
      f(3,j,i) = at1 - at2
      f(3,nx+2-j,i) = at1 + at2
   10 continue
      f(1,1,i) = 0.0
      f(1,nxh+1,i) = 2.0*f(1,nxh+1,i)
      f(2,1,i) = 0.0
      f(2,nxh+1,i) = 2.0*f(2,nxh+1,i)
      f(3,1,i) = 0.5*(f(3,1,i) + f(3,nx+1,i))
      f(3,nx+1,i) = sum3
! bit-reverse array elements in x
      do 30 j = 1, nxh
      j1 = (mixup(j) - 1)/nrxb + 1
      if (j.lt.j1) then
         do 20 jj = 1, 3
         t2 = f(jj,2*j1-1,i)
         t3 = f(jj,2*j1,i)
         f(jj,2*j1-1,i) = f(jj,2*j-1,i)
         f(jj,2*j1,i) = f(jj,2*j,i)
         f(jj,2*j-1,i) = t2
         f(jj,2*j,i) = t3
   20    continue
      endif
   30 continue
! then transform in x
      do 70 m = 1, indx1
      ns = 2**(m - 1)
      ns2 = ns + ns
      km = nxhh/ns
      kmr = 2*km*nrx
      do 60 k = 1, km
      k1 = ns2*(k - 1)
      k2 = k1 + ns
      do 50 j = 1, ns
      j1 = j + k1
      j2 = j + k2
      t1 = sctd(1+kmr*(j-1))
      do 40 jj = 1, 3
      t2 = real(t1)*f(jj,2*j2-1,i) - aimag(t1)*f(jj,2*j2,i)
      t3 = aimag(t1)*f(jj,2*j2-1,i) + real(t1)*f(jj,2*j2,i)
      f(jj,2*j2-1,i) = f(jj,2*j1-1,i) - t2
      f(jj,2*j2,i) = f(jj,2*j1,i) - t3
      f(jj,2*j1-1,i) = f(jj,2*j1-1,i) + t2
      f(jj,2*j1,i) = f(jj,2*j1,i) + t3
   40 continue
   50 continue
   60 continue
   70 continue
! unscramble coefficients and normalize
! inverse fourier transform
      if (isign.lt.0) then
         kmr = nxy/nxh
         do 90 j = 2, nxhh
         t1 = cmplx(aimag(sctd(1+kmr*(j-1))),-real(sctd(1+kmr*(j-1))))
         do 80 jj = 1, 3
         t4 = f(jj,nx3-2*j,i)
         t5 = -f(jj,nx3-2*j+1,i)
         t2 = f(jj,2*j-1,i) + t4
         t3 = f(jj,2*j,i) + t5
         t6 = f(jj,2*j-1,i) - t4
         t5 = f(jj,2*j,i) - t5
         t4 = t6*real(t1) - t5*aimag(t1)
         t5 = t6*aimag(t1) + t5*real(t1)
         f(jj,2*j-1,i) = ani*(t2 + t4)
         f(jj,2*j,i) = ani*(t3 + t5)
         f(jj,nx3-2*j,i) = ani*(t2 - t4)
         f(jj,nx3-2*j+1,i) = ani*(t5 - t3)
   80    continue
   90    continue
         do 100 jj = 1, 3
         f(jj,nxh+1,i) = 2.0*ani*f(jj,nxh+1,i)
         f(jj,nxh+2,i) = -2.0*ani*f(jj,nxh+2,i)
         t2 = 2.0*ani*(f(jj,1,i) + f(jj,2,i))
         f(jj,2,i) = 2.0*ani*(f(jj,1,i) - f(jj,2,i))
         f(jj,1,i) = t2
         f(jj,nx+1,i) = 2.0*ani*f(jj,nx+1,i)
  100    continue
! forward fourier transform
      else if (isign.gt.0) then
         kmr = nxy/nxh
         do 120 j = 2, nxhh
         t1 = cmplx(aimag(sctd(1+kmr*(j-1))),-real(sctd(1+kmr*(j-1))))
         do 110 jj = 1, 3
         t4 = f(jj,nx3-2*j,i)
         t5 = -f(jj,nx3-2*j+1,i)
         t2 = f(jj,2*j-1,i) + t4
         t3 = f(jj,2*j,i) + t5
         t6 = f(jj,2*j-1,i) - t4
         t5 = f(jj,2*j,i) - t5
         t4 = t6*real(t1) - t5*aimag(t1)
         t5 = t6*aimag(t1) + t5*real(t1)
         f(jj,2*j-1,i) = t2 + t4
         f(jj,2*j,i) = t3 + t5
         f(jj,nx3-2*j,i) = t2 - t4
         f(jj,nx3-2*j+1,i) = t5 - t3
  110    continue
  120    continue
         do 130 jj = 1, 3
         f(jj,nxh+1,i) = 2.0*f(jj,nxh+1,i)
         f(jj,nxh+2,i) = -2.0*f(jj,nxh+2,i)
         t2 = 2.0*(f(jj,1,i) + f(jj,2,i))
         f(jj,2,i) = 2.0*(f(jj,1,i) - f(jj,2,i))
         f(jj,1,i) = t2
         f(jj,nx+1,i) = 2.0*f(jj,nx+1,i)
  130    continue
      endif
! perform recursion for sine-sine transform
      sum1 = 0.5*f(1,1,i)
      f(1,1,i) = 0.0
      f(1,2,i) = sum1
      sum2 = 0.5*f(2,1,i)
      f(2,1,i) = 0.0
      f(2,2,i) = sum2
      sum3 = f(3,nx+1,i)
      f(3,nx+1,i) = f(3,2,i)
      f(3,2,i) = sum3
      do 140 j = 2, nxh
      sum1 = sum1 + f(1,2*j-1,i)
      f(1,2*j-1,i) = -f(1,2*j,i)
      f(1,2*j,i) = sum1
      sum2 = sum2 + f(2,2*j-1,i)
      f(2,2*j-1,i) = -f(2,2*j,i)
      f(2,2*j,i) = sum2
      sum3 = sum3 - f(3,2*j,i)
      f(3,2*j,i) = sum3
  140 continue
      f(1,nx+1,i) = 0.0
      f(2,nx+1,i) = 0.0
  150 continue
!$OMP END PARALLEL DO
      return
      end
!-----------------------------------------------------------------------
      subroutine PPFSSCT2RM23Y(g,isign,mixup,sctd,indx,indy,kstrt,kxpi, &
     &kxpp,nyv,kxpd,nxhyd,nxyd)
! this subroutine performs the y part of 3 two dimensional fast real
! sine transforms and their inverses, for a subset of x,
! using real arithmetic, with OpenMP,
! for data which is distributed in blocks
! x/y component has a sine transform, z component a cosine transform
! algorithm is described in Numerical Recipies in Fortran, Second Ed.,
! by W. H. Press, B. P. Flannery, S. A. Teukolsky, and W. T. Vetterling, 
! [Cambridge Univ. Press, 1992], p. 508.
! for isign = (-1,1), input: all, output: f
! approximate flop count: N*(5*log2(N) + 18)/nvp
! where N = (nx/2)*ny
! indx/indy = exponent which determines length in x/y direction,
! where nx=2**indx, ny=2**indy
! if isign = -1, inverse sine-sine-cosine transform are performed
! g(1:2,m,n) = sum(g(1:2,k,n)*sin(pi*m*k/ny))
! g(3,m,n) = (.5*g(3,1,n) + ((-1)**m)*g(3,ny+1,n)
!              + sum(g(3,k,n)*cos(pi*m*k/ny))
! if isign = 1, a forward sine-sine-cosine transforms are performed
! g(1:2,k,n) = sum(g(1:2,m,n)*sin(pi*m*k/ny))
! g(3,k,n) = 2*(.5*g(3,1,n) + ((-1)**m)*g(3,ny+1,n)
!              + sum(g(2,m,n)*cos(pi*m*k/ny))
! mixup = array of bit reversed addresses
! sctd = sine/cosine table
! kstrt = starting data block number
! kxpi = initial x index used
! kxpp = number of x indices used
! nyv = first dimension of g >= ny + 1
! kxpd = second dimension of g >= kxp + 1
! nxhyd = maximum of (nx/2,ny)
! nxyd = maximum of (nx,ny)
! written by viktor k. decyk, ucla
      implicit none
      integer isign, indx, indy, kstrt, kxpi, kxpp
      integer nyv, kxpd, nxhyd, nxyd, mixup
      real g
      complex sctd
      dimension g(3,nyv,kxpd)
      dimension mixup(nxhyd), sctd(nxyd)
! local data
      integer indx1, indy1, indx1y, nx, ny, nyh, nyhh, ny3, nxy, nxhy
      integer i, j, k, m, ks, km, kmr, nry, j1, j2, ns, ns2, k1, k2
      integer kxps, nryb, jj
      real at1, at2, at3, t2, t3, t4, t5, t6
      complex t1
      double precision sum1, sum2, sum3
      indx1 = indx - 1
      indy1 = indy - 1
      indx1y = max0(indx1,indy)
      nx = 2**indx
      ny = 2**indy
      nyh = ny/2
      nyhh = ny/4
      ny3 = ny + 3
      nxy = max0(nx,ny)
      nxhy = 2**indx1y
      ks = kstrt - 1
      kxps = kxpi + kxpp - 1
      if (kstrt.gt.nx) return
      if (isign.eq.0) return
      nryb = nxhy/nyh
      nry = nxy/nyh
!$OMP PARALLEL DO
!$OMP& PRIVATE(i,j,k,m,jj,ns,ns2,km,kmr,k1,k2,j1,j2,at1,at2,at3,t2,t3,t4
!$OMP& ,t5,t6,t1,sum1,sum2,sum3)
      do 150 i = kxpi, kxps
! create auxiliary array in y
      kmr = nxy/ny
      sum3 = 0.5*(g(3,1,i) - g(3,ny+1,i))
      do 10 k = 2, nyh
      k1 = 1 + kmr*(k - 1)
      at3 = -aimag(sctd(k1))
      at2 = g(1,ny+2-k,i)
      at1 = g(1,k,i) + at2
      at2 = g(1,k,i) - at2
      at1 = at3*at1
      at2 = 0.5*at2
      g(1,k,i) = at1 + at2
      g(1,ny+2-k,i) = at1 - at2
      at2 = g(2,ny+2-k,i)
      at1 = g(2,k,i) + at2
      at2 = g(2,k,i) - at2
      at1 = at3*at1
      at2 = 0.5*at2
      g(2,k,i) = at1 + at2
      g(2,ny+2-k,i) = at1 - at2
      at2 = g(3,ny+2-k,i)
      at1 = g(3,k,i) + at2
      at2 = g(3,k,i) - at2
      sum3 = sum3 + real(sctd(k1))*at2
      at2 = at3*at2
      at1 = 0.5*at1
      g(3,k,i) = at1 - at2
      g(3,ny+2-k,i) = at1 + at2
   10 continue
      g(1,1,i) = 0.0
      g(1,nyh+1,i) = 2.0*g(1,nyh+1,i)
      g(2,1,i) = 0.0
      g(2,nyh+1,i) = 2.0*g(2,nyh+1,i)
      g(3,1,i) = 0.5*(g(3,1,i) + g(3,ny+1,i))
      g(3,ny+1,i) = sum3
! bit-reverse array elements in y
      do 30 k = 1, nyh
      k1 = (mixup(k) - 1)/nryb + 1
      if (k.lt.k1) then
         do 20 jj = 1, 3
         t2 = g(jj,2*k1-1,i)
         t3 = g(jj,2*k1,i)
         g(jj,2*k1-1,i) = g(jj,2*k-1,i)
         g(jj,2*k1,i) = g(jj,2*k,i)
         g(jj,2*k-1,i) = t2
         g(jj,2*k,i) = t3
   20    continue
      endif
   30 continue
! then transform in y
      do 70 m = 1, indy1
      ns = 2**(m - 1)
      ns2 = ns + ns
      km = nyhh/ns
      kmr = 2*km*nry
      do 60 k = 1, km
      k1 = ns2*(k - 1)
      k2 = k1 + ns
      do 50 j = 1, ns
      j1 = j + k1
      j2 = j + k2
      t1 = sctd(1+kmr*(j-1))
      do 40 jj = 1, 3
      t2 = real(t1)*g(jj,2*j2-1,i) - aimag(t1)*g(jj,2*j2,i)
      t3 = aimag(t1)*g(jj,2*j2-1,i) + real(t1)*g(jj,2*j2,i)
      g(jj,2*j2-1,i) = g(jj,2*j1-1,i) - t2
      g(jj,2*j2,i) = g(jj,2*j1,i) - t3
      g(jj,2*j1-1,i) = g(jj,2*j1-1,i) + t2
      g(jj,2*j1,i) = g(jj,2*j1,i) + t3
   40 continue
   50 continue
   60 continue
   70 continue
! unscramble coefficients and normalize
! inverse fourier transform
      if (isign.lt.0) then
         kmr = nxy/nyh
         do 90 k = 2, nyhh
         t1 = cmplx(aimag(sctd(1+kmr*(k-1))),-real(sctd(1+kmr*(k-1))))
         do 80 jj = 1, 3
         t4 = g(jj,ny3-2*k,i)
         t5 = -g(jj,ny3-2*k+1,i)
         t2 = g(jj,2*k-1,i) + t4
         t3 = g(jj,2*k,i) + t5
         t6 = g(jj,2*k-1,i) - t4
         t5 = g(jj,2*k,i) - t5
         t4 = t6*real(t1) - t5*aimag(t1)
         t5 = t6*aimag(t1) + t5*real(t1)
         g(jj,2*k-1,i) = 0.5*(t2 + t4)
         g(jj,2*k,i) = 0.5*(t3 + t5)
         g(jj,ny3-2*k,i) = 0.5*(t2 - t4)
         g(jj,ny3-2*k+1,i) = 0.5*(t5 - t3)
   80    continue
   90    continue
         do 100 jj = 1, 3
         g(jj,nyh+1,i) = g(jj,nyh+1,i)
         g(jj,nyh+2,i) = -g(jj,nyh+2,i)
         t2 = g(jj,1,i) + g(jj,2,i)
         g(jj,2,i) = g(jj,1,i) - g(jj,2,i)
         g(jj,1,i) = t2
         g(jj,ny+1,i) = g(jj,ny+1,i)
  100    continue
! forward fourier transform
      else if (isign.gt.0) then
         kmr = nxy/nyh
         do 120 k = 2, nyhh
         t1 = cmplx(aimag(sctd(1+kmr*(k-1))),-real(sctd(1+kmr*(k-1))))
         do 110 jj = 1, 3
         t4 = g(jj,ny3-2*k,i)
         t5 = -g(jj,ny3-2*k+1,i)
         t2 = g(jj,2*k-1,i) + t4
         t3 = g(jj,2*k,i) + t5
         t6 = g(jj,2*k-1,i) - t4
         t5 = g(jj,2*k,i) - t5
         t4 = t6*real(t1) - t5*aimag(t1)
         t5 = t6*aimag(t1) + t5*real(t1)
         g(jj,2*k-1,i) = t2 + t4
         g(jj,2*k,i) = t3 + t5
         g(jj,ny3-2*k,i) = t2 - t4
         g(jj,ny3-2*k+1,i) = t5 - t3
  110    continue
  120    continue
         do 130 jj = 1, 3
         g(jj,nyh+1,i) = 2.0*g(jj,nyh+1,i)
         g(jj,nyh+2,i) = -2.0*g(jj,nyh+2,i)
         t2 = 2.0*(g(jj,1,i) + g(jj,2,i))
         g(jj,2,i) = 2.0*(g(jj,1,i) - g(jj,2,i))
         g(jj,1,i) = t2
         g(jj,ny+1,i) = 2.0*g(jj,ny+1,i)
  130    continue
      endif
! perform recursion for sine-cosine transform
      sum1 = 0.5*g(1,1,i)
      g(1,1,i) = 0.0
      g(1,2,i) = sum1
      sum2 = 0.5*g(2,1,i)
      g(2,1,i) = 0.0
      g(2,2,i) = sum2
      sum3 = g(3,ny+1,i)
      g(3,ny+1,i) = g(3,2,i)
      g(3,2,i) = sum3
      do 140 k = 2, nyh
      sum1 = sum1 + g(1,2*k-1,i)
      g(1,2*k-1,i) = -g(1,2*k,i)
      g(1,2*k,i) = sum1
      sum2 = sum2 + g(2,2*k-1,i)
      g(2,2*k-1,i) = -g(2,2*k,i)
      g(2,2*k,i) = sum2
      sum3 = sum3 - g(3,2*k,i)
      g(3,2*k,i) = sum3
  140 continue
      g(1,ny+1,i) = 0.0
      g(2,ny+1,i) = 0.0
  150 continue
!$OMP END PARALLEL DO
      return
      end
!-----------------------------------------------------------------------
      subroutine MPPDIVFD2(f,df,nx,ny,kstrt,ndim,nyv,kxp2)
! this subroutine calculates the divergence in fourier space
! with dirichlet boundary conditions (zero potential)
! using fast sine/cosine transforms for distributed data.
! intended for calculating the charge density from the electric field
! input: all except df, output: df
! approximate flop count is: 6*nx*ny
! the divergence is calculated using the equation:
! df(kx,ky) = -(kx*fx(kx,ky)+ky*fy(kx,ky))
! where kx = pi*j/nx, ky = pi*k/ny, and j,k = fourier mode numbers,
! all for fourier mode (jj-1,k-1), where jj = j + kxp2*(kstrt - 1)
! modes nx and ny are zeroed out
! nx/ny = system length in x/y direction
! ndim = number of field arrays, must be >= 2
! kstrt = starting data block number
! nyv = first dimension of field arrays, must be >= ny+1
! kxp2 = number of data values per block
      implicit none
      integer nx, ny, kstrt, ndim, nyv ,kxp2
      real f, df
      dimension f(ndim,nyv,kxp2+1), df(nyv,kxp2+1)
! local data
      integer j, k, ks, ny1, joff, kxp2s
      real dnx, dny, dkx, dky
      if (ndim.lt.2) return
      ks = kstrt - 1
      ny1 = ny + 1
      joff = kxp2*ks
      kxp2s = min(kxp2,max(0,nx-joff))
      joff = joff - 1
      dnx = 6.28318530717959/real(nx + nx)
      dny = 6.28318530717959/real(ny + ny)
! calculate the divergence
      if (kstrt.gt.nx) return
! mode numbers 0 < kx < nx and 0 < ky < ny
!$OMP PARALLEL DO PRIVATE(j,k,dkx,dky)
      do 20 j = 1, kxp2s
      dkx = dnx*real(j + joff)
      if ((j+joff).gt.0) then
         do 10 k = 2, ny
         dky = dny*real(k - 1)
         df(k,j) = -(dkx*f(1,k,j) + dky*f(2,k,j))
   10    continue
      endif
! mode numbers ky = 0, ny
      df(1,j) = 0.0
      df(ny+1,j) = 0.0
   20 continue
!$OMP END PARALLEL DO
! mode numbers kx = 0, nx
      if (ks.eq.0) then
         do 30 k = 2, ny
         df(k,1) = 0.0
   30    continue
      endif
      do 40 k = 1, ny1
      df(k,kxp2s+1) = 0.0
   40 continue
      return
      end
!-----------------------------------------------------------------------
      subroutine MPPGRADFD2(df,f,nx,ny,kstrt,ndim,nyv,kxp2)
! this subroutine calculates the gradient in fourier space
! with dirichlet boundary conditions (zero potential)
! using fast sine/cosine transforms for distributed data.
! intended for calculating the electric field from the potential
! input: all except f, output: f
! approximate flop count is: 4*nx*ny
! the gradient is calculated using the equations:
! fx(kx,ky) = kx*df(kx,ky)
! fy(kx,ky) = ky*df(kx,ky)
! where kx = pi*j/nx, ky = pi*k/ny, and j,k = fourier mode numbers,
! all for fourier mode (jj-1,k-1), where jj = j + kxp2*(kstrt - 1)
! modes nx and ny are zeroed out
! nx/ny = system length in x/y direction
! ndim = number of field arrays, must be >= 2
! kstrt = starting data block number
! nyv = first dimension of field arrays, must be >= ny+1
! kxp2 = number of data values per block
      implicit none
      integer nx, ny, kstrt, ndim, nyv, kxp2
      real df, f
      dimension df(nyv,kxp2+1), f(ndim,nyv,kxp2+1)
! local data
      integer j, k, ks, ny1, joff, kxp2s
      real dnx, dny, dkx, dky
      ks = kstrt - 1
      ny1 = ny + 1
      joff = kxp2*ks
      kxp2s = min(kxp2,max(0,nx-joff))
      joff = joff - 1
      dnx = 6.28318530717959/real(nx + nx)
      dny = 6.28318530717959/real(ny + ny)
! calculate the gradient
      if (kstrt.gt.nx) return
! mode numbers 0 < kx < nx and 0 < ky < ny
!$OMP PARALLEL DO PRIVATE(j,k,dkx,dky)
      do 20 j = 1, kxp2s
      dkx = dnx*real(j + joff)
      if ((j+joff).gt.0) then
         do 10 k = 2, ny
         dky = dny*real(k - 1)
         f(1,k,j) = dkx*df(k,j)
         f(2,k,j) = dky*df(k,j)
   10    continue
      endif
! mode numbers ky = 0, ny
      f(1,1,j) = 0.0
      f(2,1,j) = 0.0
      f(1,ny+1,j) = 0.0
      f(2,ny+1,j) = 0.0
   20 continue
!$OMP END PARALLEL DO
! mode numbers kx = 0, nx
      if (ks.eq.0) then
         do 30 k = 2, ny
         f(1,k,1) = 0.0
         f(2,k,1) = 0.0
   30    continue
      endif
      do 40 k = 1, ny1
      f(1,k,kxp2s+1) = 0.0
      f(2,k,kxp2s+1) = 0.0
   40 continue
      return
      end
!-----------------------------------------------------------------------
      subroutine MPPCURLFD2(f,g,nx,ny,kstrt,nyv,kxp2)
! this subroutine calculates the curl in fourier space
! with dirichlet boundary conditions (zero potential)
! using fast sine/cosine transforms for distributed data.
! intended for calculating the magnetic field from the vector potential
! input: all except g, output: g
! approximate flop count is: 8*nx*ny
! the curl is calculated using the equations:
! gx(kx,ky) = ky*fz(kx,ky)
! gy(kx,ky) = -kx*fz(kx,ky)
! gz(kx,ky) = (kx*fy(kx,ky)-ky*fx(kx,ky))
! where kx = pi*j/nx, ky = pi*k/ny, and j,k = fourier mode numbers,
! all for fourier mode (jj-1,k-1), where jj = j + kxp2*(kstrt - 1)
! nx/ny = system length in x/y direction
! kstrt = starting data block number
! nyv = first dimension of field arrays, must be >= ny+1
! kxp2 = number of data values per block
      implicit none
      integer nx, ny, kstrt, nyv, kxp2
      real f, g
      dimension f(3,nyv,kxp2+1), g(3,nyv,kxp2+1)
! local data
      integer j, k, ks, ny1, joff, kxp2s
      real dnx, dny, dkx, dky
      ks = kstrt - 1
      ny1 = ny + 1
      joff = kxp2*ks
      kxp2s = min(kxp2,max(0,nx-joff))
      joff = joff - 1
      dnx = 6.28318530717959/real(nx + nx)
      dny = 6.28318530717959/real(ny + ny)
! calculate the curl
      if (kstrt.gt.nx) return
! mode numbers 0 < kx < nx and 0 < ky < ny
!$OMP PARALLEL DO PRIVATE(j,k,dkx,dky)
      do 20 j = 1, kxp2s
      dkx = dnx*real(j + joff)
      if ((j+joff).gt.0) then
         do 10 k = 2, ny
         dky = dny*real(k - 1)
         g(1,k,j) = dky*f(3,k,j)
         g(2,k,j) = -dkx*f(3,k,j)
         g(3,k,j) = dkx*f(2,k,j) - dky*f(1,k,j)
   10    continue
! mode numbers ky = 0, ny
         g(1,1,j) = 0.0
         g(2,1,j) = 0.0
         g(3,1,j) = dkx*f(2,1,j)
      endif
      g(1,ny+1,j) = 0.0
      g(2,ny+1,j) = 0.0
      g(3,ny+1,j) = 0.0
   20 continue
!$OMP END PARALLEL DO
! mode numbers kx = 0, nx
      if (ks.eq.0) then
         do 30 k = 2, ny
         dky = dny*real(k - 1)
         g(1,k,1) = 0.0
         g(2,k,1) = 0.0
         g(3,k,1) = -dky*f(1,k,1)
   30    continue
         g(1,1,1) = 0.0
         g(2,1,1) = 0.0
         g(3,1,1) = 0.0
      endif
      do 40 k = 1, ny1
      g(1,k,kxp2s+1) = 0.0
      g(2,k,kxp2s+1) = 0.0
      g(3,k,kxp2s+1) = 0.0
   40 continue
      return
      end