SVD computation subroutine

```      subroutine svdcomp(A,S,V,svord,m,n,isw,error)
implicit none
c
c     mxweep : maximum number of sweeps allowed
c     mxnupd : maximum number of implicit norm updates
c     epsilon : machine precision
c     tol : tolerance used for termination of algorithm
c
integer mxweep,mxnupd
real epsilon,tol
parameter(mxweep=30)
parameter(mxnupd=10)
parameter(epsilon=0.5e-6,tol=5.0e-2)
c
c     Error code returned
c
integer error
c
c     Dimensions of A
c
integer m,n
c
c     SVD arrays
c     as input A contains the original matrix
c     as output A contains matrix U of SVD
c
real A(*),S(*),V(*)
cmf\$  layout A(:serial),S(:serial),V(:serial)
c
c     Ordering of singular values
c
integer svord(*)
cmf\$  layout svord(:serial)
c
c     CM arrays
c
c     U1,U2 : each one contains half the columns of A and an extra column
c             as a buffer for shifting
c     V1,V2 : each one contains half the columns of V and an extra column
c             as a buffer for shifting
c     alpha,beta : square of the norms of the columns of U1,U2
c		   will also hold sine theta, cosine theta for Jacobi
c		   transformations
c		   at the end of the run they will hold the singular values
c     gamma : inner product of columns in the same processor
c     inprod : normalized inner products used to determine if transformations
c              are applied based on thresholds. Also used to determine
c              termination of algorithm
c     buffer : holds temporary results
c     U1norm : squares of column 2-norms of U1
c     U2norm : squares of column 2-norms of U2
c     sing : stores singular values for transmission to front end and
c            for sorting in descending order
c     order : ordering of singular values
c
real U1(m,(n+1)/2)
cmf\$  layout U1(:serial,:news)
real U2(m,(n+1)/2)
cmf\$  layout U2(:serial,:news)
real V1(m,(n+1)/2)
cmf\$  layout V1(:serial,:news)
real V2(m,(n+1)/2)
cmf\$  layout V2(:serial,:news)
real alpha((n+1)/2),beta((n+1)/2)
cmf\$  layout alpha(:news),beta(:news)
real gamma((n+1)/2),inprod((n+1)/2)
cmf\$  layout gamma(:news),inprod(:news)
real buffer(m,(n+1)/2)
cmf\$  layout buffer(:serial,:news)
real U1norm((n+1)/2),U2norm((n+1)/2)
cmf\$  layout U1norm(:news),U2norm(:news)
real sing(n)
cmf\$  layout sing(:news)
integer order(n)
cmf\$  layout order(:news)
c
c     lc : last column in U1
c
integer lc
c
c     sflag : true if stopping criterion has been met
c
logical sflag
c
c     Temporary variables used in conjunction with the computation
c     of smax and smin
c
real max1,max2,min1,min2
c
c     Loop counters
c     isw : sweep counter
c     irot : rotation counter
c     nupd : number of times implicit norm update has been used
c
integer i,j,k,isw,irot,nupd
c
c     Timing variables
c
integer itimer1,itimer2
real flops
real nsum,nsub,nmul,ndiv,nsqrt,nabs,nsign
real wsum,wsub,wmul,wdiv,wsqrt,wabs,wsign
data wsum/1.0/
data wsub/1.0/
data wmul/1.0/
data wdiv/1.0/
data wsqrt/1.0/
data wabs/1.0/
data wsign/1.0/
c
data itimer1/1/
data itimer2/2/
c
c     Initialize U1,U2 <- A
c
lc = (n+1)/2
do i=1,m-1,2
call cmf_to_cm(U1(i,:),A(n*(i-1)+1),lc)
call cmf_to_cm(U2(i,:),A(n*(i-1)+lc+1),n-lc)
call cmf_to_cm(U1(i+1,:),A(n*i+1),lc)
call cmf_to_cm(U2(i+1,:),A(n*i+lc+1),n-lc)
enddo
if (2*(m/2) .ne. m) then
call cmf_to_cm(U1(m,:),A(n*(m-1)+1),lc)
call cmf_to_cm(U2(m,:),A(n*(m-1)+lc+1),n-lc)
endif
if (lc .ne. n-lc) U2(:,lc) = 0.0
c
c     Initialize V1,V2 <- I
c
V1 = 0.0
V2 = 0.0
forall (i=1:lc) V1(i,i) = 1.0
forall (i=1:n-lc) V2(i+lc,i) = 1.0
c
c     Initialize looping variables
c
isw = 0
sflag = .false.
nupd = mxnupd
c
c     Start timer 1
c
call CM_timer_start(itimer1)
c
c     Loop over all sweeps
c
do while (isw .lt. mxweep .and. .not. sflag)
isw = isw + 1
sflag = .true.
c
c     Loop over all rotations
c
do irot=1,2*lc-1
c
c	 compute inner products
c
nupd = nupd + 1
if (nupd .ge. mxnupd) then
U1norm = 0.0
U2norm = 0.0
do i=1,m-1,2
U1norm = U1norm + U1(i,:) * U1(i,:)
U1norm = U1norm + U1(i+1,:) * U1(i+1,:)
U2norm = U2norm + U2(i,:) * U2(i,:)
U2norm = U2norm + U2(i+1,:) * U2(i+1,:)
enddo
if (2*(m/2) .ne. m) then
U1norm = U1norm + U1(m,:) * U1(m,:)
U2norm = U2norm + U2(m,:) * U2(m,:)
endif
nupd = 0
endif
gamma = 0.0
do i=1,m-1,2
gamma = gamma + U1(i,:) * U2(i,:)
gamma = gamma + U1(i+1,:) * U2(i+1,:)
enddo
if (2*(m/2) .ne. m) gamma = gamma + U1(m,:) * U2(m,:)
c
c	If any of the normalized inner products is greater than
c	the tolerance we must execute the next sweep after this one
c
where (U1norm .eq. 0.0) gamma = 0.0
where (U2norm .eq. 0.0) gamma = 0.0
where (gamma .eq. 0.0)
inprod = 0.0
elsewhere
inprod = abs(gamma / sqrt(U1norm * U2norm))
endwhere
if (any(inprod .gt. tol)) sflag = .false.
c
c	In this set of calculations alpha and beta are used for
c	temporary storage and at the end they will have the
c	contents of the Jacobi rotation matrices
c	gamma is also used for temporary storage
c
where (inprod .gt. epsilon)
c
c	eta = (U2norm - U1norm) / (2.0 * gamma) stored in alpha
c
alpha = (U2norm - U1norm) / (2.0 * gamma)
c
c	t = sign(eta)/(abs(eta) + sqrt(1.0+eta^2)) stored in beta
c
beta = sign(1.0,alpha) / (abs(alpha) +
*			sqrt(1.0+alpha*alpha))
c
c       Update norms
c
U1norm = U1norm - beta * gamma
U2norm = U2norm + beta * gamma
c
c	cos(theta) = 1.0/sqrt(1.0*t^2) stored in alpha
c
alpha = 1.0/sqrt(1.0+beta*beta)
c
c	sin(theta) = t * cos(theta) stored in beta
c
beta = beta * alpha
elsewhere
alpha = 1.0
beta = 0.0
endwhere
c
c	Apply one sided Jacobi transformations, use gamma variable as buffer
c
do i=1,m-1,2
gamma = U1(i,:) * alpha - U2(i,:) * beta
U2(i,:) = U1(i,:) * beta + U2(i,:) * alpha
U1(i,:) = gamma
gamma = U1(i+1,:) * alpha - U2(i+1,:) * beta
U2(i+1,:) = U1(i+1,:) * beta + U2(i+1,:) * alpha
U1(i+1,:) = gamma
gamma = V1(i,:) * alpha - V2(i,:) * beta
V2(i,:) = V1(i,:) * beta + V2(i,:) * alpha
V1(i,:) = gamma
gamma = V1(i+1,:) * alpha - V2(i+1,:) * beta
V2(i+1,:) = V1(i+1,:) * beta + V2(i+1,:) * alpha
V1(i+1,:) = gamma
enddo
if (2*(m/2) .ne. m) then
gamma = U1(m,:) * alpha - U2(m,:) * beta
U2(m,:) = U1(m,:) * beta + U2(m,:) * alpha
U1(m,:) = gamma
gamma = V1(m,:) * alpha - V2(m,:) * beta
V2(m,:) = V1(m,:) * beta + V2(m,:) * alpha
V1(m,:) = gamma
endif
c
c	Shifting when lc > 2
c
c	    call CM_timer_start(itimer2)
if (lc .gt. 2) then
c
c	Shift the columns of U1,U2,V1,V2 to obtain next
c       combination of columns in sweep
c
buffer = cshift(U2,dim=2,shift=-1)
buffer(:,1) = U1(:,1)
buffer(:,lc) = U1(:,lc)
U1 = cshift(U1,dim=2,shift=-1)
U2 = cshift(U2,dim=2,shift=1)
U1(:,1:2) = buffer(:,1:2)
U2(:,lc) = buffer(:,lc)
c
buffer = cshift(V2,dim=2,shift=-1)
buffer(:,1) = V1(:,1)
buffer(:,lc) = V1(:,lc)
V1 = cshift(V1,dim=2,shift=-1)
V2 = cshift(V2,dim=2,shift=1)
V1(:,1:2) = buffer(:,1:2)
V2(:,lc) = buffer(:,lc)

c        Shift squares of column norms U1norm, U2norm

gamma = cshift(U2norm,dim=1,shift=-1)
gamma(1) = U1norm(1)
gamma(lc) = U1norm(lc)
U1norm = cshift(U1norm,dim=1,shift=-1)
U2norm = cshift(U2norm,dim=1,shift=1)
U1norm(1:2) = gamma(1:2)
U2norm(lc) = gamma(lc)
elseif (lc .eq. 2) then
c
c	Shift when lc = 2
c
buffer = cshift(U2,dim=2,shift=-1)
U2(:,2) = U1(:,2)
U1(:,2) = buffer(:,2)
U2(:,1) = buffer(:,1)
c
buffer = cshift(V2,dim=2,shift=-1)
V2(:,2) = V1(:,2)
V1(:,2) = buffer(:,2)
V2(:,1) = buffer(:,1)
c
gamma = cshift(U2norm,dim=1,shift=-1)
U2norm(2) = U1norm(2)
U1norm(2) = gamma(2)
U2norm(1) = gamma(1)
endif
c	    flops = 0.0
c	    Call MY_stop_timer('shift '//char(0),itimer2,0,flops)
enddo
enddo
c
c   Timing information
c
nsum = wsum * float(m*2*lc + (m+3)*lc + 2*(m-1)*lc/mxnupd)
nsub = wsub * float(m*2*lc + 4*lc)
nmul = wmul * float(m*lc*(9+2/mxnupd) + 7*lc)
ndiv = wdiv * float(4*lc)
nsqrt = wsqrt * float(3*lc)
nabs = wabs * float(2*lc)
nsign = wsign * float(lc)
flops = float(isw * (2*lc-1)) *
*			(nsum+nsub+nmul+ndiv+nsqrt+nabs+nsign)
call MY_stop_timer('main loop '//char(0),itimer1,0,flops)
if (sflag) then
error = 0
else
error = 1
endif
c
c	Compute singular values and U
c

buffer = U1 * U1
alpha = sqrt(sum(buffer,dim=1))
where (buffer .ne. 0.0) U1 = U1 / buffer
c
buffer = U2 * U2
beta = sqrt(sum(buffer,dim=1))
where (buffer .ne. 0.0) U2 = U2 / buffer
c
c	Set output arrays A,S,V to U,SIGMA,V
c
do i=1,m
call cmf_from_cm(A(n*(i-1)+1),U1(i,:),lc)
call cmf_from_cm(A(n*(i-1)+lc+1),U2(i,:),n-lc)
enddo
c
sing(1:lc) = alpha
sing(lc+1:n) = beta(1:n-lc)
call cmf_from_cm(S,sing,n)
c
do i=1,n
call cmf_from_cm(V(n*(i-1)+1),V1(i,:),lc)
call cmf_from_cm(V(n*(i-1)+lc+1),V2(i,:),n-lc)
enddo
c
c     Generate singular value ordering vector svord
c
call cmf_order(order,sing,1,.true.)
do i=1,n
svord(n+1-order(i)) = i
enddo
return
end
```