ceo 0.1.0 - Docs.rs

% -*- mode: Noweb; noweb-code-mode: c-mode -*-
@
\index{iterativeSolvers}
\section{The files}

\subsection{Header}

<<iterativeSolvers.h>>=
#ifndef __ITERATIVESOLVERS_H__
#define __ITERATIVESOLVERS_H__

#ifndef __UTILITIES_H__
#include "utilities.h"
#endif

#ifndef __BTBT_H__
#include "BTBT.h"
#endif

//#define MINRES_DEBUG

struct iterativeSolvers {

  <<parameters>>

  void cg_setup(int n_vector);
  void pcg_setup(int n_vector);

  void minres_setup(int n_vector);

  void cleanup(void);

  void cg(float *x, BTBT* A, float *b, int max_it, float* x0);
  void pcg(float *x, BTBT* A, float *b, int max_it, float* x0, float ip);

  void sym_ortho(float *c,float *s,float *r,
		 float *a,float *b);

  void lanczos_step(float *alpha, float *beta, float *nu_kp1,
		    BTBT *A, float *nu_k, float *nu_km1, float sigma);


  void minres_vorst(float *x, BTBT* A, float *b, float* x0);
  void minres_vorst(float *x, BTBT* A, float *b, int max_it, float* x0);
  void minres_vorst(float *x, BTBT* A, float *b, float rtol, float* x0);
  void pminres_vorst(float *x, BTBT* A, float *b, int max_it, float* x0, float ip);
  void minres_vorst(float *x, float *res, BTBT* A, float *b, int max_it, float* x0);

  void minres_choi(float *x, BTBT* A, float *b, int max_it, float* x0);

  //  void info(void);
};
#endif // __ITERATIVESOLVERS_H__
@
\subsection{Source}

<<iterativeSolvers.cu>>=
#include "iterativeSolvers.h"

<<CG setup>>
<<PCG setup>>
<<MINRES setup>>

<<cleanup>>

<<conjugate gradient>>
<<preconditioned conjugate gradient>>

<<sym-ortho algorithm>>

<<Lanczos step>>

<<MINRES (Vorst) I>>
<<MINRES (Vorst) II>>
<<MINRES (Vorst) III>>
<<MINRES (Vorst) with residue>>
<<MINRES (Choi)>>
@

\section{Parameters}

\index{iterativeSolvers!iterativeSolvers}
The structure parameters are
<<parameters>>=
float *d__vectors,
  *q, *x, *r, *p, *z,
  *nu_i, *nu_im1, *nu_ip1,
  *w_i, *w_im1, *w_im2;
float rnorm, rel_rnorm, mean_time_per_iteration, RTOL, ATOL;
int N, N_ITERATION, cvgce_iteration;
stopwatch tid;
cublasHandle_t handle;
cublasStatus_t status;
char VERBOSE;
@

\section{Functions}

\subsection{Setup \& Cleanup}

The iterativeSolvers structure gather several routines implementing iterative methods to solve the linear system: $Ax=b$.
The setup method is allocated the required memory depending of the iterative method type:
\begin{itemize}
\item
\index{iterativeSolvers!iterativeSolvers!cg\_setup}
<<CG setup>>=
void iterativeSolvers::cg_setup(int n_vector)
{
  INFO("@(CEO)>iterativeSolvers: CONJUGATE GRADIENT\n");
  N = n_vector;
  HANDLE_ERROR( cudaMalloc((void**)&d__vectors, sizeof(float)*N*4 ) );
  q = d__vectors;
  x = d__vectors + N;
  r = d__vectors + 2*N;
  p = d__vectors + 3*N;
  cublasCreate(&handle);
}
<<PCG setup>>=
void iterativeSolvers::pcg_setup(int n_vector)
{
  INFO("@(CEO)>iterativeSolvers: CONJUGATE GRADIENT\n");
  N = n_vector;
  HANDLE_ERROR( cudaMalloc((void**)&d__vectors, sizeof(float)*N*5 ) );
  q = d__vectors;
  x = d__vectors + N;
  r = d__vectors + 2*N;
  p = d__vectors + 3*N;
  z = d__vectors + 4*N;
  cublasCreate(&handle);
}
@  \item
\index{iterativeSolvers!iterativeSolvers!minres\_setup}
<<MINRES setup>>=
void iterativeSolvers::minres_setup(int n_vector)
{
  INFO("@(CEO)>iterativeSolvers: MINRES\n");
  N = n_vector;
  HANDLE_ERROR( cudaMalloc((void**)&d__vectors, sizeof(float)*N*6 ) );
  nu_i   = d__vectors;
  nu_im1 = d__vectors + N;
  nu_ip1 = d__vectors + 2*N;
  w_i    = d__vectors + 3*N;
  w_im1  = d__vectors + 4*N;
  w_im2  = d__vectors + 5*N;
  cublasCreate(&handle);
  VERBOSE = 0;
  ATOL = 0;
  RTOL = 5E-2;
  N_ITERATION = 100;
}
@ \end{itemize}

Memory is freed with
\index{iterativeSolvers!iterativeSolvers!cleanup}
<<cleanup>>= 
void iterativeSolvers::cleanup(void)
{
  INFO("@(CEO)>iterativeSolvers: freeing memory!\n");
  HANDLE_ERROR( cudaFree( d__vectors ) );
  cublasDestroy(handle);
}
@ 

\subsection{Conjugate gradient}
\label{sec:conjugate-gradient}

The next routine implements the conjugate gradient iterative method:
\index{iterativeSolvers!iterativeSolvers!cg}
<<conjugate gradient>>=
  void iterativeSolvers::cg(float *x, BTBT* A, float *b, int max_it, float* x0)
{
  float alpha, beta, gamma;
  int k;
  <<conjugate gradient init>>
  tid.tic();
  for (k=0;k<max_it;k++) {
    //printf("CG IT=%d - ",k);
    <<conjugate gradient loop part 1>>
      if (k<(max_it-1)) {
	  <<conjugate gradient loop part 2>>
      }
  }    
  tid.toc(&mean_time_per_iteration);
  mean_time_per_iteration /= max_it;
}
@ 
The conjugate gradient algorithm is written:
\begin{itemize}
\item initialization:
  \begin{enumerate}
  \item $r=Ax_0$
<<conjugate gradient init>>=
    A->MVM(r , x0);
@ \item $r_0 = b - r_0$
<<conjugate gradient init>>=
beta = -1;
cublasSscal(handle, N, &beta, r, 1);
alpha = 1;
cublasSaxpy(handle, N, &alpha, b, 1, r, 1);
@ \item $p_0=r_0$
<<conjugate gradient init>>=
cublasScopy(handle, N, r, 1, p, 1);
@ \end{enumerate}
\item loop:
  \begin{enumerate}
  \item $q=Ap_k$
<<conjugate gradient loop part 1>>=
    A->MVM(q , p);
@ \item $\gamma=r_k^Tr_k$
<<conjugate gradient loop part 1>>=
cublasSdot(handle, N, r, 1, r, 1, &gamma);
//printf("r norm=%6.2E\n",sqrt(gamma));
@ \item $\left\| r \right\|_2=\sqrt\gamma$
<<conjugate gradient loop part 1>>=
rnorm = sqrt(gamma);
if (gamma==0) break;
@ \item $\alpha_k = {\displaystyle \gamma\over p_K^Tq}$
<<conjugate gradient loop part 1>>=
cublasSdot(handle, N, p, 1, q, 1, &alpha);
alpha = gamma/alpha;
@ \item $x_{k+1} = x_k + \alpha_kp_k$
<<conjugate gradient loop part 1>>=
cublasSaxpy(handle, N, &alpha, p, 1, x, 1);
@ \item $r_{k+1} = r_k - \alpha_kq$
<<conjugate gradient loop part 2>>=
alpha = -alpha;
cublasSaxpy(handle, N, &alpha, q, 1, r, 1);
@ \item $\beta_k = {\displaystyle r_{k+1}^Tr_{k+1} \over \gamma} $
<<conjugate gradient loop part 2>>=
cublasSdot(handle, N, r, 1, r, 1, &beta);
beta = beta/gamma;
@ \item $p_{k+1} = r_{k+1} + \beta_k p_k$
<<conjugate gradient loop part 2>>=
cublasSscal(handle, N, &beta, p, 1);
alpha = 1;
cublasSaxpy(handle, N, &alpha, r, 1, p, 1);
@  \end{enumerate}
 \end{itemize}

\subsection{Preconditioned conjugate gradient}
\label{sec:precond-conjugate-gradient}

The next routine implements the conjugate gradient iterative method with Jacobi preconditioner.
The Jacobi preconditioner $M$ is given by
\begin{equation}
  \label{eq:1}
  M = \operatorname{diag}\left( A \right) = \sigma_A^2 I,
\end{equation}
and its inverse is simply
\begin{equation}
  \label{eq:2}
  M^{-1} = \sigma_A^{-2} I = [[ip]] I.
\end{equation}

\index{iterativeSolvers!iterativeSolvers!pcg}
<<preconditioned conjugate gradient>>=
  void iterativeSolvers::pcg(float *x, BTBT* A, float *b, 
			     int max_it, float* x0, float ip)
{
  float alpha, beta, gamma;
  int k;
  <<preconditioned conjugate gradient init>>
  tid.tic();
  for (k=0;k<max_it;k++) {
    //printf("CG IT=%d - ",k);
    <<preconditioned conjugate gradient loop part 1>>
      if (k<(max_it-1)) {
	  <<preconditioned conjugate gradient loop part 2>>
      }
  }    
  tid.toc(&mean_time_per_iteration);
  mean_time_per_iteration /= max_it;
}
@ 
The preconditioned conjugate gradient algorithm is written:
\begin{itemize}
\item initialization:
  \begin{enumerate}
  \item $r=Ax_0$
<<preconditioned conjugate gradient init>>=
    A->MVM(r , x0);
@ \item $r_0 = b - r_0$
<<preconditioned conjugate gradient init>>=
beta = -1;
cublasSscal(handle, N, &beta, r, 1);
alpha = 1;
cublasSaxpy(handle, N, &alpha, b, 1, r, 1);
@ \item $z_0=M^{-1}r_0$
<<preconditioned conjugate gradient init>>=
cublasScopy(handle, N, r, 1, z, 1);
cublasSscal(handle, N, &ip, z, 1);
@ \item $p_0=z_0$
<<preconditioned conjugate gradient init>>=
cublasScopy(handle, N, z, 1, p, 1);
@ \end{enumerate}
\item loop:
  \begin{enumerate}
  \item $q=Ap_k$
<<preconditioned conjugate gradient loop part 1>>=
    A->MVM(q , p);
@ \item $\gamma=r_k^Tz_k$
<<preconditioned conjugate gradient loop part 1>>=
cublasSdot(handle, N, r, 1, z, 1, &gamma);
//printf("r norm=%6.2E\n",sqrt(gamma));
@ \item $\left\| r \right\|_2=\sqrt\gamma$
<<preconditioned conjugate gradient loop part 1>>=
rnorm = sqrt(gamma);
if (gamma==0) break;
@ \item $\alpha_k = {\displaystyle \gamma\over p_K^Tq}$
<<preconditioned conjugate gradient loop part 1>>=
cublasSdot(handle, N, p, 1, q, 1, &alpha);
alpha = gamma/alpha;
@ \item $x_{k+1} = x_k + \alpha_kp_k$
<<preconditioned conjugate gradient loop part 1>>=
cublasSaxpy(handle, N, &alpha, p, 1, x, 1);
@ \item $r_{k+1} = r_k - \alpha_kq$
<<preconditioned conjugate gradient loop part 2>>=
alpha = -alpha;
cublasSaxpy(handle, N, &alpha, q, 1, r, 1);
@ \item $z_{k+1} = M^{-1}r_{k+1}$
<<preconditioned conjugate gradient loop part 2>>=
cublasScopy(handle, N, r, 1, z, 1);
cublasSscal(handle, N, &ip, z, 1);
@ \item $\beta_k = {\displaystyle r_{k+1}^Tz_{k+1} \over \gamma} $
<<preconditioned conjugate gradient loop part 2>>=
cublasSdot(handle, N, r, 1, z, 1, &beta);
beta = beta/gamma;
@ \item $p_{k+1} = z_{k+1} + \beta_k p_k$
<<preconditioned conjugate gradient loop part 2>>=
cublasSscal(handle, N, &beta, p, 1);
alpha = 1;
cublasSaxpy(handle, N, &alpha, z, 1, p, 1);
@  \end{enumerate}
 \end{itemize}

 \subsection{MINRES}
 \label{sec:minres}

\subsubsection{Vorst implementation}
\label{sec:vorst-implementation}

The MINRES iterative solver~\cite{PS75,Vorst03,Choi06} method is implemented in the next routine:
\index{iterativeSolvers!iterativeSolvers!minres\_vorst}
<<MINRES (Vorst) I>>=
  void iterativeSolvers::minres_vorst(float *x, BTBT* A, float *b, float* x0)
{
  float alpha, beta, gamma_i, gamma_im1, sigma_i, sigma_im1,
    eta, delta, rho1, rho2, rho3, rnorm0;
  int k;
  <<MINRES (Vorst) init>>
      //  tid.tic();
  for (k=0;k<N_ITERATION;k++) {
    //printf("MINRES IT=%d - ",k);
    <<MINRES (Vorst) loop>>
  }  
  //  tid.toc(&mean_time_per_iteration);
  //  mean_time_per_iteration /= max_it;
  cvgce_iteration = k;

  INFO("MINRES converged at iteration %d to a solution with relative residual %.2E.\n",
	 k,rel_rnorm);

}
<<MINRES (Vorst) II>>=
  void iterativeSolvers::minres_vorst(float *x, BTBT* A, float *b, int max_it, float* x0)
{
  float alpha, beta, gamma_i, gamma_im1, sigma_i, sigma_im1,
    eta, delta, rho1, rho2, rho3, rnorm0;
  int k;
  <<MINRES (Vorst) init>>
      //  tid.tic();
  for (k=0;k<max_it;k++) {
    //printf("MINRES IT=%d - ",k);
    <<MINRES (Vorst) loop>>
  }  
  //  tid.toc(&mean_time_per_iteration);
  //  mean_time_per_iteration /= max_it;
  INFO("MINRES converged at iteration %d to a solution with relative residual %.2E.\n",
	 k,rel_rnorm);
}
<<MINRES (Vorst) III>>=
  void iterativeSolvers::minres_vorst(float *x, BTBT* A, float *b, float rtol, float* x0)
{
  float alpha, beta, gamma_i, gamma_im1, sigma_i, sigma_im1,
    eta, delta, rho1, rho2, rho3, rnorm0;
  int k;
  <<MINRES (Vorst) init>>
  RTOL = rtol;
      //  tid.tic();
  for (k=0;k<N_ITERATION;k++) {
    //printf("MINRES IT=%d - ",k);
    <<MINRES (Vorst) loop>>
  }  
  //  tid.toc(&mean_time_per_iteration);
  //  mean_time_per_iteration /= max_it;
  INFO("MINRES converged at iteration %d to a solution with relative residual %.2E.\n",
	 k,rel_rnorm);
}
<<MINRES (Vorst) with residue>>=
  void iterativeSolvers::minres_vorst(float *x, float *res, BTBT* A, float *b, int max_it, float* x0)
{
  float alpha, beta, gamma_i, gamma_im1, sigma_i, sigma_im1,
    eta, delta, rho1, rho2, rho3, res_cst, result, rnorm0;
  int k;
  float *residual;
  HANDLE_ERROR( cudaMalloc( (void**)&residual, sizeof(float)*N ) );
  <<MINRES (Vorst) init>>
  tid.tic();
  for (k=0;k<max_it;k++) {
    //printf("MINRES IT=%d - ",k);
    <<MINRES (Vorst) loop>>
    A->MVM(residual , x);
    res_cst = -1;
    cublasSscal(handle, N, &res_cst, residual, 1);
    res_cst = 1;
    cublasSaxpy(handle, N, &res_cst, b, 1, residual, 1);
    cublasSnrm2(handle, N, residual, 1, &result);
    res[k] = result;
    INFO("(%3d): Residue norm: %.2E\n",k,result);
  }   
  tid.toc(&mean_time_per_iteration);
  mean_time_per_iteration /= max_it;
  HANDLE_ERROR( cudaFree( residual) );
}
@ 
The MINRES algorithm is written:
\begin{itemize}
\item initialization:
  \begin{enumerate}
  \item $r=Ax_0$
<<MINRES (Vorst) init>>=
A->MVM(nu_i , x0);
@ \item $r_0 = b - r_0$
<<MINRES (Vorst) init>>=
beta = -1;
cublasSscal(handle, N, &beta, nu_i, 1);
alpha = 1;
cublasSaxpy(handle, N, &alpha, b, 1, nu_i, 1);
#ifdef MINRES_DEBUG
cublasSnrm2(handle, N, b, 1, &beta);
INFO("beta=%.2E\n",beta);
#endif
@ \item $\beta_1=\left\| \nu_1 \right\|_2=\left\| r \right\|_2$
<<MINRES (Vorst) init>>=
  //cublasSnrm2(handle, N, nu_i, 1, &beta);
cublasSdot(handle, N, nu_i, 1, nu_i, 1, &beta);
beta = sqrtf(beta);
rnorm = beta;
//rnorm0 = rnorm;
cublasSnrm2(handle, N, b, 1, &rnorm0);
rel_rnorm = rnorm/rnorm0;
if (VERBOSE)
  INFO("#%3d: r norm=%6.2E - rel. tol.=%6.2E\n",0,rnorm,rnorm/rnorm0);
if ( (rnorm<ATOL) || (rel_rnorm<=RTOL) )
  return;
#ifdef MINRES_DEBUG
INFO("beta=%.2E\n",beta);
#endif
@ \item $\eta=\beta_1$, $\gamma_1=\gamma_0=1$, $\sigma_1=\sigma_0=0$, $\nu_0=0$
<<MINRES (Vorst) init>>=
eta = beta;
gamma_i = gamma_im1 = 1;
sigma_i = sigma_im1 = 0;
@ \item $\nu_0=0$, $w_0=w_{-1}=0$
<<MINRES (Vorst) init>>=
HANDLE_ERROR( cudaMemset(nu_im1, 0, sizeof(float)*N ) );
HANDLE_ERROR( cudaMemset(w_im1 , 0, sizeof(float)*N ) );
HANDLE_ERROR( cudaMemset(w_im2 , 0, sizeof(float)*N ) );
@ \end{enumerate}
\item loop:
  \begin{enumerate}
  \item $\nu_i = \nu_i / \beta_i$
<<MINRES (Vorst) loop>>=
rho1 = 1/beta;
cublasSscal(handle, N, &rho1, nu_i, 1);
@ \item $\nu_{i+1}= A\nu_i$
<<MINRES (Vorst) loop>>=
A->MVM(nu_ip1 , nu_i);
@ \item $\alpha_i = \nu_i^T\nu_{i+1}$
<<MINRES (Vorst) loop>>=
cublasSdot(handle, N, nu_i, 1, nu_ip1, 1, &alpha);
#ifdef MINRES_DEBUG
INFO("alpha=%.2E\n",alpha);
#endif
@ \item $\nu_{i+1} = \nu_{i+1} - \alpha_i\nu_i - \beta_i\nu_{i-1}$
<<MINRES (Vorst) loop>>=
alpha = -alpha;
beta  = -beta;
cublasSaxpy(handle, N, &alpha, nu_i, 1, nu_ip1, 1);
cublasSaxpy(handle, N, &beta, nu_im1, 1, nu_ip1, 1);
alpha = -alpha;
beta  = -beta;
@ \item $\delta = \gamma_i\alpha_i - \gamma_{i-1}\sigma_i\beta_i$
<<MINRES (Vorst) loop>>=
delta = gamma_i*alpha - gamma_im1*sigma_i*beta;
#ifdef MINRES_DEBUG
INFO("delta=%.2E\n",delta);
#endif
@ \item $\rho_2 = \sigma_i\alpha_i + \gamma_{i-1}\gamma_i\beta_i$
<<MINRES (Vorst) loop>>=
rho2 = sigma_i*alpha + gamma_im1*gamma_i*beta;
#ifdef MINRES_DEBUG
INFO("rho2=%.2E\n",rho2);
#endif
@ \item $\rho_3 = \sigma_{i-1}\beta_i$
<<MINRES (Vorst) loop>>=
rho3 = sigma_im1*beta;
#ifdef MINRES_DEBUG
INFO("rho3=%.2E\n",rho3);
#endif
@ \item $\beta_{i+1}=\left\| \nu_{i+1} \right\|_2$
<<MINRES (Vorst) loop>>=
  //cublasSnrm2(handle, N, nu_ip1, 1, &beta);
cublasSdot(handle, N, nu_ip1, 1, nu_ip1, 1, &beta);
beta = sqrtf(beta);
#ifdef MINRES_DEBUG
INFO("beta=%.2E\n",beta);
#endif
@ \item $\rho_1 = \sqrt{\delta^2+\beta_{i+1}^2}$
<<MINRES (Vorst) loop>>=
rho1 = hypotf(delta,beta);
#ifdef MINRES_DEBUG
INFO("rho1=%.2E\n",rho1);
#endif
@ \item $\gamma_{i+1} = \delta / \rho_1$
<<MINRES (Vorst) loop>>=
gamma_im1 = gamma_i;
gamma_i = delta/rho1;
#ifdef MINRES_DEBUG
INFO("gamma_im1=%.2E\n",gamma_im1);
INFO("gamma_i=%.2E\n",gamma_i);
#endif
@ \item $\sigma_{i+1} = \beta_{i+1} / \rho_1$
<<MINRES (Vorst) loop>>=
sigma_im1 = sigma_i;
sigma_i = beta/rho1;
#ifdef MINRES_DEBUG
INFO("sigma_im1=%.2E\n",sigma_im1);
INFO("sigma_i=%.2E\n",sigma_i);
#endif
@ \item $w_i = (\nu_i-\rho_3w_{i-2}-\rho_2w_{i-1})/\rho_1$
<<MINRES (Vorst) loop>>=
cublasScopy(handle, N, nu_i, 1, w_i, 1);
rho1 = 1.0/rho1;
cublasSscal(handle, N, &rho1, w_i, 1);
rho3 = -rho3*rho1;
cublasSaxpy(handle, N, &rho3, w_im2, 1, w_i, 1);
rho2 = -rho2*rho1;
cublasSaxpy(handle, N, &rho2, w_im1, 1, w_i, 1);
@ \item $x_i = x_{i-1} + \gamma_{i+1}\eta w_i$
<<MINRES (Vorst) loop>>=
rho1 = gamma_i*eta;
cublasSaxpy(handle, N, &rho1, w_i, 1, x, 1);
@ \item $\left\| r_i \right\|_2 = \left| \sigma_{i+1} \right|\left\| r_{i-1} \right\|_2 $
<<MINRES (Vorst) loop>>=
rnorm = rnorm*fabs(sigma_i);
rel_rnorm = rnorm/rnorm0;
if (VERBOSE)
  INFO("#%3d: r norm=%6.2E - rel. tol.=%6.2E\n",k,rnorm,rnorm/rnorm0);
if ( (rnorm<ATOL) || (rel_rnorm<=RTOL) )
  break;
@ \item $\eta = \sigma_{i+1}\eta$
<<MINRES (Vorst) loop>>=
eta = -sigma_i*eta;
#ifdef MINRES_DEBUG
INFO("eta=%.2E\n",eta);
#endif
@ \item
\begin{eqnarray}
  \label{eq:3}
  \nu_{i-1} &\leftarrow& \nu_i \nonumber\\
  \nu_{i} &\leftarrow& \nu_{i+1} \nonumber\\
  w_{i-2} &\leftarrow& w_{i-1} \nonumber\\
  w_{i-1} &\leftarrow& \nu_{i} \nonumber
\end{eqnarray}
<<MINRES (Vorst) loop>>=
cublasScopy(handle, N, nu_i  , 1, nu_im1, 1);
cublasScopy(handle, N, nu_ip1, 1, nu_i  , 1);
cublasScopy(handle, N, w_im1 , 1, w_im2 , 1);
cublasScopy(handle, N, w_i   , 1, w_im1 , 1);
if (beta==0) break;
@  \end{enumerate}
 \end{itemize}

 \subsubsection{Choi implementation}
 \label{sec:choi-implementation}

\index{iterativeSolvers!iterativeSolvers!minres\_choi}
<<MINRES (Choi)>>=
  void iterativeSolvers::minres_choi(float *x, BTBT* A, float *b, int max_it, float* x0)
{
  float a, alpha, beta, gamma, delta1, delta2,  phi, tau, c, s, epsilon;
  int k;
  <<MINRES (Choi) init>>
  tid.tic();
  for (k=0;k<max_it;k++) {
    //INFO("MINRES IT=%d - ",k);
    <<MINRES (Choi) loop>>
  }    
  tid.toc(&mean_time_per_iteration);
  mean_time_per_iteration /= max_it;
}
@ 
The MINRES algorithm is written:
\begin{itemize}
\item initialization:
  \begin{enumerate}
  \item $\beta_1=\left\| b \right\|_2$
<<MINRES (Choi) init>>=
cublasSnrm2(handle, N, b, 1, &beta);
@ \item $\nu_0 = 0$
<<MINRES (Choi) init>>=
HANDLE_ERROR( cudaMemset(nu_im1, 0, sizeof(float)*N ) );
@ \item $\nu_1 = b/\beta_1$
<<MINRES (Choi) init>>=
HANDLE_ERROR( cudaMemset(nu_i, 0, sizeof(float)*N ) );
a = 1/beta;
cublasSaxpy(handle, N, &a, b, 1, nu_i, 1);
@ \item $\phi=\tau=\beta_1$, $\delta^{(1)}_1=0$, $c_0=-1$, $s_0=0$
<<MINRES (Choi) init>>=
phi= tau = beta;
delta1 = 0;
c = -1;
s = 0;
@ \item $d_0=d_{-1}=0$
<<MINRES (Choi) init>>=
HANDLE_ERROR( cudaMemset(w_im1 , 0, sizeof(float)*N ) );
HANDLE_ERROR( cudaMemset(w_im2 , 0, sizeof(float)*N ) );
@ \end{enumerate}
\item loop:
  \begin{enumerate}
  \item Lanczos step
<<MINRES (Choi) loop>>=
lanczos_step(&alpha, &beta, nu_ip1, A, nu_i, nu_im1, 0);
@ \item $\delta^{(2)}_k=c_{k-1}\delta^{(1)}_k + s_{k-1}\alpha_k$
<<MINRES (Choi) loop>>=
delta2 = c*delta1 + s*alpha;
@ \item $\gamma^{(1)}_k=s_{k-1}\delta^{(1)}_k - c_{k-1}\alpha_k$
<<MINRES (Choi) loop>>=
gamma = s*delta1 - c*alpha;
@ \item $\epsilon_k = s_{k-1}\beta_{k+1}$
<<MINRES (Choi) loop>>=
epsilon = s*beta;
@ \item $\delta^{(1)}_k = -c_{k-1}\beta_{k+1}$
<<MINRES (Choi) loop>>=
delta1 = -c*beta;
@ \item sym--ortho
<<MINRES (Choi) loop>>=
a = gamma;
sym_ortho(&c, &s, &gamma, &a, &beta);
@ \item $\tau_k = c_k\phi_k$
<<MINRES (Choi) loop>>=
tau = c*phi;
@ \item $\phi_k = s_k\phi_{k-1}$
<<MINRES (Choi) loop>>=
phi = s*phi;
//INFO("r norm=%6.2E\n",phi);
@ \item $\left\| r \right\|_2 = \phi_k$
<<MINRES (Choi) loop>>=
rnorm = phi;
@ \item $d_k = (\nu_k-\delta^{(2)}_k d_{k-1}-\epsilon_k d_{k-2})/\gamma_k$
<<MINRES (Choi) loop>>=
if (gamma==0) break;
cublasScopy(handle, N, nu_i, 1, w_i, 1);
a = 1/gamma;
cublasSscal(handle, N, &a, w_i, 1);
a = -delta2/gamma;
cublasSaxpy(handle, N, &a, w_im1, 1, w_i, 1);
a = -epsilon/gamma;
cublasSaxpy(handle, N, &a, w_im2, 1, w_i, 1);
@ \item $x_k = x_{k-1} + \tau d_k$
<<MINRES (Choi) loop>>=
cublasSaxpy(handle, N, &tau, w_i, 1, x, 1);
@ \item 
\begin{eqnarray}
  \label{eq:1a}
  \nu_{i-1} &\leftarrow& \nu_i \nonumber\\
  \nu_{i} &\leftarrow& \nu_{i+1} \nonumber\\
  d_{i-2} &\leftarrow& d_{i-1} \nonumber\\
  d_{i-1} &\leftarrow& d_{i} \nonumber
\end{eqnarray}
<<MINRES (Choi) loop>>=
cublasScopy(handle, N, nu_i  , 1, nu_im1, 1);
cublasScopy(handle, N, nu_ip1, 1, nu_i  , 1);
cublasScopy(handle, N, w_im1 , 1, w_im2 , 1);
cublasScopy(handle, N, w_i   , 1, w_im1 , 1);
@  \end{enumerate}
 \end{itemize}
@ 
The sym--ortho function is:
\index{iterativeSolvers!iterativeSolvers!sym\_ortho}
<<sym-ortho algorithm>>=
void iterativeSolvers::sym_ortho(float *c,float *s,float *r,float *a,float *b) {
  float abs_a, abs_b, tau;
  abs_a = fabs(*a);
  abs_b = fabs(*b);
  if (*b==0) {
    *s = 0;
    *r = abs_a;
    *c = (*a==0) ? 1 : sign(*a);
  } else if (*a==0) {
    *c = 0;
    *s = sign(*b);
    *r = abs_b;
  } else if (abs_b>abs_a) {
    tau = *a/(*b);
    *s = sign(*b)/sqrtf(1+tau*tau);
    *c = *s*tau;
    *r = *b/(*s);
  } else if (abs_a>abs_b) {
    tau = *b/(*a);
    *c = sign(*a)/sqrtf(1+tau*tau);
    *s = *c*tau;
    *r = *a/(*c);
  }
}
@ 
The Lanzcos step is:
\index{iterativeSolvers!iterativeSolvers!lanczos\_step}
<<Lanczos step>>=
void iterativeSolvers::lanczos_step(float *alpha, float *beta, float *nu_kp1,
		  BTBT *A, float *nu_k, float *nu_km1, float sigma)
{
  float a;
  A->MVM(nu_kp1, nu_k);
  if (sigma!=0) {
    a = -sigma;
    cublasSaxpy(handle, N, &a, nu_k, 1, nu_kp1, 1);
  }
  cublasSdot(handle, N, nu_k, 1, nu_kp1, 1, alpha);
  a = -(*alpha);
  cublasSaxpy(handle, N, &a, nu_k, 1, nu_kp1, 1);
  a = -(*beta);
  cublasSaxpy(handle, N, &a, nu_km1, 1, nu_kp1, 1);
  cublasSnrm2(handle, N, nu_kp1, 1, beta);
  if (beta!=0) {
    a = 1/(*beta);
    cublasSscal(handle, N, &a, nu_kp1, 1);
  }
}

@
\section{Tests}
\label{sec:tests}

The test routine is:
<<iterativeSolvers.bin>>=

#include "cuda_profiler_api.h"

#ifndef __CEO_H__
#include "ceo.h"
#endif

   //#define N 10

   <<PA input>>

__global__ void fill(float *data, int n_data, float value)
{
  int k = blockIdx.x * blockDim.x + threadIdx.x;
  if (k<n_data)
    data[k] = value;
}

// Solving Ax=b
int main(int argc,char *argv[]) {
<<complete test>>
}
@
In the following a simple test is performed:
<<simple test I>>=
/*
  The test requires to replace the MVM with MVM (Test 1) 
  in the code chunk BTBT.cu in the file BTBT.nw
*/
float *d__x, *d__b;
HANDLE_ERROR( cudaMalloc((void**)&d__x, sizeof(float)*N ) );
HANDLE_ERROR( cudaMemset(d__x, 0, sizeof(float)*N ) );
HANDLE_ERROR( cudaMalloc((void**)&d__b, sizeof(float)*N ) );
float b[N] = {1,2,3,4,5,6,7,8,9,10};
HANDLE_ERROR( cudaMemcpy( d__b, b, N*sizeof(float), cudaMemcpyHostToDevice) );

BTBT A;
A.setup(N);

iterativeSolvers iSolve;
/* iSolve.cg_setup(N); */
/* iSolve.cg(d__x, &A, d__b, 5, d__x); */
iSolve.minres_setup(N);
iSolve.minres(d__x, &A, d__b, 5, d__x);
iSolve.cleanup();

float *x;
x = (float *)malloc(sizeof(float)*N);
HANDLE_ERROR( cudaMemcpy( x, d__x, N*sizeof(float), cudaMemcpyDeviceToHost) );

for (int k=0;k<N;k++)
  INFO("(%d) %5.1f\n",k,x[k]);

HANDLE_ERROR( cudaFree( d__x ) );
HANDLE_ERROR( cudaFree( d__b ) );
free(x);
@ 
<<simple test II>>=
/*
  The test requires to replace the MVM with MVM (Test 2) 
  in the code chunk BTBT.cu in the file BTBT.nw
*/
float *x;
x = (float *)malloc(sizeof(float)*N);
float *d__x, *d__b;
HANDLE_ERROR( cudaMalloc((void**)&d__x, sizeof(float)*N ) );
HANDLE_ERROR( cudaMalloc((void**)&d__b, sizeof(float)*N ) );

for (int k=0;k<N;k++) {
  x[k] = 1.0 + (float)rand();
  INFO("(%d) %.3E\n",k,x[k]);
 }
HANDLE_ERROR( cudaMemcpy( d__b, x, N*sizeof(float), cudaMemcpyHostToDevice ) );

BTBT A;
A.setup(N);
/* A.MVM(d__b,d__x); */
/* HANDLE_ERROR( cudaMemcpy( x, d__b, N*sizeof(float), cudaMemcpyDeviceToHost) ); */

/* for (int k=0;k<N;k++) */
/*   printf("(%d) %5.1f\n",k,x[k]); */

fill LLL (1+N/16),16 RRR (d__x, N, 0);

iterativeSolvers iSolve;
/* iSolve.cg_setup(N); */
/* iSolve.cg(d__x, &A, d__b, 5, d__x); */
iSolve.minres_setup(N);
iSolve.minres(d__x, &A, d__b, 5, d__x);
iSolve.cleanup();

HANDLE_ERROR( cudaMemcpy( x, d__x, N*sizeof(float), cudaMemcpyDeviceToHost) );

for (int k=0;k<N;k++)
  INFO("(%d) %5.1f\n",k,x[k]);

HANDLE_ERROR( cudaFree( d__x ) );
HANDLE_ERROR( cudaFree( d__b ) );
free(x);
@ 
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% TEST ZONE
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
This is the comprehensive suite of tests.
There are 3 tests available: [[[main test]]], [[convergence test]] and [[statistics test]].
$r_0$ is set to 15cm per default but can be override to any other value after the variable declaration.
<<complete test>>=
<<declaration>>
  //  r0 = d;
<<initialization>>
<<hot convergence test>>
<<cleanup and free memory>>
@ 
In the following, all the variables are declared and set to their default or user given values.
<<declaration>>=
int N = _N_LENSLET_*2, NP, PS_E_N_PX, k, osf;
float *d__x, *d__phase_est, *phase_screen_est;
float D, d, delta, delta_e,  r0=0.15;
char *d__mask;

if (argc != 3) {
  INFO("Usage: %s D CG or MINRES\n",argv[0]);
  return 1;
 }
char solver[10];
sprintf(solver,"%s",argv[2]);
printf("\n___ SOLVER: %s ___\n",solver);

D = atof(argv[1]);
d = D/N_SIDE_LENSLET;
delta = d/_N_PX_PUPIL_;
osf = 1;
delta_e = d/osf;

NP = osf*N_SIDE_LENSLET+1;
PS_E_N_PX = NP;
PS_E_N_PX *= PS_E_N_PX;

printf("\nd    =%.4f\n",d);
printf("\ndelta=%.4f\n",delta);
printf("\ndelta_e=%.4f\n",delta_e);

source src, gs;
atmosphere atm;
mask pupil_mask;
imaging lenslet_array;
centroiding cog;
aaStats aa;
BTBT aaCov;
paStats pa;
BTBT paCov;
iterativeSolvers iSolve;
stats S;
S.setup();
stopwatch tid;
@ 
Then all the components are initialized and the dynamic arrays are allocated.
<<initialization>>=
src.setup(ARCSEC(0) , 0, INFINITY, PS_E_N_PX);
gs.setup(ARCSEC(0) , 0, 90e3, PS_E_N_PX);        

// Atmosphere
// Single layer turbulence profile
float altitude[] = {0},
  xi0[] = {1},
  wind_speed[] = {10},
  wind_direction[] = {0};
//atm.setup(r0,30,altitude,xi0,wind_speed,wind_direction);
// GMT 7 layers turbulence profile
atm.gmt_setup(r0,30);
//atm.reset();

pupil_mask.setup( (N_SIDE_LENSLET+1)*(N_SIDE_LENSLET+1) );

// SH WFS
lenslet_array.setup(_N_PX_PUPIL_,N_SIDE_LENSLET,2,1,1);

// Centroid
cog.setup();

// covariances
aa.setup(N_SIDE_LENSLET,&atm,d, &gs, 1);
aa.toFile("aaCovariance.bin");
printf("\n AA variance [rd^2]: %.3E\n",aa.variance());
aaCov.setup(2,2,N_SIDE_LENSLET,aa.d__cov);

pa.setup(NP,N_SIDE_LENSLET,osf,&atm,d,&src,1,&gs,1);
pa.toFile("paCovariance.bin");
paCov.setup(1,2,NP,N_SIDE_LENSLET,pa.d__cov);

HANDLE_ERROR( cudaMalloc((void**)&d__mask, sizeof(char)*_N_LENSLET_ ) );
dim3 blockDim_vl(16,16);
dim3 gridDim_vl(N_SIDE_LENSLET/16+1,N_SIDE_LENSLET/16+1);
valid_lenslet LLL gridDim_vl,blockDim_vl RRR (d__mask,N_SIDE_LENSLET*_N_PX_PUPIL_,0.5);

/* char *L_mask, *A_mask; */
/* HANDLE_ERROR( cudaMalloc((void**)&L_mask, sizeof(char)*_N_LENSLET_ ) ); */
/* HANDLE_ERROR( cudaMemset(L_mask, 0, sizeof(char)*_N_LENSLET_  ) ); */
/* HANDLE_ERROR( cudaMalloc((void**)&A_mask, sizeof(char)*PS_E_N_PX ) ); */
/* HANDLE_ERROR( cudaMemset(A_mask, 0, sizeof(char)*PS_E_N_PX ) ); */

/* fried_geometry LLL gridDim_vl,blockDim_vl RRR (L_mask, A_mask, N_SIDE_LENSLET, 16, 0.5); */

/* dev2file("L_mask.bin",L_mask,_N_LENSLET_); */
/* dev2file("A_mask.bin",A_mask,PS_E_N_PX); */

/* HANDLE_ERROR( cudaFree( L_mask) ); */
/* HANDLE_ERROR( cudaFree( A_mask) ); */

char MASK_SET;
MASK_SET = fried_geometry_setup(cog.lenslet_mask, pupil_mask.m, N_SIDE_LENSLET, 16, 0.5);
//printf("MASK_SET = %d\n",MASK_SET);
pupil_mask.set_filter();
src.wavefront.M = &pupil_mask;
src.wavefront.masked();
char A_mask_filename[32];
sprintf(A_mask_filename,"A_mask_%03d.bin",N_SIDE_LENSLET);
dev2file(A_mask_filename,pupil_mask.m,PS_E_N_PX);

HANDLE_ERROR( cudaMalloc( (void**)&d__phase_est           , sizeof(float)*PS_E_N_PX ) );

float *d__phase_screen_low_res, *d__phase_screen_est, *phase_screen_low_res;
d__phase_screen_low_res = 0;
phase_screen_low_res    = 0;
phase_screen_est        = 0;
@ 
\subsection{Main test}
\label{sec:main-test}

The [[main test]] is simply a wavefront reconstruction routine.
<<main test>>=
atm.get_phase_screen(&src,delta_e,NP,delta_e,NP,0);
src.phase2file("phaseScreenLowRes.bin");

<<wavefront sensing>>

// Iterative solver
  //iSolve.cg_setup(N);
iSolve.minres_setup(N);

//aaCov.mask = d__mask;
dev2file("mask.bin",d__mask,_N_LENSLET_);

char filename[100];
/* for (k=1;k<6;k++) { */
k = 100;

sprintf(filename,"MINRES_phaseEst_%d.bin",k);
//sprintf(filename,"CG_phaseEst_%d.bin",k);
HANDLE_ERROR( cudaMalloc((void**)&d__x, sizeof(float)*N ) );
HANDLE_ERROR( cudaMemset(d__x, 0, sizeof(float)*N ) );

tid.tic();

//cudaProfilerStart();
iSolve.minres_vorst(d__x, &aaCov, cog.d__c, d__x);
//iSolve.cg(d__x, &aaCov, cog.d__c, k, d__x);
//cudaProfilerStop();

<<MVM>>

tid.toc("WAVEFRONT ESTIMATION");

<<MVM (wrap.)>>

  //printf("\nSolver residue norm           : %.2E\n",iSolve.rnorm);
  //printf("\nSolver mean time per iteration: %.2E\n",iSolve.mean_time_per_iteration);

/* } */
@ 
\subsection{Convergence test}
\label{sec:convergence-test}

The [[convergence test]] reconstructs the wavefront with both iterative solver MINRES and CG at each iteration step.
<<convergence test>>=
atm.get_phase_screen(&src,delta_e,NP,delta_e,NP,0);
char filename[100];
sprintf(filename,"CVGCE_phaseScreenLowRes_%03d.bin",N_SIDE_LENSLET);
src.phase2file(filename);

<<wavefront sensing>>

if (cog.MASK_SET) aaCov.mask = cog.lenslet_mask;

sprintf(filename,"bins/CVGCE_%s_phaseEst_200_%03d.bin",solver,N_SIDE_LENSLET);
FILE *fid;
fid = fopen(filename,"wb");
int n_page = 1, n_data = PS_E_N_PX*200;
fwrite(&n_page,sizeof(int),1,fid);
fwrite(&n_data,sizeof(int),1,fid);
phase_screen_est     = (float*)malloc(sizeof(float)*PS_E_N_PX);
HANDLE_ERROR( cudaMalloc((void**)&d__x, sizeof(float)*N ) );

// CG
if (strcmp(solver,"CG")==0) {
  iSolve.pcg_setup(N);
  for (k=1;k<=200;k++) {

    printf("\n______ ITERATION #: %03d ______\n",k);

    HANDLE_ERROR( cudaMemset(d__x, 0, sizeof(float)*N ) );

    tid.tic();

    iSolve.pcg(d__x, &aaCov, cog.d__c, k, d__x, 1./aa.variance());
    
    <<MVM>>

    tid.toc("WAVEFRONT ESTIMATION");

    HANDLE_ERROR( cudaMemcpy( phase_screen_est, d__phase_est,
			      sizeof(float)*PS_E_N_PX,
			      cudaMemcpyDeviceToHost ) );
    fwrite(phase_screen_est,sizeof(float),PS_E_N_PX,fid);

    printf("\nSolver residue norm           : %.2E\n",iSolve.rnorm);
    printf("\nSolver mean time per iteration: %.2E\n",iSolve.mean_time_per_iteration);
    printf("\n-------------------------------\n");

  }
	 }
// MINRES
if (strcmp(solver,"MINRES")==0) {
  iSolve.minres_setup(N);
  iSolve.RTOL = 1e-6;
  for (k=1;k<=200;k++) {

    printf("\n______ ITERATION #: %03d ______\n",k);

    HANDLE_ERROR( cudaMemset(d__x, 0, sizeof(float)*N ) );

    tid.tic();

    iSolve.minres_vorst(d__x, &aaCov, cog.d__c, k, d__x);

    <<MVM>>

    tid.toc("WAVEFRONT ESTIMATION");

    HANDLE_ERROR( cudaMemcpy( phase_screen_est, d__phase_est,
			      sizeof(float)*PS_E_N_PX,
			      cudaMemcpyDeviceToHost ) );
    fwrite(phase_screen_est,sizeof(float),PS_E_N_PX,fid);

    printf("\nSolver residue norm           : %.2E\n",iSolve.rnorm);
    printf("\nSolver mean time per iteration: %.2E\n",iSolve.mean_time_per_iteration);
    printf("\n-------------------------------\n");

  }
	 }

fclose(fid);
@ 
\subsection{Hot convergence test}
\label{sec:hot-convergence-test}


The [[hot convergence test]] reconstructs the wavefront with MINRES re-using the previous estimate as starting point for the next one.
<<hot convergence test>>=

LMMSE lmmse;
lmmse.setup(&atm,&gs,1,&src,1,d,N_SIDE_LENSLET,&pupil_mask,"MINRES");

int n_step = 200;

HANDLE_ERROR( cudaMalloc( (void**)&d__phase_screen_low_res, sizeof(float)*PS_E_N_PX*n_step ) );
float *d__gs_phase_screen_low_res;
HANDLE_ERROR( cudaMalloc( (void**)&d__gs_phase_screen_low_res, sizeof(float)*PS_E_N_PX*n_step ) );
HANDLE_ERROR( cudaMalloc( (void**)&d__phase_screen_est,     sizeof(float)*PS_E_N_PX*n_step ) );

//if (cog.MASK_SET) aaCov.mask = cog.lenslet_mask;

HANDLE_ERROR( cudaMalloc((void**)&d__x, sizeof(float)*N ) );
HANDLE_ERROR( cudaMemset(d__x, 0, sizeof(float)*N ) );
float time_step,tau=0, tau0=0, delta_tau;
time_step = 2;

//iSolve.minres_setup(N);
lmmse.iSolve.VERBOSE = 0;
lmmse.iSolve.RTOL = 5E-2;

// First wavefront estimate
atm.get_phase_screen_gradient(&cog,N_SIDE_LENSLET,d,&gs,tau);
tid.tic();
//iSolve.minres_vorst(d__x, &aaCov, cog.d__c, d__x);
lmmse.estimation(&cog);
tid.toc(&tau0,"WAVEFRONT ESTIMATION");
tau = tau0;

//lmmse.iSolve.N_ITERATION = 5;
//lmmse.iSolve.ATOL = lmmse.iSolve.rnorm;

// MINRES
for (k=1;k<=n_step;k++) {

  printf("\n______ ITERATION #: %03d ______\n",k);
    
  tau +=  time_step;
  atm.get_phase_screen_gradient(&cog,N_SIDE_LENSLET,d,&gs,1E-3*tau);

  //  lmmse.d__phase_est_c = d__phase_screen_est + (k-1)*PS_E_N_PX;
  lmmse.set_phase_est_ptr( d__phase_screen_est + (k-1)*PS_E_N_PX );

  tid.tic();

  /* iSolve.minres_vorst(d__x, &aaCov, cog.d__c, d__x); */

  /* paCov.MVM(d__phase_screen_est + (k-1)*PS_E_N_PX, d__x); */
  lmmse.estimation(&cog);

  tid.toc(&delta_tau, "WAVEFRONT ESTIMATION");

  //tau += delta_tau + 0*time_step;

  src.wavefront.phase = d__phase_screen_low_res + (k-1)*PS_E_N_PX;
  atm.get_phase_screen(&src,delta_e,NP,delta_e,NP,1E-3*tau);
  gs.wavefront.phase = d__gs_phase_screen_low_res + (k-1)*PS_E_N_PX;
  atm.get_phase_screen(&gs,delta_e,NP,delta_e,NP,1E-3*tau);
  /* HANDLE_ERROR( cudaMemcpy( d__phase_screen_low_res + (k-1)*PS_E_N_PX, src.wavefront.phase, */
  /* 			    sizeof(float)*PS_E_N_PX, */
  /* 			    cudaMemcpyDeviceToDevice) ); */

  //printf("\nSolver residue norm           : %.2E\n",iSolve.rnorm);
  //printf("\nSolver mean time per iteration: %.2E\n",iSolve.mean_time_per_iteration);
  printf("\n-------------------------------\n");

  /* if (k==1) */
  /*   iSolve.ATOL = iSolve.rnorm; */
  //if (k>1)
  //iSolve.N_ITERATION = 10;

 }

char filename[100];

sprintf(filename,"CVGCE_phaseScreenLowRes_%03d.bin",N_SIDE_LENSLET);
dev2file(filename,d__phase_screen_low_res,PS_E_N_PX*n_step);
sprintf(filename,"CVGCE_GS_phaseScreenLowRes_%03d.bin",N_SIDE_LENSLET);
dev2file(filename,d__gs_phase_screen_low_res,PS_E_N_PX*n_step);

sprintf(filename,"CVGCE_%s_phaseEst_%03d_%03d.bin",solver,n_step,N_SIDE_LENSLET);
dev2file(filename,d__phase_screen_est,PS_E_N_PX*n_step);

lmmse.cleanup();
HANDLE_ERROR( cudaFree( d__gs_phase_screen_low_res ) );

@
\subsection{Statistics test}
\label{sec:statistics-test}

The [[statistics test]] reconstructs the wavefront n times, each time a new statistically independent phase screen is generated.
<<statistics test>>=
float *d__phase_screen_low_res;
HANDLE_ERROR( cudaMalloc( (void**)&d__phase_screen_low_res, sizeof(float)*PS_E_N_PX ) );

float *phase_screen_low_res;
phase_screen_low_res = (float*)malloc(sizeof(float)*PS_E_N_PX);
HANDLE_ERROR( cudaMemcpy( phase_screen_low_res, d__phase_screen_low_res,
			  sizeof(float)*PS_E_N_PX,
			  cudaMemcpyDeviceToHost ) );

<<linear solver (prep.)>>
aaCov.mask = d__mask;
<<<<MVM (prep.)>>

FILE *fid0, *fid;
char filename[100];
sprintf(filename,"STATS_phaseScreenLowRes_%03d.bin",N_SIDE_LENSLET);
fid0 = fopen(filename,"wb");

int nSample = 200, nIt = 200;
sprintf(filename,"STATS_%s_phaseEst_%03d_%03d.bin",solver,nSample,N_SIDE_LENSLET);
fid = fopen(filename,"wb");

phase_screen_est     = (float*)malloc(sizeof(float)*PS_E_N_PX);
HANDLE_ERROR( cudaMalloc( (void**)&d__phase_est           , sizeof(float)*PS_E_N_PX ) );
// CG
if (strcmp(solver,"CG")==0) {
  iSolve.cg_setup(N);
  for (k=1;k<=nSample;k++) {

    printf("\n______ ITERATION #: %03d ______\n",k);

    <<atmosphere propagation and wavefront sensing>>

    HANDLE_ERROR( cudaMemset(d__x, 0, sizeof(float)*N ) );

    tid.tic();

    iSolve.cg(d__x, &aaCov, cog.d__c, nIt, d__x);
    
    <<MVM>>

    tid.toc("WAVEFRONT ESTIMATION");

    HANDLE_ERROR( cudaMemcpy( phase_screen_est, d__phase_est,
    			      sizeof(float)*PS_E_N_PX,
    			      cudaMemcpyDeviceToHost ) );
    fwrite(phase_screen_est,sizeof(float),PS_E_N_PX,fid);

    printf("\nSolver residue norm           : %.2E\n",iSolve.rnorm);
    printf("\nSolver mean time per iteration: %.2E\n",iSolve.mean_time_per_iteration);
    printf("\n-------------------------------\n");

    atm.reset();
  }
	 }
// MINRES
if (strcmp(solver,"MINRES")==0) {
  iSolve.minres_setup(N);
  for (k=1;k<=nSample;k++) {

    printf("\n______ ITERATION #: %03d ______\n",k);

    <<atmosphere propagation and wavefront sensing>>

    HANDLE_ERROR( cudaMemset(d__x, 0, sizeof(float)*N ) );

    tid.tic();

    iSolve.minres_vorst(d__x, &aaCov, cog.d__c, nIt, d__x);

    <<MVM>>

    tid.toc("WAVEFRONT ESTIMATION");

    HANDLE_ERROR( cudaMemcpy( phase_screen_est, d__phase_est,
			      sizeof(float)*PS_E_N_PX,
			      cudaMemcpyDeviceToHost ) );
    HANDLE_ERROR( cudaThreadSynchronize() );
    fwrite(phase_screen_est,sizeof(float),PS_E_N_PX,fid);
    printf("\n Data saved to file!\n");

    printf("\nSolver residue norm           : %.2E\n",iSolve.rnorm);
    printf("\nSolver mean time per iteration: %.2E\n",iSolve.mean_time_per_iteration);
    printf("\n-------------------------------\n");

    atm.reset();
  }
	 }

fclose(fid0);
fclose(fid);
@ 
This is the place where we terminate the test elegantly.
<<cleanup and free memory>>=
atm.cleanup();
lenslet_array.cleanup();
cog.cleanup();
aa.cleanup();
pa.cleanup();
aaCov.cleanup();
paCov.cleanup();
iSolve.cleanup();
S.cleanup();
HANDLE_ERROR( cudaFree( d__x ) );
HANDLE_ERROR( cudaFree( d__phase_est ) );
if (d__phase_screen_low_res) HANDLE_ERROR( cudaFree( d__phase_screen_low_res ) );
if (d__phase_screen_est)     HANDLE_ERROR( cudaFree( d__phase_screen_est ) );
HANDLE_ERROR( cudaFree( d__mask ) );
if (phase_screen_low_res) free(phase_screen_low_res);
if (phase_screen_est) free(phase_screen_est);
@ 
<<PA input>>=
__global__ void set_pa_input(float *pa_c, float *aa_c, int *idx, int N) 
{
  int i, j, k;
  i = blockIdx.x * blockDim.x + threadIdx.x;
  j = blockIdx.y * blockDim.y + threadIdx.y;
  if ( (i<N) && (j<N) ) {
    k = i*N + j;
    pa_c[idx[k]] = aa_c[k];
  }
}
@ 
This is a chunk of code for the [[statistics]] test routine:
<<atmosphere propagation and wavefront sensing>>=

atm.get_phase_screen(d__phase_screen_low_res,delta_e,NP,delta_e,NP,d__src,0);

HANDLE_ERROR( cudaMemcpy( phase_screen_low_res, d__phase_screen_low_res,
			  sizeof(float)*PS_E_N_PX,
			  cudaMemcpyDeviceToHost ) );
fwrite(phase_screen_low_res,sizeof(float),PS_E_N_PX,fid0);

<<wavefront sensing (geometric and masked)>>
@ 
The wavefront sensing  model is using either a Fourier optics or geometric optics model.
<<wavefront sensing>>=
<<wavefront sensing (geometric)>>
<<save centroids to file>>
<<wavefront sensing (masked)>>=
<<wavefront sensing (geometric and masked)>>
<<save centroids to file>>
@ 
<<wavefront sensing (Fourier)>>=
lenslet_array.propagate(d__src);
cxy0 = (_N_PX_PUPIL_ - 1)/2.0;
cog.get_data(lenslet_array.d__frame, cxy0, cxy0, slopes2Angle);
@ 
<<wavefront sensing (geometric)>>=
atm.get_phase_screen_gradient(&cog,N_SIDE_LENSLET,d,&src,1,0);
<<wavefront sensing (geometric and masked)>>=
atm.get_phase_screen_gradient(&cog,N_SIDE_LENSLET,d__mask,d,&src,1,0);
@ 
<<save centroids to file>>=
dev2file("centroids.bin",cog.d__c,N);
@ 
The index of the WFS slopes in the augmented input vector of the MVM operation are computed next:
<<linear solver (prep.)>>=
idx = (int *)malloc(sizeof(int)*_N_LENSLET_);
k = -1;
//printf("\n k   idx\n");
for (i=1;i<2*N_SIDE_LENSLET;i+=2) {
  for (j=1;j<2*N_SIDE_LENSLET;j+=2) {
    idx[++k] = i*(2*N_SIDE_LENSLET + 1) + j;
    //printf("(%2d) %2d\n",k,idx[k]);
  }
 }
HANDLE_ERROR( cudaMalloc( (void**)&d__idx, sizeof(int)*_N_LENSLET_ ) );
HANDLE_ERROR( cudaMemcpy( d__idx, idx,
			  sizeof(int)*_N_LENSLET_,
			  cudaMemcpyHostToDevice ) );

HANDLE_ERROR( cudaMalloc((void**)&d__x, sizeof(float)*N ) );
x = (float*)malloc(sizeof(float)*N);
@ 
CG iterative solver call:
<<linear solver (conjugate gradient)>>=
iSolve.cg(d__x, &aaCov, d__b, 5, d__x);
@ 
MINRES iterative solver call:
<<linear solver (MINRES)>>=
iSolve.minres(d__x, &aaCov, d__b, 5, d__x);
@ 
Allocation of the input vector for the MVM:
<<MVM (prep.)>>=
HANDLE_ERROR( cudaMalloc((void**)&d__ce, sizeof(float)*PS_E_N_PX*2 ) );
HANDLE_ERROR( cudaMemset(d__ce, 0, sizeof(float)*PS_E_N_PX*2 ) );
dim3 blockDim(16,16);
dim3 gridDim(N_SIDE_LENSLET/16+1,N_SIDE_LENSLET/16+1);
@ 
Writing the input of the iterative solver into the output of the MVM:
<<MVM>>=
paCov.MVM(d__phase_est,d__x);
@ 
Saving the MVM wavefront estimate to a file:
<<MVM (wrap.)>>=
dev2file(filename,d__phase_est,PS_E_N_PX);
@