% -*- mode: Noweb; noweb-code-mode: c-mode -*-
@
\index{iterativeSolvers}
\section{The files}
\subsection{Header}
<<iterativeSolvers.h>>=
#ifndef __ITERATIVESOLVERS_H__
#define __ITERATIVESOLVERS_H__
#ifndef __UTILITIES_H__
#include "utilities.h"
#endif
#ifndef __BTBT_H__
#include "BTBT.h"
#endif
//#define MINRES_DEBUG
struct iterativeSolvers {
<<parameters>>
void cg_setup(int n_vector);
void pcg_setup(int n_vector);
void minres_setup(int n_vector);
void cleanup(void);
void cg(float *x, BTBT* A, float *b, int max_it, float* x0);
void pcg(float *x, BTBT* A, float *b, int max_it, float* x0, float ip);
void sym_ortho(float *c,float *s,float *r,
float *a,float *b);
void lanczos_step(float *alpha, float *beta, float *nu_kp1,
BTBT *A, float *nu_k, float *nu_km1, float sigma);
void minres_vorst(float *x, BTBT* A, float *b, float* x0);
void minres_vorst(float *x, BTBT* A, float *b, int max_it, float* x0);
void minres_vorst(float *x, BTBT* A, float *b, float rtol, float* x0);
void pminres_vorst(float *x, BTBT* A, float *b, int max_it, float* x0, float ip);
void minres_vorst(float *x, float *res, BTBT* A, float *b, int max_it, float* x0);
void minres_choi(float *x, BTBT* A, float *b, int max_it, float* x0);
// void info(void);
};
#endif // __ITERATIVESOLVERS_H__
@
\subsection{Source}
<<iterativeSolvers.cu>>=
#include "iterativeSolvers.h"
<<CG setup>>
<<PCG setup>>
<<MINRES setup>>
<<cleanup>>
<<conjugate gradient>>
<<preconditioned conjugate gradient>>
<<sym-ortho algorithm>>
<<Lanczos step>>
<<MINRES (Vorst) I>>
<<MINRES (Vorst) II>>
<<MINRES (Vorst) III>>
<<MINRES (Vorst) with residue>>
<<MINRES (Choi)>>
@
\section{Parameters}
\index{iterativeSolvers!iterativeSolvers}
The structure parameters are
<<parameters>>=
float *d__vectors,
*q, *x, *r, *p, *z,
*nu_i, *nu_im1, *nu_ip1,
*w_i, *w_im1, *w_im2;
float rnorm, rel_rnorm, mean_time_per_iteration, RTOL, ATOL;
int N, N_ITERATION, cvgce_iteration;
stopwatch tid;
cublasHandle_t handle;
cublasStatus_t status;
char VERBOSE;
@
\section{Functions}
\subsection{Setup \& Cleanup}
The iterativeSolvers structure gather several routines implementing iterative methods to solve the linear system: $Ax=b$.
The setup method is allocated the required memory depending of the iterative method type:
\begin{itemize}
\item
\index{iterativeSolvers!iterativeSolvers!cg\_setup}
<<CG setup>>=
void iterativeSolvers::cg_setup(int n_vector)
{
INFO("@(CEO)>iterativeSolvers: CONJUGATE GRADIENT\n");
N = n_vector;
HANDLE_ERROR( cudaMalloc((void**)&d__vectors, sizeof(float)*N*4 ) );
q = d__vectors;
x = d__vectors + N;
r = d__vectors + 2*N;
p = d__vectors + 3*N;
cublasCreate(&handle);
}
<<PCG setup>>=
void iterativeSolvers::pcg_setup(int n_vector)
{
INFO("@(CEO)>iterativeSolvers: CONJUGATE GRADIENT\n");
N = n_vector;
HANDLE_ERROR( cudaMalloc((void**)&d__vectors, sizeof(float)*N*5 ) );
q = d__vectors;
x = d__vectors + N;
r = d__vectors + 2*N;
p = d__vectors + 3*N;
z = d__vectors + 4*N;
cublasCreate(&handle);
}
@ \item
\index{iterativeSolvers!iterativeSolvers!minres\_setup}
<<MINRES setup>>=
void iterativeSolvers::minres_setup(int n_vector)
{
INFO("@(CEO)>iterativeSolvers: MINRES\n");
N = n_vector;
HANDLE_ERROR( cudaMalloc((void**)&d__vectors, sizeof(float)*N*6 ) );
nu_i = d__vectors;
nu_im1 = d__vectors + N;
nu_ip1 = d__vectors + 2*N;
w_i = d__vectors + 3*N;
w_im1 = d__vectors + 4*N;
w_im2 = d__vectors + 5*N;
cublasCreate(&handle);
VERBOSE = 0;
ATOL = 0;
RTOL = 5E-2;
N_ITERATION = 100;
}
@ \end{itemize}
Memory is freed with
\index{iterativeSolvers!iterativeSolvers!cleanup}
<<cleanup>>=
void iterativeSolvers::cleanup(void)
{
INFO("@(CEO)>iterativeSolvers: freeing memory!\n");
HANDLE_ERROR( cudaFree( d__vectors ) );
cublasDestroy(handle);
}
@
\subsection{Conjugate gradient}
\label{sec:conjugate-gradient}
The next routine implements the conjugate gradient iterative method:
\index{iterativeSolvers!iterativeSolvers!cg}
<<conjugate gradient>>=
void iterativeSolvers::cg(float *x, BTBT* A, float *b, int max_it, float* x0)
{
float alpha, beta, gamma;
int k;
<<conjugate gradient init>>
tid.tic();
for (k=0;k<max_it;k++) {
//printf("CG IT=%d - ",k);
<<conjugate gradient loop part 1>>
if (k<(max_it-1)) {
<<conjugate gradient loop part 2>>
}
}
tid.toc(&mean_time_per_iteration);
mean_time_per_iteration /= max_it;
}
@
The conjugate gradient algorithm is written:
\begin{itemize}
\item initialization:
\begin{enumerate}
\item $r=Ax_0$
<<conjugate gradient init>>=
A->MVM(r , x0);
@ \item $r_0 = b - r_0$
<<conjugate gradient init>>=
beta = -1;
cublasSscal(handle, N, &beta, r, 1);
alpha = 1;
cublasSaxpy(handle, N, &alpha, b, 1, r, 1);
@ \item $p_0=r_0$
<<conjugate gradient init>>=
cublasScopy(handle, N, r, 1, p, 1);
@ \end{enumerate}
\item loop:
\begin{enumerate}
\item $q=Ap_k$
<<conjugate gradient loop part 1>>=
A->MVM(q , p);
@ \item $\gamma=r_k^Tr_k$
<<conjugate gradient loop part 1>>=
cublasSdot(handle, N, r, 1, r, 1, &gamma);
//printf("r norm=%6.2E\n",sqrt(gamma));
@ \item $\left\| r \right\|_2=\sqrt\gamma$
<<conjugate gradient loop part 1>>=
rnorm = sqrt(gamma);
if (gamma==0) break;
@ \item $\alpha_k = {\displaystyle \gamma\over p_K^Tq}$
<<conjugate gradient loop part 1>>=
cublasSdot(handle, N, p, 1, q, 1, &alpha);
alpha = gamma/alpha;
@ \item $x_{k+1} = x_k + \alpha_kp_k$
<<conjugate gradient loop part 1>>=
cublasSaxpy(handle, N, &alpha, p, 1, x, 1);
@ \item $r_{k+1} = r_k - \alpha_kq$
<<conjugate gradient loop part 2>>=
alpha = -alpha;
cublasSaxpy(handle, N, &alpha, q, 1, r, 1);
@ \item $\beta_k = {\displaystyle r_{k+1}^Tr_{k+1} \over \gamma} $
<<conjugate gradient loop part 2>>=
cublasSdot(handle, N, r, 1, r, 1, &beta);
beta = beta/gamma;
@ \item $p_{k+1} = r_{k+1} + \beta_k p_k$
<<conjugate gradient loop part 2>>=
cublasSscal(handle, N, &beta, p, 1);
alpha = 1;
cublasSaxpy(handle, N, &alpha, r, 1, p, 1);
@ \end{enumerate}
\end{itemize}
\subsection{Preconditioned conjugate gradient}
\label{sec:precond-conjugate-gradient}
The next routine implements the conjugate gradient iterative method with Jacobi preconditioner.
The Jacobi preconditioner $M$ is given by
\begin{equation}
\label{eq:1}
M = \operatorname{diag}\left( A \right) = \sigma_A^2 I,
\end{equation}
and its inverse is simply
\begin{equation}
\label{eq:2}
M^{-1} = \sigma_A^{-2} I = [[ip]] I.
\end{equation}
\index{iterativeSolvers!iterativeSolvers!pcg}
<<preconditioned conjugate gradient>>=
void iterativeSolvers::pcg(float *x, BTBT* A, float *b,
int max_it, float* x0, float ip)
{
float alpha, beta, gamma;
int k;
<<preconditioned conjugate gradient init>>
tid.tic();
for (k=0;k<max_it;k++) {
//printf("CG IT=%d - ",k);
<<preconditioned conjugate gradient loop part 1>>
if (k<(max_it-1)) {
<<preconditioned conjugate gradient loop part 2>>
}
}
tid.toc(&mean_time_per_iteration);
mean_time_per_iteration /= max_it;
}
@
The preconditioned conjugate gradient algorithm is written:
\begin{itemize}
\item initialization:
\begin{enumerate}
\item $r=Ax_0$
<<preconditioned conjugate gradient init>>=
A->MVM(r , x0);
@ \item $r_0 = b - r_0$
<<preconditioned conjugate gradient init>>=
beta = -1;
cublasSscal(handle, N, &beta, r, 1);
alpha = 1;
cublasSaxpy(handle, N, &alpha, b, 1, r, 1);
@ \item $z_0=M^{-1}r_0$
<<preconditioned conjugate gradient init>>=
cublasScopy(handle, N, r, 1, z, 1);
cublasSscal(handle, N, &ip, z, 1);
@ \item $p_0=z_0$
<<preconditioned conjugate gradient init>>=
cublasScopy(handle, N, z, 1, p, 1);
@ \end{enumerate}
\item loop:
\begin{enumerate}
\item $q=Ap_k$
<<preconditioned conjugate gradient loop part 1>>=
A->MVM(q , p);
@ \item $\gamma=r_k^Tz_k$
<<preconditioned conjugate gradient loop part 1>>=
cublasSdot(handle, N, r, 1, z, 1, &gamma);
//printf("r norm=%6.2E\n",sqrt(gamma));
@ \item $\left\| r \right\|_2=\sqrt\gamma$
<<preconditioned conjugate gradient loop part 1>>=
rnorm = sqrt(gamma);
if (gamma==0) break;
@ \item $\alpha_k = {\displaystyle \gamma\over p_K^Tq}$
<<preconditioned conjugate gradient loop part 1>>=
cublasSdot(handle, N, p, 1, q, 1, &alpha);
alpha = gamma/alpha;
@ \item $x_{k+1} = x_k + \alpha_kp_k$
<<preconditioned conjugate gradient loop part 1>>=
cublasSaxpy(handle, N, &alpha, p, 1, x, 1);
@ \item $r_{k+1} = r_k - \alpha_kq$
<<preconditioned conjugate gradient loop part 2>>=
alpha = -alpha;
cublasSaxpy(handle, N, &alpha, q, 1, r, 1);
@ \item $z_{k+1} = M^{-1}r_{k+1}$
<<preconditioned conjugate gradient loop part 2>>=
cublasScopy(handle, N, r, 1, z, 1);
cublasSscal(handle, N, &ip, z, 1);
@ \item $\beta_k = {\displaystyle r_{k+1}^Tz_{k+1} \over \gamma} $
<<preconditioned conjugate gradient loop part 2>>=
cublasSdot(handle, N, r, 1, z, 1, &beta);
beta = beta/gamma;
@ \item $p_{k+1} = z_{k+1} + \beta_k p_k$
<<preconditioned conjugate gradient loop part 2>>=
cublasSscal(handle, N, &beta, p, 1);
alpha = 1;
cublasSaxpy(handle, N, &alpha, z, 1, p, 1);
@ \end{enumerate}
\end{itemize}
\subsection{MINRES}
\label{sec:minres}
\subsubsection{Vorst implementation}
\label{sec:vorst-implementation}
The MINRES iterative solver~\cite{PS75,Vorst03,Choi06} method is implemented in the next routine:
\index{iterativeSolvers!iterativeSolvers!minres\_vorst}
<<MINRES (Vorst) I>>=
void iterativeSolvers::minres_vorst(float *x, BTBT* A, float *b, float* x0)
{
float alpha, beta, gamma_i, gamma_im1, sigma_i, sigma_im1,
eta, delta, rho1, rho2, rho3, rnorm0;
int k;
<<MINRES (Vorst) init>>
// tid.tic();
for (k=0;k<N_ITERATION;k++) {
//printf("MINRES IT=%d - ",k);
<<MINRES (Vorst) loop>>
}
// tid.toc(&mean_time_per_iteration);
// mean_time_per_iteration /= max_it;
cvgce_iteration = k;
INFO("MINRES converged at iteration %d to a solution with relative residual %.2E.\n",
k,rel_rnorm);
}
<<MINRES (Vorst) II>>=
void iterativeSolvers::minres_vorst(float *x, BTBT* A, float *b, int max_it, float* x0)
{
float alpha, beta, gamma_i, gamma_im1, sigma_i, sigma_im1,
eta, delta, rho1, rho2, rho3, rnorm0;
int k;
<<MINRES (Vorst) init>>
// tid.tic();
for (k=0;k<max_it;k++) {
//printf("MINRES IT=%d - ",k);
<<MINRES (Vorst) loop>>
}
// tid.toc(&mean_time_per_iteration);
// mean_time_per_iteration /= max_it;
INFO("MINRES converged at iteration %d to a solution with relative residual %.2E.\n",
k,rel_rnorm);
}
<<MINRES (Vorst) III>>=
void iterativeSolvers::minres_vorst(float *x, BTBT* A, float *b, float rtol, float* x0)
{
float alpha, beta, gamma_i, gamma_im1, sigma_i, sigma_im1,
eta, delta, rho1, rho2, rho3, rnorm0;
int k;
<<MINRES (Vorst) init>>
RTOL = rtol;
// tid.tic();
for (k=0;k<N_ITERATION;k++) {
//printf("MINRES IT=%d - ",k);
<<MINRES (Vorst) loop>>
}
// tid.toc(&mean_time_per_iteration);
// mean_time_per_iteration /= max_it;
INFO("MINRES converged at iteration %d to a solution with relative residual %.2E.\n",
k,rel_rnorm);
}
<<MINRES (Vorst) with residue>>=
void iterativeSolvers::minres_vorst(float *x, float *res, BTBT* A, float *b, int max_it, float* x0)
{
float alpha, beta, gamma_i, gamma_im1, sigma_i, sigma_im1,
eta, delta, rho1, rho2, rho3, res_cst, result, rnorm0;
int k;
float *residual;
HANDLE_ERROR( cudaMalloc( (void**)&residual, sizeof(float)*N ) );
<<MINRES (Vorst) init>>
tid.tic();
for (k=0;k<max_it;k++) {
//printf("MINRES IT=%d - ",k);
<<MINRES (Vorst) loop>>
A->MVM(residual , x);
res_cst = -1;
cublasSscal(handle, N, &res_cst, residual, 1);
res_cst = 1;
cublasSaxpy(handle, N, &res_cst, b, 1, residual, 1);
cublasSnrm2(handle, N, residual, 1, &result);
res[k] = result;
INFO("(%3d): Residue norm: %.2E\n",k,result);
}
tid.toc(&mean_time_per_iteration);
mean_time_per_iteration /= max_it;
HANDLE_ERROR( cudaFree( residual) );
}
@
The MINRES algorithm is written:
\begin{itemize}
\item initialization:
\begin{enumerate}
\item $r=Ax_0$
<<MINRES (Vorst) init>>=
A->MVM(nu_i , x0);
@ \item $r_0 = b - r_0$
<<MINRES (Vorst) init>>=
beta = -1;
cublasSscal(handle, N, &beta, nu_i, 1);
alpha = 1;
cublasSaxpy(handle, N, &alpha, b, 1, nu_i, 1);
#ifdef MINRES_DEBUG
cublasSnrm2(handle, N, b, 1, &beta);
INFO("beta=%.2E\n",beta);
#endif
@ \item $\beta_1=\left\| \nu_1 \right\|_2=\left\| r \right\|_2$
<<MINRES (Vorst) init>>=
//cublasSnrm2(handle, N, nu_i, 1, &beta);
cublasSdot(handle, N, nu_i, 1, nu_i, 1, &beta);
beta = sqrtf(beta);
rnorm = beta;
//rnorm0 = rnorm;
cublasSnrm2(handle, N, b, 1, &rnorm0);
rel_rnorm = rnorm/rnorm0;
if (VERBOSE)
INFO("#%3d: r norm=%6.2E - rel. tol.=%6.2E\n",0,rnorm,rnorm/rnorm0);
if ( (rnorm<ATOL) || (rel_rnorm<=RTOL) )
return;
#ifdef MINRES_DEBUG
INFO("beta=%.2E\n",beta);
#endif
@ \item $\eta=\beta_1$, $\gamma_1=\gamma_0=1$, $\sigma_1=\sigma_0=0$, $\nu_0=0$
<<MINRES (Vorst) init>>=
eta = beta;
gamma_i = gamma_im1 = 1;
sigma_i = sigma_im1 = 0;
@ \item $\nu_0=0$, $w_0=w_{-1}=0$
<<MINRES (Vorst) init>>=
HANDLE_ERROR( cudaMemset(nu_im1, 0, sizeof(float)*N ) );
HANDLE_ERROR( cudaMemset(w_im1 , 0, sizeof(float)*N ) );
HANDLE_ERROR( cudaMemset(w_im2 , 0, sizeof(float)*N ) );
@ \end{enumerate}
\item loop:
\begin{enumerate}
\item $\nu_i = \nu_i / \beta_i$
<<MINRES (Vorst) loop>>=
rho1 = 1/beta;
cublasSscal(handle, N, &rho1, nu_i, 1);
@ \item $\nu_{i+1}= A\nu_i$
<<MINRES (Vorst) loop>>=
A->MVM(nu_ip1 , nu_i);
@ \item $\alpha_i = \nu_i^T\nu_{i+1}$
<<MINRES (Vorst) loop>>=
cublasSdot(handle, N, nu_i, 1, nu_ip1, 1, &alpha);
#ifdef MINRES_DEBUG
INFO("alpha=%.2E\n",alpha);
#endif
@ \item $\nu_{i+1} = \nu_{i+1} - \alpha_i\nu_i - \beta_i\nu_{i-1}$
<<MINRES (Vorst) loop>>=
alpha = -alpha;
beta = -beta;
cublasSaxpy(handle, N, &alpha, nu_i, 1, nu_ip1, 1);
cublasSaxpy(handle, N, &beta, nu_im1, 1, nu_ip1, 1);
alpha = -alpha;
beta = -beta;
@ \item $\delta = \gamma_i\alpha_i - \gamma_{i-1}\sigma_i\beta_i$
<<MINRES (Vorst) loop>>=
delta = gamma_i*alpha - gamma_im1*sigma_i*beta;
#ifdef MINRES_DEBUG
INFO("delta=%.2E\n",delta);
#endif
@ \item $\rho_2 = \sigma_i\alpha_i + \gamma_{i-1}\gamma_i\beta_i$
<<MINRES (Vorst) loop>>=
rho2 = sigma_i*alpha + gamma_im1*gamma_i*beta;
#ifdef MINRES_DEBUG
INFO("rho2=%.2E\n",rho2);
#endif
@ \item $\rho_3 = \sigma_{i-1}\beta_i$
<<MINRES (Vorst) loop>>=
rho3 = sigma_im1*beta;
#ifdef MINRES_DEBUG
INFO("rho3=%.2E\n",rho3);
#endif
@ \item $\beta_{i+1}=\left\| \nu_{i+1} \right\|_2$
<<MINRES (Vorst) loop>>=
//cublasSnrm2(handle, N, nu_ip1, 1, &beta);
cublasSdot(handle, N, nu_ip1, 1, nu_ip1, 1, &beta);
beta = sqrtf(beta);
#ifdef MINRES_DEBUG
INFO("beta=%.2E\n",beta);
#endif
@ \item $\rho_1 = \sqrt{\delta^2+\beta_{i+1}^2}$
<<MINRES (Vorst) loop>>=
rho1 = hypotf(delta,beta);
#ifdef MINRES_DEBUG
INFO("rho1=%.2E\n",rho1);
#endif
@ \item $\gamma_{i+1} = \delta / \rho_1$
<<MINRES (Vorst) loop>>=
gamma_im1 = gamma_i;
gamma_i = delta/rho1;
#ifdef MINRES_DEBUG
INFO("gamma_im1=%.2E\n",gamma_im1);
INFO("gamma_i=%.2E\n",gamma_i);
#endif
@ \item $\sigma_{i+1} = \beta_{i+1} / \rho_1$
<<MINRES (Vorst) loop>>=
sigma_im1 = sigma_i;
sigma_i = beta/rho1;
#ifdef MINRES_DEBUG
INFO("sigma_im1=%.2E\n",sigma_im1);
INFO("sigma_i=%.2E\n",sigma_i);
#endif
@ \item $w_i = (\nu_i-\rho_3w_{i-2}-\rho_2w_{i-1})/\rho_1$
<<MINRES (Vorst) loop>>=
cublasScopy(handle, N, nu_i, 1, w_i, 1);
rho1 = 1.0/rho1;
cublasSscal(handle, N, &rho1, w_i, 1);
rho3 = -rho3*rho1;
cublasSaxpy(handle, N, &rho3, w_im2, 1, w_i, 1);
rho2 = -rho2*rho1;
cublasSaxpy(handle, N, &rho2, w_im1, 1, w_i, 1);
@ \item $x_i = x_{i-1} + \gamma_{i+1}\eta w_i$
<<MINRES (Vorst) loop>>=
rho1 = gamma_i*eta;
cublasSaxpy(handle, N, &rho1, w_i, 1, x, 1);
@ \item $\left\| r_i \right\|_2 = \left| \sigma_{i+1} \right|\left\| r_{i-1} \right\|_2 $
<<MINRES (Vorst) loop>>=
rnorm = rnorm*fabs(sigma_i);
rel_rnorm = rnorm/rnorm0;
if (VERBOSE)
INFO("#%3d: r norm=%6.2E - rel. tol.=%6.2E\n",k,rnorm,rnorm/rnorm0);
if ( (rnorm<ATOL) || (rel_rnorm<=RTOL) )
break;
@ \item $\eta = \sigma_{i+1}\eta$
<<MINRES (Vorst) loop>>=
eta = -sigma_i*eta;
#ifdef MINRES_DEBUG
INFO("eta=%.2E\n",eta);
#endif
@ \item
\begin{eqnarray}
\label{eq:3}
\nu_{i-1} &\leftarrow& \nu_i \nonumber\\
\nu_{i} &\leftarrow& \nu_{i+1} \nonumber\\
w_{i-2} &\leftarrow& w_{i-1} \nonumber\\
w_{i-1} &\leftarrow& \nu_{i} \nonumber
\end{eqnarray}
<<MINRES (Vorst) loop>>=
cublasScopy(handle, N, nu_i , 1, nu_im1, 1);
cublasScopy(handle, N, nu_ip1, 1, nu_i , 1);
cublasScopy(handle, N, w_im1 , 1, w_im2 , 1);
cublasScopy(handle, N, w_i , 1, w_im1 , 1);
if (beta==0) break;
@ \end{enumerate}
\end{itemize}
\subsubsection{Choi implementation}
\label{sec:choi-implementation}
\index{iterativeSolvers!iterativeSolvers!minres\_choi}
<<MINRES (Choi)>>=
void iterativeSolvers::minres_choi(float *x, BTBT* A, float *b, int max_it, float* x0)
{
float a, alpha, beta, gamma, delta1, delta2, phi, tau, c, s, epsilon;
int k;
<<MINRES (Choi) init>>
tid.tic();
for (k=0;k<max_it;k++) {
//INFO("MINRES IT=%d - ",k);
<<MINRES (Choi) loop>>
}
tid.toc(&mean_time_per_iteration);
mean_time_per_iteration /= max_it;
}
@
The MINRES algorithm is written:
\begin{itemize}
\item initialization:
\begin{enumerate}
\item $\beta_1=\left\| b \right\|_2$
<<MINRES (Choi) init>>=
cublasSnrm2(handle, N, b, 1, &beta);
@ \item $\nu_0 = 0$
<<MINRES (Choi) init>>=
HANDLE_ERROR( cudaMemset(nu_im1, 0, sizeof(float)*N ) );
@ \item $\nu_1 = b/\beta_1$
<<MINRES (Choi) init>>=
HANDLE_ERROR( cudaMemset(nu_i, 0, sizeof(float)*N ) );
a = 1/beta;
cublasSaxpy(handle, N, &a, b, 1, nu_i, 1);
@ \item $\phi=\tau=\beta_1$, $\delta^{(1)}_1=0$, $c_0=-1$, $s_0=0$
<<MINRES (Choi) init>>=
phi= tau = beta;
delta1 = 0;
c = -1;
s = 0;
@ \item $d_0=d_{-1}=0$
<<MINRES (Choi) init>>=
HANDLE_ERROR( cudaMemset(w_im1 , 0, sizeof(float)*N ) );
HANDLE_ERROR( cudaMemset(w_im2 , 0, sizeof(float)*N ) );
@ \end{enumerate}
\item loop:
\begin{enumerate}
\item Lanczos step
<<MINRES (Choi) loop>>=
lanczos_step(&alpha, &beta, nu_ip1, A, nu_i, nu_im1, 0);
@ \item $\delta^{(2)}_k=c_{k-1}\delta^{(1)}_k + s_{k-1}\alpha_k$
<<MINRES (Choi) loop>>=
delta2 = c*delta1 + s*alpha;
@ \item $\gamma^{(1)}_k=s_{k-1}\delta^{(1)}_k - c_{k-1}\alpha_k$
<<MINRES (Choi) loop>>=
gamma = s*delta1 - c*alpha;
@ \item $\epsilon_k = s_{k-1}\beta_{k+1}$
<<MINRES (Choi) loop>>=
epsilon = s*beta;
@ \item $\delta^{(1)}_k = -c_{k-1}\beta_{k+1}$
<<MINRES (Choi) loop>>=
delta1 = -c*beta;
@ \item sym--ortho
<<MINRES (Choi) loop>>=
a = gamma;
sym_ortho(&c, &s, &gamma, &a, &beta);
@ \item $\tau_k = c_k\phi_k$
<<MINRES (Choi) loop>>=
tau = c*phi;
@ \item $\phi_k = s_k\phi_{k-1}$
<<MINRES (Choi) loop>>=
phi = s*phi;
//INFO("r norm=%6.2E\n",phi);
@ \item $\left\| r \right\|_2 = \phi_k$
<<MINRES (Choi) loop>>=
rnorm = phi;
@ \item $d_k = (\nu_k-\delta^{(2)}_k d_{k-1}-\epsilon_k d_{k-2})/\gamma_k$
<<MINRES (Choi) loop>>=
if (gamma==0) break;
cublasScopy(handle, N, nu_i, 1, w_i, 1);
a = 1/gamma;
cublasSscal(handle, N, &a, w_i, 1);
a = -delta2/gamma;
cublasSaxpy(handle, N, &a, w_im1, 1, w_i, 1);
a = -epsilon/gamma;
cublasSaxpy(handle, N, &a, w_im2, 1, w_i, 1);
@ \item $x_k = x_{k-1} + \tau d_k$
<<MINRES (Choi) loop>>=
cublasSaxpy(handle, N, &tau, w_i, 1, x, 1);
@ \item
\begin{eqnarray}
\label{eq:1a}
\nu_{i-1} &\leftarrow& \nu_i \nonumber\\
\nu_{i} &\leftarrow& \nu_{i+1} \nonumber\\
d_{i-2} &\leftarrow& d_{i-1} \nonumber\\
d_{i-1} &\leftarrow& d_{i} \nonumber
\end{eqnarray}
<<MINRES (Choi) loop>>=
cublasScopy(handle, N, nu_i , 1, nu_im1, 1);
cublasScopy(handle, N, nu_ip1, 1, nu_i , 1);
cublasScopy(handle, N, w_im1 , 1, w_im2 , 1);
cublasScopy(handle, N, w_i , 1, w_im1 , 1);
@ \end{enumerate}
\end{itemize}
@
The sym--ortho function is:
\index{iterativeSolvers!iterativeSolvers!sym\_ortho}
<<sym-ortho algorithm>>=
void iterativeSolvers::sym_ortho(float *c,float *s,float *r,float *a,float *b) {
float abs_a, abs_b, tau;
abs_a = fabs(*a);
abs_b = fabs(*b);
if (*b==0) {
*s = 0;
*r = abs_a;
*c = (*a==0) ? 1 : sign(*a);
} else if (*a==0) {
*c = 0;
*s = sign(*b);
*r = abs_b;
} else if (abs_b>abs_a) {
tau = *a/(*b);
*s = sign(*b)/sqrtf(1+tau*tau);
*c = *s*tau;
*r = *b/(*s);
} else if (abs_a>abs_b) {
tau = *b/(*a);
*c = sign(*a)/sqrtf(1+tau*tau);
*s = *c*tau;
*r = *a/(*c);
}
}
@
The Lanzcos step is:
\index{iterativeSolvers!iterativeSolvers!lanczos\_step}
<<Lanczos step>>=
void iterativeSolvers::lanczos_step(float *alpha, float *beta, float *nu_kp1,
BTBT *A, float *nu_k, float *nu_km1, float sigma)
{
float a;
A->MVM(nu_kp1, nu_k);
if (sigma!=0) {
a = -sigma;
cublasSaxpy(handle, N, &a, nu_k, 1, nu_kp1, 1);
}
cublasSdot(handle, N, nu_k, 1, nu_kp1, 1, alpha);
a = -(*alpha);
cublasSaxpy(handle, N, &a, nu_k, 1, nu_kp1, 1);
a = -(*beta);
cublasSaxpy(handle, N, &a, nu_km1, 1, nu_kp1, 1);
cublasSnrm2(handle, N, nu_kp1, 1, beta);
if (beta!=0) {
a = 1/(*beta);
cublasSscal(handle, N, &a, nu_kp1, 1);
}
}
@
\section{Tests}
\label{sec:tests}
The test routine is:
<<iterativeSolvers.bin>>=
#include "cuda_profiler_api.h"
#ifndef __CEO_H__
#include "ceo.h"
#endif
//#define N 10
<<PA input>>
__global__ void fill(float *data, int n_data, float value)
{
int k = blockIdx.x * blockDim.x + threadIdx.x;
if (k<n_data)
data[k] = value;
}
// Solving Ax=b
int main(int argc,char *argv[]) {
<<complete test>>
}
@
In the following a simple test is performed:
<<simple test I>>=
/*
The test requires to replace the MVM with MVM (Test 1)
in the code chunk BTBT.cu in the file BTBT.nw
*/
float *d__x, *d__b;
HANDLE_ERROR( cudaMalloc((void**)&d__x, sizeof(float)*N ) );
HANDLE_ERROR( cudaMemset(d__x, 0, sizeof(float)*N ) );
HANDLE_ERROR( cudaMalloc((void**)&d__b, sizeof(float)*N ) );
float b[N] = {1,2,3,4,5,6,7,8,9,10};
HANDLE_ERROR( cudaMemcpy( d__b, b, N*sizeof(float), cudaMemcpyHostToDevice) );
BTBT A;
A.setup(N);
iterativeSolvers iSolve;
/* iSolve.cg_setup(N); */
/* iSolve.cg(d__x, &A, d__b, 5, d__x); */
iSolve.minres_setup(N);
iSolve.minres(d__x, &A, d__b, 5, d__x);
iSolve.cleanup();
float *x;
x = (float *)malloc(sizeof(float)*N);
HANDLE_ERROR( cudaMemcpy( x, d__x, N*sizeof(float), cudaMemcpyDeviceToHost) );
for (int k=0;k<N;k++)
INFO("(%d) %5.1f\n",k,x[k]);
HANDLE_ERROR( cudaFree( d__x ) );
HANDLE_ERROR( cudaFree( d__b ) );
free(x);
@
<<simple test II>>=
/*
The test requires to replace the MVM with MVM (Test 2)
in the code chunk BTBT.cu in the file BTBT.nw
*/
float *x;
x = (float *)malloc(sizeof(float)*N);
float *d__x, *d__b;
HANDLE_ERROR( cudaMalloc((void**)&d__x, sizeof(float)*N ) );
HANDLE_ERROR( cudaMalloc((void**)&d__b, sizeof(float)*N ) );
for (int k=0;k<N;k++) {
x[k] = 1.0 + (float)rand();
INFO("(%d) %.3E\n",k,x[k]);
}
HANDLE_ERROR( cudaMemcpy( d__b, x, N*sizeof(float), cudaMemcpyHostToDevice ) );
BTBT A;
A.setup(N);
/* A.MVM(d__b,d__x); */
/* HANDLE_ERROR( cudaMemcpy( x, d__b, N*sizeof(float), cudaMemcpyDeviceToHost) ); */
/* for (int k=0;k<N;k++) */
/* printf("(%d) %5.1f\n",k,x[k]); */
fill LLL (1+N/16),16 RRR (d__x, N, 0);
iterativeSolvers iSolve;
/* iSolve.cg_setup(N); */
/* iSolve.cg(d__x, &A, d__b, 5, d__x); */
iSolve.minres_setup(N);
iSolve.minres(d__x, &A, d__b, 5, d__x);
iSolve.cleanup();
HANDLE_ERROR( cudaMemcpy( x, d__x, N*sizeof(float), cudaMemcpyDeviceToHost) );
for (int k=0;k<N;k++)
INFO("(%d) %5.1f\n",k,x[k]);
HANDLE_ERROR( cudaFree( d__x ) );
HANDLE_ERROR( cudaFree( d__b ) );
free(x);
@
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% TEST ZONE
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
This is the comprehensive suite of tests.
There are 3 tests available: [[[main test]]], [[convergence test]] and [[statistics test]].
$r_0$ is set to 15cm per default but can be override to any other value after the variable declaration.
<<complete test>>=
<<declaration>>
// r0 = d;
<<initialization>>
<<hot convergence test>>
<<cleanup and free memory>>
@
In the following, all the variables are declared and set to their default or user given values.
<<declaration>>=
int N = _N_LENSLET_*2, NP, PS_E_N_PX, k, osf;
float *d__x, *d__phase_est, *phase_screen_est;
float D, d, delta, delta_e, r0=0.15;
char *d__mask;
if (argc != 3) {
INFO("Usage: %s D CG or MINRES\n",argv[0]);
return 1;
}
char solver[10];
sprintf(solver,"%s",argv[2]);
printf("\n___ SOLVER: %s ___\n",solver);
D = atof(argv[1]);
d = D/N_SIDE_LENSLET;
delta = d/_N_PX_PUPIL_;
osf = 1;
delta_e = d/osf;
NP = osf*N_SIDE_LENSLET+1;
PS_E_N_PX = NP;
PS_E_N_PX *= PS_E_N_PX;
printf("\nd =%.4f\n",d);
printf("\ndelta=%.4f\n",delta);
printf("\ndelta_e=%.4f\n",delta_e);
source src, gs;
atmosphere atm;
mask pupil_mask;
imaging lenslet_array;
centroiding cog;
aaStats aa;
BTBT aaCov;
paStats pa;
BTBT paCov;
iterativeSolvers iSolve;
stats S;
S.setup();
stopwatch tid;
@
Then all the components are initialized and the dynamic arrays are allocated.
<<initialization>>=
src.setup(ARCSEC(0) , 0, INFINITY, PS_E_N_PX);
gs.setup(ARCSEC(0) , 0, 90e3, PS_E_N_PX);
// Atmosphere
// Single layer turbulence profile
float altitude[] = {0},
xi0[] = {1},
wind_speed[] = {10},
wind_direction[] = {0};
//atm.setup(r0,30,altitude,xi0,wind_speed,wind_direction);
// GMT 7 layers turbulence profile
atm.gmt_setup(r0,30);
//atm.reset();
pupil_mask.setup( (N_SIDE_LENSLET+1)*(N_SIDE_LENSLET+1) );
// SH WFS
lenslet_array.setup(_N_PX_PUPIL_,N_SIDE_LENSLET,2,1,1);
// Centroid
cog.setup();
// covariances
aa.setup(N_SIDE_LENSLET,&atm,d, &gs, 1);
aa.toFile("aaCovariance.bin");
printf("\n AA variance [rd^2]: %.3E\n",aa.variance());
aaCov.setup(2,2,N_SIDE_LENSLET,aa.d__cov);
pa.setup(NP,N_SIDE_LENSLET,osf,&atm,d,&src,1,&gs,1);
pa.toFile("paCovariance.bin");
paCov.setup(1,2,NP,N_SIDE_LENSLET,pa.d__cov);
HANDLE_ERROR( cudaMalloc((void**)&d__mask, sizeof(char)*_N_LENSLET_ ) );
dim3 blockDim_vl(16,16);
dim3 gridDim_vl(N_SIDE_LENSLET/16+1,N_SIDE_LENSLET/16+1);
valid_lenslet LLL gridDim_vl,blockDim_vl RRR (d__mask,N_SIDE_LENSLET*_N_PX_PUPIL_,0.5);
/* char *L_mask, *A_mask; */
/* HANDLE_ERROR( cudaMalloc((void**)&L_mask, sizeof(char)*_N_LENSLET_ ) ); */
/* HANDLE_ERROR( cudaMemset(L_mask, 0, sizeof(char)*_N_LENSLET_ ) ); */
/* HANDLE_ERROR( cudaMalloc((void**)&A_mask, sizeof(char)*PS_E_N_PX ) ); */
/* HANDLE_ERROR( cudaMemset(A_mask, 0, sizeof(char)*PS_E_N_PX ) ); */
/* fried_geometry LLL gridDim_vl,blockDim_vl RRR (L_mask, A_mask, N_SIDE_LENSLET, 16, 0.5); */
/* dev2file("L_mask.bin",L_mask,_N_LENSLET_); */
/* dev2file("A_mask.bin",A_mask,PS_E_N_PX); */
/* HANDLE_ERROR( cudaFree( L_mask) ); */
/* HANDLE_ERROR( cudaFree( A_mask) ); */
char MASK_SET;
MASK_SET = fried_geometry_setup(cog.lenslet_mask, pupil_mask.m, N_SIDE_LENSLET, 16, 0.5);
//printf("MASK_SET = %d\n",MASK_SET);
pupil_mask.set_filter();
src.wavefront.M = &pupil_mask;
src.wavefront.masked();
char A_mask_filename[32];
sprintf(A_mask_filename,"A_mask_%03d.bin",N_SIDE_LENSLET);
dev2file(A_mask_filename,pupil_mask.m,PS_E_N_PX);
HANDLE_ERROR( cudaMalloc( (void**)&d__phase_est , sizeof(float)*PS_E_N_PX ) );
float *d__phase_screen_low_res, *d__phase_screen_est, *phase_screen_low_res;
d__phase_screen_low_res = 0;
phase_screen_low_res = 0;
phase_screen_est = 0;
@
\subsection{Main test}
\label{sec:main-test}
The [[main test]] is simply a wavefront reconstruction routine.
<<main test>>=
atm.get_phase_screen(&src,delta_e,NP,delta_e,NP,0);
src.phase2file("phaseScreenLowRes.bin");
<<wavefront sensing>>
// Iterative solver
//iSolve.cg_setup(N);
iSolve.minres_setup(N);
//aaCov.mask = d__mask;
dev2file("mask.bin",d__mask,_N_LENSLET_);
char filename[100];
/* for (k=1;k<6;k++) { */
k = 100;
sprintf(filename,"MINRES_phaseEst_%d.bin",k);
//sprintf(filename,"CG_phaseEst_%d.bin",k);
HANDLE_ERROR( cudaMalloc((void**)&d__x, sizeof(float)*N ) );
HANDLE_ERROR( cudaMemset(d__x, 0, sizeof(float)*N ) );
tid.tic();
//cudaProfilerStart();
iSolve.minres_vorst(d__x, &aaCov, cog.d__c, d__x);
//iSolve.cg(d__x, &aaCov, cog.d__c, k, d__x);
//cudaProfilerStop();
<<MVM>>
tid.toc("WAVEFRONT ESTIMATION");
<<MVM (wrap.)>>
//printf("\nSolver residue norm : %.2E\n",iSolve.rnorm);
//printf("\nSolver mean time per iteration: %.2E\n",iSolve.mean_time_per_iteration);
/* } */
@
\subsection{Convergence test}
\label{sec:convergence-test}
The [[convergence test]] reconstructs the wavefront with both iterative solver MINRES and CG at each iteration step.
<<convergence test>>=
atm.get_phase_screen(&src,delta_e,NP,delta_e,NP,0);
char filename[100];
sprintf(filename,"CVGCE_phaseScreenLowRes_%03d.bin",N_SIDE_LENSLET);
src.phase2file(filename);
<<wavefront sensing>>
if (cog.MASK_SET) aaCov.mask = cog.lenslet_mask;
sprintf(filename,"bins/CVGCE_%s_phaseEst_200_%03d.bin",solver,N_SIDE_LENSLET);
FILE *fid;
fid = fopen(filename,"wb");
int n_page = 1, n_data = PS_E_N_PX*200;
fwrite(&n_page,sizeof(int),1,fid);
fwrite(&n_data,sizeof(int),1,fid);
phase_screen_est = (float*)malloc(sizeof(float)*PS_E_N_PX);
HANDLE_ERROR( cudaMalloc((void**)&d__x, sizeof(float)*N ) );
// CG
if (strcmp(solver,"CG")==0) {
iSolve.pcg_setup(N);
for (k=1;k<=200;k++) {
printf("\n______ ITERATION #: %03d ______\n",k);
HANDLE_ERROR( cudaMemset(d__x, 0, sizeof(float)*N ) );
tid.tic();
iSolve.pcg(d__x, &aaCov, cog.d__c, k, d__x, 1./aa.variance());
<<MVM>>
tid.toc("WAVEFRONT ESTIMATION");
HANDLE_ERROR( cudaMemcpy( phase_screen_est, d__phase_est,
sizeof(float)*PS_E_N_PX,
cudaMemcpyDeviceToHost ) );
fwrite(phase_screen_est,sizeof(float),PS_E_N_PX,fid);
printf("\nSolver residue norm : %.2E\n",iSolve.rnorm);
printf("\nSolver mean time per iteration: %.2E\n",iSolve.mean_time_per_iteration);
printf("\n-------------------------------\n");
}
}
// MINRES
if (strcmp(solver,"MINRES")==0) {
iSolve.minres_setup(N);
iSolve.RTOL = 1e-6;
for (k=1;k<=200;k++) {
printf("\n______ ITERATION #: %03d ______\n",k);
HANDLE_ERROR( cudaMemset(d__x, 0, sizeof(float)*N ) );
tid.tic();
iSolve.minres_vorst(d__x, &aaCov, cog.d__c, k, d__x);
<<MVM>>
tid.toc("WAVEFRONT ESTIMATION");
HANDLE_ERROR( cudaMemcpy( phase_screen_est, d__phase_est,
sizeof(float)*PS_E_N_PX,
cudaMemcpyDeviceToHost ) );
fwrite(phase_screen_est,sizeof(float),PS_E_N_PX,fid);
printf("\nSolver residue norm : %.2E\n",iSolve.rnorm);
printf("\nSolver mean time per iteration: %.2E\n",iSolve.mean_time_per_iteration);
printf("\n-------------------------------\n");
}
}
fclose(fid);
@
\subsection{Hot convergence test}
\label{sec:hot-convergence-test}
The [[hot convergence test]] reconstructs the wavefront with MINRES re-using the previous estimate as starting point for the next one.
<<hot convergence test>>=
LMMSE lmmse;
lmmse.setup(&atm,&gs,1,&src,1,d,N_SIDE_LENSLET,&pupil_mask,"MINRES");
int n_step = 200;
HANDLE_ERROR( cudaMalloc( (void**)&d__phase_screen_low_res, sizeof(float)*PS_E_N_PX*n_step ) );
float *d__gs_phase_screen_low_res;
HANDLE_ERROR( cudaMalloc( (void**)&d__gs_phase_screen_low_res, sizeof(float)*PS_E_N_PX*n_step ) );
HANDLE_ERROR( cudaMalloc( (void**)&d__phase_screen_est, sizeof(float)*PS_E_N_PX*n_step ) );
//if (cog.MASK_SET) aaCov.mask = cog.lenslet_mask;
HANDLE_ERROR( cudaMalloc((void**)&d__x, sizeof(float)*N ) );
HANDLE_ERROR( cudaMemset(d__x, 0, sizeof(float)*N ) );
float time_step,tau=0, tau0=0, delta_tau;
time_step = 2;
//iSolve.minres_setup(N);
lmmse.iSolve.VERBOSE = 0;
lmmse.iSolve.RTOL = 5E-2;
// First wavefront estimate
atm.get_phase_screen_gradient(&cog,N_SIDE_LENSLET,d,&gs,tau);
tid.tic();
//iSolve.minres_vorst(d__x, &aaCov, cog.d__c, d__x);
lmmse.estimation(&cog);
tid.toc(&tau0,"WAVEFRONT ESTIMATION");
tau = tau0;
//lmmse.iSolve.N_ITERATION = 5;
//lmmse.iSolve.ATOL = lmmse.iSolve.rnorm;
// MINRES
for (k=1;k<=n_step;k++) {
printf("\n______ ITERATION #: %03d ______\n",k);
tau += time_step;
atm.get_phase_screen_gradient(&cog,N_SIDE_LENSLET,d,&gs,1E-3*tau);
// lmmse.d__phase_est_c = d__phase_screen_est + (k-1)*PS_E_N_PX;
lmmse.set_phase_est_ptr( d__phase_screen_est + (k-1)*PS_E_N_PX );
tid.tic();
/* iSolve.minres_vorst(d__x, &aaCov, cog.d__c, d__x); */
/* paCov.MVM(d__phase_screen_est + (k-1)*PS_E_N_PX, d__x); */
lmmse.estimation(&cog);
tid.toc(&delta_tau, "WAVEFRONT ESTIMATION");
//tau += delta_tau + 0*time_step;
src.wavefront.phase = d__phase_screen_low_res + (k-1)*PS_E_N_PX;
atm.get_phase_screen(&src,delta_e,NP,delta_e,NP,1E-3*tau);
gs.wavefront.phase = d__gs_phase_screen_low_res + (k-1)*PS_E_N_PX;
atm.get_phase_screen(&gs,delta_e,NP,delta_e,NP,1E-3*tau);
/* HANDLE_ERROR( cudaMemcpy( d__phase_screen_low_res + (k-1)*PS_E_N_PX, src.wavefront.phase, */
/* sizeof(float)*PS_E_N_PX, */
/* cudaMemcpyDeviceToDevice) ); */
//printf("\nSolver residue norm : %.2E\n",iSolve.rnorm);
//printf("\nSolver mean time per iteration: %.2E\n",iSolve.mean_time_per_iteration);
printf("\n-------------------------------\n");
/* if (k==1) */
/* iSolve.ATOL = iSolve.rnorm; */
//if (k>1)
//iSolve.N_ITERATION = 10;
}
char filename[100];
sprintf(filename,"CVGCE_phaseScreenLowRes_%03d.bin",N_SIDE_LENSLET);
dev2file(filename,d__phase_screen_low_res,PS_E_N_PX*n_step);
sprintf(filename,"CVGCE_GS_phaseScreenLowRes_%03d.bin",N_SIDE_LENSLET);
dev2file(filename,d__gs_phase_screen_low_res,PS_E_N_PX*n_step);
sprintf(filename,"CVGCE_%s_phaseEst_%03d_%03d.bin",solver,n_step,N_SIDE_LENSLET);
dev2file(filename,d__phase_screen_est,PS_E_N_PX*n_step);
lmmse.cleanup();
HANDLE_ERROR( cudaFree( d__gs_phase_screen_low_res ) );
@
\subsection{Statistics test}
\label{sec:statistics-test}
The [[statistics test]] reconstructs the wavefront n times, each time a new statistically independent phase screen is generated.
<<statistics test>>=
float *d__phase_screen_low_res;
HANDLE_ERROR( cudaMalloc( (void**)&d__phase_screen_low_res, sizeof(float)*PS_E_N_PX ) );
float *phase_screen_low_res;
phase_screen_low_res = (float*)malloc(sizeof(float)*PS_E_N_PX);
HANDLE_ERROR( cudaMemcpy( phase_screen_low_res, d__phase_screen_low_res,
sizeof(float)*PS_E_N_PX,
cudaMemcpyDeviceToHost ) );
<<linear solver (prep.)>>
aaCov.mask = d__mask;
<<<<MVM (prep.)>>
FILE *fid0, *fid;
char filename[100];
sprintf(filename,"STATS_phaseScreenLowRes_%03d.bin",N_SIDE_LENSLET);
fid0 = fopen(filename,"wb");
int nSample = 200, nIt = 200;
sprintf(filename,"STATS_%s_phaseEst_%03d_%03d.bin",solver,nSample,N_SIDE_LENSLET);
fid = fopen(filename,"wb");
phase_screen_est = (float*)malloc(sizeof(float)*PS_E_N_PX);
HANDLE_ERROR( cudaMalloc( (void**)&d__phase_est , sizeof(float)*PS_E_N_PX ) );
// CG
if (strcmp(solver,"CG")==0) {
iSolve.cg_setup(N);
for (k=1;k<=nSample;k++) {
printf("\n______ ITERATION #: %03d ______\n",k);
<<atmosphere propagation and wavefront sensing>>
HANDLE_ERROR( cudaMemset(d__x, 0, sizeof(float)*N ) );
tid.tic();
iSolve.cg(d__x, &aaCov, cog.d__c, nIt, d__x);
<<MVM>>
tid.toc("WAVEFRONT ESTIMATION");
HANDLE_ERROR( cudaMemcpy( phase_screen_est, d__phase_est,
sizeof(float)*PS_E_N_PX,
cudaMemcpyDeviceToHost ) );
fwrite(phase_screen_est,sizeof(float),PS_E_N_PX,fid);
printf("\nSolver residue norm : %.2E\n",iSolve.rnorm);
printf("\nSolver mean time per iteration: %.2E\n",iSolve.mean_time_per_iteration);
printf("\n-------------------------------\n");
atm.reset();
}
}
// MINRES
if (strcmp(solver,"MINRES")==0) {
iSolve.minres_setup(N);
for (k=1;k<=nSample;k++) {
printf("\n______ ITERATION #: %03d ______\n",k);
<<atmosphere propagation and wavefront sensing>>
HANDLE_ERROR( cudaMemset(d__x, 0, sizeof(float)*N ) );
tid.tic();
iSolve.minres_vorst(d__x, &aaCov, cog.d__c, nIt, d__x);
<<MVM>>
tid.toc("WAVEFRONT ESTIMATION");
HANDLE_ERROR( cudaMemcpy( phase_screen_est, d__phase_est,
sizeof(float)*PS_E_N_PX,
cudaMemcpyDeviceToHost ) );
HANDLE_ERROR( cudaThreadSynchronize() );
fwrite(phase_screen_est,sizeof(float),PS_E_N_PX,fid);
printf("\n Data saved to file!\n");
printf("\nSolver residue norm : %.2E\n",iSolve.rnorm);
printf("\nSolver mean time per iteration: %.2E\n",iSolve.mean_time_per_iteration);
printf("\n-------------------------------\n");
atm.reset();
}
}
fclose(fid0);
fclose(fid);
@
This is the place where we terminate the test elegantly.
<<cleanup and free memory>>=
atm.cleanup();
lenslet_array.cleanup();
cog.cleanup();
aa.cleanup();
pa.cleanup();
aaCov.cleanup();
paCov.cleanup();
iSolve.cleanup();
S.cleanup();
HANDLE_ERROR( cudaFree( d__x ) );
HANDLE_ERROR( cudaFree( d__phase_est ) );
if (d__phase_screen_low_res) HANDLE_ERROR( cudaFree( d__phase_screen_low_res ) );
if (d__phase_screen_est) HANDLE_ERROR( cudaFree( d__phase_screen_est ) );
HANDLE_ERROR( cudaFree( d__mask ) );
if (phase_screen_low_res) free(phase_screen_low_res);
if (phase_screen_est) free(phase_screen_est);
@
<<PA input>>=
__global__ void set_pa_input(float *pa_c, float *aa_c, int *idx, int N)
{
int i, j, k;
i = blockIdx.x * blockDim.x + threadIdx.x;
j = blockIdx.y * blockDim.y + threadIdx.y;
if ( (i<N) && (j<N) ) {
k = i*N + j;
pa_c[idx[k]] = aa_c[k];
}
}
@
This is a chunk of code for the [[statistics]] test routine:
<<atmosphere propagation and wavefront sensing>>=
atm.get_phase_screen(d__phase_screen_low_res,delta_e,NP,delta_e,NP,d__src,0);
HANDLE_ERROR( cudaMemcpy( phase_screen_low_res, d__phase_screen_low_res,
sizeof(float)*PS_E_N_PX,
cudaMemcpyDeviceToHost ) );
fwrite(phase_screen_low_res,sizeof(float),PS_E_N_PX,fid0);
<<wavefront sensing (geometric and masked)>>
@
The wavefront sensing model is using either a Fourier optics or geometric optics model.
<<wavefront sensing>>=
<<wavefront sensing (geometric)>>
<<save centroids to file>>
<<wavefront sensing (masked)>>=
<<wavefront sensing (geometric and masked)>>
<<save centroids to file>>
@
<<wavefront sensing (Fourier)>>=
lenslet_array.propagate(d__src);
cxy0 = (_N_PX_PUPIL_ - 1)/2.0;
cog.get_data(lenslet_array.d__frame, cxy0, cxy0, slopes2Angle);
@
<<wavefront sensing (geometric)>>=
atm.get_phase_screen_gradient(&cog,N_SIDE_LENSLET,d,&src,1,0);
<<wavefront sensing (geometric and masked)>>=
atm.get_phase_screen_gradient(&cog,N_SIDE_LENSLET,d__mask,d,&src,1,0);
@
<<save centroids to file>>=
dev2file("centroids.bin",cog.d__c,N);
@
The index of the WFS slopes in the augmented input vector of the MVM operation are computed next:
<<linear solver (prep.)>>=
idx = (int *)malloc(sizeof(int)*_N_LENSLET_);
k = -1;
//printf("\n k idx\n");
for (i=1;i<2*N_SIDE_LENSLET;i+=2) {
for (j=1;j<2*N_SIDE_LENSLET;j+=2) {
idx[++k] = i*(2*N_SIDE_LENSLET + 1) + j;
//printf("(%2d) %2d\n",k,idx[k]);
}
}
HANDLE_ERROR( cudaMalloc( (void**)&d__idx, sizeof(int)*_N_LENSLET_ ) );
HANDLE_ERROR( cudaMemcpy( d__idx, idx,
sizeof(int)*_N_LENSLET_,
cudaMemcpyHostToDevice ) );
HANDLE_ERROR( cudaMalloc((void**)&d__x, sizeof(float)*N ) );
x = (float*)malloc(sizeof(float)*N);
@
CG iterative solver call:
<<linear solver (conjugate gradient)>>=
iSolve.cg(d__x, &aaCov, d__b, 5, d__x);
@
MINRES iterative solver call:
<<linear solver (MINRES)>>=
iSolve.minres(d__x, &aaCov, d__b, 5, d__x);
@
Allocation of the input vector for the MVM:
<<MVM (prep.)>>=
HANDLE_ERROR( cudaMalloc((void**)&d__ce, sizeof(float)*PS_E_N_PX*2 ) );
HANDLE_ERROR( cudaMemset(d__ce, 0, sizeof(float)*PS_E_N_PX*2 ) );
dim3 blockDim(16,16);
dim3 gridDim(N_SIDE_LENSLET/16+1,N_SIDE_LENSLET/16+1);
@
Writing the input of the iterative solver into the output of the MVM:
<<MVM>>=
paCov.MVM(d__phase_est,d__x);
@
Saving the MVM wavefront estimate to a file:
<<MVM (wrap.)>>=
dev2file(filename,d__phase_est,PS_E_N_PX);
@