ceo 0.1.0 - Docs.rs

% -*- mode: Noweb; noweb-code-mode: c-mode -*-
@
[[aaStats]] contains a structure and the routines to compute the covariance matrix of the angle--of--arrival.
\index{aaStats}

\section{The files}

\subsection{Header}

<<aaStats.h>>=
#ifndef __AASTATS_H__
#define __AASTATS_H__

#ifndef __UTILITIES_H__
#include "utilities.h"
#endif

#ifndef __ATMOSPHERE_H__
#include "atmosphere.h"
#endif

//#define AASTATS_DEBUG
//#define PASTATS_DEBUG

<<aaStats structure>>

<<paStats stucture>>
@ 
The structures are :
<<aaStats structure>>=
struct aaStats {

  <<aaStats parameters>>

  void setup(int N, const atmosphere *atm, float lenslet_pitch, const source *src, int N_SRC);

  void cleanup(void);

  void info(int kappa, float d);

  float variance(void);

  void toFile(char *filename);
};
@  and
<<paStats stucture>>=
struct paStats {

  <<paStats parameters>>

  void setup(int M_, int N_, int osf_, const atmosphere *atm, float lenslet_pitch, 
             source *phase_src, int N_P_SRC, const source *slopes_src, int N_S_SRC);
  void setup(int M_, int N_, int osf_, const atmosphere *atm, float lenslet_pitch, 
             const source *slopes_src, int N_S_SRC,
             float z_radius);

  void cleanup(void);

  void info(int kappa, float d);

  /*
  void MVM(float *in_vector, float *out_vector, 
	   float d1, float g1, int N1, 
	   float d2, float g2, int N2);
  */
  void MVM(float *out_vector, float *in_vector, 
	   float d1, float g1, int N1, 
	   float d2, int N2,
	   atmosphere *atm,
	   source *phase_src, int N_P_SRC,
	   source *slopes_src, int N_S_SRC) ;

  void toFile(char *filename);

  void variance(void);

};
#endif // __AASTATS_H__
@ 
\subsection{Source}

<<aaStats.cu>>=
#include "aaStats.h"

<<sinc>>
<<somb>>

<<subaperture FT>>
<<subaperture PSF>>

<<phase power spectrum>>

<<aaStats power spectrum kernel>>

<<paStats power spectrum kernel>>
<<paStats polar average power spectrum kernel>>
<<paStats power spectrum kernel (single layer)>>

<<covariance extraction>>
<<PA covariance extraction>>

<<aaStats setup>>

<<paStats setup>>

<<save aa covariance to file>>
<<save pa covariance to file>>

<<aaStats cleanup>>
<<paStats cleanup>>

<<aaStats info>>
<<paStats info>>

<<aaStats variance>>
@ 

\section{The theory}

Shack--Hartmann WFS centroids are a measure of the spatial derivatives of the wavefront averaged on each subaperture.
These derivatives are often referred as angle of arrivals $\vec \alpha=(\alpha_x\alpha_y,)$:
\begin{eqnarray}
  \label{eq:1}
  \alpha_x &=& {\lambda\over 2\pi d^2} {\partial\varphi\over\partial x} \ast \Pi\left(x\over d\right)\Pi\left(y\over d\right),\\
  \alpha_y &=& {\lambda\over 2\pi d^2} {\partial\varphi\over\partial y} \ast \Pi\left(x\over d\right)\Pi\left(y\over d\right),
\end{eqnarray}
where $\lambda$ is the sensing wavelength and $d$ in the size of one subaperture.

The covariance is derived from the power spectrum density of the angle of arrivals $W_{\vec \alpha \cdot\ \vec \beta}(\vec f)$:
  \begin{equation}
    \label{eq:2}
    W_{\vec \alpha\cdot\vec \beta}(\vec f) = \lambda^2\left( \vec f \cdot \vec\alpha \right)\left( \vec f \cdot \vec\beta \right) W_\varphi(\vec f) G^2(\vec f)
  \end{equation}
$W_\varphi(f)$ is the wavefront power spectrum density given by
\begin{equation}
  \label{eq:3}
  W_\varphi(f) = 0.0229 r_0^{-5/3} \left( f^2 + {1\over\mathcal L_0^2} \right)^{-11/6}.
\end{equation}
$G(\vec f)$ is the point spread function of a subaperture of the WFS,
\begin{equation}
  \label{eq:4}
  G(\vec f) = {\sin(\pi d f_x) \over \pi d f_x }{ \sin(\pi d f_y) \over \pi d f_y } .
\end{equation}

The covariance $C_{\vec \alpha \cdot\ \vec \beta}(\vec\rho)$ is derived from the Wiener--Khinchine theorem:
\begin{equation}
  \label{eq:5}
  C_{\vec \alpha \cdot\ \vec \beta}(\vec\rho) = \mathcal F^{-1} \left[ W_{\vec \alpha\cdot\vec \beta}(\vec f) \right] (\vec\rho)
\end{equation}
where $\mathcal F^{-1}$ stands for the inverse Fourier transform.

The covariance between $\varphi$ and $\vec \alpha$ is derived from the cross--power spectrum density given by:
  \begin{equation}
    \label{eq:6}
    W_{\varphi\vec \alpha}(\vec f) = -\iota\lambda\left( \vec f \cdot \vec\alpha \right) W_\varphi(\vec f) G(\vec f)
  \end{equation}
$W_\varphi(f)$ is the wavefront power spectrum density given by
\begin{equation}
  \label{eq:7}
  W_\varphi(f) = 0.0229 r_0^{-5/3} \left( f^2 + {1\over\mathcal L_0^2} \right)^{-11/6}.
\end{equation}
$G(\vec f)$ is the point spread function of a subaperture of the WFS,
\begin{equation}
  \label{eq:8}
  G(\vec f) = {\sin(\pi d f_x) \over \pi d f_x }{ \sin(\pi d f_y) \over \pi d f_y } .
\end{equation}

The covariance $C_{\varphi \vec \alpha}(\vec\rho)$ is derived from the Wiener--Khinchine theorem:
\begin{equation}
  \label{eq:9}
  C_{\varphi \vec \alpha}(\vec\rho) = \mathcal F^{-1} \left[ W_{\varphi\vec \alpha}(\vec f) \right] (\vec\rho)
\end{equation}
where $\mathcal F^{-1}$ stands for the inverse Fourier transform.

\section{Parameters}
\label{sec:parameters}

The parameters of the [[aaStats]] structure are:
<<aaStats parameters>>=
int N, N2, NU, NU2, NF, NF2, psd_size, cov_size, ind_size;
float2 *d__psd;
float *d__cov, *d__alpha, *d__beta, n_full, n_comp, b_full, b_comp, cov_eval_et, sampling;
cufftHandle plan;
#if defined(AASTATS_DEBUG) || defined(PASTATS_DEBUG)
float2 *psd, *cov;
#endif
int N_SRC2;
@ The parameters of the [[paStats]] structure are:
<<paStats parameters>>=
int osf, M, shift;
int *M_LAYER;
<<aaStats parameters>>

@
\section{Angle--of--arrival covariance matrix functions}
\label{sec:aa-functions}

\index{aaStats!aaStats}
The covariance matrix is computed for a square lenslet array of size $[[N]]\times[[N]]$ and of pitch [[sampling]].
The covariance matrix depends on the atmosphere parameters and altitude profile.
The covariance matrix is computed for a vector of [[N_SRC]] sources.
The covariance matrix is written:
\begin{eqnarray}
  \label{eq:10}
  \Theta = \left[
    \begin{array}{ccc}
      \Theta_{11}&\cdots&\Theta_{1[[N_SRC]]} \\
      \vdots    &\ddots & \vdots \\
      \Theta_{[[N_SRC]]1} & \vdots & \Theta_{[[N_SRC]][[N_SRC]]}
    \end{array}
\right]
\end{eqnarray}
with
\begin{eqnarray}
  \label{eq:11}
  \Theta_{ij} = \left[
    \begin{array}{cc}
      \Theta_{{x_i}{x_j}} & \Theta_{{x_i}{y_j}} \\
      \Theta_{{y_i}{x_j}} & \Theta_{{y_i}{y_j}} 
    \end{array}
\right].
\end{eqnarray}
Each matrix $\Theta_{{x_i}{x_j}}$ is a two--level recursive block Toeplitz (2RBT) $[[N]]^2\times[[N]]^2$ matrix.
Toeplitz matrices \cite{GolubVanLoan} are fully defined with their first row and column. 
So the matrix $\Theta_{{x_i}{x_j}}$ is reduced to a $[[NU]]=(2[[N]]-1)^2$ elements vector $t_\Theta^{{x_i}{x_j}}$ and the matrix $\Theta_{ij}$ becomes the matrix
\begin{equation}
  \label{eq:12}
  T_{ij} = \left[ t_\Theta^{{x_i}{x_j}} t_\Theta^{{x_i}{y_j}} t_\Theta^{{y_i}{x_j}} t_\Theta^{{y_i}{y_j}} \right].
\end{equation}
Finally the Toeplitz compressed version of the matrix $\Theta$ is written:
\begin{equation}
  \label{eq:13}
  T = \left[ t_\Theta^{{x_1}{x_1}} t_\Theta^{{x_1}{y_1}} \cdots t_\Theta^{{x_1}{x_[[N_SRC]]}} t_\Theta^{{x_1}{y_[[N_SRC]]}} 
  \cdots t_\Theta^{{y_[[N_SRC]]}{x_1}} t_\Theta^{{y_[[N_SRC]]}{y_1}} \cdots t_\Theta^{{y_[[N_SRC]]}{x_[[N_SRC]]}} t_\Theta^{{y_[[N_SRC]]}{y_[[N_SRC]]}} \right].
\end{equation}

@
\subsection{Setup \& Cleanup}
\label{sec:setup--cleanup}

\index{aaStats!aaStats!setup}
<<aaStats setup>>=
void aaStats::setup(int N_, const atmosphere *atm, float lenslet_pitch, 
		    const source *src, int N_SRC)
{
  N_SRC2 = N_SRC*N_SRC;
  int BATCH = 1;
  <<aaStats setup declarations>>
  psd_size = sizeof(float2)*NF2;
  <<aaStats setup allocations>>
  stopwatch tid;
  tid.tic();
  <<aaStats evaluate covariance>>
  tid.toc(&cov_eval_et,"AA covariance evaluation");
  info(kappa, sampling);
}

@ The parameters are initialized in the setup function:
<<aaStats setup declarations>>=
float alpha[2] = {1,0}, beta[2] = {0,1}, kappa; 
N = N_;
N2 = N*N;
NU = 2*N-1;
NU2 = NU*NU;
NF = 4096;
NF2 = NF*NF;
kappa = 4;
sampling = lenslet_pitch;

psd_size = sizeof(float2)*NF2*4;
cov_size = sizeof(float)*NU2*4*N_SRC2;
ind_size = sizeof(int)*N2;
@ where the memory is also allocated:
<<aaStats setup allocations>>=
HANDLE_ERROR( cudaMalloc((void**)&d__psd, psd_size ) );
HANDLE_ERROR( cudaMalloc((void**)&d__alpha, 2*sizeof(float) ) );
HANDLE_ERROR( cudaMalloc((void**)&d__beta,  2*sizeof(float) ) );
HANDLE_ERROR( cudaMemcpy( d__alpha, alpha, 2*sizeof(float), 
			  cudaMemcpyHostToDevice) );
HANDLE_ERROR( cudaMemcpy( d__beta, beta, 2*sizeof(float), 
			  cudaMemcpyHostToDevice) );
HANDLE_ERROR( cudaMalloc((void**)&d__cov, cov_size ) );

INFO("@(CEO)>aaStats: Creating 2D covariance FFT plan\n");
int n_DFT[2] = {NF, NF};
int iodist = NF2;
/* Create a 2D FFT plan. */
HANDLE_ERROR_CUFFT(cufftPlanMany(&plan, 2, n_DFT,
				 NULL, 1, iodist,
				 NULL, 1, iodist,
				 CUFFT_C2C,BATCH), 
		   "Unable to create plan");
/* HANDLE_ERROR_CUFFT(cufftSetCompatibilityMode(plan, CUFFT_COMPATIBILITY_NATIVE), */
/* 		   "Unable to set compatibility mode to native"); */
@  
The covariance is derived from Eq.~(\ref{eq:5}) for 2 sources in the directions $\vec\theta_1$ and $\vec\theta_2$ propagated through a [[N_LAYER]] atmospheric profile.
It is given by
\begin{equation}
  \label{eq:14}
  C_{\vec \alpha \cdot\ \vec \beta}^{12}(\vec\rho) = \sum_{l=1}^{[[N]]} C_{\vec \alpha \cdot\ \vec \beta}^l(\vec\rho_l)
\end{equation}
where
\begin{equation}
  \label{eq:15}
  \vec \rho_l = \gamma_l\vec\rho + h_l(\vec\theta_2-\vec\theta_1)
\end{equation}
with $\gamma_l=1-h_l/z_\ast$, $h_l$ is the atmosphere layer altitude and $z_\ast$ is the sources height.
Invoking the Wiener--Khinchine theorem, the translation and scaling properties of the Fourier transform and using Eq.~(\ref{eq:2}), the covariance for a single atmospheric layer at altitude $h_l$ is written:
\begin{equation}
  \label{eq:16}
  C_{\vec \alpha \cdot\ \vec \beta}^{12,l}(\vec\rho_l) = \mathcal F^{-1} \left[ \lambda^2 \left( \vec f \cdot \vec\alpha \right)\left( \vec f \cdot \vec\beta \right) W_\varphi(\vec f) G^2(\gamma_l\vec f) \exp\left( 2\iota\pi h_l\vec f \cdot (\vec\theta_2-\vec\theta_1) \right)  \right] (\gamma_l\vec\rho)
\end{equation}
Setting
\begin{equation}
  \label{eq:17}
  W_{\vec \alpha\cdot\vec \beta}^{l,12}(\vec f) = \lambda^2 \left( \vec f \cdot \vec\alpha \right)\left( \vec f \cdot \vec\beta \right) W_\varphi(\vec f) G^2(\gamma_l\vec f) \exp\left( 2\iota\pi h_l\vec f \cdot (\vec\theta_2-\vec\theta_1) \right),
\end{equation}
Eq.~(\ref{eq:14}) is now given by
\begin{equation}
  \label{eq:18}
  C_{\vec \alpha \cdot\ \vec \beta}^{12}(\vec\rho) = \sum_{l=1}^{[[N]]} \mathcal F^{-1} \left[ W_{\vec \alpha\cdot\vec \beta}^{l,12}(\vec f) \right] (\gamma_l\vec\rho).
\end{equation}
Calling $\delta=d/\kappa$ the sampling step of the vector $\vec\rho$ with the integer $\kappa\geq 1$, $\vec\rho_l$ is given by
\begin{equation}
  \label{eq:19}
  \vec \rho_l = \gamma_l\delta(k\vec x + l\vec y),\quad\forall l,k\in \left[0,\cdots,[[NF]]-1\right],
\end{equation}
$\vec x$ and $\vec y$ are the unit vectors of the X and Y axis, respectively.
Eq.~(\ref{eq:18}) becomes
\begin{equation}
  \label{eq:20}
   C_{\vec \alpha \cdot\ \vec \beta}^{12}(k,l) = \sum_{l=1}^{[[N]]} \iint {\mathrm d^2}\vec f W_{\vec \alpha\cdot\vec \beta}^{l,12}(\vec f) \exp\left( 2\iota\pi \gamma_l \delta (k f_x + l f_y) \right)
\end{equation}
Discretizing Eq.~(\ref{eq:20}) with
\begin{equation}
  \label{eq:21}
  \gamma_l\delta f_x = n/[[NF]],\quad \gamma_l\delta f_y = m/[[NF]],\quad \forall n,m\in \left[0,\cdots,[[NF]]-1\right],
\end{equation}
Eq.~(\ref{eq:20}) is transformed into 
\begin{eqnarray}
  \label{eq:22}
   C_{\vec \alpha \cdot\ \vec \beta}^{12}(k,l) &=& \sum_{l=1}^{[[N]]} \sum_{n=0}^{[[NF]]-1} \sum_{m=0}^{[[NF]]-1} \left( 1\over\gamma_l\delta [[NF]]\right)^2 W_{\vec \alpha\cdot\vec \beta}^{l,12}\left({n\over\gamma_l\delta [[NF]]},{m\over\gamma_l\delta [[NF]]}\right) \exp\left( 2\iota\pi {(k n + l m)\over [[NF]]} \right)  \\
&=& \sum_{n=0}^{[[NF]]-1} \sum_{m=0}^{[[NF]]-1} W_{\vec \alpha\cdot\vec \beta}^{12}(n,m) \exp\left( 2\iota\pi {(k n + l m)\over [[NF]]} \right)
\end{eqnarray}
with
\begin{equation}
  \label{eq:23}
  W_{\vec \alpha\cdot\vec \beta}^{12}(n,m) = \sum_{l=1}^{[[N]]} \left( 1\over\gamma_l\delta [[NF]]\right)^2 W_{\vec \alpha\cdot\vec \beta}^{l,12}\left({n\over\gamma_l\delta [[NF]]},{m\over\gamma_l\delta [[NF]]}\right)
\end{equation}

<<aaStats evaluate covariance>>=
dim3 blockDimPsd(16,16);
dim3 gridDimPsd( 1+NF/16 , 1+NF/16 );
dim3 blockDimCov(16,16);
dim3 gridDimCov( 1+NU/16 , 1+NU/16 );

int i_SRC, j_SRC, offset0, offset;
INFO("Covariance %dx%d block:\n",N_SRC,N_SRC);
for (i_SRC=0;i_SRC<N_SRC;i_SRC++) {

  offset0 = 4*N_SRC*i_SRC;

  for (j_SRC=0;j_SRC<N_SRC;j_SRC++) {

    INFO(" [%d,%d]",i_SRC,j_SRC);

    offset = offset0 + 2*j_SRC;
    //INFO("(%2d",offset);
    aaPowerSpectrum LLL gridDimPsd,blockDimPsd RRR (
                    d__psd, NF, lenslet_pitch, 
                    atm->wavelength, atm->r0, atm->turbulence.L0,
                    atm->d__layers, atm->N_LAYER,
		    (src->dev_ptr) + i_SRC, (src->dev_ptr) + j_SRC,
                    d__alpha, d__alpha, kappa);
    <<AA covariance with Wiener-Khinchin>>
        float2 *b;
    b = (float2*)malloc(psd_size);
    HANDLE_ERROR( cudaMemcpy( b, d__psd, psd_size, cudaMemcpyDeviceToHost) );
    /*
    for (int i=0;i<100;i++) {
      INFO("|(%3d) [%+6.6g;%+6.6g]\n",i,b[i].x,b[i].y);
    }
    */
    free(b);
    <<AA covariance reduction>>

    offset = offset0 + (2*j_SRC+1);
    //INFO(",%2d",offset);
    aaPowerSpectrum LLL gridDimPsd,blockDimPsd RRR (
                    d__psd, NF, lenslet_pitch, 
                    atm->wavelength, atm->r0, atm->turbulence.L0,
                    atm->d__layers, atm->N_LAYER,
		    (src->dev_ptr) + i_SRC, (src->dev_ptr) + j_SRC,
                    d__alpha, d__beta, kappa);
    <<AA covariance with Wiener-Khinchin>>
    <<AA covariance reduction>>

    offset = offset0 + 2*(N_SRC+j_SRC);
    //INFO(",%2d",offset);
    aaPowerSpectrum LLL gridDimPsd,blockDimPsd RRR (
                    d__psd, NF, lenslet_pitch, 
                    atm->wavelength, atm->r0, atm->turbulence.L0,
                    atm->d__layers, atm->N_LAYER,
		    (src->dev_ptr) + i_SRC, (src->dev_ptr) + j_SRC,
                    d__beta, d__alpha, kappa);
    <<AA covariance with Wiener-Khinchin>>
    <<AA covariance reduction>>

    offset = offset0 + (2*(N_SRC+j_SRC)+1);
    //INFO(",%2d)",offset);
    aaPowerSpectrum LLL gridDimPsd,blockDimPsd RRR (
                    d__psd, NF, lenslet_pitch, 
                    atm->wavelength, atm->r0, atm->turbulence.L0,
                    atm->d__layers, atm->N_LAYER,
		    (src->dev_ptr) + i_SRC, (src->dev_ptr) + j_SRC,
                    d__beta, d__beta, kappa);
    <<AA covariance with Wiener-Khinchin>>
    <<AA covariance reduction>>
	}
  INFO("\n");
 }

@ 
The Wiener-Khinchin theorem is applied on the power spectrum with the following:
<<AA covariance with Wiener-Khinchin>>=
HANDLE_ERROR_CUFFT(cufftExecC2C(plan, d__psd, d__psd, CUFFT_FORWARD),
		   "Unable to execute plan");
//HANDLE_ERROR(cudaThreadSynchronize());
@ 
The power spectrum, given by Eq.~(\ref{eq:23}) and Eq.~(\ref{eq:17}), must be sampled such as its Fourier transform gives unbiased values of the covariance.
The largest spatial frequency is $f_{max}=\kappa/2\gamma_l\delta$ with $\kappa\ge 1$.
The numyber of sample is given by $[[N_F]]\ge [[N_SIDE_LENSLET]]$.
The spatial frequencies are given by
\begin{eqnarray}
  \label{eq:24}
  f_{x,y} &=& (i,j){2\over [[NF]] }{\kappa\over 2\gamma_l\delta}, (i,j) \in \left[ 0,\dots,[[NF]]/2-1\right] \\
  f_{x,y} &=& (i,j){2\over [[NF]] }{\kappa\over 2\gamma_l\delta}, (i,j) \in \left[ [[NF]]/2,\dots,[[NF]]-1\right] 
\end{eqnarray}
Note that [[NF]] must be even.
<<aaStats power spectrum kernel>>=
__global__ void aaPowerSpectrum(float2 *d__psd, int NF, float d,
				float wavelength, float r0, float L0,
                                layer *turb, int N_LAYER,
				source *src1, source *src2,
				float *alpha, float *beta, float kappa)
{
  int i, j, k;
  float fx, fy, f_square, fs, gl, red, c_red, s_red;
  i = blockIdx.x * blockDim.x + threadIdx.x;
  j = blockIdx.y * blockDim.y + threadIdx.y;
  k = i*NF + j;
  if ( (i<NF) && (j<(NF)) ) {
    k = i*NF + j;
    wavelength *= wavelength;
    d__psd[k].x = 0;
    d__psd[k].y = 0;
    for (int k_LAYER=0;k_LAYER<N_LAYER;k_LAYER++) {
      gl = (1-turb[k_LAYER].altitude/src1->height);
      red = 2*PI*turb[k_LAYER].altitude;
      fs = (2.0/NF)*(kappa/(2*d*gl));
      fx = (float) i;
      fy = (float) j;
      if (i>=(NF/2))
	fx = fx - NF;
      if (j>=(NF/2))
      fy = fy - NF;
      fx *= fs;
      fy *= fs;
      f_square = fx*fx + fy*fy;
      fs *= fs;
      red *= fx*(src2->theta_x - src1->theta_x) + fy*(src2->theta_y - src1->theta_y);
      sincosf( red, &s_red, &c_red);
      red = (k==0) ? 0 : turb[k_LAYER].xi0*
	fs*wavelength*(fx*alpha[0] + fy*alpha[1])*(fx*beta[0] + fy*beta[1])*
	powf(r0,-5.0/3.0)*
	phasePowerSpectrum(f_square, 1.0/L0)*
	gatePSF(gl*fx*d,gl*fy*d);
      d__psd[k].x += red*c_red;
      d__psd[k].y += red*s_red;
    }
  }
}
@ 
The covariance derived from the power spectrum is sampled every $\delta=d/\kappa$.
The covariance derived from the slopes of the lenslet array is sampled every $d$.
Consequently the lenslet--array--slope--covariance is extracted from the power--spectrum--covariance at $(i\kappa,j\kappa)$.
Due to the symmetry of the Fourier transform output, the subscripts in the power--spectrum--covariance are in fact given by
\begin{eqnarray}
  \label{eq:25}
  (i,j)\kappa, (i,j) \in \left[ 0,\dots,[[N]]-1 \right] \\
  \left[(i,j)-[[NU]]\right]\kappa + [[NF]], (i,j) \in \left[ [[N]],\dots,[[NU]]-1 \right]
\end{eqnarray}
The lenslet--array--slope--covariance is shifted such as the baseline coordinate $(0,0$) is a the center of the array i.e.
\begin{equation}
  \label{eq:26}
  (i,j) \leftarrow \left[ (i,j) + (2[[N]]-2)/2 \right] \mod ( 2[[N]]-1 )
\end{equation}
<<AA covariance reduction>>=
covariance_extraction LLL gridDimCov,blockDimCov RRR( d__cov + offset*NU2, NU,
						      d__psd, NF, kappa);
@ The extraction of the covariance is done with the kernel:
<<covariance extraction>>=
__global__ void covariance_extraction(float *cov_out, int NC_out, 
				      float2 *cov_in, int NC_in, float kappa)
{
  int i, j, k_out, k_in, h;
  i = blockIdx.x * blockDim.x + threadIdx.x;
  j = blockIdx.y * blockDim.y + threadIdx.y;
  if ( (i<NC_out) && (j<(NC_out)) ) {
    h = (NC_out-1)/2;
    k_out = (NC_out - 1 - (i+h)%NC_out)*NC_out + (NC_out - 1 - (j+h)%NC_out);
    if (i>=((NC_out+1)/2))
      i = kappa*(i - NC_out) + NC_in;
    else
      i *= kappa;
    if (j>=(NC_out+1)/2)
      j = kappa*(j - NC_out) + NC_in;
    else
      j *= kappa;
    k_in = i*NC_in + j;      
    cov_out[k_out] = cov_in[k_in].x;
  }
}

@ The memory is freed with the following routine: 
\index{aaStats!aaStats!cleanup}
<<aaStats cleanup>>=
void aaStats::cleanup(void)
{
INFO("@(CEO)>aaStats: freeing memory!\n");
<<cleanup contents>>
}
<<cleanup contents>>=
cufftDestroy(plan);
HANDLE_ERROR( cudaFree( d__psd ) );
HANDLE_ERROR( cudaFree( d__alpha ) );
HANDLE_ERROR( cudaFree( d__beta ) );
HANDLE_ERROR( cudaFree( d__cov ) );
#ifdef AASTATS_DEBUG
free(psd);		
free(cov);		
#endif		

@
\subsection{Input/Output}
\label{sec:inputoutput}

The main parameters of [[aaStats]] are displayed with the [[info]] routine:
\index{aaStats!aaStats!info}
<<aaStats info>>=
void aaStats::info(int kappa, float d)
{
  INFO("\n\x1B[1;42m@(CEO)>aaStats:\x1B[;42m\n");
  <<info content>>
  INFO("----------------------------------------------------\x1B[0m\n");
}
<<info content>>=
INFO(" . sampling [m]                   : %.2f\n",d);
INFO(" . size of covariance matrix block: %dX%d\n",NU,NU);
INFO(" . spectrum resolution            : %d\n",NF);
INFO(" . covariance oversampling factor : %d\n",kappa);
INFO(" . spectrum highest frequency     : %4.2E\n",(2.0/NF)*(kappa/(2*d)));

@ The covariance can be written to a file with:
\index{aaStats!aaStats!toFile}
<<save aa covariance to file>>=
void aaStats::toFile(char *filename)
{
  dev2file(filename, d__cov, cov_size/sizeof(float));
}

@ 
\subsection{Miscellaneous}
\label{sec:miscellaneous}

From the power spectrum, once can derive the variance:
\index{aaStats!aaStats!variance}
<<aaStats variance>>=
float aaStats::variance(void) {
  float res;
  HANDLE_ERROR( cudaMemcpy( &res, d__cov + (N-1)*(NU+1),
			    sizeof(float), cudaMemcpyDeviceToHost) );
  return res;
}

@ 
\section{Phase and Angle--of--arrival covariance matrix functions}
\label{sec:pa-functions}

\index{aaStats!paStats}
The covariance matrix is defined for a square lenslet array of size $N_L\times N_L$ and of pitch [[sampling]].
%The covariance sampling is set with the over--sampling factor [[osf]]. 
The covariance matrix depends on the atmosphere parameters and altitude profile.
The covariance matrix is computed for vectors of [[N_P_SRC]] estimation sources and [[N_S_SRC]] guide star sources.
The covariance matrix is written:
\begin{eqnarray}
  \label{eq:27}
  \Xi = \left[
    \begin{array}{ccc}
      \Xi_{11}&\cdots&\Xi_{1[[N__S_SRC]]} \\
      \vdots    &\ddots & \vdots \\
      \Xi_{[[N_P_SRC]]1} & \vdots & \Xi_{[[N_P_SRC]][[N_S_SRC]]}
    \end{array}
\right]
\end{eqnarray}
with
\begin{eqnarray}
  \label{eq:28}
  \Xi_{ij} = \left[
    \begin{array}{cc}
      \Xi_{{\phi_i}{x_j}} & \Xi_{{\phi_i}{y_j}}
    \end{array}
\right].
\end{eqnarray}
Each matrix $\Xi_{{\phi_i}{x_j}}$ is a two--level recursive block Toeplitz (2RBT) $[[M]]^2\times[[N]]^2$ matrix.% with $[[N]]=N_L$ if $[[osf]]=1$ or $[[N]]=[[osf]] \times N_L+1$ if $[[osf]]\ge 1$.
Toeplitz matrices \cite{GolubVanLoan} are fully defined with their first row and column. 
So the matrix $\Xi_{{phi_i}{x_j}}$ is reduced to a $[[NU]]=([[M]][[N]]-1)^2$ elements vector $t_\Xi^{{\phi_i}{x_j}}$ and the matrix $\Xi_{ij}$ becomes the matrix
\begin{equation}
  \label{eq:29}
  T_{ij} = \left[ t_\Xi^{{\phi_i}{x_j}} t_\Xi^{{\phi_i}{y_j}} \right].
\end{equation}
Finally the Toeplitz compressed version of the matrix $\Xi$ is written:
\begin{equation}
  \label{eq:30}
  T = \left[ t_\Xi^{{\phi_1}{x_1}} t_\Xi^{{\phi_1}{y_1}} \cdots t_\Xi^{{\phi_1}{x_[[N_S_SRC]]}} t_\Xi^{{\phi_1}{y_[[N_S_SRC]]}} 
  \cdots t_\Xi^{{\phi_[[N_P_SRC]]}{x_1}} t_\Xi^{{\phi_[[N_P_SRC]]}{y_1}} \cdots t_\Xi^{{\phi_[[N_P_SRC]]}{x_[[N_S_SRC]]}} t_\Xi^{{\phi_[[N_P_SRC]]}{y_[[N_S_SRC]]}} \right].
\end{equation}

@
\subsection{Setup \& Cleanup}
\label{sec:setup--cleanup-1}

\index{aaStats!paStats!setup}
<<paStats setup>>=
void paStats::setup(int M_, int N_, int osf_, 
                    const atmosphere *atm, float lenslet_pitch, 
                    source *phase_src_in, int N_P_SRC, 
                    const source *slopes_src, int N_S_SRC)
{
  N_SRC2 = N_P_SRC*N_S_SRC;
  float alpha[2] = {1,0}, beta[2] = {0,1}, kappa; 
  osf = osf_;
  N = N_;
  M = M_;
  shift = (N==M) ? 0 : 1;  
  N2 = M*N;
  NU = M+N-1;
  NU2 = NU*NU;
  NF = 4096;
  NF2 = NF*NF;
  kappa = 4;
  sampling = lenslet_pitch/osf;
  source *phase_src;

  if (phase_src_in[0].height==slopes_src[0].height) {

    phase_src = phase_src_in;

    M_LAYER = (int *)malloc(sizeof(int));
    M_LAYER[0] = M; 

    psd_size = sizeof(float2)*NF2;
    cov_size = sizeof(float)*NU2*2*N_SRC2;
    ind_size = sizeof(int)*N2;
    int BATCH = 1;

    <<paStats setup allocation>>

    stopwatch tid;
    tid.tic();
    <<paStats power spectrum with sources (same heights,rect.)>>
    tid.toc(&cov_eval_et,"PA covariance evaluation");

  } else {
     <<paStats setup (multilayers)>>
  }
  info(kappa,sampling);
}
<<paStats setup>>=
void paStats::setup(int M_, int N_, int osf_, 
                    const atmosphere *atm, float lenslet_pitch, 
                    const source *slopes_src, int N_S_SRC,
                    float z_radius)
{
 int N_P_SRC = 1;
  N_SRC2 = N_P_SRC*N_S_SRC;
  float alpha[2] = {1,0}, beta[2] = {0,1}, kappa; 
  osf = osf_;
  N = N_;
  M = M_;
  shift = (N==M) ? 0 : 1;  
  N2 = M*N;
  NU = M+N-1;
  NU2 = NU*NU;
  NF = 4096;
  NF2 = NF*NF;
  kappa = 4;
  sampling = lenslet_pitch/osf;

    M_LAYER = (int *)malloc(sizeof(int));
    M_LAYER[0] = M; 

    psd_size = sizeof(float2)*NF2;
    cov_size = sizeof(float)*NU2*2*N_SRC2;
    ind_size = sizeof(int)*N2;
    int BATCH = 1;

    <<paStats setup allocation>>

    stopwatch tid;
    tid.tic();
    <<paStats polar average power spectrum with sources (same heights,rect.)>>
    tid.toc(&cov_eval_et,"PA covariance evaluation");

  info(kappa,sampling);
}
@ The phase source is duplicated and set to the same height than the slope source:
<<phase source duplicate>>=
float *zenith, *azimuth;
zenith  = (float*)malloc(sizeof(float)*N_P_SRC); 
azimuth = (float*)malloc(sizeof(float)*N_P_SRC); 
source __src;
for (int k_P_SRC=0;k_P_SRC<N_P_SRC;k_P_SRC++) {
  HANDLE_ERROR( cudaMemcpy( &__src, phase_src_in->dev_ptr + k_P_SRC,
                             sizeof(source), cudaMemcpyDeviceToHost ) );
  zenith[k_P_SRC]  = __src.zenith;
  azimuth[k_P_SRC] = __src.azimuth;
}
source clone_phase_src;
clone_phase_src.setup(phase_src_in[0].photometric_band,
		      zenith, azimuth, 
		      slopes_src[0].height,N_P_SRC,0);
free(zenith);
free(azimuth);
@ The covariance is computed for each layers independently in the following:    
<<paStats setup (multilayers)>>=
/*
source clone_phase_src;
clone_phase_src.setup(phase_src_in[0].photometric_band,
		      phase_src_in[0].zenith,
		      phase_src_in[0].azimuth,
		      slopes_src[0].height,0);
*/
<<phase source duplicate>>
phase_src = &clone_phase_src;

float g;
int o;
NU2 = 0;
M_LAYER = (int *)malloc(sizeof(int)*atm->N_LAYER);
for (int k_LAYER=0;k_LAYER<atm->N_LAYER;k_LAYER++) {
  g = 1 - atm->turbulence.altitude[k_LAYER]/slopes_src[0].height;
  o = (int) ceilf( osf*0.5*N*(1-g)/g );
  M_LAYER[k_LAYER] = M + o*2;
  NU = M_LAYER[k_LAYER]+N-1;
  NU2 += NU*NU;     
  //      printf("@(CEO)>paStats: layer #%d - NU=%d\n",k_LAYER,NU);
 }

psd_size = sizeof(float2)*NF2;
cov_size = sizeof(float)*NU2*2*N_SRC2;

int BATCH = 1;

<<paStats setup allocation>>
	
stopwatch tid;
tid.tic();
<<paStats power spectrum with sources (same heights,rect.,multilayers)>>
  tid.toc(&cov_eval_et,"PA covariance evaluation");

clone_phase_src.cleanup();

<<covariance sampling>>=
int covariance_sampling(float h, float z, int osf, int M, int N) {
  float g;
  int o, M_LAYER, NU;
  g = 1 - h/z;
  o = (int) ceilf( osf*0.5*N_SIDE_LENSLET*(1-g)/g );
  M_LAYER = M + o*2;
  NU = M_LAYER+N-1;
  return NU*NU;      
}
@ Memory is allocated with:
<<paStats setup allocation>>=
HANDLE_ERROR( cudaMalloc((void**)&d__psd, psd_size ) );
HANDLE_ERROR( cudaMalloc((void**)&d__alpha, 2*sizeof(float) ) );
HANDLE_ERROR( cudaMalloc((void**)&d__beta,  2*sizeof(float) ) );
HANDLE_ERROR( cudaMemcpy( d__alpha, alpha, 2*sizeof(float), cudaMemcpyHostToDevice) );
HANDLE_ERROR( cudaMemcpy( d__beta, beta, 2*sizeof(float), cudaMemcpyHostToDevice) );
HANDLE_ERROR( cudaMalloc((void**)&d__cov, cov_size ) );

INFO("\n@(CEO)>paStats: Creating 2D covariance FFT plan\n");
int n_DFT[2] = {NF, NF};
int iodist = NF2;
/* Create a 2D FFT plan. */
HANDLE_ERROR_CUFFT( cufftPlanMany(&plan, 2, n_DFT,
                                  NULL, 1, iodist,
                                  NULL, 1, iodist,
                                  CUFFT_C2C,BATCH),
                                  "Unable to create plan" );
/* HANDLE_ERROR_CUFFT( cufftSetCompatibilityMode(plan, CUFFT_COMPATIBILITY_NATIVE), */
/*                     "Unable to set compatibility mode to native" ); */
@  
The covariance is derived from Eq.~(\ref{eq:5}) for 2 sources in the directions $\vec\theta_1$ and $\vec\theta_2$ propagated through a [[N_LAYER]] atmospheric profile.
It is given by
\begin{equation}
  \label{eq:31}
  C_{\varphi \vec \alpha}^{12}(\vec\rho) = \sum_{l=1}^{[[N]]} C_{\varphi \vec \alpha}^l(\vec\rho_l)
\end{equation}
where
\begin{equation}
  \label{eq:32}
  \vec \rho_l = \gamma_l\vec\rho + h_l(\vec\theta_2-\vec\theta_1)
\end{equation}
with $\gamma_l=1-h_l/z_\ast$, $h_l$ is the atmosphere layer altitude and $z_\ast$ is the sources height.
Invoking the Wiener--Khinchine theorem, the translation and scaling properties of the Fourier transform and using Eq.~(\ref{eq:2}), the covariance for a single atmospheric layer at altitude $h_l$ is written:
\begin{equation}
  \label{eq:33}
  C_{\varphi \vec \alpha}^{12,l}(\vec\rho_l) = \mathcal F^{-1} \left[ -\iota\lambda\left( \vec f \cdot \vec\alpha \right) W_\varphi(\vec f) G(\gamma_l \vec f) \exp\left( 2\iota\pi h_l\vec f \cdot (\vec\theta_2-\vec\theta_1) \right)  \right] (\gamma_l\vec\rho)
\end{equation}
Setting
\begin{equation}
  \label{eq:34}
  W_{\varphi \vec \alpha}^{l,12}(\vec f) = -\iota\lambda\left( \vec f \cdot \vec\alpha \right) W_\varphi(\vec f) G(\gamma_l \vec f) \exp\left( 2\iota\pi h_l\vec f \cdot (\vec\theta_2-\vec\theta_1) \right),
\end{equation}
Eq.~(\ref{eq:31}) is now given by
\begin{equation}
  \label{eq:35}
  C_{\varphi \vec \alpha}^{12}(\vec\rho) = \sum_{l=1}^{[[N]]} \mathcal F^{-1} \left[ W_{\varphi \vec \alpha}^{l,12}(\vec f) \right] (\gamma_l\vec\rho).
\end{equation}
Calling $\delta=d/\kappa$ the sampling step of the vector $\vec\rho$ with the integer $\kappa\geq 1$, $\vec\rho_l$ is given by
\begin{equation}
  \label{eq:36}
  \vec \rho_l = \gamma_l\delta(k\vec x + l\vec y),\quad\forall l,k\in \left[0,\cdots,[[NF]]-1\right],
\end{equation}
$\vec x$ and $\vec y$ are the unit vectors of the X and Y axis, respectively.
Eq.~(\ref{eq:35}) becomes
\begin{equation}
  \label{eq:37}
   C_{\varphi \vec \alpha}^{12}(k,l) = \sum_{l=1}^{[[N]]} \iint {\mathrm d^2}\vec f W_{\varphi \vec \alpha}^{l,12}(\vec f) \exp\left( 2\iota\pi \gamma_l \delta (k f_x + l f_y) \right)
\end{equation}
Discretizing Eq.~(\ref{eq:37}) with
\begin{equation}
  \label{eq:38}
  \gamma_l\delta f_x = n/[[NF]],\quad \gamma_l\delta f_y = m/[[NF]],\quad \forall n,m\in \left[0,\cdots,[[NF]]-1\right],
\end{equation}
Eq.~(\ref{eq:37}) is transformed into 
\begin{eqnarray}
  \label{eq:39}
   C_{\varphi \vec \alpha}^{12}(k,l) &=& \sum_{l=1}^{[[N]]} \sum_{n=0}^{[[NF]]-1} \sum_{m=0}^{[[NF]]-1} \left( 1\over\gamma_l\delta [[NF]]\right)^2 W_{\varphi \vec \alpha}^{l,12}\left({n\over\gamma_l\delta [[NF]]},{m\over\gamma_l\delta [[NF]]}\right) \exp\left( 2\iota\pi {(k n + l m)\over [[NF]]} \right)  \\
&=& \sum_{n=0}^{[[NF]]-1} \sum_{m=0}^{[[NF]]-1} W_{\varphi \vec \alpha}^{12}(n,m) \exp\left( 2\iota\pi {(k n + l m)\over [[NF]]} \right)
\end{eqnarray}
with
\begin{equation}
  \label{eq:40}
  W_{\varphi \vec \alpha}^{12}(n,m) = \sum_{l=1}^{[[N]]} \left( 1\over\gamma_l\delta [[NF]]\right)^2 W_{\varphi \vec \alpha}^{l,12}\left({n\over\gamma_l\delta [[NF]]},{m\over\gamma_l\delta [[NF]]}\right)
\end{equation}
 
%% Rectangular 2RBT
<<paStats power spectrum with sources (same heights,rect.)>>=
dim3 blockDimPsd(16,16);
dim3 gridDimPsd( 1+NF/16 , 1+NF/16 );
dim3 blockDimCov(16,16);
dim3 gridDimCov( 1+NU/16 , 1+NU/16 );

int i_P_SRC, j_S_SRC, offset0, offset;
INFO("Covariance %dx%d block:\n",N_P_SRC,N_S_SRC);
for (i_P_SRC=0;i_P_SRC<N_P_SRC;i_P_SRC++) {

  offset0 = 2*N_S_SRC*i_P_SRC;

  for (j_S_SRC=0;j_S_SRC<N_S_SRC;j_S_SRC++) {

    INFO(" [%d,%d]",i_P_SRC,j_S_SRC);

    offset = offset0 + 2*j_S_SRC;
    //INFO("(%2d",offset);
    paPowerSpectrum LLL gridDimPsd,blockDimPsd RRR (d__psd, NF, osf, lenslet_pitch, 
						    atm->wavelength, atm->r0, 
						    atm->turbulence.L0, atm->d__layers, atm->N_LAYER,
						    (phase_src->dev_ptr) + i_P_SRC, 
						    (slopes_src->dev_ptr) + j_S_SRC,
						    d__alpha, kappa, shift);
    <<AA covariance with Wiener-Khinchin>>
    <<PA covariance reduction>>

    offset = offset0 + (2*j_S_SRC+1);
    //INFO(",%2d)",offset);
    paPowerSpectrum LLL gridDimPsd,blockDimPsd RRR (d__psd, NF, osf, lenslet_pitch,
						    atm->wavelength, atm->r0, 
						    atm->turbulence.L0, atm->d__layers, atm->N_LAYER,
						    (phase_src->dev_ptr) + i_P_SRC, 
						    (slopes_src->dev_ptr) + j_S_SRC,
						    d__beta, kappa, shift);
    <<AA covariance with Wiener-Khinchin>>
    <<PA covariance reduction>>
}
INFO("\n");
}
@
The power spectrum of the average of the covariance over a continous sample of sources within a circle of angular radius $\zeta$ is given by:
\begin{equation}
  \label{eq:45}
  W_{\varphi \vec \alpha}^{l,\zeta 2}(\vec f) = {1\over \pi\zeta^2} \int_0^\zeta {\mathrm d}z z \int_0^{2\pi}{\mathrm d} \theta -\iota\lambda\left( \vec f \cdot \vec\alpha \right) W_\varphi(\vec f) G(\gamma_l \vec f) \exp\left( 2\iota\pi h_l\vec f \cdot (\vec\theta_2-\vec\theta) \right),  
\end{equation}
Performing the integration leads to:
\begin{equation}
  \label{eq:45}
  W_{\varphi \vec \alpha}^{l,\zeta 2}(\vec f) = -\iota\lambda\left( \vec f \cdot \vec\alpha \right) W_\varphi(\vec f) G(\gamma_l \vec f) {2J_1(2\pi h_lf\zeta)\over 2\pi h_lf\zeta} \exp\left( 2\iota\pi h_l\vec f \cdot \vec\theta_2 \right),  
\end{equation}
<<paStats polar average power spectrum with sources (same heights,rect.)>>=
dim3 blockDimPsd(16,16);
dim3 gridDimPsd( 1+NF/16 , 1+NF/16 );
dim3 blockDimCov(16,16);
dim3 gridDimCov( 1+NU/16 , 1+NU/16 );

int i_P_SRC, j_S_SRC, offset0, offset;
INFO("Polar average covariance %dx%d block:\n",N_P_SRC,N_S_SRC);
for (i_P_SRC=0;i_P_SRC<N_P_SRC;i_P_SRC++) {

  offset0 = 2*N_S_SRC*i_P_SRC;

  for (j_S_SRC=0;j_S_SRC<N_S_SRC;j_S_SRC++) {

    INFO(" [%d,%d]",i_P_SRC,j_S_SRC);

    offset = offset0 + 2*j_S_SRC;
    //INFO("(%2d",offset);
    paPolarPowerSpectrum LLL gridDimPsd,blockDimPsd RRR (d__psd, NF, osf, lenslet_pitch, 
                                                         atm->wavelength, atm->r0, 
                                                         atm->turbulence.L0, atm->d__layers, atm->N_LAYER,
                                                         (slopes_src->dev_ptr) + j_S_SRC,
                                                         d__alpha, kappa, shift, z_radius);
    <<AA covariance with Wiener-Khinchin>>
    <<PA covariance reduction>>

    offset = offset0 + (2*j_S_SRC+1);
    //INFO(",%2d)",offset);
    paPolarPowerSpectrum LLL gridDimPsd,blockDimPsd RRR (d__psd, NF, osf, lenslet_pitch,
                                                         atm->wavelength, atm->r0, 
                                                         atm->turbulence.L0, atm->d__layers, atm->N_LAYER,
                                                         (slopes_src->dev_ptr) + j_S_SRC,
                                                         d__beta, kappa, shift, z_radius);
    <<AA covariance with Wiener-Khinchin>>
    <<PA covariance reduction>>
}
INFO("\n");
}
@ The power spectrum is computed with the following kernels:
<<paStats power spectrum kernel>>=
__global__ void paPowerSpectrum(float2 *d__psd, int NF, int osf, float d,
                                float wavelength, float r0, float L0,
                                layer *turb, int N_LAYER,
                                source *src1, source *src2,
                                float *alpha, float kappa, int shift)
{
  int i, j, k;
  float fx, fy, f_square, fs, gl, red, c_red, s_red, wavenumber, src_x, src_y;
  i = blockIdx.x * blockDim.x + threadIdx.x;
  j = blockIdx.y * blockDim.y + threadIdx.y;
  k = i*NF + j;
  if ( (i<NF) && (j<(NF)) ) {
    wavenumber = wavelength*0.5/PI;
    k = i*NF + j;
    d__psd[k].x = 0;
    d__psd[k].y = 0;
     for (int k_LAYER=0;k_LAYER<N_LAYER;k_LAYER++) {
      gl = (1-turb[k_LAYER].altitude/src1->height);
      red = 2*PI;
      src_x = src2->theta_x - src1->theta_x;
      src_x *= turb[k_LAYER].altitude;
      src_y = src2->theta_y - src1->theta_y;
      src_y *= turb[k_LAYER].altitude;
      fs = osf*(2.0/NF)*(kappa/(2*d*gl));
      fx = (float) i;
      fy = (float) j;
      if (i>=(NF/2))
        fx = fx - NF;
      if (j>=(NF/2))
        fy = fy - NF;
      fx *= fs;
      fy *= fs;
      f_square = fx*fx + fy*fy;
      fs *= fs;
      red *= fx*(src_x + 0.5*d*shift) + fy*(src_y + 0.5*d*shift);
      sincosf( red, &s_red, &c_red);
      red = -wavenumber*turb[k_LAYER].xi0*fs*wavelength*(fx*alpha[0] + fy*alpha[1])*
      powf(r0,-5.0/3.0)*
      phasePowerSpectrum(f_square, 1.0/L0)*
	gateAmp(gl*fx*d,gl*fy*d);
      d__psd[k].x += -red*s_red;
      d__psd[k].y +=  red*c_red;
    }
  }
}
@
The average of the power spectrum on an angular disc of radius $[z_radius]$ is
<<paStats polar average power spectrum kernel>>=
__global__ void paPolarPowerSpectrum(float2 *d__psd, int NF, int osf, float d,
                                     float wavelength, float r0, float L0,
                                     layer *turb, int N_LAYER,
                                     source *src2,
                                     float *alpha, float kappa, int shift,
                                     float z_radius)
{
  int i, j, k;
  float fx, fy, f_square, f, fs, gl, red, c_red, s_red, wavenumber, src_x, src_y, b;
  i = blockIdx.x * blockDim.x + threadIdx.x;
  j = blockIdx.y * blockDim.y + threadIdx.y;
  k = i*NF + j;
  if ( (i<NF) && (j<(NF)) ) {
    wavenumber = wavelength*0.5/PI;
    k = i*NF + j;
    d__psd[k].x = 0;
    d__psd[k].y = 0;
     for (int k_LAYER=0;k_LAYER<N_LAYER;k_LAYER++) {
       gl = 1.0;
      red = 2*PI;
      src_x = src2->theta_x;
      src_x *= turb[k_LAYER].altitude;
      src_y = src2->theta_y;
      src_y *= turb[k_LAYER].altitude;
      fs = osf*(2.0/NF)*(kappa/(2*d*gl));
      fx = (float) i;
      fy = (float) j;
      if (i>=(NF/2))
        fx = fx - NF;
      if (j>=(NF/2))
        fy = fy - NF;
      fx *= fs;
      fy *= fs;
      f = hypot(fx,fy);
      f_square = f*f;
      fs *= fs;
      b = 2*turb[k_LAYER].altitude*f*z_radius;
      red *= fx*(src_x + 0.5*d*shift) + fy*(src_y + 0.5*d*shift);
      sincosf( red, &s_red, &c_red);
      red = -wavenumber*turb[k_LAYER].xi0*fs*wavelength*(fx*alpha[0] + fy*alpha[1])*
        powf(r0,-5.0/3.0)*
        phasePowerSpectrum(f_square, 1.0/L0)*
        gateAmp(gl*fx*d,gl*fy*d)*
        somb(b);
      d__psd[k].x += -red*s_red;
      d__psd[k].y +=  red*c_red;
    }
  }
}
@
<<PA covariance reduction>>=
pa_covariance_extraction LLL gridDimCov,blockDimCov RRR( d__cov + offset*NU2, NU,
							 d__psd, NF, kappa);
@ The extraction of the covariance is done with the kernel:
<<PA covariance extraction>>=
__global__ void pa_covariance_extraction(float *cov_out, int NC_out, 
				      float2 *cov_in, int NC_in, float kappa)
{
  int i, j, k_out, k_in, h;
  i = blockIdx.x * blockDim.x + threadIdx.x;
  j = blockIdx.y * blockDim.y + threadIdx.y;
  if ( (i<NC_out) && (j<(NC_out)) ) {
    h = (NC_out-1)/2;
    k_out = (NC_out - 1 - (i+h)%NC_out)*NC_out + (NC_out - 1 - (j+h)%NC_out);
    //    k_out = ((i+h)%NC_out)*NC_out + ((j+h)%NC_out);
    if (i>(NC_out/2))
      i = kappa*(i - NC_out) + NC_in;
    else
      i *= kappa;
    if (j>(NC_out/2))
      j = kappa*(j - NC_out) + NC_in;
    else
      j *= kappa;
    k_in = i*NC_in + j;      
    cov_out[k_out] = cov_in[k_in].x;
  }
}

@ 
%% Rectangular 2RBT
<<paStats power spectrum with sources (same heights,rect.,multilayers)>>=
dim3 blockDimPsd(16,16);
dim3 gridDimPsd( 1+NF/16 , 1+NF/16 );
dim3 blockDimCov(16,16);
dim3 gridDimCov( 1+NU/16 , 1+NU/16 );

int i_P_SRC, j_S_SRC, k_LAYER;
INFO("Covariance %dx%d block:\n",N_P_SRC*atm->N_LAYER,N_S_SRC);
int CNU2 = 0;
for (i_P_SRC=0;i_P_SRC<N_P_SRC;i_P_SRC++) {

  for (k_LAYER=0;k_LAYER<atm->N_LAYER;k_LAYER++) {

    NU = M_LAYER[k_LAYER]+N-1;

    for (j_S_SRC=0;j_S_SRC<N_S_SRC;j_S_SRC++) {

      INFO(" [%d,%d]",i_P_SRC+k_LAYER*N_P_SRC,j_S_SRC);

      //INFO("(%2d",CNU2);
      paPowerSpectrum LLL gridDimPsd,blockDimPsd RRR (
			       d__psd, NF, osf, lenslet_pitch, 
			       atm->wavelength, atm->r0, atm->turbulence.L0, 
			       atm->d__layers, atm->N_LAYER, k_LAYER,
			       (phase_src->dev_ptr) + i_P_SRC, 
			       (slopes_src->dev_ptr) + j_S_SRC,
			       d__alpha, kappa,shift);
      <<AA covariance with Wiener-Khinchin>>
      pa_covariance_extraction LLL gridDimCov,blockDimCov RRR( d__cov + CNU2, NU,
							       d__psd, NF, kappa);
      CNU2 += NU*NU;      

      //INFO(",%2d)",CNU2);
      paPowerSpectrum LLL gridDimPsd,blockDimPsd RRR (
                               d__psd, NF, osf, lenslet_pitch,
			       atm->wavelength, atm->r0, atm->turbulence.L0, 
			       atm->d__layers, atm->N_LAYER, k_LAYER,
			       (phase_src->dev_ptr) + i_P_SRC, 
			       (slopes_src->dev_ptr) + j_S_SRC,
			       d__beta, kappa,shift);
      <<AA covariance with Wiener-Khinchin>>
      pa_covariance_extraction LLL gridDimCov,blockDimCov RRR( d__cov +  CNU2, NU,
							       d__psd, NF, kappa);
      CNU2 += NU*NU;      
    }
  INFO("\n");
  }
 }
<<paStats power spectrum kernel (single layer)>>=
__global__ void paPowerSpectrum(float2 *d__psd, int NF, int osf, float d,
				float wavelength, float r0, float L0,
				layer *turb, int N_LAYER, int k_LAYER,
				source *src1, source *src2,
				float *alpha, float kappa, int shift)
{
  int i, j, k;
  float fx, fy, f_square, fs, gl, red, c_red, s_red, wavenumber, src_x, src_y;
  i = blockIdx.x * blockDim.x + threadIdx.x;
  j = blockIdx.y * blockDim.y + threadIdx.y;
  k = i*NF + j;
  if ( (i<NF) && (j<(NF)) ) {
    wavenumber = wavelength*0.5/PI;
    k = i*NF + j;
    d__psd[k].x = 0;
    d__psd[k].y = 0;

      gl = (1-turb[k_LAYER].altitude/src1->height);
      red = 2*PI;
      src_x = src2->theta_x - src1->theta_x;
      src_x *= turb[k_LAYER].altitude;
      src_y = src2->theta_y - src1->theta_y;
      src_y *= turb[k_LAYER].altitude;
      fs = osf*(2.0/NF)*(kappa/(2*d*gl));
      fx = (float) i;
      fy = (float) j;
      if (i>=(NF/2))
	fx = fx - NF;
      if (j>=(NF/2))
	fy = fy - NF;
      fx *= fs;
      fy *= fs;
      f_square = fx*fx + fy*fy;
      fs *= fs;
      red *= fx*(src_x + 0.5*d*shift) + fy*(src_y + 0.5*d*shift);
      sincosf( red, &s_red, &c_red);
      red = -wavenumber*turb[k_LAYER].xi0*fs*wavelength*(fx*alpha[0] + fy*alpha[1])*
      powf(r0,-5.0/3.0)*
      phasePowerSpectrum(f_square, 1.0/L0)*
	gateAmp(gl*fx*d,gl*fy*d);
      d__psd[k].x += -red*s_red;
      d__psd[k].y +=  red*c_red;

  }
}
@ 
The memory is freed with the following routine:
\index{aaStats!paStats!cleanup}
<<paStats cleanup>>=
void paStats::cleanup(void)
{
INFO("@(CEO)>paStats: freeing memory!\n");
free(M_LAYER);
<<cleanup contents>>
}
@
\subsection{Input/Output}
\label{sec:inputoutput-1}

The main parameters of [[paStats]] are displayed with the info routine: 
\index{aaStats!paStats!info}
<<paStats info>>=
void paStats::info(int kappa, float d)
{
  INFO("\n\x1B[1;42m@(CEO)>paStats:\x1B[;42m\n");
  <<info content>>
  INFO(" . phase oversampling factor      : %d\n",osf);
  INFO("----------------------------------------------------\x1B[0m\n");
}
@
The [[paStats]] covariance can be written to a file with:
\index{aaStats!paStats!toFile}
<<save pa covariance to file>>=
void paStats::toFile(char *filename)
{
  dev2file(filename, d__cov, cov_size/sizeof(float));
}

@
\subsection{Miscellaneous}
\label{sec:miscellaneous-1}

The phase power spectrum is given by:
<<phase power spectrum>>=
 __device__ float phasePowerSpectrum(float f_square, float f0) {
  return 0.022895587108555*powf(f_square + f0*f0,-11.0/6.0);
}
@ 
The subaperture Fourier transform is given by
\begin{equation}
  \label{eq:41}
  {\mathrm sinc}(f_x){\mathrm sinc}(f_y)
\end{equation}
and written in
<<subaperture FT>>=
__device__ float gateAmp(float fx, float fy) {
  float out;
  out = sinc(fx)*sinc(fy);
  return out;
}
@ 
The subaperture PSF is given by
\begin{equation}
  \label{eq:42}
  {\mathrm sinc}^2(f_x){\mathrm sinc}^2(f_y)
\end{equation}
and written in
<<subaperture PSF>>=
__device__ float gatePSF(float fx, float fy) {
  float out;
  out = sinc(fx)*sinc(fy);
  return out*out;
}
@ that depends on the $\mathrm sinc$ function:
\begin{equation}
  \label{eq:43}
  {\sin(\pi x)\over \pi x}
\end{equation}
<<sinc>>=
__device__ float sinc(float x) {
  return (x==0) ? 1.0 : sinf( PI*x) / (PI*x) ; 
}
@
The Sombrero function is defined as:
\begin{equation}
  \label{eq:44}
  {\mathrm somb}(x) = {2J_1(x)\over x}
\end{equation}
<<somb>>=
__device__ float somb(float x) {
  return (x==0) ? 1.0 : 2 * j1(PI*x) / (PI*x);
}

@
\section{Tests}
\label{sec:tests}

The test routine is written:
<<aaStats.bin>>=

#include <iostream>
#include <fstream>
#include <iomanip>
using namespace std;

#include "cublas_v2.h"
#include "h"
#include "aaStats.h"
#include "BTBT.h"
using namespace std;
#define N_RUN 5
#define N_SAMPLE 100

__global__ void fill(float *A, int m_a, int n_a) {
  int i, j, k;
  float n;
  i = blockIdx.x * blockDim.x + threadIdx.x;
  j = blockIdx.y * blockDim.y + threadIdx.y;
  k = i*m_a +j;
  n = (float) m_a*n_a;
  if (k<n)
    A[k] = ((float)k)/n;
}

int main( void) {
  atmosphere atm;
  source src;
  aaStats aa;
  paStats pa;
  BTBT C, S;
  int k, data_size, M, N, NX, NY, NA, MA, k_N, k_SAMPLE, osf;
  float altitude[] = {0},
        xi0[] = {1},
        wind_speed[] = {10},
  	wind_direction[] = {0};
  float *x, *y, *d__x, *d__y, *d__A;
  stopwatch tid;
  cublasHandle_t handle;
  cublasCreate(&handle);
  cublasStatus_t status;

  atm.setup(0.15,30,altitude,xi0,wind_speed,wind_direction);
  src.setup(0,0,INFINITY);

  int N_[N_RUN] = {20,40,64,84,150};
  float aa_full_mvm_et[N_RUN], pa_full_mvm_et[N_RUN], aa_comp_mvm_et[N_RUN], pa_comp_mvm_et[N_RUN], 
    aa_comp_eval_et[N_RUN], pa_comp_eval_et[N_RUN];

  dim3 blockDim(16,16);
  dim3 gridDim;
  float alpha, beta;
  alpha = 1;
  beta  = 0;
  osf = 1;

  std::ofstream aa_fid, pa_fid;
  aa_fid.open ("aa_runtime.txt");
  aa_fid.setf(ios::fixed, ios::floatfield);
  aa_fid.precision(3);
  aa_fid << "\nAASTATS:\n";
  aa_fid << " N   FULL MVM   COMP MVM   COMP EVAL\n";

  pa_fid.open ("pa_runtime.txt");
  pa_fid.setf(ios::fixed, ios::floatfield);
  pa_fid.precision(3);
  pa_fid << "\nPASTATS:\n";
  pa_fid << " N   FULL MVM   COMP MVM   COMP EVAL\n";

  for (k_SAMPLE=0;k_SAMPLE<N_SAMPLE;k_SAMPLE++) {

  //AASTATS TESTS
  for (k_N=0;k_N<N_RUN;k_N++) {

    printf("\n_______ AASTATS TESTS: SAMPLE #%d - RUN #%d _______\n",k_SAMPLE,k_N);

    N = N_[k_N];
    NX = N*N*2;
    data_size = sizeof(float)*NX;

    x = (float *)malloc( data_size );
    y = (float *)malloc( data_size );
    HANDLE_ERROR( cudaMalloc((void**)&d__x, data_size ) );
    HANDLE_ERROR( cudaMalloc((void**)&d__y, data_size ) );

    for (k=0;k<NX;k++) {
      x[k] = (float)k;
    }
    HANDLE_ERROR( cudaMemcpy( d__x, x, data_size, cudaMemcpyHostToDevice) );

    aa.setup(N,&atm,0.1,&src,1);
    C.setup(2,2,N,aa.d__cov);
    aa_comp_eval_et[k_N] = aa.cov_eval_et;
    
    //printf("\n AA VARIANCE: %.4E rd^2\n",aa.variance(0.1, &atm, 4, 2048));

    tid.tic();
    C.MVM(d__y,d__x);
    tid.toc(aa_comp_mvm_et + k_N,"Covariance MVM");
    HANDLE_ERROR( cudaMemcpy( y, d__y, data_size, cudaMemcpyDeviceToHost) );
    /* printf("\nMVM:\n"); */
    /* for (k=0;k<NX;k++) { */
    /*   printf("(%2d) %4.2E\n",k,y[k]); */
    /* } */
    FILE *aa_fid;
    char filename[100];
    sprintf(filename,"aa_mvm%03d.bin",N);
    aa_fid = fopen(filename,"wb");
    fwrite(y,sizeof(float),NX,aa_fid);
    fclose(aa_fid);

    if (N<=100) {
      M = 2*N*N;
      data_size = sizeof(float)*M*M;
      HANDLE_ERROR( cudaMalloc((void**)&d__A, data_size ) );
      gridDim = dim3(1+M/16,1+M/16);
      tid.tic();
      fill LLL gridDim,blockDim RRR (d__A, M, M);
      tid.toc("Filling of A");
      tid.tic();
      status = cublasSgemv(handle, CUBLAS_OP_N, M, M, &alpha, d__A, M, d__x, 1, &beta, d__y, 1);
      tid.toc(aa_full_mvm_et + k_N,"A MVM");
      if (status!=CUBLAS_STATUS_SUCCESS)
    	printf(">>> ERROR! CUBLAS FAILED! <<<\n");
      HANDLE_ERROR( cudaFree( d__A ) );
    }  else aa_full_mvm_et[k_N]=0;

    aa.cleanup();
    C.cleanup();
    HANDLE_ERROR( cudaFree( d__x ) );
    HANDLE_ERROR( cudaFree( d__y ) );
    free(x);
    free(y);
  }
  for (k_N=0;k_N<N_RUN;k_N++) {
    aa_fid << setw(3) << N_[k_N] << "   " << setw(6) << aa_full_mvm_et[k_N] << "     " << aa_comp_mvm_et[k_N] << "     " << aa_comp_eval_et[k_N] << "\n";
  }

  //PASTATS TESTS
  for (k_N=0;k_N<N_RUN;k_N++) {

    printf("\n_______ PASTATS TESTS: SAMPLE #%d - RUN #%d _______\n",k_SAMPLE,k_N);

    //    N = 2*N_[k_N]+1;
    NX = N_[k_N];
    NX *= NX*2;
    NY = N_[k_N] + 1;
    NY *= NY;
    data_size = sizeof(float);

    x = (float *)malloc( data_size*NX );
    y = (float *)malloc( data_size*NY );
    HANDLE_ERROR( cudaMalloc((void**)&d__x, data_size*NX ) );
    HANDLE_ERROR( cudaMalloc((void**)&d__y, data_size*NY ) );

    for (k=0;k<NX;k++) {
      x[k] = (float)k;
    }
    HANDLE_ERROR( cudaMemcpy( d__x, x, data_size*NX, cudaMemcpyHostToDevice) );
    pa.setup(N_[k_N]+1,N_[k_N],osf,&atm,0.1,&src,1,&src,1);
    S.setup(1,2,N_[k_N]+1,N_[k_N],pa.d__cov);
    pa_comp_eval_et[k_N] = pa.cov_eval_et;

    tid.tic();
    S.MVM(d__y,d__x);
    tid.toc(pa_comp_mvm_et + k_N,"Covariance MVM");
    HANDLE_ERROR( cudaMemcpy( y, d__y, data_size*NY, cudaMemcpyDeviceToHost) );
    /* printf("\nMVM:\n"); */
    /* for (k=0;k<NY;k++) { */
    /*   printf("(%3d) %+6.4E\n",k,y[k]); */
    /* } */
    FILE *aa_fid;
    char filename[100];
    sprintf(filename,"pa_mvm%03d.bin",N);
    aa_fid = fopen(filename,"wb");
    fwrite(y,sizeof(float),NY,aa_fid);
    fclose(aa_fid);

    if (N_[k_N]<=100) {
      MA = NY;
      NA = NX;
      data_size = sizeof(float)*MA*NA;
      HANDLE_ERROR( cudaMalloc((void**)&d__A, data_size ) );
      gridDim = dim3(1+MA/16,1+NA/16);
      tid.tic();
      fill LLL gridDim,blockDim RRR (d__A, MA , NA);
      tid.toc("Filling of A");
      tid.tic();
      status = cublasSgemv(handle, CUBLAS_OP_N, MA, NA, &alpha, d__A, MA, d__x, 1, &beta, d__y, 1);
      tid.toc(pa_full_mvm_et + k_N,"A MVM");
      if (status!=CUBLAS_STATUS_SUCCESS)
  	printf(">>> ERROR! CUBLAS FAILED! <<<\n");
      HANDLE_ERROR( cudaFree( d__A ) );
    } else pa_full_mvm_et[k_N]=0;

    pa.cleanup();
    S.cleanup();
    HANDLE_ERROR( cudaFree( d__x ) );
    HANDLE_ERROR( cudaFree( d__y ) );
    free(x);
    free(y);
  }
  for (k_N=0;k_N<N_RUN;k_N++) {
     pa_fid << setw(3) << N_[k_N] << "   " << setw(6) << pa_full_mvm_et[k_N] << "     " << pa_comp_mvm_et[k_N] << "     " << pa_comp_eval_et[k_N] << "\n";
  }

  } // SAMPLE LOOP

  atm.cleanup();
  cublasDestroy(handle);
  aa_fid.close();
  pa_fid.close();
}

@  The following is the Matlab script to test  the [[MVM_kernel]]:
<<MVM kernel test>>=
kern = parallel.gpu.CUDAKernel('aaStats.ptx','aaStats.cu','MVM_kernel');
kern.ThreadBlockSize = [16,16];
kern.GridSize = [2,2];
d1 = 2/3;
d2 = 2/3;
NU = 7;
N1 = 4;
N2 = 4;
g1 = 1;
g2 = 1;
[x,y] = meshgrid( linspace(-1,1,4) );
z = x(:) + 1i*y(:);
c = abs( bsxfun( @minus, z , z.') );
idx = [13 9 5 1 2 3 4];
cs = mat2cell( c , ones(1,4)*4,ones(1,4)*4);
cov = cell2mat(cellfun( @(x) x(idx)', cs(idx) , 'UniformOutput', false));
vout = gpuArray.zeros(16^2,1,'single');
vin = gpuArray.ones(16^2,1,'single');
sampling = 2/3;