#ifndef __UTILITIES_H__
#define __UTILITIES_H__
#include <sys/stat.h>
#include <stdio.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <curand.h>
#include <curand_kernel.h>
#include <cufft.h>
#include "cublas_v2.h"
#include "cusparse_v2.h"
#include <cusolverDn.h>
#define PI 3.141592653589793
#define RADIAN2ARCSEC (180*3600/PI)
#define MIN(a, b) (((a) < (b)) ? (a) : (b))
#define MAX(a, b) (((a) > (b)) ? (a) : (b))
#define CUBLAS_VERBOSE 0
#define RAND_SEED 2013
#define N_THREAD 16
#define N_THREAD2 256
#define SQRT_DBL_MAX 1.3407807929942596e+154
#define DBL_EPSILON 2.2204460492503131e-16
static void Error( const char *errMsg,
const char *file,
int line ) {
fprintf(stderr,"\n\x1B[31m@(CEO)>ERROR: %s in %s at line %d\x1B[0m\n",
errMsg, file, line );
exit( EXIT_FAILURE );
}
#define ERROR( errMsg ) (Error( errMsg, __FILE__, __LINE__ ))
#ifdef SILENT
#define INFO( infoMsg, args... );
#else
#define INFO( infoMsg , args...) fprintf(stdout,infoMsg, ##args);
#endif
static void HandleError( cudaError_t err,
const char *file,
int line ) {
if (err != cudaSuccess) {
fprintf(stderr,"\n\x1B[31m@(CEO)>ERROR: %s in %s at line %d\x1B[0m\n",
cudaGetErrorString( err ), file, line );
exit( EXIT_FAILURE );
}
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
static void HandleErrorCUFFT( cufftResult_t err,
const char *msg,
const char *file,
int line ) {
if (err != CUFFT_SUCCESS) {
fprintf(stderr,"\n\x1B[31m@(CEO)>CUFFT ERROR: %s in %s at line %d\x1B[0m\n",
msg, file, line );
exit( EXIT_FAILURE );
}
}
#define HANDLE_ERROR_CUFFT( err, msg ) (HandleErrorCUFFT( err, msg, __FILE__, __LINE__ ))
static void HandleErrorCUSPARSE( cusparseStatus_t err,
const char *msg,
const char *file,
int line ) {
if (err != CUSPARSE_STATUS_SUCCESS) {
fprintf(stderr,"\n\x1B[31m@(CEO)>CUSPARSE ERROR: %s in %s at line %d\x1B[0m\n",
msg, file, line );
exit( EXIT_FAILURE );
}
}
#define HANDLE_ERROR_CUSPARSE( err, msg ) (HandleErrorCUSPARSE( err, msg, __FILE__, __LINE__ ))
static void HandleErrorCURAND( curandStatus_t err,
const char *msg,
const char *file,
int line ) {
if (err != CURAND_STATUS_SUCCESS) {
fprintf(stderr,"\n\x1B[31m@(CEO)>CURAND ERROR: %s in %s at line %d\x1B[0m\n",
msg, file, line );
exit( EXIT_FAILURE );
}
}
#define HANDLE_ERROR_CUSOLVER( err ) (HandleErrorCUSOLVER( err, __FILE__, __LINE__ ))
static void HandleErrorCUSOLVER( cusolverStatus_t err,
const char *file,
int line ) {
if (err != CUSOLVER_STATUS_SUCCESS) {
if (err==CUSOLVER_STATUS_NOT_INITIALIZED)
fprintf(stderr,"\n\x1B[31m@(CEO)>CUSOLVER ERROR: %s in %s at line %d:.\x1B[0m\n",
"the library was not initialized", file, line );
else if (err==CUSOLVER_STATUS_INVALID_VALUE)
fprintf(stderr,"\n\x1B[31m@(CEO)>CUSOLVER ERROR: %s in %s at line %d\x1B[0m\n",
"invalid parameters were passed (m,n<0 or lda<max(1,m))", file, line );
else if (err==CUSOLVER_STATUS_ARCH_MISMATCH)
fprintf(stderr,"\n\x1B[31m@(CEO)>CUSOLVER ERROR: %s in %s at line %d\x1B[0m\n",
"the device only supports compute capability 2.0 and above", file, line );
else
fprintf(stderr,"\n\x1B[31m@(CEO)>CURAND ERROR: %s in %s at line %d\x1B[0m\n",
"an internal operation failed.", file, line );
exit( EXIT_FAILURE );
}
}
#define HANDLE_ERROR_CURAND( err, msg ) (HandleErrorCURAND( err, msg, __FILE__, __LINE__ ))
inline float sign(float x) {
return (float) ( (x > 0) - (x < 0) );
}
int round_up_to_nhp2(int n);
void set_device(int id);
#define ARCSEC(a) (a*PI/180/3600)
typedef double rtd;
struct vector{
rtd x;
rtd y;
rtd z;
__host__ __device__ vector(rtd x=0.0, rtd y=0.0, rtd z=0.0)
: x(x), y(y), z(z)
{
}
__host__ __device__ rtd rho2(void);
__host__ __device__ rtd rho2_shift(rtd x0, rtd y0);
__host__ __device__ rtd mag(void);
__host__ __device__ rtd mag(rtd R);
__host__ __device__ rtd angle(void);
__host__ __device__ rtd norm(void);
__host__ __device__ rtd dot(vector *u);
__host__ __device__ void unit(void);
__host__ __device__ void left_cross(vector *w, vector *u);
__host__ __device__ void right_cross(vector *w, vector *u);
__host__ __device__ vector& operator=(const vector& v){
x = v.x;
y = v.y;
z = v.z;
return *this;
}
__host__ __device__ vector& operator-=(const vector& rhs){
x -= rhs.x;
y -= rhs.y;
z -= rhs.z;
return *this;
}
__host__ __device__ vector& operator+=(const vector& rhs){
x += rhs.x;
y += rhs.y;
z += rhs.z;
return *this;
}
__host__ __device__ vector operator-(const vector& rhs) const
{
return vector(x-rhs.x,y-rhs.y,z-rhs.z);
}
__host__ __device__ vector operator+(const vector& rhs) const
{
return vector(x+rhs.x,y+rhs.y,z+rhs.z);
}
__host__ __device__ rtd operator*(const vector& rhs) const
{
return x*rhs.x + y*rhs.y + z*rhs.z;
}
__host__ __device__ vector operator*(const rtd& rhs) const
{
return vector(x*rhs,y*rhs,z*rhs);
}
};
__device__ inline int lenslet2array(int i_px_lenslet, int j_px_lenslet, int n_px_lenslet,
int i_lenslet, int j_lenslet, int n_lenslet, int i_source)
{
int index;
index = i_lenslet*n_px_lenslet + i_px_lenslet;
index += i_source*n_px_lenslet*n_lenslet;
index *= n_lenslet*n_px_lenslet;
index += j_lenslet*n_px_lenslet + j_px_lenslet;
return index;
}
__device__ inline int lenslet2array_overlap(int i_px_lenslet, int j_px_lenslet, int n_px_lenslet,
int i_lenslet, int j_lenslet, int n_lenslet, int i_source)
{
int index;
index = i_lenslet*(n_px_lenslet-1) + i_px_lenslet;
index += i_source*((n_px_lenslet-1)*n_lenslet+1);
index *= n_lenslet*(n_px_lenslet-1)+1;
index += j_lenslet*(n_px_lenslet-1) + j_px_lenslet;
return index;
}
__device__ inline int lenslet2row(int i_px_lenslet, int j_px_lenslet, int n_px_lenslet,
int i_lenslet, int j_lenslet, int n_lenslet,
int i_source)
{
int index;
index = i_lenslet*n_lenslet + j_lenslet;
index += i_source*n_lenslet*n_lenslet;
index *= n_px_lenslet;
index += i_px_lenslet;
index *= n_px_lenslet;
index += j_px_lenslet;
return index;
}
__device__ inline char threads2lenslet(dim3 threadIdx, dim3 blockIdx, dim3 blockDim,
int *i_px_lenslet, int *j_px_lenslet, int n_px_lenslet,
int *i_lenslet, int *j_lenslet, int n_lenslet)
{
*i_px_lenslet = threadIdx.x + blockIdx.x * blockDim.x;
*j_px_lenslet = threadIdx.y + blockIdx.y * blockDim.y;
*i_lenslet = *i_px_lenslet/n_px_lenslet;
if (*i_lenslet>=n_lenslet)
return 0;
*j_lenslet = *j_px_lenslet/n_px_lenslet;
if (*j_lenslet>=n_lenslet)
return 0;
*i_px_lenslet -= *i_lenslet*n_px_lenslet;
*j_px_lenslet -= *j_lenslet*n_px_lenslet;
return 1;
}
#ifdef SILENT
struct stopwatch{
float elapsedTime;
void tic(void) { elapsedTime=0.0; }
void toc(void) {}
void toc(const char *message) {}
void toc(float *elapsedTime, const char *message) {}
void toc(float *elapsedTime) {}
};
#else
struct stopwatch{
float elapsedTime;
cudaEvent_t start, stop;
void tic(void) {
HANDLE_ERROR( cudaEventCreate( &start ) );
HANDLE_ERROR( cudaEventCreate( &stop ) );
HANDLE_ERROR( cudaEventRecord( start, 0 ) );
}
void toc(void) {
HANDLE_ERROR( cudaEventRecord( stop, 0 ) );
HANDLE_ERROR( cudaEventSynchronize( stop ) );
HANDLE_ERROR( cudaEventElapsedTime( &elapsedTime,
start, stop ) );
printf("\x1B[33mElapsed time: %8.2E ms\x1B[0m\n", elapsedTime );
HANDLE_ERROR( cudaEventDestroy( start ) );
HANDLE_ERROR( cudaEventDestroy( stop ) );
}
void toc(const char *message) {
HANDLE_ERROR( cudaEventRecord( stop, 0 ) );
HANDLE_ERROR( cudaEventSynchronize( stop ) );
HANDLE_ERROR( cudaEventElapsedTime( &elapsedTime,
start, stop ) );
printf("\x1B[33m%s: Elapsed time: %8.2E ms\x1B[0m\n", message, elapsedTime );
HANDLE_ERROR( cudaEventDestroy( start ) );
HANDLE_ERROR( cudaEventDestroy( stop ) );
}
void toc(float *elapsedTime, const char *message) {
HANDLE_ERROR( cudaEventRecord( stop, 0 ) );
HANDLE_ERROR( cudaEventSynchronize( stop ) );
HANDLE_ERROR( cudaEventElapsedTime( elapsedTime,
start, stop ) );
printf("\x1B[33m%s: Elapsed time: %8.2E ms\x1B[0m\n", message, *elapsedTime );
HANDLE_ERROR( cudaEventDestroy( start ) );
HANDLE_ERROR( cudaEventDestroy( stop ) );
}
void toc(float *elapsedTime) {
HANDLE_ERROR( cudaEventRecord( stop, 0 ) );
HANDLE_ERROR( cudaEventSynchronize( stop ) );
HANDLE_ERROR( cudaEventElapsedTime( elapsedTime,
start, stop ) );
HANDLE_ERROR( cudaEventDestroy( start ) );
HANDLE_ERROR( cudaEventDestroy( stop ) );
}
};
#endif
void CUBLAS_ERROR(cublasStatus_t status);
struct mask {
char *m;
float *f;
int *idx;
int size_px[2];
int nel;
float nnz;
float size_m[2];
float area;
float delta[2];
cublasHandle_t handle;
int *d__piston_mask;
void setup(int n);
void setup(int n, float L);
void setup(int n, float L, int i_s, int j_s, int n_out);
void setup(float n, float L, float i_0, float j_0,
float theta, float i_s, float j_s, int n_out );
void setup_circular(int n);
void setup_circular(int n, float D);
void setup_circular(int n, float D, float scale);
void setup_GMT(int n, float S);
void set_filter(void);
void set_filter_quiet(void);
void set_index(void);
void set_gmt_piston(float *phase, float *d__p);
void alter(float *filter);
void add(mask *other);
void add(char *other, int other_nel);
void add(mask *other, int offset);
void add(char *other, int other_nel, int offset);
void reset(void);
void cleanup(void);
};
struct stats {
cublasHandle_t handle;
cublasStatus_t status;
void setup(void);
void cleanup(void);
float mean(const float *data, int n_data);
float mean(const float *data, mask *M, int n_data);
float var(const float *data, int n_data);
float std(const float *data, int n_data);
float diff_var(const float *data_1, const float *data_2, int n_data);
float diff_std(const float *data_1, const float *data_2, int n_data);
float var(const float *data1, mask *M, int n_data);
float std(const float *data1, mask *M, int n_data);
float diff_var(const float *data1, const float *data2, mask *M, int n_data);
float diff_std(const float *data1, const float *data2, mask *M, int n_data);
};
__device__ inline int lenset2actuator(int iL, int jL, int NL, int io, int jo)
{
int iL_p, jL_p, iL_pp, jL_pp, iA, jA;
iL_p = 2*iL + 1;
jL_p = 2*jL + 1;
iL_pp = iL_p + io;
jL_pp = jL_p + jo;
iA = iL_pp/2;
jA = jL_pp/2;
return iA*(NL+1) + jA;
}
__global__ void fill_ones_char(char *ones, int n_data) ;
__global__ void circular_pupil(char *pupil, int N, float scale);
__global__ void square_pupil(char *pupil, int N, int N_S, int I_S, int J_S);
__global__ void rot_square_pupil(char *pupil, int N, float N_S,
float I_0, float J_0, float theta, float I_S, float J_S);
__global__ void GMT_pupil(char *pupil, int *piston_mask , const int N,
const float S, float Dsh, const float c, const char v);
__global__ void fried_geometry(char *lenslet_mask, char *actuator_mask,
int NL, int n, float threshold);
__global__ void fried_geometry_vs_pupil(char *lenslet_mask, char *actuator_mask,
const int NL, const int n,
const float threshold, const char *pupil);
__global__ void wavefront_finite_difference_kernel(float *sx, float *sy, int NL,
float *phi, int n, float d);
__global__ void wavefront_finite_difference_kernel_masked(
float *sx, float *sy, int NL,
float *phi, int n, float d,
char *M);
__global__ void wavefront_finite_difference_sparse_matrix_kernel(
float *csrValH,
int *csrColIndH,
int *csrRowPtrH,
const int NL, const int n, const float d);
__global__ void set_gmt_piston_kernel(float *phase, float *p,
char *m, int *piston_mask,
const int N);
__global__ void double2float_kern(float *d__single_data, double *d__double_data, const int N);
char fried_geometry_setup(char *lenslet_mask, char *actuator_mask,
int NL, int n, float threshold);
char fried_geometry_setup_vs_pupil(char *lenslet_mask, char *actuator_mask,
int NL, int n, float threshold, char *pupil);
void dev2file(const char* filename, float* d__data, int n_data);
void dev2file(const char* filename, float* d__data, int n_data,
int n_data_page, int n_page);
void dev2file(const char* filename, float* d__data, int n_data,
int *n_data_page, int n_page);
void dev2file(const char* filename, int* d__data, int n_data);
void dev2file(const char* filename, float2* d__data, int n_data);
void dev2file(const char* filename, char* d__data, int n_data);
double atmosphere_refractive_index(float wavelength, float altitude, float temperature, float humidity);
double atmospheric_dispersion(float wavelength, float delta_wavelength,
float zenith, float altitude,
float temperature, float humidity);
void dev2host( float *host_data, float *dev_data, int N);
void dev2host_int( int *host_data, int *dev_data, int N);
void host2dev( float *dev_data, float *host_data, int N);
void host2dev_char( char *dev_data, char *host_data, int N);
void freedev( float **dev_data );
struct gpu_float {
float *dev_data, *host_data, *d_tau;
int N, nb;
stats S;
cublasStatus_t stat;
cublasHandle_t handle;
cusolverDnHandle_t cusolverH;
void setup(void);
void setup(int N_T);
void dev_malloc(void);
void free_dev(void);
void dev2host(void);
void host2dev(void);
void reset(void);
void qr(int m);
void dev2dev(gpu_float *other);
void axpy(const gpu_float *x, const float alpha);
void scale(const float alpha);
void mv(gpu_float *y, const gpu_float *x);
void qr_solve(gpu_float *x, gpu_float *b);
};
struct gpu_double {
double *dev_data, *host_data;
int N, nb;
stats S;
cublasStatus_t stat;
cublasHandle_t handle;
cusolverDnHandle_t cusolverH;
void setup(void);
void setup(int N_T);
void dev_malloc(void);
void free_dev(void);
void dev2host(void);
void host2dev(void);
void reset(void);
void qr(int m);
void mv(gpu_float *y, const gpu_float *x);
};
void geqrf(float *d_tau, float *a, int m, int n);
void ormqr(float *b, int m, float *q, float *tau, int n);
struct sparseMatrix {
int n_row, n_col;
int nnz;
float *csrValH;
int *csrColIndH;
int *csrRowPtrH;
float alpha, beta;
cudaError_t cudaStat;
cusparseStatus_t status;
cusparseHandle_t handle;
cusparseMatDescr_t descr;
void setup(const int _n_row_, const int _n_col_, const int _nnz_);
void gradient( int NL, int N_LENSLET_PX, float d);
void interpolation(int NI, int NP);
void interpolation(int NI, int NP, mask *pupil,float i0, float j0);
void cleanup(void);
void MVM(float *y, float *x);
void add(sparseMatrix *C, sparseMatrix *B);
};
void wavefront_finite_difference(float *sx, float *sy, int NL,
float *phi, int n, float d,
int N_GS);
void wavefront_finite_difference(float *sx, float *sy, int NL,
float *phi, int n, float d,
mask *valid_lenset, int N_GS);
struct cheb_series_struct {
double * c;
int order;
double a;
double b;
int order_sp;
};
typedef struct cheb_series_struct cheb_series;
__host__ __device__ void
cheb_eval_e(const cheb_series * cs,
const double x,
double * result);
__host__ __device__ void
temme_gamma(const double nu, double * g_1pnu, double * g_1mnu, double * g1, double * g2);
__host__ __device__ void
K_scaled_temme(const double nu, const double x,
double * K_nu, double * K_nup1, double * Kp_nu);
__host__ __device__ void
K_scaled_steed_temme_CF2(const double nu, const double x,
double * K_nu, double * K_nup1,
double * Kp_nu);
__host__ __device__ double _K_nu_(const double nu, const double x);
__host__ __device__ int polywind(double Px, double Py, double *Vx, double *Vy, int NV);
void polywinds(int *W, double *Px, double *Py, int NP, double *Vx, double *Vy, int NV);
#endif