#ifdef _OPENMP
static void
_t2(compress_omp, Scalar, 1)(zfp_stream* stream, const zfp_field* field)
{
const Scalar* data = (const Scalar*)field->data;
size_t nx = field->nx;
uint threads = thread_count_omp(stream);
size_t blocks = (nx + 3) / 4;
size_t chunks = chunk_count_omp(stream, blocks, threads);
int chunk;
bitstream** bs = compress_init_par(stream, field, chunks, blocks);
if (!bs)
return;
#pragma omp parallel for num_threads(threads)
for (chunk = 0; chunk < (int)chunks; chunk++) {
size_t bmin = chunk_offset(blocks, chunks, chunk + 0);
size_t bmax = chunk_offset(blocks, chunks, chunk + 1);
size_t block;
zfp_stream s = *stream;
zfp_stream_set_bit_stream(&s, bs[chunk]);
for (block = bmin; block < bmax; block++) {
const Scalar* p = data;
size_t x = 4 * block;
p += x;
if (nx - x < 4u)
_t2(zfp_encode_partial_block_strided, Scalar, 1)(&s, p, nx - x, 1);
else
_t2(zfp_encode_block, Scalar, 1)(&s, p);
}
}
compress_finish_par(stream, bs, chunks);
}
static void
_t2(compress_strided_omp, Scalar, 1)(zfp_stream* stream, const zfp_field* field)
{
const Scalar* data = (const Scalar*)field->data;
size_t nx = field->nx;
ptrdiff_t sx = field->sx ? field->sx : 1;
uint threads = thread_count_omp(stream);
size_t blocks = (nx + 3) / 4;
size_t chunks = chunk_count_omp(stream, blocks, threads);
int chunk;
bitstream** bs = compress_init_par(stream, field, chunks, blocks);
if (!bs)
return;
#pragma omp parallel for num_threads(threads)
for (chunk = 0; chunk < (int)chunks; chunk++) {
size_t bmin = chunk_offset(blocks, chunks, chunk + 0);
size_t bmax = chunk_offset(blocks, chunks, chunk + 1);
size_t block;
zfp_stream s = *stream;
zfp_stream_set_bit_stream(&s, bs[chunk]);
for (block = bmin; block < bmax; block++) {
const Scalar* p = data;
size_t x = 4 * block;
p += sx * (ptrdiff_t)x;
if (nx - x < 4u)
_t2(zfp_encode_partial_block_strided, Scalar, 1)(&s, p, nx - x, sx);
else
_t2(zfp_encode_block_strided, Scalar, 1)(&s, p, sx);
}
}
compress_finish_par(stream, bs, chunks);
}
static void
_t2(compress_strided_omp, Scalar, 2)(zfp_stream* stream, const zfp_field* field)
{
const Scalar* data = (const Scalar*)field->data;
size_t nx = field->nx;
size_t ny = field->ny;
ptrdiff_t sx = field->sx ? field->sx : 1;
ptrdiff_t sy = field->sy ? field->sy : (ptrdiff_t)nx;
uint threads = thread_count_omp(stream);
size_t bx = (nx + 3) / 4;
size_t by = (ny + 3) / 4;
size_t blocks = bx * by;
size_t chunks = chunk_count_omp(stream, blocks, threads);
int chunk;
bitstream** bs = compress_init_par(stream, field, chunks, blocks);
if (!bs)
return;
#pragma omp parallel for num_threads(threads)
for (chunk = 0; chunk < (int)chunks; chunk++) {
size_t bmin = chunk_offset(blocks, chunks, chunk + 0);
size_t bmax = chunk_offset(blocks, chunks, chunk + 1);
size_t block;
zfp_stream s = *stream;
zfp_stream_set_bit_stream(&s, bs[chunk]);
for (block = bmin; block < bmax; block++) {
const Scalar* p = data;
size_t b = block;
size_t x, y;
x = 4 * (b % bx); b /= bx;
y = 4 * b;
p += sx * (ptrdiff_t)x + sy * (ptrdiff_t)y;
if (nx - x < 4u || ny - y < 4u)
_t2(zfp_encode_partial_block_strided, Scalar, 2)(&s, p, MIN(nx - x, 4u), MIN(ny - y, 4u), sx, sy);
else
_t2(zfp_encode_block_strided, Scalar, 2)(&s, p, sx, sy);
}
}
compress_finish_par(stream, bs, chunks);
}
static void
_t2(compress_strided_omp, Scalar, 3)(zfp_stream* stream, const zfp_field* field)
{
const Scalar* data = (const Scalar*)field->data;
size_t nx = field->nx;
size_t ny = field->ny;
size_t nz = field->nz;
ptrdiff_t sx = field->sx ? field->sx : 1;
ptrdiff_t sy = field->sy ? field->sy : (ptrdiff_t)nx;
ptrdiff_t sz = field->sz ? field->sz : (ptrdiff_t)(nx * ny);
uint threads = thread_count_omp(stream);
size_t bx = (nx + 3) / 4;
size_t by = (ny + 3) / 4;
size_t bz = (nz + 3) / 4;
size_t blocks = bx * by * bz;
size_t chunks = chunk_count_omp(stream, blocks, threads);
int chunk;
bitstream** bs = compress_init_par(stream, field, chunks, blocks);
if (!bs)
return;
#pragma omp parallel for num_threads(threads)
for (chunk = 0; chunk < (int)chunks; chunk++) {
size_t bmin = chunk_offset(blocks, chunks, chunk + 0);
size_t bmax = chunk_offset(blocks, chunks, chunk + 1);
size_t block;
zfp_stream s = *stream;
zfp_stream_set_bit_stream(&s, bs[chunk]);
for (block = bmin; block < bmax; block++) {
const Scalar* p = data;
size_t b = block;
size_t x, y, z;
x = 4 * (b % bx); b /= bx;
y = 4 * (b % by); b /= by;
z = 4 * b;
p += sx * (ptrdiff_t)x + sy * (ptrdiff_t)y + sz * (ptrdiff_t)z;
if (nx - x < 4u || ny - y < 4u || nz - z < 4u)
_t2(zfp_encode_partial_block_strided, Scalar, 3)(&s, p, MIN(nx - x, 4u), MIN(ny - y, 4u), MIN(nz - z, 4u), sx, sy, sz);
else
_t2(zfp_encode_block_strided, Scalar, 3)(&s, p, sx, sy, sz);
}
}
compress_finish_par(stream, bs, chunks);
}
static void
_t2(compress_strided_omp, Scalar, 4)(zfp_stream* stream, const zfp_field* field)
{
const Scalar* data = field->data;
size_t nx = field->nx;
size_t ny = field->ny;
size_t nz = field->nz;
size_t nw = field->nw;
ptrdiff_t sx = field->sx ? field->sx : 1;
ptrdiff_t sy = field->sy ? field->sy : (ptrdiff_t)nx;
ptrdiff_t sz = field->sz ? field->sz : (ptrdiff_t)(nx * ny);
ptrdiff_t sw = field->sw ? field->sw : (ptrdiff_t)(nx * ny * nz);
uint threads = thread_count_omp(stream);
size_t bx = (nx + 3) / 4;
size_t by = (ny + 3) / 4;
size_t bz = (nz + 3) / 4;
size_t bw = (nw + 3) / 4;
size_t blocks = bx * by * bz * bw;
size_t chunks = chunk_count_omp(stream, blocks, threads);
int chunk;
bitstream** bs = compress_init_par(stream, field, chunks, blocks);
if (!bs)
return;
#pragma omp parallel for num_threads(threads)
for (chunk = 0; chunk < (int)chunks; chunk++) {
size_t bmin = chunk_offset(blocks, chunks, chunk + 0);
size_t bmax = chunk_offset(blocks, chunks, chunk + 1);
size_t block;
zfp_stream s = *stream;
zfp_stream_set_bit_stream(&s, bs[chunk]);
for (block = bmin; block < bmax; block++) {
const Scalar* p = data;
size_t b = block;
size_t x, y, z, w;
x = 4 * (b % bx); b /= bx;
y = 4 * (b % by); b /= by;
z = 4 * (b % bz); b /= bz;
w = 4 * b;
p += sx * (ptrdiff_t)x + sy * (ptrdiff_t)y + sz * (ptrdiff_t)z + sw * (ptrdiff_t)w;
if (nx - x < 4u || ny - y < 4u || nz - z < 4u || nw - w < 4u)
_t2(zfp_encode_partial_block_strided, Scalar, 4)(&s, p, MIN(nx - x, 4u), MIN(ny - y, 4u), MIN(nz - z, 4u), MIN(nw - w, 4u), sx, sy, sz, sw);
else
_t2(zfp_encode_block_strided, Scalar, 4)(&s, p, sx, sy, sz, sw);
}
}
compress_finish_par(stream, bs, chunks);
}
#endif