const BLOCK_SIZE: usize = 16;
const NBR_SEGMENTS: usize = 4;
const RECURSIVE_LIMIT: usize = 128;
const SMALL_LEN: usize = 255;
const MEDIUM_LEN: usize = 1024*1024;
unsafe fn transpose_small<T: Copy>(input: &[T], output: &mut [T], width: usize, height: usize) {
for x in 0..width {
for y in 0..height {
let input_index = x + y * width;
let output_index = y + x * height;
*output.get_unchecked_mut(output_index) = *input.get_unchecked(input_index);
}
}
}
unsafe fn transpose_block<T: Copy>(input: &[T], output: &mut [T], width: usize, height: usize, start_x: usize, start_y: usize, block_width: usize, block_height: usize) {
for inner_x in 0..block_width {
for inner_y in 0..block_height {
let x = start_x + inner_x;
let y = start_y + inner_y;
let input_index = x + y * width;
let output_index = y + x * height;
*output.get_unchecked_mut(output_index) = *input.get_unchecked(input_index);
}
}
}
unsafe fn transpose_block_segmented<T: Copy>(input: &[T], output: &mut [T], width: usize, height: usize, start_x: usize, start_y: usize, block_width: usize, block_height: usize) {
let height_per_div = block_height/NBR_SEGMENTS;
for subblock in 0..NBR_SEGMENTS {
for inner_x in 0..block_width {
for inner_y in 0..height_per_div {
let x = start_x + inner_x;
let y = start_y + inner_y + subblock*height_per_div;
let input_index = x + y * width;
let output_index = y + x * height;
*output.get_unchecked_mut(output_index) = *input.get_unchecked(input_index);
}
}
}
}
fn transpose_tiled<T: Copy>(input: &[T], output: &mut [T], input_width: usize, input_height: usize) {
let x_block_count = input_width / BLOCK_SIZE;
let y_block_count = input_height / BLOCK_SIZE;
let remainder_x = input_width - x_block_count * BLOCK_SIZE;
let remainder_y = input_height - y_block_count * BLOCK_SIZE;
for y_block in 0..y_block_count {
for x_block in 0..x_block_count {
unsafe {
transpose_block(
input, output,
input_width, input_height,
x_block * BLOCK_SIZE, y_block * BLOCK_SIZE,
BLOCK_SIZE, BLOCK_SIZE,
);
}
}
if remainder_x > 0 {
unsafe {
transpose_block(
input, output,
input_width, input_height,
input_width - remainder_x, y_block * BLOCK_SIZE,
remainder_x, BLOCK_SIZE);
}
}
}
if remainder_y > 0 {
for x_block in 0..x_block_count {
unsafe {
transpose_block(
input, output,
input_width, input_height,
x_block * BLOCK_SIZE, input_height - remainder_y,
BLOCK_SIZE, remainder_y,
);
}
}
if remainder_x > 0 {
unsafe {
transpose_block(
input, output,
input_width, input_height,
input_width - remainder_x, input_height - remainder_y,
remainder_x, remainder_y);
}
}
}
}
fn transpose_recursive<T: Copy>(input: &[T], output: &mut [T], row_start: usize, row_end: usize, col_start: usize, col_end: usize, total_columns: usize, total_rows: usize) {
let nbr_rows = row_end - row_start;
let nbr_cols = col_end - col_start;
if (nbr_rows <= RECURSIVE_LIMIT && nbr_cols <= RECURSIVE_LIMIT) || nbr_rows<=2 || nbr_cols<=2 {
let x_block_count = nbr_cols / BLOCK_SIZE;
let y_block_count = nbr_rows / BLOCK_SIZE;
let remainder_x = nbr_cols - x_block_count * BLOCK_SIZE;
let remainder_y = nbr_rows - y_block_count * BLOCK_SIZE;
for y_block in 0..y_block_count {
for x_block in 0..x_block_count {
unsafe {
transpose_block_segmented(
input, output,
total_columns, total_rows,
col_start + x_block * BLOCK_SIZE, row_start + y_block * BLOCK_SIZE,
BLOCK_SIZE, BLOCK_SIZE,
);
}
}
if remainder_x > 0 {
unsafe {
transpose_block(
input, output,
total_columns, total_rows,
col_start + x_block_count * BLOCK_SIZE, row_start + y_block * BLOCK_SIZE,
remainder_x, BLOCK_SIZE);
}
}
}
if remainder_y > 0 {
for x_block in 0..x_block_count {
unsafe {
transpose_block(
input, output,
total_columns, total_rows,
col_start + x_block * BLOCK_SIZE, row_start + y_block_count * BLOCK_SIZE,
BLOCK_SIZE, remainder_y,
);
}
}
if remainder_x > 0 {
unsafe {
transpose_block(
input, output,
total_columns, total_rows,
col_start + x_block_count * BLOCK_SIZE, row_start + y_block_count * BLOCK_SIZE,
remainder_x, remainder_y);
}
}
}
} else if nbr_rows >= nbr_cols {
transpose_recursive(input, output, row_start, row_start + (nbr_rows / 2), col_start, col_end, total_columns, total_rows);
transpose_recursive(input, output, row_start + (nbr_rows / 2), row_end, col_start, col_end, total_columns, total_rows);
} else {
transpose_recursive(input, output, row_start, row_end, col_start, col_start + (nbr_cols / 2), total_columns, total_rows);
transpose_recursive(input, output, row_start, row_end, col_start + (nbr_cols / 2), col_end, total_columns, total_rows);
}
}
pub fn transpose<T: Copy>(input: &[T], output: &mut [T], input_width: usize, input_height: usize) {
assert_eq!(input_width.checked_mul(input_height), Some(input.len()));
assert_eq!(input.len(), output.len());
if input.len() <= SMALL_LEN {
unsafe { transpose_small(input, output, input_width, input_height) };
}
else if input.len() <= MEDIUM_LEN {
transpose_tiled(input, output, input_width, input_height);
}
else {
transpose_recursive(input, output, 0, input_height, 0, input_width, input_width, input_height);
}
}