#![allow(clippy::too_many_arguments)]
use super::ac_strategy::*;
use crate::debug_rect;
use crate::effort::EffortProfile;
#[inline]
fn set_entropy_for_transform(
entropy_estimate: &mut [f32; 64],
ox: usize,
oy: usize,
cx: usize,
cy: usize,
entropy: f32,
) {
for iy in 0..cy {
for ix in 0..cx {
entropy_estimate[(oy + iy) * 8 + (ox + ix)] = 0.0;
}
}
entropy_estimate[oy * 8 + ox] = entropy;
}
#[allow(clippy::too_many_arguments, clippy::needless_range_loop)]
pub(super) fn find_best_16x16_transform(
xyb: [&[f32]; 3],
stride: usize,
bx0: usize,
by0: usize,
cx: usize,
cy: usize,
distance: f32,
quant_field: &[f32],
xsize_blocks: usize,
masking: &[f32],
ytox: i8,
ytob: i8,
mask1x1: Option<&[f32]>,
mask1x1_stride: usize,
ac_strategy: &mut AcStrategyMap,
scratch: &mut EntropyEstScratch,
favor_single_mul: f32,
cache_offset: Option<(usize, usize)>,
profile: &EffortProfile,
) {
#[cfg(target_arch = "x86_64")]
{
use jxl_simd::SimdToken;
if let Some(token) = jxl_simd::X64V3Token::summon() {
find_best_16x16_transform_avx2(
token,
xyb,
stride,
bx0,
by0,
cx,
cy,
distance,
quant_field,
xsize_blocks,
masking,
ytox,
ytob,
mask1x1,
mask1x1_stride,
ac_strategy,
scratch,
favor_single_mul,
cache_offset,
profile,
);
return;
}
}
#[cfg(target_arch = "aarch64")]
{
use jxl_simd::SimdToken;
if let Some(token) = jxl_simd::NeonToken::summon() {
find_best_16x16_transform_neon(
token,
xyb,
stride,
bx0,
by0,
cx,
cy,
distance,
quant_field,
xsize_blocks,
masking,
ytox,
ytob,
mask1x1,
mask1x1_stride,
ac_strategy,
scratch,
favor_single_mul,
cache_offset,
profile,
);
return;
}
}
#[cfg(target_arch = "wasm32")]
{
use jxl_simd::SimdToken;
if let Some(token) = jxl_simd::Wasm128Token::summon() {
find_best_16x16_transform_wasm128(
token,
xyb,
stride,
bx0,
by0,
cx,
cy,
distance,
quant_field,
xsize_blocks,
masking,
ytox,
ytob,
mask1x1,
mask1x1_stride,
ac_strategy,
scratch,
favor_single_mul,
cache_offset,
profile,
);
return;
}
}
find_best_16x16_transform_impl(
xyb,
stride,
bx0,
by0,
cx,
cy,
distance,
quant_field,
xsize_blocks,
masking,
ytox,
ytob,
mask1x1,
mask1x1_stride,
ac_strategy,
scratch,
favor_single_mul,
cache_offset,
profile,
);
}
#[cfg(target_arch = "x86_64")]
#[archmage::arcane]
#[allow(clippy::too_many_arguments, clippy::needless_range_loop)]
fn find_best_16x16_transform_avx2(
_token: jxl_simd::X64V3Token,
xyb: [&[f32]; 3],
stride: usize,
bx0: usize,
by0: usize,
cx: usize,
cy: usize,
distance: f32,
quant_field: &[f32],
xsize_blocks: usize,
masking: &[f32],
ytox: i8,
ytob: i8,
mask1x1: Option<&[f32]>,
mask1x1_stride: usize,
ac_strategy: &mut AcStrategyMap,
scratch: &mut EntropyEstScratch,
favor_single_mul: f32,
cache_offset: Option<(usize, usize)>,
profile: &EffortProfile,
) {
find_best_16x16_transform_impl(
xyb,
stride,
bx0,
by0,
cx,
cy,
distance,
quant_field,
xsize_blocks,
masking,
ytox,
ytob,
mask1x1,
mask1x1_stride,
ac_strategy,
scratch,
favor_single_mul,
cache_offset,
profile,
);
}
#[cfg(target_arch = "aarch64")]
#[archmage::arcane]
#[allow(clippy::too_many_arguments, clippy::needless_range_loop)]
fn find_best_16x16_transform_neon(
_token: jxl_simd::NeonToken,
xyb: [&[f32]; 3],
stride: usize,
bx0: usize,
by0: usize,
cx: usize,
cy: usize,
distance: f32,
quant_field: &[f32],
xsize_blocks: usize,
masking: &[f32],
ytox: i8,
ytob: i8,
mask1x1: Option<&[f32]>,
mask1x1_stride: usize,
ac_strategy: &mut AcStrategyMap,
scratch: &mut EntropyEstScratch,
favor_single_mul: f32,
cache_offset: Option<(usize, usize)>,
profile: &EffortProfile,
) {
find_best_16x16_transform_impl(
xyb,
stride,
bx0,
by0,
cx,
cy,
distance,
quant_field,
xsize_blocks,
masking,
ytox,
ytob,
mask1x1,
mask1x1_stride,
ac_strategy,
scratch,
favor_single_mul,
cache_offset,
profile,
);
}
#[cfg(target_arch = "wasm32")]
#[archmage::arcane]
#[allow(clippy::too_many_arguments, clippy::needless_range_loop)]
fn find_best_16x16_transform_wasm128(
_token: jxl_simd::Wasm128Token,
xyb: [&[f32]; 3],
stride: usize,
bx0: usize,
by0: usize,
cx: usize,
cy: usize,
distance: f32,
quant_field: &[f32],
xsize_blocks: usize,
masking: &[f32],
ytox: i8,
ytob: i8,
mask1x1: Option<&[f32]>,
mask1x1_stride: usize,
ac_strategy: &mut AcStrategyMap,
scratch: &mut EntropyEstScratch,
favor_single_mul: f32,
cache_offset: Option<(usize, usize)>,
profile: &EffortProfile,
) {
find_best_16x16_transform_impl(
xyb,
stride,
bx0,
by0,
cx,
cy,
distance,
quant_field,
xsize_blocks,
masking,
ytox,
ytob,
mask1x1,
mask1x1_stride,
ac_strategy,
scratch,
favor_single_mul,
cache_offset,
profile,
);
}
#[inline(always)]
#[allow(clippy::too_many_arguments, clippy::needless_range_loop)]
fn find_best_16x16_transform_impl(
xyb: [&[f32]; 3],
stride: usize,
bx0: usize,
by0: usize,
cx: usize,
cy: usize,
distance: f32,
quant_field: &[f32],
xsize_blocks: usize,
masking: &[f32],
ytox: i8,
ytob: i8,
mask1x1: Option<&[f32]>,
mask1x1_stride: usize,
ac_strategy: &mut AcStrategyMap,
scratch: &mut EntropyEstScratch,
favor_single_mul: f32,
cache_offset: Option<(usize, usize)>,
profile: &EffortProfile,
) {
let use_pixel_domain = mask1x1.is_some();
let compute_mul = |k: (f32, f32, f32)| k.1 + k.0 / (distance + k.2);
let (mul8x8, mul16x8, mul16x16, mul4x8, mul4x4) = if use_pixel_domain {
let mul8x8 = 1.0 + (-0.4) / (distance + 1.4);
(mul8x8, 1.0_f32, 1.0_f32, mul8x8, mul8x8)
} else {
(
compute_mul(profile.k8x8),
compute_mul(profile.k16x8),
compute_mul(profile.k16x16),
compute_mul(profile.k4x8),
compute_mul(profile.k4x4),
)
};
let base_cost_8x8 = if use_pixel_domain { 0.0 } else { 3.0 * mul8x8 };
let favor_weight = if distance < 5.0 {
((5.0 - distance) / 5.0_f32).powi(2)
} else {
0.0
};
let favor_2x2_adjust = profile.k_favor_2x2 * favor_weight;
let avoid_transforms_adjust = if distance > 4.0 {
let mul = if distance < 12.0 {
(12.0 - 4.0) / (distance - 4.0)
} else {
1.0
};
profile.k_avoid_transforms_base * mul
} else {
0.0
};
let abs_bx = bx0 + cx;
let abs_by = by0 + cy;
let scaled_constants = if use_pixel_domain {
compute_scaled_constants(
distance,
(
profile.k_info_loss_mul_base,
profile.k_zeros_mul_base,
profile.k_cost_delta_base,
),
)
} else {
COEFF_DOMAIN_CONSTANTS
};
let mut entropy = [[0.0f32; 2]; 2];
let mut best_single_strategy = [[RAW_STRATEGY_DCT8; 2]; 2];
for (dy, (entropy_row, strat_row)) in entropy
.iter_mut()
.zip(best_single_strategy.iter_mut())
.enumerate()
{
for (dx, (entropy_val, best_strat)) in
entropy_row.iter_mut().zip(strat_row.iter_mut()).enumerate()
{
let block_x = abs_bx + dx;
let block_y = abs_by + dy;
macro_rules! eval {
($strategy:expr, $adjust:expr) => {
estimate_entropy_with_mask(
$strategy,
xyb,
stride,
block_x,
block_y,
distance,
quant_field,
xsize_blocks,
masking,
ytox,
ytob,
mask1x1,
mask1x1_stride,
$adjust,
scaled_constants,
&profile.entropy_mul_table,
scratch,
)
};
}
let e8 = eval!(RAW_STRATEGY_DCT8, 0.0);
let cost8 = base_cost_8x8 + mul8x8 * e8;
*entropy_val = cost8;
*best_strat = RAW_STRATEGY_DCT8;
if profile.try_dct4x8_afv {
let e4x8 = eval!(RAW_STRATEGY_DCT4X8, avoid_transforms_adjust);
let base_cost_4x8 = if use_pixel_domain { 0.0 } else { 3.0 * mul4x8 };
let cost4x8 = base_cost_4x8 + mul4x8 * e4x8;
let e8x4 = eval!(RAW_STRATEGY_DCT8X4, avoid_transforms_adjust);
let cost8x4 = base_cost_4x8 + mul4x8 * e8x4;
let e4x4 = eval!(RAW_STRATEGY_DCT4X4, avoid_transforms_adjust);
let base_cost_4x4 = if use_pixel_domain { 0.0 } else { 3.0 * mul4x4 };
let cost4x4 = base_cost_4x4 + mul4x4 * e4x4;
if cost4x8 < *entropy_val {
*entropy_val = cost4x8;
*best_strat = RAW_STRATEGY_DCT4X8;
}
if cost8x4 < *entropy_val {
*entropy_val = cost8x4;
*best_strat = RAW_STRATEGY_DCT8X4;
}
if cost4x4 < *entropy_val {
*entropy_val = cost4x4;
*best_strat = RAW_STRATEGY_DCT4X4;
}
let base_cost_afv = if use_pixel_domain { 0.0 } else { 3.0 * mul8x8 };
for afv_idx in 0..4u8 {
let afv_strategy = RAW_STRATEGY_AFV0 + afv_idx;
let e_afv = eval!(afv_strategy, avoid_transforms_adjust);
let cost_afv = base_cost_afv + mul8x8 * e_afv;
if cost_afv < *entropy_val {
*entropy_val = cost_afv;
*best_strat = afv_strategy;
}
}
}
let e_identity = eval!(RAW_STRATEGY_IDENTITY, favor_2x2_adjust);
let base_cost_identity = if use_pixel_domain { 0.0 } else { 3.0 * mul8x8 };
let cost_identity = base_cost_identity + mul8x8 * e_identity;
let e_dct2 = eval!(RAW_STRATEGY_DCT2X2, favor_2x2_adjust);
let base_cost_dct2 = if use_pixel_domain { 0.0 } else { 3.0 * mul8x8 };
let cost_dct2 = base_cost_dct2 + mul8x8 * e_dct2;
if cost_identity < *entropy_val {
*entropy_val = cost_identity;
*best_strat = RAW_STRATEGY_IDENTITY;
}
if cost_dct2 < *entropy_val {
*entropy_val = cost_dct2;
*best_strat = RAW_STRATEGY_DCT2X2;
}
}
}
if !profile.try_dct16 {
for dy in 0..2 {
for dx in 0..2 {
let strat = best_single_strategy[dy][dx];
if strat != RAW_STRATEGY_DCT8 {
ac_strategy.set(abs_bx + dx, abs_by + dy, strat);
}
}
}
if let Some((ox, oy)) = cache_offset {
for dy in 0..2 {
for dx in 0..2 {
scratch.entropy_estimate[(oy + dy) * 8 + (ox + dx)] = entropy[dy][dx];
}
}
}
return;
}
let entropy_16x8_left = mul16x8
* estimate_entropy_with_mask(
RAW_STRATEGY_DCT16X8,
xyb,
stride,
abs_bx,
abs_by,
distance,
quant_field,
xsize_blocks,
masking,
ytox,
ytob,
mask1x1,
mask1x1_stride,
0.0,
scaled_constants,
&profile.entropy_mul_table,
scratch,
);
let entropy_16x8_right = mul16x8
* estimate_entropy_with_mask(
RAW_STRATEGY_DCT16X8,
xyb,
stride,
abs_bx + 1,
abs_by,
distance,
quant_field,
xsize_blocks,
masking,
ytox,
ytob,
mask1x1,
mask1x1_stride,
0.0,
scaled_constants,
&profile.entropy_mul_table,
scratch,
);
let entropy_8x16_top = mul16x8
* estimate_entropy_with_mask(
RAW_STRATEGY_DCT8X16,
xyb,
stride,
abs_bx,
abs_by,
distance,
quant_field,
xsize_blocks,
masking,
ytox,
ytob,
mask1x1,
mask1x1_stride,
0.0,
scaled_constants,
&profile.entropy_mul_table,
scratch,
);
let entropy_8x16_bottom = mul16x8
* estimate_entropy_with_mask(
RAW_STRATEGY_DCT8X16,
xyb,
stride,
abs_bx,
abs_by + 1,
distance,
quant_field,
xsize_blocks,
masking,
ytox,
ytob,
mask1x1,
mask1x1_stride,
0.0,
scaled_constants,
&profile.entropy_mul_table,
scratch,
);
let entropy_16x16 = mul16x16
* estimate_entropy_with_mask(
RAW_STRATEGY_DCT16X16,
xyb,
stride,
abs_bx,
abs_by,
distance,
quant_field,
xsize_blocks,
masking,
ytox,
ytob,
mask1x1,
mask1x1_stride,
0.0,
scaled_constants,
&profile.entropy_mul_table,
scratch,
);
if favor_single_mul != 1.0 {
for row in &mut entropy {
for val in row {
*val *= favor_single_mul;
}
}
}
let cost_all_single = entropy[0][0] + entropy[0][1] + entropy[1][0] + entropy[1][1];
let cost16x8 = (entropy_16x8_left).min(entropy[0][0] + entropy[1][0])
+ (entropy_16x8_right).min(entropy[0][1] + entropy[1][1]);
let cost8x16 = (entropy_8x16_top).min(entropy[0][0] + entropy[0][1])
+ (entropy_8x16_bottom).min(entropy[1][0] + entropy[1][1]);
let cost16x16 = entropy_16x16;
let best_rect = cost16x8.min(cost8x16);
let best_large = best_rect.min(cost16x16);
let qi = abs_by * xsize_blocks + abs_bx;
let qf_avg = if qi + 1 + xsize_blocks < quant_field.len() {
(quant_field[qi]
+ quant_field[qi + 1]
+ quant_field[qi + xsize_blocks]
+ quant_field[qi + 1 + xsize_blocks])
/ 4.0
} else {
quant_field.get(qi).copied().unwrap_or(0.0)
};
let mask_avg = if qi + 1 + xsize_blocks < masking.len() {
(masking[qi]
+ masking[qi + 1]
+ masking[qi + xsize_blocks]
+ masking[qi + 1 + xsize_blocks])
/ 4.0
} else {
masking.get(qi).copied().unwrap_or(0.0)
};
debug_rect!(
"acs/16x16",
abs_bx * 8,
abs_by * 8,
16,
16,
"winner={} singles={:.0} 16x16={:.0} 16x8={:.0} 8x16={:.0} | qf={:.1} mask={:.2} mul8={:.2} mul16={:.2}",
if best_large >= cost_all_single {
"singles"
} else if cost16x16 <= best_rect {
"16x16"
} else if cost16x8 < cost8x16 {
"16x8"
} else {
"8x16"
},
cost_all_single,
cost16x16,
cost16x8,
cost8x16,
qf_avg,
mask_avg,
mul8x8,
mul16x16
);
if best_large >= cost_all_single {
for dy in 0..2 {
for dx in 0..2 {
let strat = best_single_strategy[dy][dx];
if strat != RAW_STRATEGY_DCT8 {
ac_strategy.set(abs_bx + dx, abs_by + dy, strat);
}
}
}
if let Some((ox, oy)) = cache_offset {
for dy in 0..2 {
for dx in 0..2 {
scratch.entropy_estimate[(oy + dy) * 8 + (ox + dx)] = entropy[dy][dx];
}
}
}
return;
}
if cost16x16 <= best_rect {
ac_strategy.set(abs_bx, abs_by, RAW_STRATEGY_DCT16X16);
} else if cost16x8 < cost8x16 {
if entropy_16x8_left < entropy[0][0] + entropy[1][0] {
ac_strategy.set(abs_bx, abs_by, RAW_STRATEGY_DCT16X8);
} else {
for dy in 0..2 {
let strat = best_single_strategy[dy][0];
if strat != RAW_STRATEGY_DCT8 {
ac_strategy.set(abs_bx, abs_by + dy, strat);
}
}
}
if entropy_16x8_right < entropy[0][1] + entropy[1][1] {
ac_strategy.set(abs_bx + 1, abs_by, RAW_STRATEGY_DCT16X8);
} else {
for dy in 0..2 {
let strat = best_single_strategy[dy][1];
if strat != RAW_STRATEGY_DCT8 {
ac_strategy.set(abs_bx + 1, abs_by + dy, strat);
}
}
}
} else {
if entropy_8x16_top < entropy[0][0] + entropy[0][1] {
ac_strategy.set(abs_bx, abs_by, RAW_STRATEGY_DCT8X16);
} else {
for dx in 0..2 {
let strat = best_single_strategy[0][dx];
if strat != RAW_STRATEGY_DCT8 {
ac_strategy.set(abs_bx + dx, abs_by, strat);
}
}
}
if entropy_8x16_bottom < entropy[1][0] + entropy[1][1] {
ac_strategy.set(abs_bx, abs_by + 1, RAW_STRATEGY_DCT8X16);
} else {
for dx in 0..2 {
let strat = best_single_strategy[1][dx];
if strat != RAW_STRATEGY_DCT8 {
ac_strategy.set(abs_bx + dx, abs_by + 1, strat);
}
}
}
}
if let Some((ox, oy)) = cache_offset {
if cost16x16 <= best_rect {
set_entropy_for_transform(&mut scratch.entropy_estimate, ox, oy, 2, 2, entropy_16x16);
} else if cost16x8 < cost8x16 {
if entropy_16x8_left < entropy[0][0] + entropy[1][0] {
set_entropy_for_transform(
&mut scratch.entropy_estimate,
ox,
oy,
1,
2,
entropy_16x8_left,
);
} else {
scratch.entropy_estimate[oy * 8 + ox] = entropy[0][0];
scratch.entropy_estimate[(oy + 1) * 8 + ox] = entropy[1][0];
}
if entropy_16x8_right < entropy[0][1] + entropy[1][1] {
set_entropy_for_transform(
&mut scratch.entropy_estimate,
ox + 1,
oy,
1,
2,
entropy_16x8_right,
);
} else {
scratch.entropy_estimate[oy * 8 + (ox + 1)] = entropy[0][1];
scratch.entropy_estimate[(oy + 1) * 8 + (ox + 1)] = entropy[1][1];
}
} else {
if entropy_8x16_top < entropy[0][0] + entropy[0][1] {
set_entropy_for_transform(
&mut scratch.entropy_estimate,
ox,
oy,
2,
1,
entropy_8x16_top,
);
} else {
scratch.entropy_estimate[oy * 8 + ox] = entropy[0][0];
scratch.entropy_estimate[oy * 8 + (ox + 1)] = entropy[0][1];
}
if entropy_8x16_bottom < entropy[1][0] + entropy[1][1] {
set_entropy_for_transform(
&mut scratch.entropy_estimate,
ox,
oy + 1,
2,
1,
entropy_8x16_bottom,
);
} else {
scratch.entropy_estimate[(oy + 1) * 8 + ox] = entropy[1][0];
scratch.entropy_estimate[(oy + 1) * 8 + (ox + 1)] = entropy[1][1];
}
}
}
}
#[allow(clippy::too_many_arguments)]
pub(super) fn try_merge_16x16(
xyb: [&[f32]; 3],
stride: usize,
bx0: usize,
by0: usize,
cx: usize,
cy: usize,
distance: f32,
quant_field: &[f32],
xsize_blocks: usize,
masking: &[f32],
ytox: i8,
ytob: i8,
mask1x1: Option<&[f32]>,
mask1x1_stride: usize,
ac_strategy: &mut AcStrategyMap,
scratch: &mut EntropyEstScratch,
profile: &EffortProfile,
) -> bool {
#[cfg(target_arch = "x86_64")]
{
use jxl_simd::SimdToken;
if let Some(token) = jxl_simd::X64V3Token::summon() {
return try_merge_16x16_avx2(
token,
xyb,
stride,
bx0,
by0,
cx,
cy,
distance,
quant_field,
xsize_blocks,
masking,
ytox,
ytob,
mask1x1,
mask1x1_stride,
ac_strategy,
scratch,
profile,
);
}
}
#[cfg(target_arch = "aarch64")]
{
use jxl_simd::SimdToken;
if let Some(token) = jxl_simd::NeonToken::summon() {
return try_merge_16x16_neon(
token,
xyb,
stride,
bx0,
by0,
cx,
cy,
distance,
quant_field,
xsize_blocks,
masking,
ytox,
ytob,
mask1x1,
mask1x1_stride,
ac_strategy,
scratch,
profile,
);
}
}
#[cfg(target_arch = "wasm32")]
{
use jxl_simd::SimdToken;
if let Some(token) = jxl_simd::Wasm128Token::summon() {
return try_merge_16x16_wasm128(
token,
xyb,
stride,
bx0,
by0,
cx,
cy,
distance,
quant_field,
xsize_blocks,
masking,
ytox,
ytob,
mask1x1,
mask1x1_stride,
ac_strategy,
scratch,
profile,
);
}
}
try_merge_16x16_impl(
xyb,
stride,
bx0,
by0,
cx,
cy,
distance,
quant_field,
xsize_blocks,
masking,
ytox,
ytob,
mask1x1,
mask1x1_stride,
ac_strategy,
scratch,
profile,
)
}
#[cfg(target_arch = "x86_64")]
#[archmage::arcane]
#[allow(clippy::too_many_arguments)]
fn try_merge_16x16_avx2(
_token: jxl_simd::X64V3Token,
xyb: [&[f32]; 3],
stride: usize,
bx0: usize,
by0: usize,
cx: usize,
cy: usize,
distance: f32,
quant_field: &[f32],
xsize_blocks: usize,
masking: &[f32],
ytox: i8,
ytob: i8,
mask1x1: Option<&[f32]>,
mask1x1_stride: usize,
ac_strategy: &mut AcStrategyMap,
scratch: &mut EntropyEstScratch,
profile: &EffortProfile,
) -> bool {
try_merge_16x16_impl(
xyb,
stride,
bx0,
by0,
cx,
cy,
distance,
quant_field,
xsize_blocks,
masking,
ytox,
ytob,
mask1x1,
mask1x1_stride,
ac_strategy,
scratch,
profile,
)
}
#[cfg(target_arch = "aarch64")]
#[archmage::arcane]
#[allow(clippy::too_many_arguments)]
fn try_merge_16x16_neon(
_token: jxl_simd::NeonToken,
xyb: [&[f32]; 3],
stride: usize,
bx0: usize,
by0: usize,
cx: usize,
cy: usize,
distance: f32,
quant_field: &[f32],
xsize_blocks: usize,
masking: &[f32],
ytox: i8,
ytob: i8,
mask1x1: Option<&[f32]>,
mask1x1_stride: usize,
ac_strategy: &mut AcStrategyMap,
scratch: &mut EntropyEstScratch,
profile: &EffortProfile,
) -> bool {
try_merge_16x16_impl(
xyb,
stride,
bx0,
by0,
cx,
cy,
distance,
quant_field,
xsize_blocks,
masking,
ytox,
ytob,
mask1x1,
mask1x1_stride,
ac_strategy,
scratch,
profile,
)
}
#[cfg(target_arch = "wasm32")]
#[archmage::arcane]
#[allow(clippy::too_many_arguments)]
fn try_merge_16x16_wasm128(
_token: jxl_simd::Wasm128Token,
xyb: [&[f32]; 3],
stride: usize,
bx0: usize,
by0: usize,
cx: usize,
cy: usize,
distance: f32,
quant_field: &[f32],
xsize_blocks: usize,
masking: &[f32],
ytox: i8,
ytob: i8,
mask1x1: Option<&[f32]>,
mask1x1_stride: usize,
ac_strategy: &mut AcStrategyMap,
scratch: &mut EntropyEstScratch,
profile: &EffortProfile,
) -> bool {
try_merge_16x16_impl(
xyb,
stride,
bx0,
by0,
cx,
cy,
distance,
quant_field,
xsize_blocks,
masking,
ytox,
ytob,
mask1x1,
mask1x1_stride,
ac_strategy,
scratch,
profile,
)
}
#[inline(always)]
#[allow(clippy::too_many_arguments)]
fn try_merge_16x16_impl(
xyb: [&[f32]; 3],
stride: usize,
bx0: usize,
by0: usize,
cx: usize,
cy: usize,
distance: f32,
quant_field: &[f32],
xsize_blocks: usize,
masking: &[f32],
ytox: i8,
ytob: i8,
mask1x1: Option<&[f32]>,
mask1x1_stride: usize,
ac_strategy: &mut AcStrategyMap,
scratch: &mut EntropyEstScratch,
profile: &EffortProfile,
) -> bool {
let use_pixel_domain = mask1x1.is_some();
let scaled_constants = if use_pixel_domain {
compute_scaled_constants(
distance,
(
profile.k_info_loss_mul_base,
profile.k_zeros_mul_base,
profile.k_cost_delta_base,
),
)
} else {
COEFF_DOMAIN_CONSTANTS
};
let (mul16x8, mul16x16) = if use_pixel_domain {
(1.0_f32, 1.0_f32)
} else {
let compute_mul = |k: (f32, f32, f32)| k.1 + k.0 / (distance + k.2);
(compute_mul(profile.k16x8), compute_mul(profile.k16x16))
};
let abs_bx = bx0 + cx;
let abs_by = by0 + cy;
let cached = [
[
scratch.entropy_estimate[cy * 8 + cx],
scratch.entropy_estimate[cy * 8 + (cx + 1)],
],
[
scratch.entropy_estimate[(cy + 1) * 8 + cx],
scratch.entropy_estimate[(cy + 1) * 8 + (cx + 1)],
],
];
let cost_all_single = cached[0][0] + cached[0][1] + cached[1][0] + cached[1][1];
let entropy_16x16 = mul16x16
* estimate_entropy_with_mask(
RAW_STRATEGY_DCT16X16,
xyb,
stride,
abs_bx,
abs_by,
distance,
quant_field,
xsize_blocks,
masking,
ytox,
ytob,
mask1x1,
mask1x1_stride,
0.0,
scaled_constants,
&profile.entropy_mul_table,
scratch,
);
let entropy_16x8_left = mul16x8
* estimate_entropy_with_mask(
RAW_STRATEGY_DCT16X8,
xyb,
stride,
abs_bx,
abs_by,
distance,
quant_field,
xsize_blocks,
masking,
ytox,
ytob,
mask1x1,
mask1x1_stride,
0.0,
scaled_constants,
&profile.entropy_mul_table,
scratch,
);
let entropy_16x8_right = mul16x8
* estimate_entropy_with_mask(
RAW_STRATEGY_DCT16X8,
xyb,
stride,
abs_bx + 1,
abs_by,
distance,
quant_field,
xsize_blocks,
masking,
ytox,
ytob,
mask1x1,
mask1x1_stride,
0.0,
scaled_constants,
&profile.entropy_mul_table,
scratch,
);
let entropy_8x16_top = mul16x8
* estimate_entropy_with_mask(
RAW_STRATEGY_DCT8X16,
xyb,
stride,
abs_bx,
abs_by,
distance,
quant_field,
xsize_blocks,
masking,
ytox,
ytob,
mask1x1,
mask1x1_stride,
0.0,
scaled_constants,
&profile.entropy_mul_table,
scratch,
);
let entropy_8x16_bottom = mul16x8
* estimate_entropy_with_mask(
RAW_STRATEGY_DCT8X16,
xyb,
stride,
abs_bx,
abs_by + 1,
distance,
quant_field,
xsize_blocks,
masking,
ytox,
ytob,
mask1x1,
mask1x1_stride,
0.0,
scaled_constants,
&profile.entropy_mul_table,
scratch,
);
let cost_single_left = cached[0][0] + cached[1][0];
let cost_single_right = cached[0][1] + cached[1][1];
let cost_single_top = cached[0][0] + cached[0][1];
let cost_single_bottom = cached[1][0] + cached[1][1];
let cost16x8 =
entropy_16x8_left.min(cost_single_left) + entropy_16x8_right.min(cost_single_right);
let cost8x16 =
entropy_8x16_top.min(cost_single_top) + entropy_8x16_bottom.min(cost_single_bottom);
let cost16x16 = entropy_16x16;
let best_rect = cost16x8.min(cost8x16);
let best_large = best_rect.min(cost16x16);
if best_large >= cost_all_single {
return false; }
for dy in 0..2usize {
for dx in 0..2usize {
ac_strategy.set(abs_bx + dx, abs_by + dy, RAW_STRATEGY_DCT8);
}
}
if cost16x16 <= best_rect {
ac_strategy.set(abs_bx, abs_by, RAW_STRATEGY_DCT16X16);
set_entropy_for_transform(&mut scratch.entropy_estimate, cx, cy, 2, 2, entropy_16x16);
} else if cost16x8 < cost8x16 {
if entropy_16x8_left < cost_single_left {
ac_strategy.set(abs_bx, abs_by, RAW_STRATEGY_DCT16X8);
set_entropy_for_transform(
&mut scratch.entropy_estimate,
cx,
cy,
1,
2,
entropy_16x8_left,
);
}
if entropy_16x8_right < cost_single_right {
ac_strategy.set(abs_bx + 1, abs_by, RAW_STRATEGY_DCT16X8);
set_entropy_for_transform(
&mut scratch.entropy_estimate,
cx + 1,
cy,
1,
2,
entropy_16x8_right,
);
}
} else {
if entropy_8x16_top < cost_single_top {
ac_strategy.set(abs_bx, abs_by, RAW_STRATEGY_DCT8X16);
set_entropy_for_transform(
&mut scratch.entropy_estimate,
cx,
cy,
2,
1,
entropy_8x16_top,
);
}
if entropy_8x16_bottom < cost_single_bottom {
ac_strategy.set(abs_bx, abs_by + 1, RAW_STRATEGY_DCT8X16);
set_entropy_for_transform(
&mut scratch.entropy_estimate,
cx,
cy + 1,
2,
1,
entropy_8x16_bottom,
);
}
}
true
}
#[allow(clippy::too_many_arguments)]
pub(super) fn try_merge_32x32(
xyb: [&[f32]; 3],
stride: usize,
bx0: usize,
by0: usize,
cx: usize,
cy: usize,
distance: f32,
quant_field: &[f32],
xsize_blocks: usize,
masking: &[f32],
ytox: i8,
ytob: i8,
mask1x1: Option<&[f32]>,
mask1x1_stride: usize,
ac_strategy: &mut AcStrategyMap,
scratch: &mut EntropyEstScratch,
profile: &EffortProfile,
) -> bool {
#[cfg(target_arch = "x86_64")]
{
use jxl_simd::SimdToken;
if let Some(token) = jxl_simd::X64V3Token::summon() {
return try_merge_32x32_avx2(
token,
xyb,
stride,
bx0,
by0,
cx,
cy,
distance,
quant_field,
xsize_blocks,
masking,
ytox,
ytob,
mask1x1,
mask1x1_stride,
ac_strategy,
scratch,
profile,
);
}
}
#[cfg(target_arch = "aarch64")]
{
use jxl_simd::SimdToken;
if let Some(token) = jxl_simd::NeonToken::summon() {
return try_merge_32x32_neon(
token,
xyb,
stride,
bx0,
by0,
cx,
cy,
distance,
quant_field,
xsize_blocks,
masking,
ytox,
ytob,
mask1x1,
mask1x1_stride,
ac_strategy,
scratch,
profile,
);
}
}
#[cfg(target_arch = "wasm32")]
{
use jxl_simd::SimdToken;
if let Some(token) = jxl_simd::Wasm128Token::summon() {
return try_merge_32x32_wasm128(
token,
xyb,
stride,
bx0,
by0,
cx,
cy,
distance,
quant_field,
xsize_blocks,
masking,
ytox,
ytob,
mask1x1,
mask1x1_stride,
ac_strategy,
scratch,
profile,
);
}
}
try_merge_32x32_impl(
xyb,
stride,
bx0,
by0,
cx,
cy,
distance,
quant_field,
xsize_blocks,
masking,
ytox,
ytob,
mask1x1,
mask1x1_stride,
ac_strategy,
scratch,
profile,
)
}
#[cfg(target_arch = "x86_64")]
#[archmage::arcane]
#[allow(clippy::too_many_arguments)]
fn try_merge_32x32_avx2(
_token: jxl_simd::X64V3Token,
xyb: [&[f32]; 3],
stride: usize,
bx0: usize,
by0: usize,
cx: usize,
cy: usize,
distance: f32,
quant_field: &[f32],
xsize_blocks: usize,
masking: &[f32],
ytox: i8,
ytob: i8,
mask1x1: Option<&[f32]>,
mask1x1_stride: usize,
ac_strategy: &mut AcStrategyMap,
scratch: &mut EntropyEstScratch,
profile: &EffortProfile,
) -> bool {
try_merge_32x32_impl(
xyb,
stride,
bx0,
by0,
cx,
cy,
distance,
quant_field,
xsize_blocks,
masking,
ytox,
ytob,
mask1x1,
mask1x1_stride,
ac_strategy,
scratch,
profile,
)
}
#[cfg(target_arch = "aarch64")]
#[archmage::arcane]
#[allow(clippy::too_many_arguments)]
fn try_merge_32x32_neon(
_token: jxl_simd::NeonToken,
xyb: [&[f32]; 3],
stride: usize,
bx0: usize,
by0: usize,
cx: usize,
cy: usize,
distance: f32,
quant_field: &[f32],
xsize_blocks: usize,
masking: &[f32],
ytox: i8,
ytob: i8,
mask1x1: Option<&[f32]>,
mask1x1_stride: usize,
ac_strategy: &mut AcStrategyMap,
scratch: &mut EntropyEstScratch,
profile: &EffortProfile,
) -> bool {
try_merge_32x32_impl(
xyb,
stride,
bx0,
by0,
cx,
cy,
distance,
quant_field,
xsize_blocks,
masking,
ytox,
ytob,
mask1x1,
mask1x1_stride,
ac_strategy,
scratch,
profile,
)
}
#[cfg(target_arch = "wasm32")]
#[archmage::arcane]
#[allow(clippy::too_many_arguments)]
fn try_merge_32x32_wasm128(
_token: jxl_simd::Wasm128Token,
xyb: [&[f32]; 3],
stride: usize,
bx0: usize,
by0: usize,
cx: usize,
cy: usize,
distance: f32,
quant_field: &[f32],
xsize_blocks: usize,
masking: &[f32],
ytox: i8,
ytob: i8,
mask1x1: Option<&[f32]>,
mask1x1_stride: usize,
ac_strategy: &mut AcStrategyMap,
scratch: &mut EntropyEstScratch,
profile: &EffortProfile,
) -> bool {
try_merge_32x32_impl(
xyb,
stride,
bx0,
by0,
cx,
cy,
distance,
quant_field,
xsize_blocks,
masking,
ytox,
ytob,
mask1x1,
mask1x1_stride,
ac_strategy,
scratch,
profile,
)
}
#[inline(always)]
#[allow(clippy::too_many_arguments)]
fn try_merge_32x32_impl(
xyb: [&[f32]; 3],
stride: usize,
bx0: usize,
by0: usize,
cx: usize,
cy: usize,
distance: f32,
quant_field: &[f32],
xsize_blocks: usize,
masking: &[f32],
ytox: i8,
ytob: i8,
mask1x1: Option<&[f32]>,
mask1x1_stride: usize,
ac_strategy: &mut AcStrategyMap,
scratch: &mut EntropyEstScratch,
profile: &EffortProfile,
) -> bool {
let use_pixel_domain = mask1x1.is_some();
let scaled_constants = if use_pixel_domain {
compute_scaled_constants(
distance,
(
profile.k_info_loss_mul_base,
profile.k_zeros_mul_base,
profile.k_cost_delta_base,
),
)
} else {
COEFF_DOMAIN_CONSTANTS
};
let (mul32x32, mul32x16) = if use_pixel_domain {
(1.0_f32, 1.0_f32)
} else {
let k32x32mul1: f32 = -0.75;
let k32x32mul2: f32 = 1.2;
let k32x32base: f32 = 2.0;
let m32 = k32x32mul2 + k32x32mul1 / (distance + k32x32base);
let k32x16mul1: f32 = -0.70;
let k32x16mul2: f32 = 1.1;
let k32x16base: f32 = 2.0;
let m16 = k32x16mul2 + k32x16mul1 / (distance + k32x16base);
(m32, m16)
};
let abs_bx = bx0 + cx;
let abs_by = by0 + cy;
let mut quadrant_cost = [[0.0f32; 2]; 2];
for iy in 0..4 {
for ix in 0..4 {
quadrant_cost[iy / 2][ix / 2] += scratch.entropy_estimate[(cy + iy) * 8 + (cx + ix)];
}
}
let sub_left = quadrant_cost[0][0] + quadrant_cost[1][0];
let sub_right = quadrant_cost[0][1] + quadrant_cost[1][1];
let sub_top = quadrant_cost[0][0] + quadrant_cost[0][1];
let sub_bottom = quadrant_cost[1][0] + quadrant_cost[1][1];
let cost_sub = sub_left + sub_right;
let entropy_32x32 = mul32x32
* estimate_entropy_with_mask(
RAW_STRATEGY_DCT32X32,
xyb,
stride,
abs_bx,
abs_by,
distance,
quant_field,
xsize_blocks,
masking,
ytox,
ytob,
mask1x1,
mask1x1_stride,
0.0,
scaled_constants,
&profile.entropy_mul_table,
scratch,
);
let entropy_32x16_0 = mul32x16
* estimate_entropy_with_mask(
RAW_STRATEGY_DCT32X16,
xyb,
stride,
abs_bx,
abs_by,
distance,
quant_field,
xsize_blocks,
masking,
ytox,
ytob,
mask1x1,
mask1x1_stride,
0.0,
scaled_constants,
&profile.entropy_mul_table,
scratch,
);
let entropy_32x16_1 = mul32x16
* estimate_entropy_with_mask(
RAW_STRATEGY_DCT32X16,
xyb,
stride,
abs_bx + 2,
abs_by,
distance,
quant_field,
xsize_blocks,
masking,
ytox,
ytob,
mask1x1,
mask1x1_stride,
0.0,
scaled_constants,
&profile.entropy_mul_table,
scratch,
);
let entropy_16x32_0 = mul32x16
* estimate_entropy_with_mask(
RAW_STRATEGY_DCT16X32,
xyb,
stride,
abs_bx,
abs_by,
distance,
quant_field,
xsize_blocks,
masking,
ytox,
ytob,
mask1x1,
mask1x1_stride,
0.0,
scaled_constants,
&profile.entropy_mul_table,
scratch,
);
let entropy_16x32_1 = mul32x16
* estimate_entropy_with_mask(
RAW_STRATEGY_DCT16X32,
xyb,
stride,
abs_bx,
abs_by + 2,
distance,
quant_field,
xsize_blocks,
masking,
ytox,
ytob,
mask1x1,
mask1x1_stride,
0.0,
scaled_constants,
&profile.entropy_mul_table,
scratch,
);
let cost_jxn = entropy_32x16_0.min(sub_left) + entropy_32x16_1.min(sub_right);
let cost_nxj = entropy_16x32_0.min(sub_top) + entropy_16x32_1.min(sub_bottom);
let best_large = entropy_32x32.min(cost_jxn).min(cost_nxj);
if best_large >= cost_sub {
return false;
}
for dy in 0..4usize {
for dx in 0..4usize {
ac_strategy.set(abs_bx + dx, abs_by + dy, RAW_STRATEGY_DCT8);
}
}
if entropy_32x32 < cost_jxn && entropy_32x32 < cost_nxj {
ac_strategy.set(abs_bx, abs_by, RAW_STRATEGY_DCT32X32);
set_entropy_for_transform(&mut scratch.entropy_estimate, cx, cy, 4, 4, entropy_32x32);
} else if cost_jxn < cost_nxj {
if entropy_32x16_0 < sub_left {
ac_strategy.set(abs_bx, abs_by, RAW_STRATEGY_DCT32X16);
set_entropy_for_transform(&mut scratch.entropy_estimate, cx, cy, 2, 4, entropy_32x16_0);
}
if entropy_32x16_1 < sub_right {
ac_strategy.set(abs_bx + 2, abs_by, RAW_STRATEGY_DCT32X16);
set_entropy_for_transform(
&mut scratch.entropy_estimate,
cx + 2,
cy,
2,
4,
entropy_32x16_1,
);
}
} else {
if entropy_16x32_0 < sub_top {
ac_strategy.set(abs_bx, abs_by, RAW_STRATEGY_DCT16X32);
set_entropy_for_transform(&mut scratch.entropy_estimate, cx, cy, 4, 2, entropy_16x32_0);
}
if entropy_16x32_1 < sub_bottom {
ac_strategy.set(abs_bx, abs_by + 2, RAW_STRATEGY_DCT16X32);
set_entropy_for_transform(
&mut scratch.entropy_estimate,
cx,
cy + 2,
4,
2,
entropy_16x32_1,
);
}
}
true
}
#[allow(
clippy::too_many_arguments,
clippy::needless_range_loop,
unreachable_code
)]
pub(super) fn find_best_32x32_transform(
xyb: [&[f32]; 3],
stride: usize,
bx0: usize,
by0: usize,
cx: usize,
cy: usize,
distance: f32,
quant_field: &[f32],
xsize_blocks: usize,
masking: &[f32],
ytox: i8,
ytob: i8,
mask1x1: Option<&[f32]>,
mask1x1_stride: usize,
ac_strategy: &mut AcStrategyMap,
scratch: &mut EntropyEstScratch,
cache_offset: Option<(usize, usize)>,
profile: &EffortProfile,
) -> bool {
#[cfg(target_arch = "x86_64")]
{
use jxl_simd::SimdToken;
if let Some(token) = jxl_simd::X64V3Token::summon() {
return find_best_32x32_transform_avx2(
token,
xyb,
stride,
bx0,
by0,
cx,
cy,
distance,
quant_field,
xsize_blocks,
masking,
ytox,
ytob,
mask1x1,
mask1x1_stride,
ac_strategy,
scratch,
cache_offset,
profile,
);
}
}
#[cfg(target_arch = "aarch64")]
{
use jxl_simd::SimdToken;
if let Some(token) = jxl_simd::NeonToken::summon() {
return find_best_32x32_transform_neon(
token,
xyb,
stride,
bx0,
by0,
cx,
cy,
distance,
quant_field,
xsize_blocks,
masking,
ytox,
ytob,
mask1x1,
mask1x1_stride,
ac_strategy,
scratch,
cache_offset,
profile,
);
}
}
#[cfg(target_arch = "wasm32")]
{
use jxl_simd::SimdToken;
if let Some(token) = jxl_simd::Wasm128Token::summon() {
return find_best_32x32_transform_wasm128(
token,
xyb,
stride,
bx0,
by0,
cx,
cy,
distance,
quant_field,
xsize_blocks,
masking,
ytox,
ytob,
mask1x1,
mask1x1_stride,
ac_strategy,
scratch,
cache_offset,
profile,
);
}
}
find_best_32x32_transform_impl(
xyb,
stride,
bx0,
by0,
cx,
cy,
distance,
quant_field,
xsize_blocks,
masking,
ytox,
ytob,
mask1x1,
mask1x1_stride,
ac_strategy,
scratch,
cache_offset,
profile,
)
}
#[cfg(target_arch = "x86_64")]
#[archmage::arcane]
#[allow(
clippy::too_many_arguments,
clippy::needless_range_loop,
unreachable_code
)]
fn find_best_32x32_transform_avx2(
_token: jxl_simd::X64V3Token,
xyb: [&[f32]; 3],
stride: usize,
bx0: usize,
by0: usize,
cx: usize,
cy: usize,
distance: f32,
quant_field: &[f32],
xsize_blocks: usize,
masking: &[f32],
ytox: i8,
ytob: i8,
mask1x1: Option<&[f32]>,
mask1x1_stride: usize,
ac_strategy: &mut AcStrategyMap,
scratch: &mut EntropyEstScratch,
cache_offset: Option<(usize, usize)>,
profile: &EffortProfile,
) -> bool {
find_best_32x32_transform_impl(
xyb,
stride,
bx0,
by0,
cx,
cy,
distance,
quant_field,
xsize_blocks,
masking,
ytox,
ytob,
mask1x1,
mask1x1_stride,
ac_strategy,
scratch,
cache_offset,
profile,
)
}
#[cfg(target_arch = "aarch64")]
#[archmage::arcane]
#[allow(
clippy::too_many_arguments,
clippy::needless_range_loop,
unreachable_code
)]
fn find_best_32x32_transform_neon(
_token: jxl_simd::NeonToken,
xyb: [&[f32]; 3],
stride: usize,
bx0: usize,
by0: usize,
cx: usize,
cy: usize,
distance: f32,
quant_field: &[f32],
xsize_blocks: usize,
masking: &[f32],
ytox: i8,
ytob: i8,
mask1x1: Option<&[f32]>,
mask1x1_stride: usize,
ac_strategy: &mut AcStrategyMap,
scratch: &mut EntropyEstScratch,
cache_offset: Option<(usize, usize)>,
profile: &EffortProfile,
) -> bool {
find_best_32x32_transform_impl(
xyb,
stride,
bx0,
by0,
cx,
cy,
distance,
quant_field,
xsize_blocks,
masking,
ytox,
ytob,
mask1x1,
mask1x1_stride,
ac_strategy,
scratch,
cache_offset,
profile,
)
}
#[cfg(target_arch = "wasm32")]
#[archmage::arcane]
#[allow(
clippy::too_many_arguments,
clippy::needless_range_loop,
unreachable_code
)]
fn find_best_32x32_transform_wasm128(
_token: jxl_simd::Wasm128Token,
xyb: [&[f32]; 3],
stride: usize,
bx0: usize,
by0: usize,
cx: usize,
cy: usize,
distance: f32,
quant_field: &[f32],
xsize_blocks: usize,
masking: &[f32],
ytox: i8,
ytob: i8,
mask1x1: Option<&[f32]>,
mask1x1_stride: usize,
ac_strategy: &mut AcStrategyMap,
scratch: &mut EntropyEstScratch,
cache_offset: Option<(usize, usize)>,
profile: &EffortProfile,
) -> bool {
find_best_32x32_transform_impl(
xyb,
stride,
bx0,
by0,
cx,
cy,
distance,
quant_field,
xsize_blocks,
masking,
ytox,
ytob,
mask1x1,
mask1x1_stride,
ac_strategy,
scratch,
cache_offset,
profile,
)
}
#[inline(always)]
#[allow(
clippy::too_many_arguments,
clippy::needless_range_loop,
unreachable_code
)]
fn find_best_32x32_transform_impl(
xyb: [&[f32]; 3],
stride: usize,
bx0: usize,
by0: usize,
cx: usize,
cy: usize,
distance: f32,
quant_field: &[f32],
xsize_blocks: usize,
masking: &[f32],
ytox: i8,
ytob: i8,
mask1x1: Option<&[f32]>,
mask1x1_stride: usize,
ac_strategy: &mut AcStrategyMap,
scratch: &mut EntropyEstScratch,
cache_offset: Option<(usize, usize)>,
profile: &EffortProfile,
) -> bool {
let use_pixel_domain = mask1x1.is_some();
let scaled_constants = if use_pixel_domain {
compute_scaled_constants(
distance,
(
profile.k_info_loss_mul_base,
profile.k_zeros_mul_base,
profile.k_cost_delta_base,
),
)
} else {
COEFF_DOMAIN_CONSTANTS
};
let (mul32x32, mul32x16) = if use_pixel_domain {
(1.0_f32, 1.0_f32)
} else {
let k32x32mul1: f32 = -0.75;
let k32x32mul2: f32 = 1.2;
let k32x32base: f32 = 2.0;
let m32 = k32x32mul2 + k32x32mul1 / (distance + k32x32base);
let k32x16mul1: f32 = -0.70;
let k32x16mul2: f32 = 1.1;
let k32x16base: f32 = 2.0;
let m16 = k32x16mul2 + k32x16mul1 / (distance + k32x16base);
(m32, m16)
};
let abs_bx = bx0 + cx;
let abs_by = by0 + cy;
let entropy_32x32 = mul32x32
* estimate_entropy_with_mask(
RAW_STRATEGY_DCT32X32,
xyb,
stride,
abs_bx,
abs_by,
distance,
quant_field,
xsize_blocks,
masking,
ytox,
ytob,
mask1x1,
mask1x1_stride,
0.0,
scaled_constants,
&profile.entropy_mul_table,
scratch,
);
let entropy_32x16_0 = mul32x16
* estimate_entropy_with_mask(
RAW_STRATEGY_DCT32X16,
xyb,
stride,
abs_bx,
abs_by,
distance,
quant_field,
xsize_blocks,
masking,
ytox,
ytob,
mask1x1,
mask1x1_stride,
0.0,
scaled_constants,
&profile.entropy_mul_table,
scratch,
);
let entropy_32x16_1 = mul32x16
* estimate_entropy_with_mask(
RAW_STRATEGY_DCT32X16,
xyb,
stride,
abs_bx + 2,
abs_by,
distance,
quant_field,
xsize_blocks,
masking,
ytox,
ytob,
mask1x1,
mask1x1_stride,
0.0,
scaled_constants,
&profile.entropy_mul_table,
scratch,
);
let entropy_16x32_0 = mul32x16
* estimate_entropy_with_mask(
RAW_STRATEGY_DCT16X32,
xyb,
stride,
abs_bx,
abs_by,
distance,
quant_field,
xsize_blocks,
masking,
ytox,
ytob,
mask1x1,
mask1x1_stride,
0.0,
scaled_constants,
&profile.entropy_mul_table,
scratch,
);
let entropy_16x32_1 = mul32x16
* estimate_entropy_with_mask(
RAW_STRATEGY_DCT16X32,
xyb,
stride,
abs_bx,
abs_by + 2,
distance,
quant_field,
xsize_blocks,
masking,
ytox,
ytob,
mask1x1,
mask1x1_stride,
0.0,
scaled_constants,
&profile.entropy_mul_table,
scratch,
);
for qy in (0..4).step_by(2) {
for qx in (0..4).step_by(2) {
let sub_cache = cache_offset.map(|(ox, oy)| (ox + qx, oy + qy));
find_best_16x16_transform(
xyb,
stride,
bx0,
by0,
cx + qx,
cy + qy,
distance,
quant_field,
xsize_blocks,
masking,
ytox,
ytob,
mask1x1,
mask1x1_stride,
ac_strategy,
scratch,
1.0, sub_cache,
profile,
);
}
}
let mut quadrant_cost = [[0.0f32; 2]; 2];
if let Some((ox, oy)) = cache_offset {
for iy in 0..4 {
for ix in 0..4 {
quadrant_cost[iy / 2][ix / 2] +=
scratch.entropy_estimate[(oy + iy) * 8 + (ox + ix)];
}
}
} else {
let (sub_mul8x8, sub_mul16x8, sub_mul16x16) = if use_pixel_domain {
let mul8x8 = 1.0 + (-0.4) / (distance + 1.4);
(mul8x8, 1.0_f32, 1.0_f32)
} else {
let k8x8mul1: f32 = -0.55 * 0.75;
let k8x8mul2: f32 = 1.073_575_8 * 0.75;
let k8x8base: f32 = 1.4;
let m8 = k8x8mul2 + k8x8mul1 / (distance + k8x8base);
let k8x16mul1: f32 = -0.55;
let k8x16mul2: f32 = 0.901_958_8;
let k8x16base: f32 = 1.6;
let m16x8 = k8x16mul2 + k8x16mul1 / (distance + k8x16base);
let k16x16mul1: f32 = -0.65;
let k16x16mul2: f32 = 0.88;
let k16x16base: f32 = 1.8;
let m16x16 = k16x16mul2 + k16x16mul1 / (distance + k16x16base);
(m8, m16x8, m16x16)
};
let favor_weight = if distance < 5.0 {
((5.0 - distance) / 5.0_f32).powi(2)
} else {
0.0
};
let favor_2x2_adjust = profile.k_favor_2x2 * favor_weight;
let avoid_transforms_adjust = if distance > 4.0 {
let mul = if distance < 12.0 {
(12.0 - 4.0) / (distance - 4.0)
} else {
1.0
};
profile.k_avoid_transforms_base * mul
} else {
0.0
};
for iy in 0..4 {
for ix in 0..4 {
if !ac_strategy.is_first(abs_bx + ix, abs_by + iy) {
continue;
}
let sub_raw = ac_strategy.raw_strategy(abs_bx + ix, abs_by + iy);
let mul = match sub_raw {
RAW_STRATEGY_DCT8 => sub_mul8x8,
RAW_STRATEGY_DCT16X8 | RAW_STRATEGY_DCT8X16 => sub_mul16x8,
RAW_STRATEGY_DCT16X16 => sub_mul16x16,
_ => sub_mul8x8,
};
let base = if !use_pixel_domain && sub_raw == RAW_STRATEGY_DCT8 {
3.0 * sub_mul8x8
} else {
0.0
};
let adjust = match sub_raw {
RAW_STRATEGY_DCT2X2 | RAW_STRATEGY_IDENTITY => favor_2x2_adjust,
RAW_STRATEGY_DCT4X8 | RAW_STRATEGY_DCT8X4 | RAW_STRATEGY_DCT4X4
| RAW_STRATEGY_AFV0 | RAW_STRATEGY_AFV1 | RAW_STRATEGY_AFV2
| RAW_STRATEGY_AFV3 => avoid_transforms_adjust,
_ => 0.0,
};
let e = estimate_entropy_with_mask(
sub_raw,
xyb,
stride,
abs_bx + ix,
abs_by + iy,
distance,
quant_field,
xsize_blocks,
masking,
ytox,
ytob,
mask1x1,
mask1x1_stride,
adjust,
scaled_constants,
&profile.entropy_mul_table,
scratch,
);
let cost = base + mul * e;
quadrant_cost[iy / 2][ix / 2] += cost;
}
}
}
let sub_left = quadrant_cost[0][0] + quadrant_cost[1][0];
let sub_right = quadrant_cost[0][1] + quadrant_cost[1][1];
let sub_top = quadrant_cost[0][0] + quadrant_cost[0][1];
let sub_bottom = quadrant_cost[1][0] + quadrant_cost[1][1];
let cost_sub = sub_left + sub_right;
let cost_jxn = entropy_32x16_0.min(sub_left) + entropy_32x16_1.min(sub_right);
let cost_nxj = entropy_16x32_0.min(sub_top) + entropy_16x32_1.min(sub_bottom);
debug_rect!(
"acs/32x32",
abs_bx * 8,
abs_by * 8,
32,
32,
"sub={:.0} 32x32={:.0} costJxN={:.0}(32x16: {:.0}+{:.0}) costNxJ={:.0}(16x32: {:.0}+{:.0}) | mul32={:.2} mul16={:.2}",
cost_sub,
entropy_32x32,
cost_jxn,
entropy_32x16_0,
entropy_32x16_1,
cost_nxj,
entropy_16x32_0,
entropy_16x32_1,
mul32x32,
mul32x16
);
if entropy_32x32 < cost_jxn && entropy_32x32 < cost_nxj {
ac_strategy.set(abs_bx, abs_by, RAW_STRATEGY_DCT32X32);
if let Some((ox, oy)) = cache_offset {
set_entropy_for_transform(&mut scratch.entropy_estimate, ox, oy, 4, 4, entropy_32x32);
}
true
} else if cost_jxn < cost_nxj {
let mut any_merged = false;
if entropy_32x16_0 < sub_left {
ac_strategy.set(abs_bx, abs_by, RAW_STRATEGY_DCT32X16);
if let Some((ox, oy)) = cache_offset {
set_entropy_for_transform(
&mut scratch.entropy_estimate,
ox,
oy,
2,
4,
entropy_32x16_0,
);
}
any_merged = true;
}
if entropy_32x16_1 < sub_right {
ac_strategy.set(abs_bx + 2, abs_by, RAW_STRATEGY_DCT32X16);
if let Some((ox, oy)) = cache_offset {
set_entropy_for_transform(
&mut scratch.entropy_estimate,
ox + 2,
oy,
2,
4,
entropy_32x16_1,
);
}
any_merged = true;
}
any_merged
} else {
let mut any_merged = false;
if entropy_16x32_0 < sub_top {
ac_strategy.set(abs_bx, abs_by, RAW_STRATEGY_DCT16X32);
if let Some((ox, oy)) = cache_offset {
set_entropy_for_transform(
&mut scratch.entropy_estimate,
ox,
oy,
4,
2,
entropy_16x32_0,
);
}
any_merged = true;
}
if entropy_16x32_1 < sub_bottom {
ac_strategy.set(abs_bx, abs_by + 2, RAW_STRATEGY_DCT16X32);
if let Some((ox, oy)) = cache_offset {
set_entropy_for_transform(
&mut scratch.entropy_estimate,
ox,
oy + 2,
4,
2,
entropy_16x32_1,
);
}
any_merged = true;
}
any_merged
}
}
#[allow(clippy::too_many_arguments, clippy::needless_range_loop)]
pub(super) fn find_best_64x64_transform(
xyb: [&[f32]; 3],
stride: usize,
bx0: usize,
by0: usize,
cx: usize,
cy: usize,
distance: f32,
quant_field: &[f32],
xsize_blocks: usize,
masking: &[f32],
ytox: i8,
ytob: i8,
mask1x1: Option<&[f32]>,
mask1x1_stride: usize,
ac_strategy: &mut AcStrategyMap,
scratch: &mut EntropyEstScratch,
profile: &EffortProfile,
) {
#[cfg(target_arch = "x86_64")]
{
use jxl_simd::SimdToken;
if let Some(token) = jxl_simd::X64V3Token::summon() {
find_best_64x64_transform_avx2(
token,
xyb,
stride,
bx0,
by0,
cx,
cy,
distance,
quant_field,
xsize_blocks,
masking,
ytox,
ytob,
mask1x1,
mask1x1_stride,
ac_strategy,
scratch,
profile,
);
return;
}
}
#[cfg(target_arch = "aarch64")]
{
use jxl_simd::SimdToken;
if let Some(token) = jxl_simd::NeonToken::summon() {
find_best_64x64_transform_neon(
token,
xyb,
stride,
bx0,
by0,
cx,
cy,
distance,
quant_field,
xsize_blocks,
masking,
ytox,
ytob,
mask1x1,
mask1x1_stride,
ac_strategy,
scratch,
profile,
);
return;
}
}
#[cfg(target_arch = "wasm32")]
{
use jxl_simd::SimdToken;
if let Some(token) = jxl_simd::Wasm128Token::summon() {
find_best_64x64_transform_wasm128(
token,
xyb,
stride,
bx0,
by0,
cx,
cy,
distance,
quant_field,
xsize_blocks,
masking,
ytox,
ytob,
mask1x1,
mask1x1_stride,
ac_strategy,
scratch,
profile,
);
return;
}
}
find_best_64x64_transform_impl(
xyb,
stride,
bx0,
by0,
cx,
cy,
distance,
quant_field,
xsize_blocks,
masking,
ytox,
ytob,
mask1x1,
mask1x1_stride,
ac_strategy,
scratch,
profile,
);
}
#[cfg(target_arch = "x86_64")]
#[archmage::arcane]
#[allow(clippy::too_many_arguments, clippy::needless_range_loop)]
fn find_best_64x64_transform_avx2(
_token: jxl_simd::X64V3Token,
xyb: [&[f32]; 3],
stride: usize,
bx0: usize,
by0: usize,
cx: usize,
cy: usize,
distance: f32,
quant_field: &[f32],
xsize_blocks: usize,
masking: &[f32],
ytox: i8,
ytob: i8,
mask1x1: Option<&[f32]>,
mask1x1_stride: usize,
ac_strategy: &mut AcStrategyMap,
scratch: &mut EntropyEstScratch,
profile: &EffortProfile,
) {
find_best_64x64_transform_impl(
xyb,
stride,
bx0,
by0,
cx,
cy,
distance,
quant_field,
xsize_blocks,
masking,
ytox,
ytob,
mask1x1,
mask1x1_stride,
ac_strategy,
scratch,
profile,
);
}
#[cfg(target_arch = "aarch64")]
#[archmage::arcane]
#[allow(clippy::too_many_arguments, clippy::needless_range_loop)]
fn find_best_64x64_transform_neon(
_token: jxl_simd::NeonToken,
xyb: [&[f32]; 3],
stride: usize,
bx0: usize,
by0: usize,
cx: usize,
cy: usize,
distance: f32,
quant_field: &[f32],
xsize_blocks: usize,
masking: &[f32],
ytox: i8,
ytob: i8,
mask1x1: Option<&[f32]>,
mask1x1_stride: usize,
ac_strategy: &mut AcStrategyMap,
scratch: &mut EntropyEstScratch,
profile: &EffortProfile,
) {
find_best_64x64_transform_impl(
xyb,
stride,
bx0,
by0,
cx,
cy,
distance,
quant_field,
xsize_blocks,
masking,
ytox,
ytob,
mask1x1,
mask1x1_stride,
ac_strategy,
scratch,
profile,
);
}
#[cfg(target_arch = "wasm32")]
#[archmage::arcane]
#[allow(clippy::too_many_arguments, clippy::needless_range_loop)]
fn find_best_64x64_transform_wasm128(
_token: jxl_simd::Wasm128Token,
xyb: [&[f32]; 3],
stride: usize,
bx0: usize,
by0: usize,
cx: usize,
cy: usize,
distance: f32,
quant_field: &[f32],
xsize_blocks: usize,
masking: &[f32],
ytox: i8,
ytob: i8,
mask1x1: Option<&[f32]>,
mask1x1_stride: usize,
ac_strategy: &mut AcStrategyMap,
scratch: &mut EntropyEstScratch,
profile: &EffortProfile,
) {
find_best_64x64_transform_impl(
xyb,
stride,
bx0,
by0,
cx,
cy,
distance,
quant_field,
xsize_blocks,
masking,
ytox,
ytob,
mask1x1,
mask1x1_stride,
ac_strategy,
scratch,
profile,
);
}
#[inline(always)]
#[allow(clippy::too_many_arguments, clippy::needless_range_loop)]
fn find_best_64x64_transform_impl(
xyb: [&[f32]; 3],
stride: usize,
bx0: usize,
by0: usize,
cx: usize,
cy: usize,
distance: f32,
quant_field: &[f32],
xsize_blocks: usize,
masking: &[f32],
ytox: i8,
ytob: i8,
mask1x1: Option<&[f32]>,
mask1x1_stride: usize,
ac_strategy: &mut AcStrategyMap,
scratch: &mut EntropyEstScratch,
profile: &EffortProfile,
) {
let use_pixel_domain = mask1x1.is_some();
let scaled_constants = if use_pixel_domain {
compute_scaled_constants(
distance,
(
profile.k_info_loss_mul_base,
profile.k_zeros_mul_base,
profile.k_cost_delta_base,
),
)
} else {
COEFF_DOMAIN_CONSTANTS
};
let (mul64x64, mul64x32) = if use_pixel_domain {
(1.0_f32, 1.0_f32)
} else {
let k64x64mul1: f32 = -0.80;
let k64x64mul2: f32 = 1.3;
let k64x64base: f32 = 2.5;
let m64 = k64x64mul2 + k64x64mul1 / (distance + k64x64base);
let k64x32mul1: f32 = -0.75;
let k64x32mul2: f32 = 1.2;
let k64x32base: f32 = 2.5;
let m32 = k64x32mul2 + k64x32mul1 / (distance + k64x32base);
(m64, m32)
};
let abs_bx = bx0 + cx;
let abs_by = by0 + cy;
scratch.entropy_estimate = [0.0; 64];
let entropy_64x64 = mul64x64
* estimate_entropy_with_mask(
RAW_STRATEGY_DCT64X64,
xyb,
stride,
abs_bx,
abs_by,
distance,
quant_field,
xsize_blocks,
masking,
ytox,
ytob,
mask1x1,
mask1x1_stride,
0.0,
scaled_constants,
&profile.entropy_mul_table,
scratch,
);
let entropy_64x32_0 = mul64x32
* estimate_entropy_with_mask(
RAW_STRATEGY_DCT64X32,
xyb,
stride,
abs_bx,
abs_by,
distance,
quant_field,
xsize_blocks,
masking,
ytox,
ytob,
mask1x1,
mask1x1_stride,
0.0,
scaled_constants,
&profile.entropy_mul_table,
scratch,
);
let entropy_64x32_1 = mul64x32
* estimate_entropy_with_mask(
RAW_STRATEGY_DCT64X32,
xyb,
stride,
abs_bx + 4,
abs_by,
distance,
quant_field,
xsize_blocks,
masking,
ytox,
ytob,
mask1x1,
mask1x1_stride,
0.0,
scaled_constants,
&profile.entropy_mul_table,
scratch,
);
let entropy_32x64_0 = mul64x32
* estimate_entropy_with_mask(
RAW_STRATEGY_DCT32X64,
xyb,
stride,
abs_bx,
abs_by,
distance,
quant_field,
xsize_blocks,
masking,
ytox,
ytob,
mask1x1,
mask1x1_stride,
0.0,
scaled_constants,
&profile.entropy_mul_table,
scratch,
);
let entropy_32x64_1 = mul64x32
* estimate_entropy_with_mask(
RAW_STRATEGY_DCT32X64,
xyb,
stride,
abs_bx,
abs_by + 4,
distance,
quant_field,
xsize_blocks,
masking,
ytox,
ytob,
mask1x1,
mask1x1_stride,
0.0,
scaled_constants,
&profile.entropy_mul_table,
scratch,
);
let use_cache = use_pixel_domain;
for qy in (0..8).step_by(4) {
for qx in (0..8).step_by(4) {
let sub_cache = if use_cache { Some((qx, qy)) } else { None };
find_best_32x32_transform(
xyb,
stride,
bx0,
by0,
cx + qx,
cy + qy,
distance,
quant_field,
xsize_blocks,
masking,
ytox,
ytob,
mask1x1,
mask1x1_stride,
ac_strategy,
scratch,
sub_cache,
profile,
);
}
}
let mut quadrant_cost = [[0.0f32; 2]; 2];
if use_cache {
for iy in 0..8 {
for ix in 0..8 {
quadrant_cost[iy / 4][ix / 4] += scratch.entropy_estimate[iy * 8 + ix];
}
}
} else {
let favor_weight = if distance < 5.0 {
((5.0 - distance) / 5.0_f32).powi(2)
} else {
0.0
};
let favor_2x2_adjust = profile.k_favor_2x2 * favor_weight;
let avoid_transforms_adjust = if distance > 4.0 {
let mul = if distance < 12.0 {
(12.0 - 4.0) / (distance - 4.0)
} else {
1.0
};
profile.k_avoid_transforms_base * mul
} else {
0.0
};
for iy in 0..8 {
for ix in 0..8 {
if !ac_strategy.is_first(abs_bx + ix, abs_by + iy) {
continue;
}
let sub_raw = ac_strategy.raw_strategy(abs_bx + ix, abs_by + iy);
let k8x8mul1: f32 = -0.55 * 0.75;
let k8x8mul2: f32 = 1.073_575_8 * 0.75;
let k8x8base: f32 = 1.4;
let mul8x8 = k8x8mul2 + k8x8mul1 / (distance + k8x8base);
let k8x16mul1: f32 = -0.55;
let k8x16mul2: f32 = 0.901_958_8;
let k8x16base: f32 = 1.6;
let mul16x8 = k8x16mul2 + k8x16mul1 / (distance + k8x16base);
let k16x16mul1: f32 = -0.65;
let k16x16mul2: f32 = 0.88;
let k16x16base: f32 = 1.8;
let mul16x16 = k16x16mul2 + k16x16mul1 / (distance + k16x16base);
let k32x32mul1: f32 = -0.75;
let k32x32mul2: f32 = 1.2;
let k32x32base: f32 = 2.0;
let mul32x32 = k32x32mul2 + k32x32mul1 / (distance + k32x32base);
let k32x16mul1: f32 = -0.70;
let k32x16mul2: f32 = 1.1;
let k32x16base: f32 = 2.0;
let mul32x16 = k32x16mul2 + k32x16mul1 / (distance + k32x16base);
let mul = match sub_raw {
RAW_STRATEGY_DCT8 => mul8x8,
RAW_STRATEGY_DCT16X8 | RAW_STRATEGY_DCT8X16 => mul16x8,
RAW_STRATEGY_DCT16X16 => mul16x16,
RAW_STRATEGY_DCT32X32 => mul32x32,
RAW_STRATEGY_DCT32X16 | RAW_STRATEGY_DCT16X32 => mul32x16,
_ => mul8x8,
};
let base = if sub_raw == RAW_STRATEGY_DCT8 {
3.0 * mul8x8
} else {
0.0
};
let adjust = match sub_raw {
RAW_STRATEGY_DCT2X2 | RAW_STRATEGY_IDENTITY => favor_2x2_adjust,
RAW_STRATEGY_DCT4X8 | RAW_STRATEGY_DCT8X4 | RAW_STRATEGY_DCT4X4
| RAW_STRATEGY_AFV0 | RAW_STRATEGY_AFV1 | RAW_STRATEGY_AFV2
| RAW_STRATEGY_AFV3 => avoid_transforms_adjust,
_ => 0.0,
};
let e = estimate_entropy_with_mask(
sub_raw,
xyb,
stride,
abs_bx + ix,
abs_by + iy,
distance,
quant_field,
xsize_blocks,
masking,
ytox,
ytob,
mask1x1,
mask1x1_stride,
adjust,
scaled_constants,
&profile.entropy_mul_table,
scratch,
);
let cost = base + mul * e;
quadrant_cost[iy / 4][ix / 4] += cost;
}
}
}
let sub_left = quadrant_cost[0][0] + quadrant_cost[1][0];
let sub_right = quadrant_cost[0][1] + quadrant_cost[1][1];
let sub_top = quadrant_cost[0][0] + quadrant_cost[0][1];
let sub_bottom = quadrant_cost[1][0] + quadrant_cost[1][1];
let cost_sub = sub_left + sub_right;
let cost_jxn = entropy_64x32_0.min(sub_left) + entropy_64x32_1.min(sub_right);
let cost_nxj = entropy_32x64_0.min(sub_top) + entropy_32x64_1.min(sub_bottom);
debug_rect!(
"acs/64x64",
abs_bx * 8,
abs_by * 8,
64,
64,
"sub={:.0} 64x64={:.0} costJxN={:.0}(64x32: {:.0}+{:.0}) costNxJ={:.0}(32x64: {:.0}+{:.0}) | mul64={:.2} mul32={:.2}",
cost_sub,
entropy_64x64,
cost_jxn,
entropy_64x32_0,
entropy_64x32_1,
cost_nxj,
entropy_32x64_0,
entropy_32x64_1,
mul64x64,
mul64x32
);
if entropy_64x64 < cost_jxn && entropy_64x64 < cost_nxj {
ac_strategy.set(abs_bx, abs_by, RAW_STRATEGY_DCT64X64);
} else if cost_jxn < cost_nxj {
if entropy_64x32_0 < sub_left {
ac_strategy.set(abs_bx, abs_by, RAW_STRATEGY_DCT64X32);
}
if entropy_64x32_1 < sub_right {
ac_strategy.set(abs_bx + 4, abs_by, RAW_STRATEGY_DCT64X32);
}
} else {
if entropy_32x64_0 < sub_top {
ac_strategy.set(abs_bx, abs_by, RAW_STRATEGY_DCT32X64);
}
if entropy_32x64_1 < sub_bottom {
ac_strategy.set(abs_bx, abs_by + 4, RAW_STRATEGY_DCT32X64);
}
}
}