use super::feature::RawAnalysis;
use super::row_stream::RowStream;
use archmage::{incant, magetypes};
pub(crate) const DEFAULT_HF_MAX_BLOCKS: usize = 1024;
pub fn populate_tier3(out: &mut RawAnalysis, stream: &mut RowStream<'_>, hf_max_blocks: usize) {
let h_stats = luma_histogram_stats(stream);
out.luma_histogram_entropy = h_stats.entropy;
#[cfg(feature = "composites")]
{
out.line_art_score = h_stats.line_art_score;
}
let _ = h_stats;
let dct = dct_stats(stream, hf_max_blocks);
out.high_freq_energy_ratio = dct.high_freq_ratio;
#[cfg(feature = "experimental")]
{
out.dct_compressibility_y = dct.compressibility_y;
out.dct_compressibility_uv = dct.compressibility_uv;
out.patch_fraction = dct.patch_fraction;
out.aq_map_mean = dct.aq_map_mean;
out.aq_map_std = dct.aq_map_std;
out.noise_floor_y = dct.noise_floor_y;
out.noise_floor_uv = dct.noise_floor_uv;
out.gradient_fraction = dct.gradient_fraction;
}
#[cfg(not(feature = "experimental"))]
{
let _ = (
dct.compressibility_y,
dct.compressibility_uv,
dct.patch_fraction,
dct.aq_map_mean,
dct.aq_map_std,
dct.noise_floor_y,
dct.noise_floor_uv,
dct.gradient_fraction,
);
}
}
struct Tier3DctStats {
high_freq_ratio: f32,
compressibility_y: f32,
compressibility_uv: f32,
patch_fraction: f32,
aq_map_mean: f32,
aq_map_std: f32,
noise_floor_y: f32,
noise_floor_uv: f32,
gradient_fraction: f32,
}
#[inline(always)]
fn block_alpha(coeffs: &[[f32; 8]; 8], bin_div: f32) -> u32 {
let mut histo = [0u32; 64];
let inv = 1.0 / bin_div;
for k in 1..64 {
let u = k % 8;
let v = k / 8;
let mag = (coeffs[v][u].abs() * inv) as i32;
let bin = mag.clamp(0, 63) as usize;
histo[bin] += 1;
}
let mut max_count: u32 = 0;
let mut last_non_zero: u32 = 1;
for (k, &c) in histo.iter().enumerate() {
if c > 0 {
if c > max_count {
max_count = c;
}
last_non_zero = k as u32;
}
}
if max_count > 1 {
(256 * last_non_zero) / max_count
} else {
0
}
}
const BIN_DIV_LUMA: f32 = 16.0;
const BIN_DIV_CHROMA: f32 = 8.0;
#[inline(always)]
fn block_signature(coeffs: &[[f32; 8]; 8]) -> u32 {
let mut peak: f32 = 0.0;
for k in 1..64 {
if RASTER_TO_ZIGZAG[k] >= 17 {
continue;
}
let u = k % 8;
let v = k / 8;
let m = coeffs[v][u].abs();
if m > peak {
peak = m;
}
}
let threshold = peak * 0.25;
let mut sig: u32 = 0;
let mut bit_pos: u32 = 0;
for k in 1..64 {
let zz = RASTER_TO_ZIGZAG[k];
if zz >= 17 {
continue;
}
let u = k % 8;
let v = k / 8;
let c = coeffs[v][u];
if c.abs() > threshold {
sig |= 1 << bit_pos;
}
bit_pos += 1;
if c < 0.0 {
sig |= 1 << bit_pos;
}
bit_pos += 1;
}
sig
}
struct LumaHistStats {
entropy: f32,
line_art_score: f32,
}
fn luma_histogram_stats(stream: &mut RowStream<'_>) -> LumaHistStats {
let width = stream.width() as usize;
let height = stream.height() as usize;
if width == 0 || height == 0 {
return LumaHistStats {
entropy: 0.0,
line_art_score: 0.0,
};
}
let w = crate::luma::LumaWeights::for_primaries(stream.primaries());
let qr = w.qr as u32;
let qg = w.qg as u32;
let qb = w.qb as u32;
let mut bins = [0u32; 32];
let mut n = 0u32;
let mut carry: usize = 0;
for yy in 0..height {
let row = stream.borrow_row(yy as u32);
let start = (4 - carry) % 4;
let mut x = start;
while x < width {
let off = x * 3;
let p = &row[off..off + 3];
let y = ((qr * p[0] as u32 + qg * p[1] as u32 + qb * p[2] as u32 + 128) >> 8) as u8;
bins[(y >> 3) as usize] += 1;
n += 1;
x += 4;
}
carry = (carry + width) % 4;
}
if n == 0 {
return LumaHistStats {
entropy: 0.0,
line_art_score: 0.0,
};
}
let n_f = n as f32;
let mut entropy = 0.0f32;
for &c in &bins {
if c > 0 {
let p = c as f32 / n_f;
entropy -= p * p.log2();
}
}
let mut total_sum: f64 = 0.0;
for (i, &c) in bins.iter().enumerate() {
total_sum += (i as f64) * (c as f64);
}
let mu_total = total_sum / n_f as f64;
let mut total_var: f64 = 0.0;
for (i, &c) in bins.iter().enumerate() {
let d = i as f64 - mu_total;
total_var += (c as f64) * d * d;
}
total_var /= n_f as f64;
let mut max_between_var: f64 = 0.0;
let mut sum0: f64 = 0.0;
let mut count0: f64 = 0.0;
for k in 0..31 {
sum0 += (k as f64) * (bins[k] as f64);
count0 += bins[k] as f64;
let count1 = n_f as f64 - count0;
if count0 < 1.0 || count1 < 1.0 {
continue;
}
let mu0 = sum0 / count0;
let mu1 = (total_sum - sum0) / count1;
let omega0 = count0 / n_f as f64;
let omega1 = count1 / n_f as f64;
let between_var = omega0 * omega1 * (mu0 - mu1) * (mu0 - mu1);
if between_var > max_between_var {
max_between_var = between_var;
}
}
let bimodality = if total_var > 1e-6 {
(max_between_var / total_var).clamp(0.0, 1.0) as f32
} else {
0.0
};
let mut top1 = 0u32;
let mut top2 = 0u32;
for &c in &bins {
if c > top1 {
top2 = top1;
top1 = c;
} else if c > top2 {
top2 = c;
}
}
let top2_coverage = (top1 + top2) as f32 / n_f;
let low_entropy = ((2.5 - entropy) / 2.5).clamp(0.0, 1.0);
let line_art_score = bimodality.min(top2_coverage).min(low_entropy);
LumaHistStats {
entropy,
line_art_score,
}
}
const RASTER_TO_ZIGZAG: [u8; 64] = [
0, 1, 5, 6, 14, 15, 27, 28, 2, 4, 7, 13, 16, 26, 29, 42, 3, 8, 12, 17, 25, 30, 41, 43, 9, 11,
18, 24, 31, 40, 44, 53, 10, 19, 23, 32, 39, 45, 52, 54, 20, 22, 33, 38, 46, 51, 55, 60, 21, 34,
37, 47, 50, 56, 59, 61, 35, 36, 48, 49, 57, 58, 62, 63,
];
const DCT_COEF: [[f32; 8]; 8] = [
[
0.353_553_4,
0.353_553_4,
0.353_553_4,
0.353_553_4,
0.353_553_4,
0.353_553_4,
0.353_553_4,
0.353_553_4,
],
[
0.490_392_64,
0.415_734_8,
0.277_785_12,
0.097_545_16,
-0.097_545_16,
-0.277_785_12,
-0.415_734_8,
-0.490_392_64,
],
[
0.461_939_77,
0.191_341_72,
-0.191_341_72,
-0.461_939_77,
-0.461_939_77,
-0.191_341_72,
0.191_341_72,
0.461_939_77,
],
[
0.415_734_8,
-0.097_545_16,
-0.490_392_64,
-0.277_785_12,
0.277_785_12,
0.490_392_64,
0.097_545_16,
-0.415_734_8,
],
[
0.353_553_4,
-0.353_553_4,
-0.353_553_4,
0.353_553_4,
0.353_553_4,
-0.353_553_4,
-0.353_553_4,
0.353_553_4,
],
[
0.277_785_12,
-0.490_392_64,
0.097_545_16,
0.415_734_8,
-0.415_734_8,
-0.097_545_16,
0.490_392_64,
-0.277_785_12,
],
[
0.191_341_72,
-0.461_939_77,
0.461_939_77,
-0.191_341_72,
-0.191_341_72,
0.461_939_77,
-0.461_939_77,
0.191_341_72,
],
[
0.097_545_16,
-0.277_785_12,
0.415_734_8,
-0.490_392_64,
0.490_392_64,
-0.415_734_8,
0.277_785_12,
-0.097_545_16,
],
];
const DCT_COEF_T: [[f32; 8]; 8] = {
let mut t = [[0.0f32; 8]; 8];
let mut n = 0;
while n < 8 {
let mut k = 0;
while k < 8 {
t[n][k] = DCT_COEF[k][n];
k += 1;
}
n += 1;
}
t
};
fn dct_stats(stream: &mut RowStream<'_>, max_blocks: usize) -> Tier3DctStats {
let width = stream.width() as usize;
let height = stream.height() as usize;
if width < 8 || height < 8 {
return Tier3DctStats {
high_freq_ratio: 0.0,
compressibility_y: 0.0,
compressibility_uv: 0.0,
patch_fraction: 0.0,
aq_map_mean: 0.0,
aq_map_std: 0.0,
noise_floor_y: 0.0,
noise_floor_uv: 0.0,
gradient_fraction: 0.0,
};
}
let lw = crate::luma::LumaWeights::for_primaries(stream.primaries());
let qr = lw.qr;
let qg = lw.qg;
let qb = lw.qb;
let blocks_x = width / 8;
let blocks_y = height / 8;
let total_blocks = blocks_x * blocks_y;
if total_blocks == 0 {
return Tier3DctStats {
high_freq_ratio: 0.0,
compressibility_y: 0.0,
compressibility_uv: 0.0,
patch_fraction: 0.0,
aq_map_mean: 0.0,
aq_map_std: 0.0,
noise_floor_y: 0.0,
noise_floor_uv: 0.0,
gradient_fraction: 0.0,
};
}
let stride = (total_blocks / max_blocks).max(1);
let mut low_energy = 0.0f64;
let mut high_energy = 0.0f64;
let mut blocks_sampled: u32 = 0;
let mut alpha_y_sum: u64 = 0;
let mut alpha_uv_sum: u64 = 0;
let mut block_acs: Vec<f32> = Vec::with_capacity(max_blocks.min(4096));
let mut block_low_y: Vec<f32> = Vec::with_capacity(max_blocks.min(4096));
let mut block_low_cb: Vec<f32> = Vec::with_capacity(max_blocks.min(4096));
let mut block_low_cr: Vec<f32> = Vec::with_capacity(max_blocks.min(4096));
let mut gradient_blocks: u32 = 0;
let mut signatures: Vec<u32> = Vec::with_capacity(max_blocks.min(2048));
let row_bytes = width * 3;
let mut block_buf = vec![0u8; 8 * row_bytes]; let mut block_idx = 0usize;
for by in 0..blocks_y {
let row_start = by * blocks_x;
let row_end = row_start + blocks_x;
let any_sampled = (row_start..row_end).any(|k| k % stride == 0);
if !any_sampled {
block_idx += blocks_x;
continue;
}
for i in 0..8 {
stream.fetch_into(
(by * 8 + i) as u32,
&mut block_buf[i * row_bytes..(i + 1) * row_bytes],
);
}
for bx in 0..blocks_x {
if block_idx % stride != 0 {
block_idx += 1;
continue;
}
block_idx += 1;
let mut blk_y = [[0.0f32; 8]; 8];
let mut blk_cb = [[0.0f32; 8]; 8];
let mut blk_cr = [[0.0f32; 8]; 8];
for y in 0..8 {
let row = &block_buf[y * row_bytes..(y + 1) * row_bytes];
for x in 0..8 {
let off = (bx * 8 + x) * 3;
let p = &row[off..off + 3];
let r = p[0] as i32;
let g = p[1] as i32;
let b = p[2] as i32;
let l_i = (qr * r + qg * g + qb * b + 128) >> 8;
let cb_i = ((-38 * r - 74 * g + 112 * b + 128) >> 8) + 128;
let cr_i = ((112 * r - 94 * g - 18 * b + 128) >> 8) + 128;
blk_y[y][x] = l_i as f32 - 128.0;
blk_cb[y][x] = cb_i as f32 - 128.0;
blk_cr[y][x] = cr_i as f32 - 128.0;
}
}
let mut coeffs_y = [[0.0f32; 8]; 8];
let mut coeffs_cb = [[0.0f32; 8]; 8];
let mut coeffs_cr = [[0.0f32; 8]; 8];
dct2d_8_three_planes(
&blk_y,
&blk_cb,
&blk_cr,
&mut coeffs_y,
&mut coeffs_cb,
&mut coeffs_cr,
);
blocks_sampled += 1;
let mut block_ac: f64 = 0.0;
let mut block_low_y_ac: f64 = 0.0;
for k in 1..64 {
let u = k % 8;
let v = k / 8;
let e = (coeffs_y[v][u] * coeffs_y[v][u]) as f64;
block_ac += e;
if RASTER_TO_ZIGZAG[k] < 16 {
low_energy += e;
block_low_y_ac += e;
} else {
high_energy += e;
}
}
block_acs.push(block_ac as f32);
block_low_y.push(block_low_y_ac as f32);
if block_ac > 16.0 && block_low_y_ac >= 0.9 * block_ac {
gradient_blocks += 1;
}
let mut low_cb: f64 = 0.0;
let mut low_cr: f64 = 0.0;
for k in 1..64 {
if RASTER_TO_ZIGZAG[k] < 16 {
let u = k % 8;
let v = k / 8;
low_cb += (coeffs_cb[v][u] * coeffs_cb[v][u]) as f64;
low_cr += (coeffs_cr[v][u] * coeffs_cr[v][u]) as f64;
}
}
block_low_cb.push(low_cb as f32);
block_low_cr.push(low_cr as f32);
alpha_y_sum += block_alpha(&coeffs_y, BIN_DIV_LUMA) as u64;
let a_cb = block_alpha(&coeffs_cb, BIN_DIV_CHROMA);
let a_cr = block_alpha(&coeffs_cr, BIN_DIV_CHROMA);
alpha_uv_sum += a_cb.max(a_cr) as u64;
signatures.push(block_signature(&coeffs_y));
}
}
let high_freq_ratio = if low_energy < 1e-6 {
0.0
} else {
(high_energy / low_energy) as f32
};
let compressibility_y = if blocks_sampled > 0 {
(alpha_y_sum as f64 / blocks_sampled as f64) as f32
} else {
0.0
};
let compressibility_uv = if blocks_sampled > 0 {
(alpha_uv_sum as f64 / blocks_sampled as f64) as f32
} else {
0.0
};
let patch_fraction = if blocks_sampled > 1 {
signatures.sort_unstable();
let mut matched: u32 = 0;
let mut i = 0;
while i < signatures.len() {
let mut j = i + 1;
while j < signatures.len() && signatures[j] == signatures[i] {
j += 1;
}
let run = (j - i) as u32;
if run > 1 {
matched += run;
}
i = j;
}
matched as f32 / blocks_sampled as f32
} else {
0.0
};
let (aq_log_sum, aq_log_sq_sum) =
log10_sum_and_sq_sum_dispatch(&block_acs);
let (aq_map_mean, aq_map_std) = if blocks_sampled > 0 {
let n = blocks_sampled as f64;
let mean = aq_log_sum / n;
let var = (aq_log_sq_sum / n - mean * mean).max(0.0);
(mean as f32, var.sqrt() as f32)
} else {
(0.0, 0.0)
};
let p10 = |arr: &mut [f32]| -> f32 {
if arr.is_empty() {
return 0.0;
}
arr.sort_unstable_by(|a, b| a.partial_cmp(b).unwrap_or(core::cmp::Ordering::Equal));
let idx = (arr.len() / 10).min(arr.len() - 1);
arr[idx]
};
let noise_floor_y = ((p10(&mut block_low_y) / 15.0).sqrt() / 32.0).clamp(0.0, 1.0);
let p10_cb = ((p10(&mut block_low_cb) / 15.0).sqrt() / 32.0).clamp(0.0, 1.0);
let p10_cr = ((p10(&mut block_low_cr) / 15.0).sqrt() / 32.0).clamp(0.0, 1.0);
let noise_floor_uv = p10_cb.max(p10_cr);
let gradient_fraction = if blocks_sampled > 0 {
gradient_blocks as f32 / blocks_sampled as f32
} else {
0.0
};
Tier3DctStats {
high_freq_ratio,
compressibility_y,
compressibility_uv,
patch_fraction,
aq_map_mean,
aq_map_std,
noise_floor_y,
noise_floor_uv,
gradient_fraction,
}
}
#[inline]
fn dct2d_8_three_planes(
blk_y: &[[f32; 8]; 8],
blk_cb: &[[f32; 8]; 8],
blk_cr: &[[f32; 8]; 8],
coeffs_y: &mut [[f32; 8]; 8],
coeffs_cb: &mut [[f32; 8]; 8],
coeffs_cr: &mut [[f32; 8]; 8],
) {
incant!(dct2d_8_three_planes_simd(
blk_y, blk_cb, blk_cr, coeffs_y, coeffs_cb, coeffs_cr
));
}
#[magetypes(define(f32x8), v4, v3, neon, wasm128, scalar)]
fn dct2d_8_three_planes_simd(
token: Token,
blk_y: &[[f32; 8]; 8],
blk_cb: &[[f32; 8]; 8],
blk_cr: &[[f32; 8]; 8],
coeffs_y: &mut [[f32; 8]; 8],
coeffs_cb: &mut [[f32; 8]; 8],
coeffs_cr: &mut [[f32; 8]; 8],
) {
let d_col = [
f32x8::load(token, &DCT_COEF_T[0]),
f32x8::load(token, &DCT_COEF_T[1]),
f32x8::load(token, &DCT_COEF_T[2]),
f32x8::load(token, &DCT_COEF_T[3]),
f32x8::load(token, &DCT_COEF_T[4]),
f32x8::load(token, &DCT_COEF_T[5]),
f32x8::load(token, &DCT_COEF_T[6]),
f32x8::load(token, &DCT_COEF_T[7]),
];
macro_rules! dct2d_one_plane {
($blk:ident, $out:ident) => {{
let mut y_row: [f32x8; 8] = [
f32x8::zero(token),
f32x8::zero(token),
f32x8::zero(token),
f32x8::zero(token),
f32x8::zero(token),
f32x8::zero(token),
f32x8::zero(token),
f32x8::zero(token),
];
let mut v = 0;
while v < 8 {
let r = &$blk[v];
let mut acc = d_col[0] * f32x8::splat(token, r[0]);
acc = d_col[1].mul_add(f32x8::splat(token, r[1]), acc);
acc = d_col[2].mul_add(f32x8::splat(token, r[2]), acc);
acc = d_col[3].mul_add(f32x8::splat(token, r[3]), acc);
acc = d_col[4].mul_add(f32x8::splat(token, r[4]), acc);
acc = d_col[5].mul_add(f32x8::splat(token, r[5]), acc);
acc = d_col[6].mul_add(f32x8::splat(token, r[6]), acc);
acc = d_col[7].mul_add(f32x8::splat(token, r[7]), acc);
y_row[v] = acc;
v += 1;
}
let mut v = 0;
while v < 8 {
let d_row = &DCT_COEF[v];
let mut acc = y_row[0] * f32x8::splat(token, d_row[0]);
acc = y_row[1].mul_add(f32x8::splat(token, d_row[1]), acc);
acc = y_row[2].mul_add(f32x8::splat(token, d_row[2]), acc);
acc = y_row[3].mul_add(f32x8::splat(token, d_row[3]), acc);
acc = y_row[4].mul_add(f32x8::splat(token, d_row[4]), acc);
acc = y_row[5].mul_add(f32x8::splat(token, d_row[5]), acc);
acc = y_row[6].mul_add(f32x8::splat(token, d_row[6]), acc);
acc = y_row[7].mul_add(f32x8::splat(token, d_row[7]), acc);
acc.store(&mut $out[v]);
v += 1;
}
}};
}
dct2d_one_plane!(blk_y, coeffs_y);
dct2d_one_plane!(blk_cb, coeffs_cb);
dct2d_one_plane!(blk_cr, coeffs_cr);
}
#[cfg(feature = "composites")]
pub fn compute_derived_likelihoods<const T3: bool, const PAL: bool>(out: &mut RawAnalysis) {
let chroma_sh = out.cb_sharpness + out.cr_sharpness;
let chroma_lo = (0.005 - chroma_sh).clamp(0.0, 0.005) / 0.005;
let edge_hi = (out.edge_density / 0.25).min(1.0);
let flat_high = (out.flat_color_block_ratio / 0.5).min(1.0);
if T3 {
let entropy_low = (4.0 - out.luma_histogram_entropy).clamp(0.0, 4.0) / 4.0;
out.text_likelihood = (entropy_low * 0.4 + edge_hi * 0.3 + chroma_lo * 0.3).clamp(0.0, 1.0);
}
if PAL {
let palette_small = if out.distinct_color_bins == 0 {
0.0
} else {
(1.0 - (out.distinct_color_bins as f32 / 4000.0).min(1.0)).clamp(0.0, 1.0)
};
out.screen_content_likelihood =
(flat_high * 0.6 + palette_small * 0.3 + chroma_lo * 0.1).clamp(0.0, 1.0);
}
if T3 && PAL {
let entropy_hi = (out.luma_histogram_entropy - 3.5).clamp(0.0, 1.5) / 1.5;
let palette_large = if out.distinct_color_bins < 2000 {
0.0
} else {
((out.distinct_color_bins as f32 - 2000.0) / 8000.0).clamp(0.0, 1.0)
};
let chroma_moderate = (chroma_sh / 0.012).min(1.0);
let not_flat = (1.0 - (out.flat_color_block_ratio / 0.3).min(1.0)).clamp(0.0, 1.0);
out.natural_likelihood =
(entropy_hi * 0.3 + palette_large * 0.25 + chroma_moderate * 0.2 + not_flat * 0.25)
.clamp(0.0, 1.0);
}
}
#[cfg(not(feature = "composites"))]
pub fn compute_derived_likelihoods<const T3: bool, const PAL: bool>(_out: &mut RawAnalysis) {}
fn log10_sum_and_sq_sum_dispatch(block_acs: &[f32]) -> (f64, f64) {
incant!(log10_sum_and_sq_sum_simd(block_acs))
}
#[magetypes(define(f32x8), v4, v3, neon, wasm128, scalar)]
fn log10_sum_and_sq_sum_simd(token: Token, block_acs: &[f32]) -> (f64, f64) {
let one_v = f32x8::splat(token, 1.0);
let mut sum_v = f32x8::zero(token);
let mut sq_sum_v = f32x8::zero(token);
let mut sum_f64: f64 = 0.0;
let mut sq_sum_f64: f64 = 0.0;
const FLUSH: usize = 32;
let mut iters_since_flush = 0usize;
let chunks = block_acs.chunks_exact(8);
let remainder = chunks.remainder();
for chunk in chunks {
let arr: &[f32; 8] = chunk.try_into().unwrap();
let ac_v = f32x8::load(token, arr);
let log_v = (ac_v + one_v).log10_lowp();
sum_v += log_v;
sq_sum_v = log_v.mul_add(log_v, sq_sum_v);
iters_since_flush += 1;
if iters_since_flush >= FLUSH {
sum_f64 += sum_v.reduce_add() as f64;
sq_sum_f64 += sq_sum_v.reduce_add() as f64;
sum_v = f32x8::zero(token);
sq_sum_v = f32x8::zero(token);
iters_since_flush = 0;
}
}
sum_f64 += sum_v.reduce_add() as f64;
sq_sum_f64 += sq_sum_v.reduce_add() as f64;
for &ac in remainder {
let l = (ac + 1.0).log10();
sum_f64 += l as f64;
sq_sum_f64 += (l as f64) * (l as f64);
}
(sum_f64, sq_sum_f64)
}