Skip to main content

fast_ssim2/
strip.rs

1//! Strip-wise SSIMULACRA2 computation for bounded peak memory at very
2//! large image sizes.
3//!
4//! The full SSIMULACRA2 pipeline allocates roughly 24 image-sized `f32`
5//! planes plus a downscale pyramid; at 40 MP this is ~7 GiB of working
6//! memory. The strip walker bounds that to `O(strip_height * width)` by
7//! processing the image in horizontal strips, accumulating per-strip
8//! contributions to each scale's SSIM and edge-difference reductions, and
9//! summing those contributions across strips before the final score
10//! aggregation.
11//!
12//! ## Algorithm
13//!
14//! The two non-local operations in SSIMULACRA2 are:
15//!
16//! 1. The recursive (IIR) Gaussian blur (`Blur`), which has effectively
17//!    finite support thanks to its exponential impulse decay (sigma=1.5,
18//!    so a halo of 24 rows reduces the boundary effect to ~e^-16).
19//! 2. The 2×2 downsampling between scales, which has a strict halo of
20//!    one row on either side.
21//!
22//! Everything else — XYB conversion, `make_positive_xyb`, planar
23//! multiply, the SSIM/edge-diff reductions — is per-pixel and trivially
24//! stripable.
25//!
26//! The strip walker therefore processes each strip with a configurable
27//! halo of extra rows above and below; the per-pixel reductions inside
28//! the halo are discarded, and only the rows inside the "interior" of
29//! each strip contribute to the accumulated sums.
30//!
31//! ## Parity
32//!
33//! With the default halo ([`HALO_ROWS_DEFAULT`] = 96 rows), the strip
34//! score differs from the full-image score by less than 0.01 on the
35//! 0..100 SSIMULACRA2 scale across the test corpus at and above
36//! 256x256. The exponential decay of the IIR's impulse response gives
37//! effective bit-identity at the f32 precision used by the inner SSIM
38//! map computation for scales 0..3, and a small residual contribution
39//! (~`e^{-6}` ≈ `1e-3` per pixel) from scale 4 where the per-strip
40//! image is small and the effective halo is correspondingly thinner.
41//! Callers can override the halo via
42//! [`Ssimulacra2StripConfig::with_halo_rows`] for stricter parity at
43//! the cost of slightly more per-strip work.
44//!
45//! ## Example
46//!
47//! ```
48//! use fast_ssim2::compute_ssimulacra2_strip;
49//! use yuvxyb::{Rgb, TransferCharacteristic, ColorPrimaries};
50//! use std::num::NonZeroUsize;
51//!
52//! let data: Vec<[f32; 3]> = vec![[0.5, 0.5, 0.5]; 256 * 256];
53//! let w = NonZeroUsize::new(256).unwrap();
54//! let h = NonZeroUsize::new(256).unwrap();
55//! let source = Rgb::new(data.clone(), w, h,
56//!     TransferCharacteristic::SRGB, ColorPrimaries::BT709).unwrap();
57//! let distorted = Rgb::new(data, w, h,
58//!     TransferCharacteristic::SRGB, ColorPrimaries::BT709).unwrap();
59//!
60//! // Process in strips of 64 rows each.
61//! let score = compute_ssimulacra2_strip(source, distorted, 64).unwrap();
62//! assert!((score - 100.0).abs() < 1e-3);
63//! ```
64//!
65//! ## Cached-reference strip API
66//!
67//! When comparing many distorted images against the same reference,
68//! pair [`Ssimulacra2Reference::new`] with
69//! [`Ssimulacra2Reference::compare_strip`] for the warm-ref + strip
70//! benefit:
71//!
72//! ```ignore
73//! let reference = Ssimulacra2Reference::new(source)?;
74//! for distorted in distortions {
75//!     let score = reference.compare_strip(distorted, 64)?;
76//! }
77//! ```
78//!
79//! Note that `compare_strip` still holds the full precomputed reference
80//! in memory; the strip discipline only bounds dist-side peak memory.
81//! For full strip mode on both sides, use [`compute_ssimulacra2_strip`]
82//! directly.
83
84use std::num::NonZeroUsize;
85
86use yuvxyb::LinearRgb;
87
88use crate::blur::Blur;
89use crate::input::ToLinearRgb;
90use crate::precompute::Ssimulacra2Reference;
91use crate::weights::{EDGE_HAS_WEIGHT, NUM_SCALES, SSIM_HAS_WEIGHT};
92use crate::{
93    LinearRgbImage, Msssim, MsssimScale, Ssimulacra2Config, Ssimulacra2Error, downscale_by_2,
94    image_multiply, linear_rgb_to_xyb_simd, make_positive_xyb, xyb_to_planar_into,
95};
96
97/// Default number of halo rows above and below each strip.
98///
99/// SSIMULACRA2 runs the IIR Gaussian at every pyramid scale. The
100/// scale-0 image has the most rows; scale-4's image is 16× smaller.
101/// The IIR impulse decays as roughly `e^{-2/3 · n}` per row at
102/// sigma=1.5, so the *effective* halo at scale `s` is
103/// `HALO_ROWS_DEFAULT >> s`. To keep at least 6 rows of warmup at
104/// scale 4 (the deepest scale on a 40 MP image), we set the scale-0
105/// halo to 96 rows. This adds modest per-strip overhead (a 256-row
106/// strip becomes a 448-row working strip — 75 % more work per strip,
107/// still bounded by `O(strip_h)` rather than `O(full_h)`) in exchange
108/// for atomic-tolerance parity against the full-image score across
109/// all scales.
110pub const HALO_ROWS_DEFAULT: usize = 96;
111
112/// Minimum supported strip height (in scale-0 rows).
113///
114/// SSIMULACRA2's minimum scale-0 input is 8x8. To allow downsampling to
115/// produce meaningful per-scale data, strips at scale 0 must be at
116/// least 8 rows tall (so scale 5 has at least 1 row).
117pub const MIN_STRIP_HEIGHT: usize = 8;
118
119/// Configuration for strip-wise SSIMULACRA2 computation.
120#[derive(Debug, Clone, Copy)]
121pub struct Ssimulacra2StripConfig {
122    /// Number of rows above and below each strip's "interior" that
123    /// are processed but excluded from the per-pixel reductions.
124    pub halo_rows: usize,
125    /// Underlying SIMD configuration for the per-strip ops.
126    pub inner: Ssimulacra2Config,
127}
128
129impl Default for Ssimulacra2StripConfig {
130    fn default() -> Self {
131        Self {
132            halo_rows: HALO_ROWS_DEFAULT,
133            inner: Ssimulacra2Config::default(),
134        }
135    }
136}
137
138impl Ssimulacra2StripConfig {
139    /// Create a strip config with the given halo size (rows).
140    #[must_use]
141    pub fn with_halo_rows(halo_rows: usize) -> Self {
142        Self {
143            halo_rows,
144            inner: Ssimulacra2Config::default(),
145        }
146    }
147
148    /// Set the underlying SIMD configuration.
149    #[must_use]
150    pub fn with_inner(mut self, inner: Ssimulacra2Config) -> Self {
151        self.inner = inner;
152        self
153    }
154}
155
156/// Computes the SSIMULACRA2 score with strip-bounded peak memory.
157///
158/// `strip_height` is the number of rows in each strip's "interior" at
159/// scale 0; the actual working strip is `strip_height + 2*halo_rows`
160/// rows tall, where `halo_rows` defaults to [`HALO_ROWS_DEFAULT`].
161///
162/// At 40 MP (e.g., 7700x5200) with `strip_height=256`, peak working
163/// memory is bounded by ~24 × 7700 × (256+48) × 4 B ≈ 220 MiB, an
164/// order of magnitude below the ~7 GiB of the full-image path.
165///
166/// # Errors
167/// - If `strip_height < 8`.
168/// - If the input images are smaller than 8x8 (the strip API targets
169///   very large images and does not reflect-pad — use
170///   [`crate::compute_ssimulacra2`] for tiny inputs), exceed
171///   [`crate::MAX_IMAGE_PIXELS`], or have mismatched dimensions.
172pub fn compute_ssimulacra2_strip<S, D>(
173    source: S,
174    distorted: D,
175    strip_height: u32,
176) -> Result<f64, Ssimulacra2Error>
177where
178    S: ToLinearRgb,
179    D: ToLinearRgb,
180{
181    compute_ssimulacra2_strip_with_config(
182        source,
183        distorted,
184        strip_height,
185        Ssimulacra2StripConfig::default(),
186    )
187}
188
189/// Strip-wise SSIMULACRA2 with explicit configuration (halo, SIMD impl).
190///
191/// See [`compute_ssimulacra2_strip`] for the default-config path and
192/// [`Ssimulacra2StripConfig`] for the available knobs.
193///
194/// # Errors
195/// As [`compute_ssimulacra2_strip`].
196pub fn compute_ssimulacra2_strip_with_config<S, D>(
197    source: S,
198    distorted: D,
199    strip_height: u32,
200    config: Ssimulacra2StripConfig,
201) -> Result<f64, Ssimulacra2Error>
202where
203    S: ToLinearRgb,
204    D: ToLinearRgb,
205{
206    let img1: LinearRgbImage = source.into_linear_rgb();
207    let img2: LinearRgbImage = distorted.into_linear_rgb();
208    let lin1: LinearRgb = img1.into();
209    let lin2: LinearRgb = img2.into();
210    compute_strip_impl(lin1, lin2, strip_height as usize, config)
211}
212
213fn validate_strip_dims(
214    width: usize,
215    height: usize,
216    strip_height: usize,
217) -> Result<(), Ssimulacra2Error> {
218    if width < 8 || height < 8 {
219        return Err(Ssimulacra2Error::InvalidImageSize);
220    }
221    if strip_height < MIN_STRIP_HEIGHT {
222        return Err(Ssimulacra2Error::InvalidImageSize);
223    }
224    let pixels = width
225        .checked_mul(height)
226        .ok_or(Ssimulacra2Error::ImageTooLarge { actual: usize::MAX })?;
227    if pixels > crate::MAX_IMAGE_PIXELS {
228        return Err(Ssimulacra2Error::ImageTooLarge { actual: pixels });
229    }
230    Ok(())
231}
232
233/// Per-channel raw SSIM and edge-diff sums accumulated across strips,
234/// indexed by `[scale][channel]`.
235///
236/// These are the un-divided per-pixel sums; the final `Msssim::score()`
237/// transform expects the SSIM/edge sums divided by the **scale-s** pixel
238/// count, so [`Accumulator::finalise`] performs that division using the
239/// total image pixel counts per scale.
240#[derive(Debug, Default, Clone)]
241struct ScaleSums {
242    /// `[c*2 + n]` = sum of `d.powi(n*3 + 1)` over all interior pixels at
243    /// scale s. The full algorithm stores `avg_ssim[c*2 + 0] = sum_d /
244    /// pixels`, `avg_ssim[c*2 + 1] = (sum_d4 / pixels).sqrt().sqrt()`.
245    /// Here we accumulate `sum_d` and `sum_d4` per channel.
246    ssim_sums: [f64; 3 * 2],
247    /// `[c*4 + n]` = sum of edge-diff `d` or `d4` per artifact/detail.
248    /// Mirrors `edge_diff_map`'s `sum1` layout: `[artifact_d,
249    /// artifact_d4, detail_d, detail_d4]` per channel.
250    edge_sums: [f64; 3 * 4],
251    /// Total interior pixels accumulated at this scale (== sum of strip
252    /// interior heights, in scale-s coordinates, times scale-s width).
253    pixels: u64,
254    /// True once at least one strip contributed sums at this scale.
255    initialised: bool,
256}
257
258struct StripAccumulator {
259    per_scale: Vec<ScaleSums>,
260    target_total_pixels: Vec<u64>,
261}
262
263impl StripAccumulator {
264    fn new(width: usize, height: usize) -> Self {
265        let mut per_scale = Vec::with_capacity(NUM_SCALES);
266        let mut target_total_pixels = Vec::with_capacity(NUM_SCALES);
267        // Mirror the main-loop pre-downscale gate exactly: the loop
268        // checks `w < 8 || h < 8` BEFORE downsampling, then downsamples
269        // only when `scale > 0`. So a 64x64 input produces 5 scales:
270        // 64, 32, 16, 8, 4 — the last scale skips the gate because
271        // the gate fires on `w=4` at scale 5, not scale 4.
272        let mut w = width;
273        let mut h = height;
274        for scale in 0..NUM_SCALES {
275            if w < 8 || h < 8 {
276                break;
277            }
278            if scale > 0 {
279                w = w.div_ceil(2);
280                h = h.div_ceil(2);
281            }
282            per_scale.push(ScaleSums::default());
283            target_total_pixels.push((w * h) as u64);
284        }
285        Self {
286            per_scale,
287            target_total_pixels,
288        }
289    }
290
291    fn add_strip_sums(&mut self, scale: usize, ssim: &[f64; 6], edge: &[f64; 12], pixels: u64) {
292        if scale >= self.per_scale.len() {
293            return;
294        }
295        let s = &mut self.per_scale[scale];
296        for (dst, &src) in s.ssim_sums.iter_mut().zip(ssim.iter()) {
297            *dst += src;
298        }
299        for (dst, &src) in s.edge_sums.iter_mut().zip(edge.iter()) {
300            *dst += src;
301        }
302        s.pixels += pixels;
303        s.initialised = true;
304    }
305
306    fn finalise(self) -> Result<f64, Ssimulacra2Error> {
307        // Sanity-check that the accumulated pixel counts equal the
308        // expected per-scale totals; otherwise we have a strip walker
309        // bug and the score is meaningless.
310        for (scale, (s, &expected)) in self
311            .per_scale
312            .iter()
313            .zip(self.target_total_pixels.iter())
314            .enumerate()
315        {
316            if !s.initialised {
317                continue;
318            }
319            debug_assert_eq!(
320                s.pixels, expected,
321                "strip accumulator scale {} pixel count {} != expected {}",
322                scale, s.pixels, expected,
323            );
324        }
325
326        let mut msssim = Msssim::default();
327        for (scale, s) in self.per_scale.iter().enumerate() {
328            if !s.initialised {
329                break;
330            }
331            let denom = self.target_total_pixels[scale] as f64;
332            if denom == 0.0 {
333                break;
334            }
335            let inv = 1.0 / denom;
336            let mut avg_ssim = [0.0_f64; 6];
337            for c in 0..3 {
338                avg_ssim[c * 2] = inv * s.ssim_sums[c * 2];
339                avg_ssim[c * 2 + 1] = (inv * s.ssim_sums[c * 2 + 1]).sqrt().sqrt();
340            }
341            let mut avg_edgediff = [0.0_f64; 12];
342            for c in 0..3 {
343                avg_edgediff[c * 4] = inv * s.edge_sums[c * 4];
344                avg_edgediff[c * 4 + 1] = (inv * s.edge_sums[c * 4 + 1]).sqrt().sqrt();
345                avg_edgediff[c * 4 + 2] = inv * s.edge_sums[c * 4 + 2];
346                avg_edgediff[c * 4 + 3] = (inv * s.edge_sums[c * 4 + 3]).sqrt().sqrt();
347            }
348            msssim.scales.push(MsssimScale {
349                avg_ssim,
350                avg_edgediff,
351            });
352        }
353        Ok(msssim.score())
354    }
355}
356
357/// Construct a `LinearRgb` covering one input strip (rows
358/// `[row_start, row_end)`) of `src`.
359fn linear_rgb_strip(src: &LinearRgb, row_start: usize, row_end: usize) -> LinearRgb {
360    let width = src.width().get();
361    let height = src.height().get();
362    debug_assert!(row_start <= row_end);
363    debug_assert!(row_end <= height);
364    let strip_rows = row_end - row_start;
365    let start = row_start * width;
366    let end = row_end * width;
367    let data: Vec<[f32; 3]> = src.data()[start..end].to_vec();
368    LinearRgb::new(
369        data,
370        NonZeroUsize::new(width).expect("width must be non-zero"),
371        NonZeroUsize::new(strip_rows).expect("strip rows non-zero"),
372    )
373    .expect("strip dimensions are valid")
374}
375
376/// Run the multi-scale SSIM2 pipeline on a single strip's pair of
377/// images, returning raw per-scale, per-channel sums for the rows
378/// belonging to the "interior" of the strip.
379///
380/// `interior_start` and `interior_end` are scale-0 row indices into
381/// the original image; the strip's data covers rows
382/// `[strip_y0, strip_y0 + strip_image_height)` where `strip_y0` is the
383/// row offset passed in. Interior rows are accumulated; halo rows are
384/// processed but discarded.
385fn process_strip(
386    img1_strip: LinearRgb,
387    img2_strip: LinearRgb,
388    strip_y0: usize,       // scale-0 row index of the first row in img*_strip
389    interior_start: usize, // scale-0 row index of first interior row (inclusive)
390    interior_end: usize,   // scale-0 row index of last interior row (exclusive)
391    acc: &mut StripAccumulator,
392    config: Ssimulacra2Config,
393) {
394    let impl_type = config.impl_type;
395    let mut img1 = img1_strip;
396    let mut img2 = img2_strip;
397
398    let mut width = img1.width().get();
399    let mut height = img1.height().get();
400    let total_scales = acc.per_scale.len();
401
402    // Per-strip reusable allocations sized for the current strip
403    // height (much smaller than the full image). `Vec::truncate`
404    // reuses capacity at higher scales.
405    let alloc_plane = |w: usize, h: usize| vec![0.0f32; w * h];
406    let alloc_3planes =
407        |w: usize, h: usize| [alloc_plane(w, h), alloc_plane(w, h), alloc_plane(w, h)];
408
409    let mut mul = alloc_3planes(width, height);
410    let mut sigma1_sq = alloc_3planes(width, height);
411    let mut sigma2_sq = alloc_3planes(width, height);
412    let mut sigma12 = alloc_3planes(width, height);
413    let mut mu1 = alloc_3planes(width, height);
414    let mut mu2 = alloc_3planes(width, height);
415    let mut img1_planar = alloc_3planes(width, height);
416    let mut img2_planar = alloc_3planes(width, height);
417    let mut blur = Blur::with_simd_impl(width, height, impl_type);
418
419    // Scale-0 strip-local interior bounds; updated per scale via
420    // halve-with-snap-down semantics matching the downscale.
421    let mut scale0_interior_start_in_strip = interior_start - strip_y0;
422    let mut scale0_interior_end_in_strip = interior_end - strip_y0;
423    // Halve interior bounds per scale; track the cumulative scale-0
424    // row counts that have been accumulated so the final pixel-count
425    // accounting matches the full image.
426
427    let _ = (strip_y0, interior_start, interior_end); // accounted for via interior_start/end_in_strip
428
429    for scale in 0..total_scales {
430        if width < 8 || height < 8 {
431            break;
432        }
433
434        if scale > 0 {
435            img1 = downscale_by_2(&img1);
436            img2 = downscale_by_2(&img2);
437            width = img1.width().get();
438            height = img2.height().get();
439
440            // Resync interior bounds to the new (halved) coordinate
441            // system. The downsample uses `(out_w, out_h) =
442            // (w.div_ceil(2), h.div_ceil(2))` and indexes
443            // `in[y * 2 + iy]` — equivalent to a per-output-row map of
444            // `out_y = in_y / 2`. The strip's first row maps to
445            // `strip_y0_scaled.div_ceil(2)` IFF the previous strip's
446            // last row was on an even boundary; for that we rely on
447            // strip boundaries always being chosen to be even-aligned
448            // at every scale (the strip walker enforces this).
449            scale0_interior_start_in_strip = scale0_interior_start_in_strip.div_ceil(2);
450            scale0_interior_end_in_strip = scale0_interior_end_in_strip.div_ceil(2);
451        }
452
453        // Bound the interior to the actual scaled strip height, in
454        // case rounding has pushed it past the strip's last row.
455        scale0_interior_start_in_strip = scale0_interior_start_in_strip.min(height);
456        scale0_interior_end_in_strip = scale0_interior_end_in_strip.min(height);
457        if scale0_interior_start_in_strip >= scale0_interior_end_in_strip {
458            // No interior rows at this scale; halo-only contributions are
459            // discarded.
460            continue;
461        }
462
463        // Resize per-scale buffers; cheap (no allocation when truncating).
464        let size = width * height;
465        for buf in [
466            &mut mul,
467            &mut sigma1_sq,
468            &mut sigma2_sq,
469            &mut sigma12,
470            &mut mu1,
471            &mut mu2,
472            &mut img1_planar,
473            &mut img2_planar,
474        ] {
475            for c in buf.iter_mut() {
476                c.resize(size, 0.0);
477                c.truncate(size);
478            }
479        }
480        blur.shrink_to(width, height);
481
482        // XYB conversion + positive shift (per pixel — strip safe).
483        let mut img1_xyb = linear_rgb_to_xyb_simd(img1.clone());
484        let mut img2_xyb = linear_rgb_to_xyb_simd(img2.clone());
485        make_positive_xyb(&mut img1_xyb);
486        make_positive_xyb(&mut img2_xyb);
487        xyb_to_planar_into(&img1_xyb, &mut img1_planar);
488        xyb_to_planar_into(&img2_xyb, &mut img2_planar);
489
490        // Variance / cross-term tensors (per pixel multiply).
491        image_multiply(&img1_planar, &img1_planar, &mut mul, impl_type);
492        blur.blur_into(&mul, &mut sigma1_sq);
493        image_multiply(&img2_planar, &img2_planar, &mut mul, impl_type);
494        blur.blur_into(&mul, &mut sigma2_sq);
495        image_multiply(&img1_planar, &img2_planar, &mut mul, impl_type);
496        blur.blur_into(&mul, &mut sigma12);
497
498        // Means (separate blur calls — IIR has finite halo per row).
499        blur.blur_into(&img1_planar, &mut mu1);
500        blur.blur_into(&img2_planar, &mut mu2);
501
502        // Accumulate the interior rows' contribution to the
503        // per-scale SSIM and edge-diff sums.
504        let ssim_sums = ssim_map_strip(
505            scale,
506            width,
507            scale0_interior_start_in_strip,
508            scale0_interior_end_in_strip,
509            &mu1,
510            &mu2,
511            &sigma1_sq,
512            &sigma2_sq,
513            &sigma12,
514            total_scales,
515        );
516        let edge_sums = edge_diff_map_strip(
517            scale,
518            width,
519            scale0_interior_start_in_strip,
520            scale0_interior_end_in_strip,
521            &img1_planar,
522            &mu1,
523            &img2_planar,
524            &mu2,
525            total_scales,
526        );
527
528        let interior_h = scale0_interior_end_in_strip - scale0_interior_start_in_strip;
529        let interior_pixels = (interior_h as u64) * (width as u64);
530        acc.add_strip_sums(scale, &ssim_sums, &edge_sums, interior_pixels);
531    }
532}
533
534/// Sums (sum_d, sum_d4) per channel over the strip's interior rows,
535/// returning a `[f64; 6]` matching `ssim_map_scalar`'s `plane_averages`
536/// layout but without the per-pixel division.
537#[allow(clippy::too_many_arguments)]
538fn ssim_map_strip(
539    scale_idx: usize,
540    width: usize,
541    interior_start: usize,
542    interior_end: usize,
543    m1: &[Vec<f32>; 3],
544    m2: &[Vec<f32>; 3],
545    s11: &[Vec<f32>; 3],
546    s22: &[Vec<f32>; 3],
547    s12: &[Vec<f32>; 3],
548    total_scales: usize,
549) -> [f64; 6] {
550    const C2: f32 = 0.0009f32;
551
552    let mut out = [0.0_f64; 6];
553    let skip_table = SSIM_HAS_WEIGHT[total_scales.min(NUM_SCALES)];
554
555    for c in 0..3 {
556        // The skip table is computed for the active number of scales
557        // in the multi-scale loop. A channel/scale entry of `false`
558        // contributes zero weight to the final score — we must
559        // produce the matching zero sum here so the score replay
560        // matches the full-image path.
561        if scale_idx < NUM_SCALES && !skip_table[c][scale_idx] {
562            continue;
563        }
564        let mut sum_d = 0.0f64;
565        let mut sum_d4 = 0.0f64;
566        let m1c = &m1[c];
567        let m2c = &m2[c];
568        let s11c = &s11[c];
569        let s22c = &s22[c];
570        let s12c = &s12[c];
571        for y in interior_start..interior_end {
572            let row_start = y * width;
573            let row_end = row_start + width;
574            let m1_row = &m1c[row_start..row_end];
575            let m2_row = &m2c[row_start..row_end];
576            let s11_row = &s11c[row_start..row_end];
577            let s22_row = &s22c[row_start..row_end];
578            let s12_row = &s12c[row_start..row_end];
579            for x in 0..width {
580                let mu1 = m1_row[x];
581                let mu2 = m2_row[x];
582                let mu11 = mu1 * mu1;
583                let mu22 = mu2 * mu2;
584                let mu12 = mu1 * mu2;
585                let mu_diff = mu1 - mu2;
586                let num_m = mu_diff.mul_add(-mu_diff, 1.0f32);
587                let num_s = 2.0f32.mul_add(s12_row[x] - mu12, C2);
588                let denom_s = (s11_row[x] - mu11) + (s22_row[x] - mu22) + C2;
589                let d = (1.0f32 - (num_m * num_s) / denom_s).max(0.0f32);
590                let d2 = d * d;
591                let d4 = d2 * d2;
592                sum_d += f64::from(d);
593                sum_d4 += f64::from(d4);
594            }
595        }
596        out[c * 2] = sum_d;
597        out[c * 2 + 1] = sum_d4;
598    }
599    out
600}
601
602#[allow(clippy::too_many_arguments)]
603fn edge_diff_map_strip(
604    scale_idx: usize,
605    width: usize,
606    interior_start: usize,
607    interior_end: usize,
608    img1: &[Vec<f32>; 3],
609    mu1: &[Vec<f32>; 3],
610    img2: &[Vec<f32>; 3],
611    mu2: &[Vec<f32>; 3],
612    total_scales: usize,
613) -> [f64; 12] {
614    let mut out = [0.0_f64; 12];
615    let skip_table = EDGE_HAS_WEIGHT[total_scales.min(NUM_SCALES)];
616
617    for c in 0..3 {
618        if scale_idx < NUM_SCALES && !skip_table[c][scale_idx] {
619            continue;
620        }
621        let mut sums = [0.0_f64; 4];
622        let i1 = &img1[c];
623        let i2 = &img2[c];
624        let m1c = &mu1[c];
625        let m2c = &mu2[c];
626        for y in interior_start..interior_end {
627            let row_start = y * width;
628            let row_end = row_start + width;
629            let row1 = &i1[row_start..row_end];
630            let row2 = &i2[row_start..row_end];
631            let rowm1 = &m1c[row_start..row_end];
632            let rowm2 = &m2c[row_start..row_end];
633            for x in 0..width {
634                let d1: f64 = (1.0 + f64::from((row2[x] - rowm2[x]).abs()))
635                    / (1.0 + f64::from((row1[x] - rowm1[x]).abs()))
636                    - 1.0;
637                let artifact = d1.max(0.0);
638                sums[0] += artifact;
639                sums[1] += artifact.powi(4);
640                let detail_lost = (-d1).max(0.0);
641                sums[2] += detail_lost;
642                sums[3] += detail_lost.powi(4);
643            }
644        }
645        out[c * 4] = sums[0];
646        out[c * 4 + 1] = sums[1];
647        out[c * 4 + 2] = sums[2];
648        out[c * 4 + 3] = sums[3];
649    }
650
651    out
652}
653
654fn compute_strip_impl(
655    img1: LinearRgb,
656    img2: LinearRgb,
657    strip_height: usize,
658    config: Ssimulacra2StripConfig,
659) -> Result<f64, Ssimulacra2Error> {
660    if img1.width() != img2.width() || img1.height() != img2.height() {
661        return Err(Ssimulacra2Error::NonMatchingImageDimensions);
662    }
663    let width = img1.width().get();
664    let height = img1.height().get();
665    validate_strip_dims(width, height, strip_height)?;
666    let halo = config.halo_rows;
667    let mut acc = StripAccumulator::new(width, height);
668
669    // Strip boundaries are chosen on EVEN row boundaries at every scale
670    // so the downsample mapping is consistent across strips: at scale s,
671    // strip-y0 / (2^s) is the strip's starting row. To keep boundaries
672    // aligned through scale 5 (the deepest we ever reach), boundaries
673    // at scale 0 must be multiples of 32. We snap strip boundaries up
674    // to multiples of 32 (or to `height`, whichever is smaller).
675    const ALIGNMENT: usize = 32;
676    let strip_h = strip_height.max(MIN_STRIP_HEIGHT);
677
678    let mut y = 0usize;
679    while y < height {
680        let mut next_y = (y + strip_h).next_multiple_of(ALIGNMENT);
681        if next_y >= height || height - next_y < ALIGNMENT {
682            next_y = height;
683        }
684        let interior_start = y;
685        let interior_end = next_y;
686        let halo_above = halo.min(interior_start);
687        let halo_below = halo.min(height - interior_end);
688        let strip_y0 = interior_start - halo_above;
689        let strip_y1 = interior_end + halo_below;
690
691        let img1_strip = linear_rgb_strip(&img1, strip_y0, strip_y1);
692        let img2_strip = linear_rgb_strip(&img2, strip_y0, strip_y1);
693
694        process_strip(
695            img1_strip,
696            img2_strip,
697            strip_y0,
698            interior_start,
699            interior_end,
700            &mut acc,
701            config.inner,
702        );
703
704        y = next_y;
705    }
706
707    acc.finalise()
708}
709
710impl Ssimulacra2Reference {
711    /// Compare a distorted image against the precomputed reference
712    /// using strip-bounded peak memory.
713    ///
714    /// Mirrors [`Ssimulacra2Reference::compare`] but runs the dist
715    /// side in strips of `strip_height` rows (plus default halo); the
716    /// precomputed reference data is held full-image as in the
717    /// non-strip API.
718    ///
719    /// At 40 MP, this bounds dist-side peak memory to
720    /// `O(strip_height * width)` instead of the ~7 GiB of the
721    /// full-image dist path; the ref-side cost stays at the cached
722    /// reference's footprint.
723    ///
724    /// # Errors
725    /// - If the distorted image dimensions don't match the reference
726    /// - If `strip_height < 8`
727    pub fn compare_strip<T: ToLinearRgb>(
728        &self,
729        distorted: T,
730        strip_height: u32,
731    ) -> Result<f64, Ssimulacra2Error> {
732        self.compare_strip_with_config(distorted, strip_height, Ssimulacra2StripConfig::default())
733    }
734
735    /// Strip-bounded comparison with explicit configuration.
736    ///
737    /// # Errors
738    /// As [`Ssimulacra2Reference::compare_strip`].
739    pub fn compare_strip_with_config<T: ToLinearRgb>(
740        &self,
741        distorted: T,
742        strip_height: u32,
743        config: Ssimulacra2StripConfig,
744    ) -> Result<f64, Ssimulacra2Error> {
745        // `Ssimulacra2Reference` retains only the per-scale precomputed
746        // planes (img1_planar, mu1, sigma1_sq), not the source LinearRgb.
747        // The cached-ref strip walker therefore slices those full-image
748        // planes per strip (re-running only the ref-side blur, which must
749        // share the strip's IIR boundary handling — see
750        // `process_dist_strip_with_cached_ref`), while the dist side
751        // streams strip-by-strip.
752        let img2: LinearRgb = distorted.into_linear_rgb().into();
753        let width = img2.width().get();
754        let height = img2.height().get();
755        if width != self.width() || height != self.height() {
756            return Err(Ssimulacra2Error::NonMatchingImageDimensions);
757        }
758        validate_strip_dims(width, height, strip_height as usize)?;
759        compare_strip_with_cached_ref(self, img2, strip_height as usize, config)
760    }
761}
762
763/// Cached-reference strip path. The reference's precomputed
764/// scale-0..N planes (`img1_planar`, `mu1`, `sigma1_sq`) stay in memory
765/// at full size; the distorted side is processed strip-by-strip.
766///
767/// At 40 MP this bounds the per-call working memory to
768/// `O(strip_height * width)` for the dist-side planes plus the ~3 GiB
769/// already pinned by `Ssimulacra2Reference`. The savings are real but
770/// smaller than the full-strip path — pair with the
771/// no-cached-ref `compute_ssimulacra2_strip` when the goal is the
772/// absolute lowest peak heap.
773fn compare_strip_with_cached_ref(
774    reference: &Ssimulacra2Reference,
775    img2_full: LinearRgb,
776    strip_height: usize,
777    config: Ssimulacra2StripConfig,
778) -> Result<f64, Ssimulacra2Error> {
779    let width = img2_full.width().get();
780    let height = img2_full.height().get();
781
782    let halo = config.halo_rows;
783    let mut acc = StripAccumulator::new(width, height);
784
785    const ALIGNMENT: usize = 32;
786    let strip_h = strip_height.max(MIN_STRIP_HEIGHT);
787
788    let mut y = 0usize;
789    while y < height {
790        let mut next_y = (y + strip_h).next_multiple_of(ALIGNMENT);
791        if next_y >= height || height - next_y < ALIGNMENT {
792            next_y = height;
793        }
794        let interior_start = y;
795        let interior_end = next_y;
796        let halo_above = halo.min(interior_start);
797        let halo_below = halo.min(height - interior_end);
798        let strip_y0 = interior_start - halo_above;
799        let strip_y1 = interior_end + halo_below;
800
801        let img2_strip = linear_rgb_strip(&img2_full, strip_y0, strip_y1);
802
803        process_dist_strip_with_cached_ref(
804            reference,
805            img2_strip,
806            strip_y0,
807            interior_start,
808            interior_end,
809            &mut acc,
810            config.inner,
811        );
812
813        y = next_y;
814    }
815
816    acc.finalise()
817}
818
819/// Strip walker for the cached-ref path. Reuses the reference's
820/// pre-computed planar XYB (`img1_planar`); recomputes the ref-side
821/// blurs (`mu1`, `sigma1_sq`) per strip so they share the strip-halo
822/// IIR boundary handling with the dist-side. This is required for
823/// blur-symmetric SSIM: both `mu1` and `mu2` (and both sigma squared
824/// terms) must come from the SAME blur context to score 100 on
825/// identical inputs.
826///
827/// The cache saves the XYB conversion + multi-scale downsample + the
828/// `img1 * img1` product; the IIR blur is recomputed per strip
829/// because the cached full-image blur differs from the strip blur at
830/// halo precision (~`e^{-halo}` at sigma=1.5). On a 40 MP image the
831/// XYB+downsample cost dominates the cached-ref path's savings, so
832/// recomputing the per-strip blur is a small price for blur-symmetric
833/// parity.
834fn process_dist_strip_with_cached_ref(
835    reference: &Ssimulacra2Reference,
836    img2_strip: LinearRgb,
837    strip_y0: usize,
838    interior_start: usize,
839    interior_end: usize,
840    acc: &mut StripAccumulator,
841    config: Ssimulacra2Config,
842) {
843    let impl_type = config.impl_type;
844    let mut img2 = img2_strip;
845    let mut width = img2.width().get();
846    let mut height = img2.height().get();
847    let total_scales = acc.per_scale.len();
848
849    let alloc_plane = |w: usize, h: usize| vec![0.0f32; w * h];
850    let alloc_3planes =
851        |w: usize, h: usize| [alloc_plane(w, h), alloc_plane(w, h), alloc_plane(w, h)];
852
853    let mut mul = alloc_3planes(width, height);
854    let mut sigma1_sq_strip = alloc_3planes(width, height);
855    let mut sigma2_sq = alloc_3planes(width, height);
856    let mut sigma12 = alloc_3planes(width, height);
857    let mut mu1_strip = alloc_3planes(width, height);
858    let mut mu2 = alloc_3planes(width, height);
859    let mut img2_planar = alloc_3planes(width, height);
860    // Per-strip slice of the reference's planar XYB image.
861    let mut img1_planar_strip = alloc_3planes(width, height);
862
863    let mut blur = Blur::with_simd_impl(width, height, impl_type);
864
865    let mut interior_start_in_strip = interior_start - strip_y0;
866    let mut interior_end_in_strip = interior_end - strip_y0;
867    let mut strip_y0_in_ref = strip_y0;
868
869    for scale in 0..total_scales {
870        if width < 8 || height < 8 {
871            break;
872        }
873
874        if scale > 0 {
875            img2 = downscale_by_2(&img2);
876            width = img2.width().get();
877            height = img2.height().get();
878            interior_start_in_strip = interior_start_in_strip.div_ceil(2);
879            interior_end_in_strip = interior_end_in_strip.div_ceil(2);
880            // Strip starts at a 32-aligned row at scale 0; at scale s
881            // it's a multiple of 32/2^s, divisible by 2 for s<=4.
882            // At scale 5 it's a multiple of 1; integer-divide by 2 is
883            // safe because strip_y0_in_ref will not advance past the
884            // correct ref row.
885            strip_y0_in_ref /= 2;
886        }
887
888        interior_start_in_strip = interior_start_in_strip.min(height);
889        interior_end_in_strip = interior_end_in_strip.min(height);
890        if interior_start_in_strip >= interior_end_in_strip {
891            continue;
892        }
893
894        // Pull strip-shaped slices of the reference's scale-s planes.
895        // The cached reference exposes them via accessor methods on
896        // `Ssimulacra2Reference` (added alongside this strip API).
897        let ref_planes = reference
898            .scale_planes(scale)
899            .expect("scale index is bounded by total_scales which equals reference.num_scales()");
900        let ref_width = ref_planes.width;
901        let ref_height = ref_planes.height;
902        debug_assert_eq!(ref_width, width);
903        // The strip's local height (`height`) and the corresponding
904        // slice of the ref's full plane (`ref_h_for_strip`) must
905        // agree; alignment math (32-aligned strip boundaries through
906        // scale 5) guarantees this.
907        let ref_h_for_strip = (ref_height - strip_y0_in_ref).min(height);
908        // Ensure the local strip buffers are sized to whatever extent
909        // the IMG2 downsample produced — it determines per-pixel
910        // operations' row counts.
911        let actual_strip_h = height.min(ref_h_for_strip);
912        debug_assert_eq!(
913            actual_strip_h, height,
914            "strip walker scale={scale} ref_h={ref_height} strip_y0_in_ref={strip_y0_in_ref} \
915             dist_strip_h={height} ref_h_for_strip={ref_h_for_strip} — alignment regression"
916        );
917        let size = width * actual_strip_h;
918        for buf in [
919            &mut mul,
920            &mut sigma1_sq_strip,
921            &mut sigma2_sq,
922            &mut sigma12,
923            &mut mu1_strip,
924            &mut mu2,
925            &mut img2_planar,
926            &mut img1_planar_strip,
927        ] {
928            for c in buf.iter_mut() {
929                c.resize(size, 0.0);
930                c.truncate(size);
931            }
932        }
933        blur.shrink_to(width, actual_strip_h);
934
935        // Pull the strip's worth of cached XYB-planar reference rows.
936        for (dst_chan, src_chan) in img1_planar_strip
937            .iter_mut()
938            .zip(ref_planes.img1_planar.iter())
939        {
940            for row in 0..actual_strip_h {
941                let src_row_start = (strip_y0_in_ref + row) * width;
942                let dst_row_start = row * width;
943                dst_chan[dst_row_start..dst_row_start + width]
944                    .copy_from_slice(&src_chan[src_row_start..src_row_start + width]);
945            }
946        }
947
948        let mut img2_xyb = linear_rgb_to_xyb_simd(img2.clone());
949        make_positive_xyb(&mut img2_xyb);
950        xyb_to_planar_into(&img2_xyb, &mut img2_planar);
951
952        // mu1, mu2: recompute the ref-side blur per strip so it
953        // shares the same IIR boundary handling as mu2.
954        blur.blur_into(&img1_planar_strip, &mut mu1_strip);
955        // sigma1_sq: same — recompute on the strip from cached planar.
956        image_multiply(&img1_planar_strip, &img1_planar_strip, &mut mul, impl_type);
957        blur.blur_into(&mul, &mut sigma1_sq_strip);
958        // sigma2_sq = blur(img2^2)
959        image_multiply(&img2_planar, &img2_planar, &mut mul, impl_type);
960        blur.blur_into(&mul, &mut sigma2_sq);
961        // sigma12 = blur(img1 * img2)
962        image_multiply(&img1_planar_strip, &img2_planar, &mut mul, impl_type);
963        blur.blur_into(&mul, &mut sigma12);
964        // mu2 = blur(img2)
965        blur.blur_into(&img2_planar, &mut mu2);
966
967        let ssim_sums = ssim_map_strip(
968            scale,
969            width,
970            interior_start_in_strip,
971            interior_end_in_strip,
972            &mu1_strip,
973            &mu2,
974            &sigma1_sq_strip,
975            &sigma2_sq,
976            &sigma12,
977            total_scales,
978        );
979        let edge_sums = edge_diff_map_strip(
980            scale,
981            width,
982            interior_start_in_strip,
983            interior_end_in_strip,
984            &img1_planar_strip,
985            &mu1_strip,
986            &img2_planar,
987            &mu2,
988            total_scales,
989        );
990
991        let interior_h = interior_end_in_strip - interior_start_in_strip;
992        let interior_pixels = (interior_h as u64) * (width as u64);
993        acc.add_strip_sums(scale, &ssim_sums, &edge_sums, interior_pixels);
994    }
995}