oxideav-webp 0.0.8

Pure-Rust WebP image codec for oxideav — RIFF VP8 lossy + VP8L lossless + VP8X extended + ALPH + animation decode, plus VP8 lossy and VP8L lossless single-frame encode
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
//! VP8L transforms — predictor, colour, subtract-green, colour-indexing.
//!
//! Each transform is parsed once from the bitstream and later applied in
//! reverse order during the final image assembly. The predictor and
//! colour transforms carry their own sub-image (a small tiled image of
//! transform parameters); colour-indexing carries a 1D palette; subtract-
//! green has no parameters.

use oxideav_core::{Error, Result};

use super::bit_reader::BitReader;
use super::decode_image_stream;

#[derive(Debug)]
pub enum Transform {
    Predictor {
        tile_bits: u32,
        sub_image: Vec<u32>,
        sub_w: u32,
        #[allow(dead_code)]
        sub_h: u32,
        xsize: u32,
    },
    Color {
        tile_bits: u32,
        sub_image: Vec<u32>,
        sub_w: u32,
        #[allow(dead_code)]
        sub_h: u32,
        xsize: u32,
    },
    SubtractGreen,
    ColorIndex {
        colors: Vec<u32>,
        bits_per_pixel: u32,
        orig_xsize: u32,
    },
}

impl Transform {
    pub fn read(br: &mut BitReader<'_>, xsize: u32, ysize: u32) -> Result<Self> {
        let ty = br.read_bits(2)?;
        match ty {
            0 => {
                // Predictor.
                let tile_bits = br.read_bits(3)? + 2;
                let sub_w = subsampled_size(xsize, tile_bits);
                let sub_h = subsampled_size(ysize, tile_bits);
                let sub = decode_image_stream(br, sub_w, sub_h, false)?;
                Ok(Transform::Predictor {
                    tile_bits,
                    sub_image: sub,
                    sub_w,
                    sub_h,
                    xsize,
                })
            }
            1 => {
                // Colour.
                let tile_bits = br.read_bits(3)? + 2;
                let sub_w = subsampled_size(xsize, tile_bits);
                let sub_h = subsampled_size(ysize, tile_bits);
                let sub = decode_image_stream(br, sub_w, sub_h, false)?;
                Ok(Transform::Color {
                    tile_bits,
                    sub_image: sub,
                    sub_w,
                    sub_h,
                    xsize,
                })
            }
            2 => Ok(Transform::SubtractGreen),
            3 => {
                // Colour indexing.
                let num_colors = br.read_bits(8)? + 1;
                let mut colors_raw = decode_image_stream(br, num_colors, 1, false)?;
                // Colour table is delta-coded along the row (each entry
                // differs from the previous by a per-channel value in
                // modulo 256 arithmetic).
                for i in 1..colors_raw.len() {
                    colors_raw[i] = add_argb(colors_raw[i], colors_raw[i - 1]);
                }
                let bits_per_pixel = if num_colors <= 2 {
                    1
                } else if num_colors <= 4 {
                    2
                } else if num_colors <= 16 {
                    4
                } else {
                    8
                };
                Ok(Transform::ColorIndex {
                    colors: colors_raw,
                    bits_per_pixel,
                    orig_xsize: xsize,
                })
            }
            _ => Err(Error::invalid("VP8L: invalid transform type")),
        }
    }

    /// Width of the image stream produced *after* this transform's parse
    /// step. Used while parsing subsequent transforms. For colour-
    /// indexing the pixel stream is packed: its width shrinks by the
    /// packing factor. Other transforms keep `default_w` unchanged —
    /// the caller passes the current xsize as the default.
    pub fn image_width_or_default(&self, default_w: u32) -> u32 {
        match self {
            Transform::ColorIndex {
                bits_per_pixel,
                orig_xsize,
                ..
            } => {
                let pack = 8 / *bits_per_pixel;
                (orig_xsize + pack - 1) / pack
            }
            _ => default_w,
        }
    }

    /// Width of the image after this transform is *applied* in the
    /// reverse pass. For colour-indexing it expands back to `orig_xsize`;
    /// every other transform is width-neutral.
    pub fn output_width(&self, input_w: u32) -> u32 {
        match self {
            Transform::ColorIndex { orig_xsize, .. } => *orig_xsize,
            _ => input_w,
        }
    }

    pub fn apply(&self, pixels: &[u32], width: u32, height: u32) -> Result<Vec<u32>> {
        match self {
            Transform::Predictor {
                tile_bits,
                sub_image,
                sub_w,
                ..
            } => Ok(apply_predictor(
                pixels, width, height, *tile_bits, sub_image, *sub_w,
            )),
            Transform::Color {
                tile_bits,
                sub_image,
                sub_w,
                ..
            } => Ok(apply_color_transform(
                pixels, width, height, *tile_bits, sub_image, *sub_w,
            )),
            Transform::SubtractGreen => Ok(apply_subtract_green(pixels)),
            Transform::ColorIndex {
                colors,
                bits_per_pixel,
                orig_xsize,
            } => apply_color_index(pixels, width, height, colors, *bits_per_pixel, *orig_xsize),
        }
    }
}

fn subsampled_size(size: u32, bits: u32) -> u32 {
    (size + (1 << bits) - 1) >> bits
}

/// ARGB addition per-component (modulo 256). Used by transforms that
/// encode residuals.
///
/// Implemented with the standard SWAR trick: mask out bit 7 of every
/// byte, add (which can no longer carry into the next byte because the
/// per-byte sum is ≤ 0x7f+0x7f = 0xfe), then re-derive bit 7 of each
/// result byte from `a ^ b` masked to the bit-7 lane. The previous
/// per-byte unpack/add/repack cost ~12 shifts + 8 masks + 4 adds + 4
/// shifts + 3 ORs per call; the SWAR version is 3 ANDs + 1 add + 1
/// XOR + 1 XOR. Called per pixel in the predictor transform inner
/// loop and per palette entry during colour-index delta-decode.
#[inline]
fn add_argb(a: u32, b: u32) -> u32 {
    let masked_sum = (a & 0x7f7f_7f7f).wrapping_add(b & 0x7f7f_7f7f);
    masked_sum ^ ((a ^ b) & 0x8080_8080)
}

// ── Predictor transform ───────────────────────────────────────────────
//
// Each tile gets a predictor mode 0..13 from the sub-image's green
// channel. The decoded pixel is `pred + residual` per-component mod 256,
// where `pred` is computed from the already-decoded neighbourhood.

fn apply_predictor(
    residual: &[u32],
    width: u32,
    height: u32,
    tile_bits: u32,
    sub_image: &[u32],
    sub_w: u32,
) -> Vec<u32> {
    // Build the decoded image into a fresh buffer in raster order. The
    // previous implementation `to_vec()`'d `residual` up-front so it could
    // index `out[idx] = ...` cheaply, but that's a wasted memcpy of the
    // entire residual image (~64 KiB on the 128×128-natural fixture) plus
    // a write-then-overwrite of every cell. Since `predict_argb` only
    // reads the already-decoded neighbourhood (L / T / TL / TR — all at
    // indices strictly less than `idx` in raster order), `Vec::push`
    // works directly: at the moment we compute pixel `idx`, every
    // earlier slot is filled and every later slot is logically untouched.
    //
    // Tile hoisting: the predictor `mode` only varies between tiles
    // (`1 << tile_bits` pixels wide/tall, typically 4..32). Iterating
    // column-tile-by-column-tile lets us look up `mode` once per tile
    // row segment instead of once per pixel — eliminating two shifts +
    // a multiply + a sub-image load from the hot inner loop.
    let pixel_count = residual.len();
    let mut out: Vec<u32> = Vec::with_capacity(pixel_count);
    let w_usize = width as usize;
    let sub_w_usize = sub_w as usize;

    // ── Row 0 (top row): predictor is `out[idx-1]` for x>0 and
    // 0xff00_0000 for the top-left pixel. No tile lookup needed.
    if height > 0 {
        out.push(add_argb(residual[0], 0xff00_0000));
        for x in 1..width as usize {
            let pred = out[x - 1];
            out.push(add_argb(residual[x], pred));
        }
    }

    // ── Rows 1..height: column 0 uses the top neighbour, columns
    // 1..width walk through tiles using a fixed `mode` per tile.
    for y in 1..height {
        let ty = (y >> tile_bits) as usize;
        let row_base = (y * width) as usize;
        // Column 0 — top-only predictor.
        let pred0 = out[row_base - w_usize];
        out.push(add_argb(residual[row_base], pred0));

        // Columns 1..width: walk in tile-sized spans so `mode` is
        // only loaded when crossing a tile boundary.
        let mut x: u32 = 1;
        while x < width {
            let tx = (x >> tile_bits) as usize;
            let mode = (sub_image[ty * sub_w_usize + tx] >> 8) & 0x0f;
            // End of this tile column: next multiple of `tile_size`
            // strictly greater than `x`. Capped at `width`.
            let tile_end = (((x >> tile_bits) + 1) << tile_bits).min(width);
            // Per-mode specialisation: dispatch once per tile-row
            // span, not once per pixel.
            apply_predictor_tile_row(&mut out, residual, w_usize, y as usize, x, tile_end, mode);
            x = tile_end;
        }
    }
    out
}

/// Apply the predictor for one tile-row span — pixels `(x..x_end, y)` —
/// with a fixed `mode`. The mode dispatch is hoisted out of the per-pixel
/// loop so each mode's inner body sees only one branch.
#[inline]
fn apply_predictor_tile_row(
    out: &mut Vec<u32>,
    residual: &[u32],
    w: usize,
    y: usize,
    x_start: u32,
    x_end: u32,
    mode: u32,
) {
    // y >= 1 and x_start >= 1 by construction (caller handles the
    // first-row + first-column special cases). All four neighbours
    // (L, T, TL, TR) are therefore in-bounds.
    let row_base = y * w;
    match mode {
        0 => {
            // Constant 0xff00_0000 — no neighbour reads.
            for x in x_start..x_end {
                let idx = row_base + x as usize;
                out.push(add_argb(residual[idx], 0xff00_0000));
            }
        }
        1 => {
            for x in x_start..x_end {
                let idx = row_base + x as usize;
                let pred = out[idx - 1];
                out.push(add_argb(residual[idx], pred));
            }
        }
        2 => {
            for x in x_start..x_end {
                let idx = row_base + x as usize;
                let pred = out[idx - w];
                out.push(add_argb(residual[idx], pred));
            }
        }
        4 => {
            for x in x_start..x_end {
                let idx = row_base + x as usize;
                let pred = out[idx - w - 1];
                out.push(add_argb(residual[idx], pred));
            }
        }
        _ => {
            // Modes 3, 5..13 all need TR which is column-boundary
            // sensitive (the "leftmost pixel on the same row" wrap),
            // so keep them on the generic path.
            for x in x_start..x_end {
                let idx = row_base + x as usize;
                let pred = predict_argb(out, w, x as usize, y, mode);
                out.push(add_argb(residual[idx], pred));
            }
        }
    }
}

fn predict_argb(out: &[u32], w: usize, x: usize, y: usize, mode: u32) -> u32 {
    let l = out[y * w + x - 1];
    let t = out[(y - 1) * w + x];
    let tl = out[(y - 1) * w + x - 1];
    let tr = if x + 1 < w {
        out[(y - 1) * w + x + 1]
    } else {
        // RFC 9649 §4.1: "Addressing the TR-pixel for pixels on the
        // rightmost column is exceptional. … the leftmost pixel on the
        // same row as the current pixel is instead used as the TR-pixel."
        // (Note: that is column 0 of the *current* row — NOT the LEFT
        // neighbour at column x-1, which is what we previously had and
        // which produced the issue-#8 regression where pixel (1, 53) of
        // a libwebp-encoded 5×78 image cascaded a wrong TR through every
        // row's column-4 predictor into adjacent columns' L/T/TL/TR.)
        out[y * w]
    };
    match mode {
        0 => 0xff00_0000, // opaque black
        1 => l,
        2 => t,
        3 => tr,
        4 => tl,
        5 => avg3(l, tr, t),
        6 => avg2(l, tl),
        7 => avg2(l, t),
        8 => avg2(tl, t),
        9 => avg2(t, tr),
        10 => avg2(avg2(l, tl), avg2(t, tr)),
        11 => select_argb(l, t, tl),
        12 => clamp_add_sub_argb(l, t, tl),
        13 => clamp_add_sub_half_argb(avg2(l, t), tl),
        _ => 0xff00_0000,
    }
}

/// Per-byte floor-average of two ARGB pixels: result_byte = (a_byte +
/// b_byte) >> 1 per channel. Used by predictor modes 5..10 which mix
/// neighbour pixels; called multiple times per output pixel for those
/// modes (mode 10 nests two `avg2` calls under another `avg2`).
///
/// SWAR identity: `(a + b) / 2 = (a^b)/2 + (a&b)`. Masking `a^b` with
/// 0xfefefefe before the right-shift drops bit 0 of every byte (which
/// would otherwise be the LSB of the next byte after `>> 1`), keeping
/// the half-shift inside its lane. Lane sums are bounded by 0xff —
/// floor((0xff + 0xff) / 2) — so the wrapping_add can't carry into the
/// next byte.
#[inline]
fn avg2(a: u32, b: u32) -> u32 {
    (a & b).wrapping_add(((a ^ b) & 0xfefe_fefe) >> 1)
}

fn avg3(a: u32, b: u32, c: u32) -> u32 {
    avg2(a, avg2(b, c))
}

/// Predictor mode 11 ("select"): a Paeth-like decision that picks the
/// pixel whose per-channel L1 distance to TL is smaller. The decision is
/// global to the pixel — the result is *either* `l` or `t` whole, never
/// a per-channel mix — so we only need the channel-summed L1 distances.
///
/// Re-shaped from the previous double-loop into:
///   1. Unpack the three pixels into byte arrays once. LLVM lowers this
///      to a single 4-byte load; the compiler then vectorises the
///      summed-abs-diffs into SAD-style instructions on x86 (PSADBW)
///      and the equivalent on aarch64.
///   2. Compute `dl` (summed |t-tl|) and `dt` (summed |l-tl|) in one
///      pass over the four channel positions.
///   3. Branch once on `dl < dt` and return `l` or `t` directly — no
///      per-channel re-pack.
///
/// Avoids 8 shifts + 8 ands + 8 sign-extends + 4 `i32::abs()` + 4
/// shifts + 4 ors that the previous shape forced on every call.
#[inline]
fn select_argb(l: u32, t: u32, tl: u32) -> u32 {
    let lb = l.to_le_bytes();
    let tb = t.to_le_bytes();
    let tlb = tl.to_le_bytes();
    let mut dl: u32 = 0;
    let mut dt: u32 = 0;
    for c in 0..4 {
        // |a - b| on u8 lanes via the unsigned `abs_diff` intrinsic
        // (one instruction on most targets).
        dl += tb[c].abs_diff(tlb[c]) as u32;
        dt += lb[c].abs_diff(tlb[c]) as u32;
    }
    if dl < dt {
        l
    } else {
        t
    }
}

/// Predictor mode 12: per-channel `clamp(l + t - tl, 0, 255)`. Lane
/// arithmetic is a signed sum-then-clamp; the previous shape did this
/// with explicit shift/and/sign-extend per channel plus one `i32::clamp`
/// then a re-pack `or-shift`. The byte-array variant lets LLVM hoist
/// the per-byte unpack/repack and saturate instructions on platforms
/// that have them.
#[inline]
fn clamp_add_sub_argb(l: u32, t: u32, tl: u32) -> u32 {
    let lb = l.to_le_bytes();
    let tb = t.to_le_bytes();
    let tlb = tl.to_le_bytes();
    let mut out = [0u8; 4];
    for c in 0..4 {
        let v = (lb[c] as i32) + (tb[c] as i32) - (tlb[c] as i32);
        out[c] = v.clamp(0, 255) as u8;
    }
    u32::from_le_bytes(out)
}

/// Predictor mode 13: `clamp(a + (a - b) / 2, 0, 255)` per channel.
/// Same byte-array re-shape as `clamp_add_sub_argb` for the same reason.
#[inline]
fn clamp_add_sub_half_argb(a: u32, b: u32) -> u32 {
    let ab = a.to_le_bytes();
    let bb = b.to_le_bytes();
    let mut out = [0u8; 4];
    for c in 0..4 {
        let av = ab[c] as i32;
        let bv = bb[c] as i32;
        let v = av + (av - bv) / 2;
        out[c] = v.clamp(0, 255) as u8;
    }
    u32::from_le_bytes(out)
}

// ── Colour transform ──────────────────────────────────────────────────
//
// Spec §4.2. Removes correlation between R/B channels by subtracting
// scaled versions of G and of (post-subtract) R.

fn apply_color_transform(
    pixels: &[u32],
    width: u32,
    height: u32,
    tile_bits: u32,
    sub_image: &[u32],
    sub_w: u32,
) -> Vec<u32> {
    // Tile hoisting: `coeffs` (and therefore the unpacked r2b / g2b /
    // g2r values + their sign-extends) only changes between tiles. The
    // previous shape did `(x >> tile_bits)` + `(y >> tile_bits)` + a
    // sub-image load + three byte-extracts + three sign-extends per
    // pixel; tiling lifts all of that to once per tile-row span (≥ 4
    // pixels at minimum tile_bits=2, typically 16-32).
    let mut out = Vec::with_capacity(pixels.len());
    let sub_w_usize = sub_w as usize;
    for y in 0..height {
        let ty = (y >> tile_bits) as usize;
        let row_base = (y * width) as usize;
        let mut x: u32 = 0;
        while x < width {
            let tx = (x >> tile_bits) as usize;
            let coeffs = sub_image[ty * sub_w_usize + tx];
            // Coeff packing per WebP lossless spec §4.2 (the "Color
            // Transform" section): each `ColorTransformElement` is
            // stored as an ARGB pixel where
            //   A = 255 (unused)
            //   R = red_to_blue
            //   G = green_to_blue
            //   B = green_to_red
            let r2b = ((coeffs >> 16) & 0xff) as i8 as i32;
            let g2b = ((coeffs >> 8) & 0xff) as i8 as i32;
            let g2r = (coeffs & 0xff) as i8 as i32;
            let tile_end = (((x >> tile_bits) + 1) << tile_bits).min(width);
            for xi in x..tile_end {
                let p = pixels[row_base + xi as usize];
                let a = (p >> 24) & 0xff;
                let mut r = ((p >> 16) & 0xff) as i32;
                let g = ((p >> 8) & 0xff) as i32;
                let mut b = (p & 0xff) as i32;

                // g2r / g2b / r2b are sign-extended 8-bit values; per
                // spec the correction is `((coeff * sign_extend(green)) >> 5)`.
                r = (r + ((g2r * (g as i8 as i32)) >> 5)) & 0xff;
                b = (b + ((g2b * (g as i8 as i32)) >> 5)) & 0xff;
                b = (b + ((r2b * (r as i8 as i32)) >> 5)) & 0xff;

                let argb = (a << 24)
                    | ((r as u32 & 0xff) << 16)
                    | ((g as u32 & 0xff) << 8)
                    | (b as u32 & 0xff);
                out.push(argb);
            }
            x = tile_end;
        }
    }
    out
}

// ── Subtract-green transform ──────────────────────────────────────────

/// Inverse of the encoder's "subtract green" — re-adds the green
/// channel into R and B (per-byte mod 256), leaving A and G untouched.
///
/// SWAR form: broadcast G into the R and B byte lanes (the A and G
/// lanes stay zero), then SWAR-add to the original pixel via the same
/// bit-7-XOR trick as `add_argb`. This collapses the per-byte
/// unpack/add/repack to roughly 5 bitwise ops + 1 add per pixel.
fn apply_subtract_green(pixels: &[u32]) -> Vec<u32> {
    pixels
        .iter()
        .map(|&p| {
            let g = (p >> 8) & 0xff;
            // Broadcast G into the R (bits 16..24) and B (bits 0..8) lanes.
            let g_rb = (g << 16) | g;
            // Per-byte add mod 256 — same SWAR identity as add_argb.
            let masked_sum = (p & 0x7f7f_7f7f).wrapping_add(g_rb & 0x7f7f_7f7f);
            masked_sum ^ ((p ^ g_rb) & 0x8080_8080)
        })
        .collect()
}

// ── Colour indexing transform ─────────────────────────────────────────
//
// The decoded pixel stream is an "index image": each pixel's green
// channel is an index into `colors`. When there are ≤16 colours the
// stream is bit-packed — `bits_per_pixel` indices per green byte.

fn apply_color_index(
    packed: &[u32],
    width: u32,
    _height: u32,
    colors: &[u32],
    bits_per_pixel: u32,
    orig_xsize: u32,
) -> Result<Vec<u32>> {
    // Specialise per `bits_per_pixel` ∈ {1, 2, 4, 8} so `pack` (the
    // number of indices packed into one byte) and `mask` are
    // compile-time constants in each branch — eliminating the per-pixel
    // `8 / bits_per_pixel` divide, the dynamic mask construction, and
    // the runtime shift count. With pack known at compile time the
    // inner loop also unrolls into straight-line per-sub-index code.
    let rows = packed.len() / width as usize;
    let mut out = Vec::with_capacity((orig_xsize as usize) * rows.max(1));
    let w = width as usize;
    let ox_end = orig_xsize as usize;
    match bits_per_pixel {
        8 => apply_color_index_pack1(&mut out, packed, w, rows, colors, ox_end),
        4 => apply_color_index_packed::<2, 4, 0x0f>(&mut out, packed, w, rows, colors, ox_end),
        2 => apply_color_index_packed::<4, 2, 0x03>(&mut out, packed, w, rows, colors, ox_end),
        1 => apply_color_index_packed::<8, 1, 0x01>(&mut out, packed, w, rows, colors, ox_end),
        _ => return Err(Error::invalid("VP8L: invalid bits_per_pixel")),
    }
    Ok(out)
}

/// Unpacked colour-index path: `bits_per_pixel == 8`, exactly one
/// palette index per packed pixel. The boundary check disappears
/// entirely (every packed pixel produces exactly one output, so
/// `width == orig_xsize` always).
#[inline]
fn apply_color_index_pack1(
    out: &mut Vec<u32>,
    packed: &[u32],
    width: usize,
    rows: usize,
    colors: &[u32],
    _ox_end: usize,
) {
    let num_colors = colors.len();
    for y in 0..rows {
        let row_base = y * width;
        for xp in 0..width {
            let p = packed[row_base + xp];
            let idx = ((p >> 8) & 0xff) as usize;
            let color = if idx < num_colors { colors[idx] } else { 0 };
            out.push(color);
        }
    }
}

/// Bit-packed colour-index path: `bits_per_pixel ∈ {1, 2, 4}`,
/// `PACK = 8 / bits_per_pixel`, `BITS = bits_per_pixel`, `MASK =
/// (1 << bits_per_pixel) - 1`. Const generics make `PACK` /
/// `MASK` literal constants in each monomorphisation so the inner
/// loop unrolls cleanly. The right-edge boundary check is lifted
/// out of the inner sub-loop: only the final packed column of each
/// row may contain partial output, so we split into a fast bulk
/// path (`xp < bulk_end`) and a tail handler.
#[inline]
fn apply_color_index_packed<const PACK: usize, const BITS: u32, const MASK: u32>(
    out: &mut Vec<u32>,
    packed: &[u32],
    width: usize,
    rows: usize,
    colors: &[u32],
    ox_end: usize,
) {
    let num_colors = colors.len();
    // Floor division: number of packed columns that produce a full
    // PACK-pixel output span. Anything past `bulk_end` may be a
    // partial span at the right edge of the row.
    let bulk_end = ox_end / PACK;
    for y in 0..rows {
        let row_base = y * width;
        // Bulk path: PACK outputs per packed pixel, no boundary check.
        for xp in 0..bulk_end {
            let g = (packed[row_base + xp] >> 8) & 0xff;
            for sub in 0..PACK {
                let idx = ((g >> (BITS * sub as u32)) & MASK) as usize;
                let color = if idx < num_colors { colors[idx] } else { 0 };
                out.push(color);
            }
        }
        // Tail path: at most one partial packed pixel at the right
        // edge of the row. Skip entirely on rows where the width
        // divides evenly.
        for xp in bulk_end..width {
            let g = (packed[row_base + xp] >> 8) & 0xff;
            for sub in 0..PACK {
                let ox = xp * PACK + sub;
                if ox >= ox_end {
                    break;
                }
                let idx = ((g >> (BITS * sub as u32)) & MASK) as usize;
                let color = if idx < num_colors { colors[idx] } else { 0 };
                out.push(color);
            }
        }
    }
}