agg-gui 0.2.0

Immediate-mode Rust GUI library with AGG rendering, Y-up layout, widgets, text, SVG, and native/WASM adapters
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
//! LCD subpixel text as a **per-channel coverage mask** that composites
//! onto arbitrary backgrounds — no bg pre-fill, no destination-color
//! knowledge required at rasterization time.
//!
//! # Why this replaces the pre-fill approach
//!
//! The older `PixfmtRgba32Lcd` path baked the caller's background colour
//! into the rasterised output via a per-channel src-over against the
//! pre-filled framebuffer.  That coupled the LCD glyphs to one specific
//! destination and forced us to know that destination everywhere text is
//! drawn — driving the walk / sample / push / pop complexity.
//!
//! Instead, we keep the **three subpixel coverage values independent**:
//! the output of the rasteriser is three 8-bit channels per pixel
//! `(cov_r, cov_g, cov_b)` describing how much of each subpixel the glyph
//! covered.  At composite time a per-channel Porter-Duff `over` blend
//! mixes the TEXT COLOUR into the live destination:
//!
//! ```text
//! dst.r = src.r * cov.r + dst.r * (1 - cov.r)
//! dst.g = src.g * cov.g + dst.g * (1 - cov.g)
//! dst.b = src.b * cov.b + dst.b * (1 - cov.b)
//! ```
//!
//! The coverage mask is the same regardless of where it lands; the blend
//! naturally produces the correct LCD chroma against any background.
//!
//! See `lcd-subpixel-compositing.md` at the repository root for the full
//! derivation.
//!
//! # Pipeline
//!
//! ```text
//! shape_text (rustybuzz kerning + fallback chain — unchanged)
//!//! per-glyph PathStorage → ConvTransform(scale_x_3) → PixfmtGray8
//!   (8-bit grayscale coverage at 3× horizontal resolution)
//!//! 5-tap low-pass filter per output channel
//!//! packed (cov_r, cov_g, cov_b) 3-byte mask
//! ```

use agg_rust::path_storage::PathStorage;
use agg_rust::trans_affine::TransAffine;

use crate::color::Color;
use crate::draw_ctx::FillRule;

// ---------------------------------------------------------------------------
// LcdBuffer — opaque 3-byte-per-pixel RGB render target
// ---------------------------------------------------------------------------
//
// Analogue of `Framebuffer` for widgets that opt into
// [`crate::widget::BackbufferMode::LcdCoverage`].  Every fill into an
// `LcdBuffer` goes through the 3× horizontal supersample + 5-tap filter
// pipeline and composites per-channel via Porter-Duff src-over.  The
// buffer has no alpha channel — it's intended to be fully covered by
// opaque fills and blitted as an opaque RGB texture.

/// LCD coverage buffer, row 0 = bottom (matches `Framebuffer` convention).
///
/// **Two planes, 3 bytes per pixel each:**
///
/// - `color`: per-channel **premultiplied** RGB colour accumulated from
///   every paint so far.  `(R_color, G_color, B_color)` where each byte
///   is `channel_color * channel_alpha`.
/// - `alpha`: per-channel alpha/coverage accumulated from every paint so
///   far.  `(R_alpha, G_alpha, B_alpha)` where each byte is the combined
///   opacity of that subpixel column (0 = untouched, 255 = fully opaque).
///
/// **Why per-channel alpha?**  LCD subpixel rendering produces a distinct
/// coverage value per R/G/B channel, so a single per-pixel alpha can't
/// represent the output correctly at glyph edges and fractional image
/// boundaries.  Splitting alpha per-channel gives each subpixel its own
/// Porter-Duff state: paints accumulate independently through the same
/// premultiplied src-over math you'd use for a normal RGBA surface, just
/// three streams instead of one.  A cached `LcdBuffer` with partial
/// coverage can be composited onto any destination without the "black
/// rect where unpainted" failure mode that killed the first-cut design.
pub struct LcdBuffer {
    color: Vec<u8>,
    alpha: Vec<u8>,
    width: u32,
    height: u32,
}

impl LcdBuffer {
    /// Allocate a fully-transparent buffer (color zero, alpha zero
    /// everywhere).  "Transparent" here means the per-channel alpha is
    /// 0, so composite-onto-destination leaves the destination
    /// unchanged wherever no paint has landed yet.
    pub fn new(width: u32, height: u32) -> Self {
        // Safety net: refuse to honour an obviously-pathological size
        // rather than let the allocator try for gigabytes.  Returning a
        // 1×1 buffer means the caller's text doesn't render this
        // frame, but the app keeps running and the offending widget's
        // bounds get clamped naturally on the next layout pass.  A
        // debug build prints the caller info; release silently clamps.
        const MAX_BYTES: usize = 512 * 1024 * 1024; // 512 MB per plane
        let bytes = (width as usize)
            .saturating_mul(height as usize)
            .saturating_mul(3);
        if bytes > MAX_BYTES {
            #[cfg(debug_assertions)]
            eprintln!(
                "[LcdBuffer] clamped pathological size ({}, {}); \
                 widget bounds likely skipped a size cap",
                width, height,
            );
            return Self {
                color: vec![0u8; 3],
                alpha: vec![0u8; 3],
                width: 1,
                height: 1,
            };
        }
        Self {
            color: vec![0u8; bytes],
            alpha: vec![0u8; bytes],
            width,
            height,
        }
    }

    #[inline]
    pub fn width(&self) -> u32 {
        self.width
    }
    #[inline]
    pub fn height(&self) -> u32 {
        self.height
    }

    #[inline]
    pub fn color_plane(&self) -> &[u8] {
        &self.color
    }
    #[inline]
    pub fn alpha_plane(&self) -> &[u8] {
        &self.alpha
    }
    #[inline]
    pub fn color_plane_mut(&mut self) -> &mut [u8] {
        &mut self.color
    }
    #[inline]
    pub fn alpha_plane_mut(&mut self) -> &mut [u8] {
        &mut self.alpha
    }

    /// Both planes mutably in one borrow — for inner loops that update
    /// a pixel's colour and alpha together (image blit, manual composite).
    #[inline]
    pub fn planes_mut(&mut self) -> (&mut [u8], &mut [u8]) {
        (&mut self.color, &mut self.alpha)
    }

    /// Consume the buffer, returning the owned `(color, alpha)` planes
    /// as a pair — used when moving the painted pixels into `Arc`s for
    /// a widget's backbuffer cache or for GPU texture upload.
    pub fn into_planes(self) -> (Vec<u8>, Vec<u8>) {
        (self.color, self.alpha)
    }

    /// Top-row-first copy of the colour plane, suitable for a plain
    /// RGB8 upload or CPU blit.  Row 0 of the output is the VISUAL
    /// top of the buffer (Y-up → Y-down flip).
    pub fn color_plane_flipped(&self) -> Vec<u8> {
        flip_plane(&self.color, self.width, self.height)
    }

    /// Top-row-first copy of the alpha plane.
    pub fn alpha_plane_flipped(&self) -> Vec<u8> {
        flip_plane(&self.alpha, self.width, self.height)
    }

    /// Collapse both planes into a single top-row-first straight-alpha
    /// RGBA8 image suitable for the existing blit pipeline (one texture,
    /// standard `SRC_ALPHA, ONE_MINUS_SRC_ALPHA` blend).
    ///
    /// The per-channel alphas get collapsed to a single per-pixel alpha
    /// via `max(R_alpha, G_alpha, B_alpha)`; RGB is recovered by dividing
    /// the premult colour by that max alpha (straight-alpha form).  This
    /// conversion is **lossy** when the three subpixel alphas diverge
    /// (the whole point of the per-channel representation is lost under
    /// collapse).  It's correct for typical monochrome-text cases where
    /// all three alphas agree, and degrades gracefully otherwise —
    /// Phase 5.2's two-plane blit path preserves the full per-channel
    /// information through upload and shader.
    pub fn to_rgba8_top_down_collapsed(&self) -> Vec<u8> {
        let w = self.width as usize;
        let h = self.height as usize;
        let mut out = vec![0u8; w * h * 4];
        for y in 0..h {
            let src_y = h - 1 - y;
            for x in 0..w {
                let si = (src_y * w + x) * 3;
                let di = (y * w + x) * 4;
                let ra = self.alpha[si];
                let ga = self.alpha[si + 1];
                let ba = self.alpha[si + 2];
                let a = ra.max(ga).max(ba);
                if a == 0 {
                    continue;
                } // fully transparent → keep RGBA zero
                let af = a as f32 / 255.0;
                let rc = self.color[si] as f32 / 255.0;
                let gc = self.color[si + 1] as f32 / 255.0;
                let bc = self.color[si + 2] as f32 / 255.0;
                out[di] = ((rc / af) * 255.0 + 0.5).clamp(0.0, 255.0) as u8;
                out[di + 1] = ((gc / af) * 255.0 + 0.5).clamp(0.0, 255.0) as u8;
                out[di + 2] = ((bc / af) * 255.0 + 0.5).clamp(0.0, 255.0) as u8;
                out[di + 3] = a;
            }
        }
        out
    }

    // ── Paint primitives ────────────────────────────────────────────────────
    //
    // These are the foundation operations every higher layer (LcdGfxCtx,
    // path-fill helpers, image blit) eventually composes into.  They write
    // directly into the 3-byte-per-pixel coverage store with no intermediate
    // allocation.

    /// Fill the entire buffer with a solid colour.  Every subpixel gets
    /// the same premultiplied colour contribution and the same alpha —
    /// a flat clear has no per-subpixel differentiation, so the three
    /// alpha channels are all set to `color.a` and the three colour
    /// channels to `color.rgb * color.a`.
    pub fn clear(&mut self, color: Color) {
        let a = color.a.clamp(0.0, 1.0);
        let r_c = ((color.r.clamp(0.0, 1.0) * a) * 255.0 + 0.5) as u8;
        let g_c = ((color.g.clamp(0.0, 1.0) * a) * 255.0 + 0.5) as u8;
        let b_c = ((color.b.clamp(0.0, 1.0) * a) * 255.0 + 0.5) as u8;
        let a_byte = (a * 255.0 + 0.5) as u8;
        for px in self.color.chunks_exact_mut(3) {
            px[0] = r_c;
            px[1] = g_c;
            px[2] = b_c;
        }
        for px in self.alpha.chunks_exact_mut(3) {
            px[0] = a_byte;
            px[1] = a_byte;
            px[2] = a_byte;
        }
    }

    /// Fill an AGG path through the LCD pipeline: rasterize at 3× X
    /// resolution → 5-tap filter → per-channel src-over composite into
    /// this buffer.  `transform` is applied to `path` before the 3× X
    /// scale (typically the caller's CTM); the path's coordinates are
    /// in the buffer's pixel space (Y-up, origin = bottom-left).
    /// Optional `clip` is a screen-space rect (post-CTM, in mask pixel
    /// coords) — pixels outside it are unaffected.
    ///
    /// First non-text primitive on the buffer.  Future fill / stroke /
    /// image-blit entry points either call this directly (for solid
    /// fills / outlines) or open their own `LcdMaskBuilder` scope when
    /// they need to batch many paths into one mask.
    ///
    /// First-cut implementation: rasterizes at the buffer's full size.
    /// A later optimization can compute the path's bbox and size the
    /// scratch tightly — measurable win for small paths in large
    /// buffers, but architecturally identical and not required for
    /// correctness.
    pub fn fill_path(
        &mut self,
        path: &mut PathStorage,
        color: Color,
        transform: &TransAffine,
        clip: Option<(f64, f64, f64, f64)>,
        fill_rule: FillRule,
    ) {
        if self.width == 0 || self.height == 0 {
            return;
        }
        let mut builder = LcdMaskBuilder::new(self.width, self.height)
            .with_clip(clip)
            .with_fill_rule(fill_rule);
        builder.with_paths(transform, |add| {
            add(path);
        });
        let mask = builder.finalize();
        // Convert clip → integer pixel rect for composite-time enforcement.
        // The gray-buffer raster clip should already have zeroed coverage
        // outside, but the 5-tap filter can leak ±2 subpixels at clip
        // edges; composite-time clip catches that.
        let clip_i = clip.map(rect_to_pixel_clip);
        self.composite_mask(&mask, color, 0, 0, clip_i);
    }

    /// Composite an [`LcdMask`] into this buffer using per-channel
    /// **premultiplied** Porter-Duff src-over.  Each subpixel column's
    /// effective alpha is `src.a × mask.channel_coverage`, and colour +
    /// alpha both accumulate under the standard premult src-over:
    ///
    /// ```text
    /// eff_a_c        = src.a * mask.c
    /// buf.color_c   := src.c * eff_a_c + buf.color_c * (1 - eff_a_c)
    /// buf.alpha_c   := eff_a_c         + buf.alpha_c * (1 - eff_a_c)
    /// ```
    ///
    /// `(dst_x, dst_y)` is the mask's bottom-left in this buffer's Y-up
    /// pixel grid; mask row `my` writes to buffer row `dst_y + my`.
    /// Optional `clip` (in this buffer's integer pixel coords:
    /// `(x1, y1, x2, y2)`, half-open) suppresses writes outside its
    /// bounds — used by widgets that paint inside a clipping parent.
    pub fn composite_mask(
        &mut self,
        mask: &LcdMask,
        src: Color,
        dst_x: i32,
        dst_y: i32,
        clip: Option<(i32, i32, i32, i32)>,
    ) {
        if mask.width == 0 || mask.height == 0 {
            return;
        }
        let sa = src.a.clamp(0.0, 1.0);
        let sr = src.r.clamp(0.0, 1.0);
        let sg = src.g.clamp(0.0, 1.0);
        let sb = src.b.clamp(0.0, 1.0);
        let dst_w_i = self.width as i32;
        let dst_h_i = self.height as i32;
        let dst_w_u = self.width as usize;
        let mw = mask.width as i32;
        let mh = mask.height as i32;
        let (cx1, cy1, cx2, cy2) = match clip {
            Some((cx1, cy1, cx2, cy2)) => {
                (cx1.max(0), cy1.max(0), cx2.min(dst_w_i), cy2.min(dst_h_i))
            }
            None => (0, 0, dst_w_i, dst_h_i),
        };
        if cx1 >= cx2 || cy1 >= cy2 {
            return;
        }

        for my in 0..mh {
            let dy = dst_y + my;
            if dy < cy1 || dy >= cy2 {
                continue;
            }
            let dy_u = dy as usize;
            for mx in 0..mw {
                let dx = dst_x + mx;
                if dx < cx1 || dx >= cx2 {
                    continue;
                }
                let mi = ((my * mw + mx) * 3) as usize;
                // Per-channel effective alpha = src colour alpha × mask coverage.
                let ea_r = sa * (mask.data[mi] as f32 / 255.0);
                let ea_g = sa * (mask.data[mi + 1] as f32 / 255.0);
                let ea_b = sa * (mask.data[mi + 2] as f32 / 255.0);
                if ea_r == 0.0 && ea_g == 0.0 && ea_b == 0.0 {
                    continue;
                }

                let di = (dy_u * dst_w_u + (dx as usize)) * 3;
                // Read existing premult colour + per-channel alpha.
                let bc_r = self.color[di] as f32 / 255.0;
                let bc_g = self.color[di + 1] as f32 / 255.0;
                let bc_b = self.color[di + 2] as f32 / 255.0;
                let ba_r = self.alpha[di] as f32 / 255.0;
                let ba_g = self.alpha[di + 1] as f32 / 255.0;
                let ba_b = self.alpha[di + 2] as f32 / 255.0;
                // Premult src-over per channel.  `src.c × eff_a` is the
                // premultiplied source colour contribution; it adds to
                // the buffer's existing premult colour, weighted by
                // (1 - eff_a).  Alpha stream does the same Porter-Duff
                // composite independently per channel.
                let rc_r = sr * ea_r + bc_r * (1.0 - ea_r);
                let rc_g = sg * ea_g + bc_g * (1.0 - ea_g);
                let rc_b = sb * ea_b + bc_b * (1.0 - ea_b);
                let ra_r = ea_r + ba_r * (1.0 - ea_r);
                let ra_g = ea_g + ba_g * (1.0 - ea_g);
                let ra_b = ea_b + ba_b * (1.0 - ea_b);

                self.color[di] = (rc_r * 255.0 + 0.5).clamp(0.0, 255.0) as u8;
                self.color[di + 1] = (rc_g * 255.0 + 0.5).clamp(0.0, 255.0) as u8;
                self.color[di + 2] = (rc_b * 255.0 + 0.5).clamp(0.0, 255.0) as u8;
                self.alpha[di] = (ra_r * 255.0 + 0.5).clamp(0.0, 255.0) as u8;
                self.alpha[di + 1] = (ra_g * 255.0 + 0.5).clamp(0.0, 255.0) as u8;
                self.alpha[di + 2] = (ra_b * 255.0 + 0.5).clamp(0.0, 255.0) as u8;
            }
        }
    }

    /// Composite an [`LcdMask`] using a per-pixel source colour callback.
    ///
    /// The callback receives destination pixel coordinates in this buffer's
    /// Y-up pixel space.  This keeps the LCD coverage pipeline shared for
    /// solid and gradient fills while allowing colour to vary across the mask.
    pub fn composite_mask_with_color<F>(
        &mut self,
        mask: &LcdMask,
        dst_x: i32,
        dst_y: i32,
        clip: Option<(i32, i32, i32, i32)>,
        mut color_at: F,
    ) where
        F: FnMut(i32, i32) -> Color,
    {
        if mask.width == 0 || mask.height == 0 {
            return;
        }
        let dst_w_i = self.width as i32;
        let dst_h_i = self.height as i32;
        let dst_w_u = self.width as usize;
        let mw = mask.width as i32;
        let mh = mask.height as i32;
        let (cx1, cy1, cx2, cy2) = match clip {
            Some((cx1, cy1, cx2, cy2)) => {
                (cx1.max(0), cy1.max(0), cx2.min(dst_w_i), cy2.min(dst_h_i))
            }
            None => (0, 0, dst_w_i, dst_h_i),
        };
        if cx1 >= cx2 || cy1 >= cy2 {
            return;
        }

        for my in 0..mh {
            let dy = dst_y + my;
            if dy < cy1 || dy >= cy2 {
                continue;
            }
            let dy_u = dy as usize;
            for mx in 0..mw {
                let dx = dst_x + mx;
                if dx < cx1 || dx >= cx2 {
                    continue;
                }
                let mi = ((my * mw + mx) * 3) as usize;
                let src = color_at(dx, dy);
                let sa = src.a.clamp(0.0, 1.0);
                let sr = src.r.clamp(0.0, 1.0);
                let sg = src.g.clamp(0.0, 1.0);
                let sb = src.b.clamp(0.0, 1.0);
                let ea_r = sa * (mask.data[mi] as f32 / 255.0);
                let ea_g = sa * (mask.data[mi + 1] as f32 / 255.0);
                let ea_b = sa * (mask.data[mi + 2] as f32 / 255.0);
                if ea_r == 0.0 && ea_g == 0.0 && ea_b == 0.0 {
                    continue;
                }

                let di = (dy_u * dst_w_u + (dx as usize)) * 3;
                let bc_r = self.color[di] as f32 / 255.0;
                let bc_g = self.color[di + 1] as f32 / 255.0;
                let bc_b = self.color[di + 2] as f32 / 255.0;
                let ba_r = self.alpha[di] as f32 / 255.0;
                let ba_g = self.alpha[di + 1] as f32 / 255.0;
                let ba_b = self.alpha[di + 2] as f32 / 255.0;

                let rc_r = sr * ea_r + bc_r * (1.0 - ea_r);
                let rc_g = sg * ea_g + bc_g * (1.0 - ea_g);
                let rc_b = sb * ea_b + bc_b * (1.0 - ea_b);
                let ra_r = ea_r + ba_r * (1.0 - ea_r);
                let ra_g = ea_g + ba_g * (1.0 - ea_g);
                let ra_b = ea_b + ba_b * (1.0 - ea_b);

                self.color[di] = (rc_r * 255.0 + 0.5).clamp(0.0, 255.0) as u8;
                self.color[di + 1] = (rc_g * 255.0 + 0.5).clamp(0.0, 255.0) as u8;
                self.color[di + 2] = (rc_b * 255.0 + 0.5).clamp(0.0, 255.0) as u8;
                self.alpha[di] = (ra_r * 255.0 + 0.5).clamp(0.0, 255.0) as u8;
                self.alpha[di + 1] = (ra_g * 255.0 + 0.5).clamp(0.0, 255.0) as u8;
                self.alpha[di + 2] = (ra_b * 255.0 + 0.5).clamp(0.0, 255.0) as u8;
            }
        }
    }

    /// Composite `src` onto this buffer at offset `(dst_x, dst_y)` via
    /// **per-channel premultiplied src-over** — the buffer-level
    /// analogue of [`Self::composite_mask`].  Each of the three
    /// subpixel columns applies `src.ch_alpha` as its own
    /// Porter-Duff weight:
    ///
    /// ```text
    /// buf.color_c := src.color_c + buf.color_c * (1 - src.alpha_c)
    /// buf.alpha_c := src.alpha_c + buf.alpha_c * (1 - src.alpha_c)
    /// ```
    ///
    /// Untouched source pixels (alpha zero on every channel) don't
    /// change the buffer at all — exactly the semantic that makes a
    /// popped layer leave unpainted areas alone, no seed trick needed.
    pub fn composite_buffer(
        &mut self,
        src: &LcdBuffer,
        dst_x: i32,
        dst_y: i32,
        clip: Option<(i32, i32, i32, i32)>,
    ) {
        if src.width == 0 || src.height == 0 {
            return;
        }
        let dst_w_i = self.width as i32;
        let dst_h_i = self.height as i32;
        let dst_w_u = self.width as usize;
        let src_w_u = src.width as usize;
        let sw = src.width as i32;
        let sh = src.height as i32;
        let (cx1, cy1, cx2, cy2) = match clip {
            Some((x1, y1, x2, y2)) => (x1.max(0), y1.max(0), x2.min(dst_w_i), y2.min(dst_h_i)),
            None => (0, 0, dst_w_i, dst_h_i),
        };
        if cx1 >= cx2 || cy1 >= cy2 {
            return;
        }

        for sy in 0..sh {
            let dy = dst_y + sy;
            if dy < cy1 || dy >= cy2 {
                continue;
            }
            let dy_u = dy as usize;
            let sy_u = sy as usize;
            for sx in 0..sw {
                let dx = dst_x + sx;
                if dx < cx1 || dx >= cx2 {
                    continue;
                }
                let si = (sy_u * src_w_u + sx as usize) * 3;
                let di = (dy_u * dst_w_u + dx as usize) * 3;

                let sa_r = src.alpha[si] as f32 / 255.0;
                let sa_g = src.alpha[si + 1] as f32 / 255.0;
                let sa_b = src.alpha[si + 2] as f32 / 255.0;
                if sa_r == 0.0 && sa_g == 0.0 && sa_b == 0.0 {
                    continue;
                }

                let sc_r = src.color[si] as f32 / 255.0;
                let sc_g = src.color[si + 1] as f32 / 255.0;
                let sc_b = src.color[si + 2] as f32 / 255.0;

                let bc_r = self.color[di] as f32 / 255.0;
                let bc_g = self.color[di + 1] as f32 / 255.0;
                let bc_b = self.color[di + 2] as f32 / 255.0;
                let ba_r = self.alpha[di] as f32 / 255.0;
                let ba_g = self.alpha[di + 1] as f32 / 255.0;
                let ba_b = self.alpha[di + 2] as f32 / 255.0;

                // src is already premultiplied, so `sc + bc*(1-sa)` is the
                // plain Porter-Duff expression — no additional modulation.
                let rc_r = sc_r + bc_r * (1.0 - sa_r);
                let rc_g = sc_g + bc_g * (1.0 - sa_g);
                let rc_b = sc_b + bc_b * (1.0 - sa_b);
                let ra_r = sa_r + ba_r * (1.0 - sa_r);
                let ra_g = sa_g + ba_g * (1.0 - sa_g);
                let ra_b = sa_b + ba_b * (1.0 - sa_b);

                self.color[di] = (rc_r * 255.0 + 0.5).clamp(0.0, 255.0) as u8;
                self.color[di + 1] = (rc_g * 255.0 + 0.5).clamp(0.0, 255.0) as u8;
                self.color[di + 2] = (rc_b * 255.0 + 0.5).clamp(0.0, 255.0) as u8;
                self.alpha[di] = (ra_r * 255.0 + 0.5).clamp(0.0, 255.0) as u8;
                self.alpha[di + 1] = (ra_g * 255.0 + 0.5).clamp(0.0, 255.0) as u8;
                self.alpha[di + 2] = (ra_b * 255.0 + 0.5).clamp(0.0, 255.0) as u8;
            }
        }
    }
}

// ── helpers ───────────────────────────────────────────────────────────────

/// Y-flip a 3-byte/pixel plane (Y-up row 0 = bottom → top-row-first).
fn flip_plane(src: &[u8], width: u32, height: u32) -> Vec<u8> {
    let row_bytes = (width * 3) as usize;
    let mut out = vec![0u8; src.len()];
    for y in 0..height as usize {
        let dst_y = height as usize - 1 - y;
        out[dst_y * row_bytes..(dst_y + 1) * row_bytes]
            .copy_from_slice(&src[y * row_bytes..(y + 1) * row_bytes]);
    }
    out
}

mod mask;
#[cfg(test)]
mod tests;

pub use mask::{
    composite_lcd_mask, identity_xform, rasterize_lcd_mask, rasterize_lcd_mask_multi,
    rasterize_text_lcd_cached, rect_to_pixel_clip, CachedLcdText, LcdMask, LcdMaskBuilder,
};