vello_common 0.0.8

Core data structures and utilities shared across the Vello rendering, including geometry processing and tiling logic.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
// Copyright 2025 the Vello Authors
// SPDX-License-Identifier: Apache-2.0 OR MIT

//! Rendering strips.

use crate::flatten::Line;
use crate::peniko::Fill;
use crate::tile::{Tile, Tiles};
use crate::util::f32_to_u8;
use alloc::vec::Vec;
use fearless_simd::*;

/// A strip.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct Strip {
    /// The x coordinate of the strip, in user coordinates.
    pub x: u16,
    /// The y coordinate of the strip, in user coordinates.
    pub y: u16,
    /// Packed alpha index and fill gap flag.
    ///
    /// Bit layout (u32):
    /// - bit 31: `fill_gap` (See `Strip::fill_gap()`).
    /// - bits 0..=30: `alpha_idx` (See `Strip::alpha_idx()`).
    packed_alpha_idx_fill_gap: u32,
}

impl Strip {
    /// The bit mask for `fill_gap` packed into `packed_alpha_idx_fill_gap`.
    const FILL_GAP_MASK: u32 = 1 << 31;

    /// Creates a new strip.
    pub fn new(x: u16, y: u16, alpha_idx: u32, fill_gap: bool) -> Self {
        // Ensure `alpha_idx` does not collide with the fill flag bit.
        assert!(
            alpha_idx & Self::FILL_GAP_MASK == 0,
            "`alpha_idx` too large"
        );
        let fill_gap = u32::from(fill_gap) << 31;
        Self {
            x,
            y,
            packed_alpha_idx_fill_gap: alpha_idx | fill_gap,
        }
    }

    /// Return whether the strip is a sentinel strip.
    pub fn is_sentinel(&self) -> bool {
        self.x == u16::MAX
    }

    /// Return the y coordinate of the strip, in strip units.
    pub fn strip_y(&self) -> u16 {
        self.y / Tile::HEIGHT
    }

    /// Returns the alpha index.
    #[inline(always)]
    pub fn alpha_idx(&self) -> u32 {
        self.packed_alpha_idx_fill_gap & !Self::FILL_GAP_MASK
    }

    /// Sets the alpha index.
    ///
    /// Note that the largest value that can be stored in the alpha index is `u32::MAX << 1`, as the
    /// highest bit is reserved for `fill_gap`.
    #[inline(always)]
    pub fn set_alpha_idx(&mut self, alpha_idx: u32) {
        // Ensure `alpha_idx` does not collide with the fill flag bit.
        assert!(
            alpha_idx & Self::FILL_GAP_MASK == 0,
            "`alpha_idx` too large"
        );
        let fill_gap = self.packed_alpha_idx_fill_gap & Self::FILL_GAP_MASK;
        self.packed_alpha_idx_fill_gap = alpha_idx | fill_gap;
    }

    /// Returns whether the gap that lies between this strip and the previous in the same row should be filled.
    #[inline(always)]
    pub fn fill_gap(&self) -> bool {
        (self.packed_alpha_idx_fill_gap & Self::FILL_GAP_MASK) != 0
    }

    /// Sets whether the gap that lies between this strip and the previous in the same row should be filled.
    #[inline(always)]
    pub fn set_fill_gap(&mut self, fill: bool) {
        let fill = u32::from(fill) << 31;
        self.packed_alpha_idx_fill_gap =
            (self.packed_alpha_idx_fill_gap & !Self::FILL_GAP_MASK) | fill;
    }

    /// When early culling is active, geometry fully to the left of the viewport creates no tiles.
    /// However, if that geometry has a non-zero winding (e.g. a large shape surrounding the
    /// viewport), then we must output strips for those fills.
    ///
    /// We reconstruct this "background" fill using `row_windings` (the winding at x=0) to emit solid
    /// strips for:
    ///      1. All rows vertically above the first visible tile.
    ///      2. 'Captive' rows between two tile-containing rows.
    ///      3. All rows vertically below the last visible tile.
    #[inline(always)]
    fn emit_culled_background<F>(
        start: u16,
        end: u16,
        strips: &mut Vec<Self>,
        alphas: &mut Vec<u8>,
        windings: &crate::tile::CulledWindings,
        mut should_fill: F,
    ) where
        F: FnMut(i32) -> bool,
    {
        windings.for_active_rows_in_range(start as usize, end as usize, |row| {
            if should_fill(windings.coarse[row] as i32) {
                let y_pos = row as u16 * Tile::HEIGHT;
                strips.push(Self::new(0, y_pos, alphas.len() as u32, false));
                alphas.extend([255_u8; Tile::HEIGHT as usize * Tile::WIDTH as usize]);
                // TODO: Clamp to the scene width instead of u16::MAX; in the future there might
                // not be clamping on the x.
                strips.push(Self::new(u16::MAX, y_pos, alphas.len() as u32, true));
            }
        });
    }
}

/// Render the tiles stored in `tiles` into the strip and alpha buffer.
pub fn render(
    level: Level,
    tiles: &Tiles,
    strip_buf: &mut Vec<Strip>,
    alpha_buf: &mut Vec<u8>,
    fill_rule: Fill,
    aliasing_threshold: Option<u8>,
    lines: &[Line],
) {
    dispatch!(level, simd => render_impl(simd,
                                         tiles,
                                         strip_buf,
                                         alpha_buf,
                                         fill_rule,
                                         aliasing_threshold,
                                         lines));
}

#[inline(always)]
fn render_impl<S: Simd>(
    s: S,
    tiles: &Tiles,
    strip_buf: &mut Vec<Strip>,
    alpha_buf: &mut Vec<u8>,
    fill_rule: Fill,
    aliasing_threshold: Option<u8>,
    lines: &[Line],
) {
    let row_windings = &tiles.windings.coarse;
    let has_culled_tiles = tiles.has_culled_tiles();

    // If no tiles were culled and the tile buffer is empty, we can simply exit. If tiles were
    // culled, the tile buffer may be empty but there may be winding produced by culled geometry
    // left of the viewport that must be checked for filling.
    if !has_culled_tiles && tiles.is_empty() {
        return;
    }

    let should_fill = |winding: i32| match fill_rule {
        Fill::NonZero => winding != 0,
        Fill::EvenOdd => winding % 2 != 0,
    };

    // Helper to handle "captive strips". When a row has tiles, but the first tile
    // is not at the left edge of the viewport (x != 0), we must emit a solid strip
    // from x=0 to that tile if the coarse winding dictates a fill.
    let emit_captive_strip =
        |y: u16, is_left_viewport: bool, strips: &mut Vec<Strip>, alphas: &mut Vec<u8>| {
            let coarse_wd = tiles.windings.coarse[y as usize] as i32;

            if should_fill(coarse_wd) && !is_left_viewport {
                strips.push(Strip::new(0, y * Tile::HEIGHT, alphas.len() as u32, false));
                alphas.extend([255_u8; Tile::HEIGHT as usize * Tile::WIDTH as usize]);
            }

            let mut acc = f32x4::splat(s, coarse_wd as f32);
            if is_left_viewport {
                let fine_winding: f32x4<_> = tiles.windings.partial[y as usize].simd_into(s);
                acc += fine_winding;
            }

            (coarse_wd, acc)
        };

    // The accumulated tile winding delta. A line that crosses the top edge of a tile
    // increments the delta if the line is directed upwards, and decrements it if goes
    // downwards. Horizontal lines leave it unchanged.
    let mut winding_delta: i32 = 0;

    // The previous tile visited.
    let mut prev_tile = if has_culled_tiles && tiles.is_empty() {
        Tile::SENTINEL
    } else {
        *tiles.get(0)
    };

    // The accumulated (fractional) winding of the tile-sized location we're currently at.
    // Note multiple tiles can be at the same location.
    // Note that we are also implicitly assuming here that the tile height exactly fits into a
    // SIMD vector (i.e. 128 bits).
    let mut location_winding = [f32x4::splat(s, 0.0); Tile::WIDTH as usize];
    // The accumulated (fractional) windings at this location's right edge. When we move to the
    // next location, this is splatted to that location's starting winding.
    let mut accumulated_winding = f32x4::splat(s, 0.0);

    let left_viewport = prev_tile.x == 0;
    if has_culled_tiles {
        let row_max = prev_tile.y.min(row_windings.len() as u16);
        Strip::emit_culled_background(
            0,
            row_max,
            strip_buf,
            alpha_buf,
            &tiles.windings,
            should_fill,
        );
        if tiles.is_empty() {
            return;
        }
        let (wd, acc) = emit_captive_strip(prev_tile.y, left_viewport, strip_buf, alpha_buf);
        winding_delta = wd;
        accumulated_winding = acc;
        location_winding = [accumulated_winding; Tile::WIDTH as usize];
    }

    // The strip we're building.
    let mut strip = Strip::new(
        prev_tile.x * Tile::WIDTH,
        prev_tile.y * Tile::HEIGHT,
        alpha_buf.len() as u32,
        should_fill(winding_delta) && !left_viewport,
    );

    for (tile_idx, tile) in tiles.iter().copied().chain([Tile::SENTINEL]).enumerate() {
        let line = lines[tile.line_idx() as usize];
        let tile_left_x = f32::from(tile.x) * f32::from(Tile::WIDTH);
        let tile_top_y = f32::from(tile.y) * f32::from(Tile::HEIGHT);
        let p0_x = line.p0.x - tile_left_x;
        let p0_y = line.p0.y - tile_top_y;
        let p1_x = line.p1.x - tile_left_x;
        let p1_y = line.p1.y - tile_top_y;

        // Push out the winding as an alpha mask when we move to the next location (i.e., a tile
        // without the same location).
        if !prev_tile.same_loc(&tile) {
            match fill_rule {
                Fill::NonZero => {
                    let p1 = f32x4::splat(s, 0.5);
                    let p2 = f32x4::splat(s, 255.0);

                    #[expect(clippy::needless_range_loop, reason = "dimension clarity")]
                    for x in 0..Tile::WIDTH as usize {
                        let area = location_winding[x];
                        let coverage = area.abs();
                        let mulled = coverage.mul_add(p2, p1);
                        // Note that we are not storing the location winding here but the actual
                        // alpha value as f32, so we reuse the variable as a temporary storage.
                        // Also note that we need the `min` here because the winding can be > 1
                        // and thus the calculated alpha value need to be clamped to 255.
                        location_winding[x] = mulled.min(p2);
                    }
                }
                Fill::EvenOdd => {
                    let p1 = f32x4::splat(s, 0.5);
                    let p2 = f32x4::splat(s, -2.0);
                    let p3 = f32x4::splat(s, 255.0);

                    #[expect(clippy::needless_range_loop, reason = "dimension clarity")]
                    for x in 0..Tile::WIDTH as usize {
                        let area = location_winding[x];
                        let im1 = area.mul_add(p1, p1).floor();
                        let coverage = p2.mul_add(im1, area).abs();
                        let mulled = p3.mul_add(coverage, p1);
                        // TODO: It is possible that, unlike for `NonZero`, we don't need the `min`
                        // here.
                        location_winding[x] = mulled.min(p3);
                    }
                }
            };

            let p1 = s.combine_f32x4(location_winding[0], location_winding[1]);
            let p2 = s.combine_f32x4(location_winding[2], location_winding[3]);

            let mut u8_vals = f32_to_u8(s.combine_f32x8(p1, p2));

            if let Some(aliasing_threshold) = aliasing_threshold {
                u8_vals = s.select_u8x16(
                    u8_vals.simd_ge(u8x16::splat(s, aliasing_threshold)),
                    u8x16::splat(s, 255),
                    u8x16::splat(s, 0),
                );
            }

            alpha_buf.extend_from_slice(u8_vals.as_slice());

            #[expect(clippy::needless_range_loop, reason = "dimension clarity")]
            for x in 0..Tile::WIDTH as usize {
                location_winding[x] = accumulated_winding;
            }
        }

        // Push out the strip if we're moving to a next strip.
        if !prev_tile.same_loc(&tile) && !prev_tile.prev_loc(&tile) {
            debug_assert_eq!(
                (prev_tile.x as u32 + 1) * Tile::WIDTH as u32 - strip.x as u32,
                ((alpha_buf.len() - strip.alpha_idx() as usize) / usize::from(Tile::HEIGHT)) as u32,
                "The number of columns written to the alpha buffer should equal the number of columns spanned by this strip."
            );
            strip_buf.push(strip);

            let is_sentinel = tile_idx == tiles.len() as usize;
            let left_viewport = tile.x == 0;
            if !prev_tile.same_row(&tile) {
                // Emit a final strip in the row if there is non-zero winding for the sparse fill,
                // or unconditionally if we've reached the sentinel tile to end the path (the
                // `alpha_idx` field is used for width calculations).
                if winding_delta != 0 || is_sentinel {
                    strip_buf.push(Strip::new(
                        u16::MAX,
                        prev_tile.y * Tile::HEIGHT,
                        alpha_buf.len() as u32,
                        should_fill(winding_delta),
                    ));
                }

                // Logic identical to the start (see above): fill any vertical gaps (empty rows)
                // between the previous and current tile using the row windings.
                if has_culled_tiles && !is_sentinel {
                    Strip::emit_culled_background(
                        prev_tile.y + 1,
                        tile.y,
                        strip_buf,
                        alpha_buf,
                        &tiles.windings,
                        should_fill,
                    );

                    let (wd, acc) = emit_captive_strip(tile.y, left_viewport, strip_buf, alpha_buf);
                    winding_delta = wd;
                    accumulated_winding = acc;
                } else {
                    winding_delta = 0;
                    accumulated_winding = f32x4::splat(s, 0.0);
                };

                #[expect(clippy::needless_range_loop, reason = "dimension clarity")]
                for x in 0..Tile::WIDTH as usize {
                    location_winding[x] = accumulated_winding;
                }
            } else {
                // Note: this fill is mathematically not necessary. It provides a way to reduce
                // accumulation of float rounding errors.
                accumulated_winding = f32x4::splat(s, winding_delta as f32);
            }

            if is_sentinel {
                break;
            }

            strip = Strip::new(
                tile.x * Tile::WIDTH,
                tile.y * Tile::HEIGHT,
                alpha_buf.len() as u32,
                should_fill(winding_delta) && !left_viewport,
            );
        }
        prev_tile = tile;

        // TODO: horizontal geometry has no impact on winding. This branch will be removed when
        // horizontal geometry is culled at the tile-generation stage.
        if p0_y == p1_y {
            continue;
        }

        // Lines moving upwards (in a y-down coordinate system) add to winding; lines moving
        // downwards subtract from winding.
        let sign = (p0_y - p1_y).signum();

        // Calculate winding / pixel area coverage.
        //
        // Conceptually, horizontal rays are shot from left to right. Every time the ray crosses a
        // line that is directed upwards (decreasing `y`), the winding is incremented. Every time
        // the ray crosses a line moving downwards (increasing `y`), the winding is decremented.
        // The fractional area coverage of a pixel is the integral of the winding within it.
        //
        // Practically, to calculate this, each pixel is considered individually, and we determine
        // whether the line moves through this pixel. The line's y-delta within this pixel is
        // accumulated and added to the area coverage of pixels to the right. Within the pixel
        // itself, the area to the right of the line segment forms a trapezoid (or a triangle in
        // the degenerate case). The area of this trapezoid is added to the pixel's area coverage.
        //
        // For example, consider the following pixel square, with a line indicated by asterisks
        // starting inside the pixel and crossing its bottom edge. The area covered is the
        // trapezoid on the bottom-right enclosed by the line and the pixel square. The area is
        // positive if the line moves down, and negative otherwise.
        //
        //  __________________
        //  |                |
        //  |         *------|
        //  |        *       |
        //  |       *        |
        //  |      *         |
        //  |     *          |
        //  |    *           |
        //  |___*____________|
        //     *
        //    *

        let (line_top_y, line_top_x, line_bottom_y, line_bottom_x) = if p0_y < p1_y {
            (p0_y, p0_x, p1_y, p1_x)
        } else {
            (p1_y, p1_x, p0_y, p0_x)
        };

        let y_slope = (line_bottom_y - line_top_y) / (line_bottom_x - line_top_x);
        let x_slope = 1. / y_slope;

        winding_delta += sign as i32 * i32::from(tile.winding());

        let line_top_y = f32x4::splat(s, line_top_y);
        let line_bottom_y = f32x4::splat(s, line_bottom_y);

        // See the explanation of this term on the `line_px_left_yx` and `line_px_right_yx`
        // variables below.
        let line_px_base_yx = line_top_y.mul_add(-x_slope, line_top_x);

        let px_top_y = f32x4::simd_from(s, [0., 1., 2., 3.]);
        let px_bottom_y = 1. + px_top_y;

        let ymin = line_top_y.max(px_top_y);
        let ymax = line_bottom_y.min(px_bottom_y);

        let mut acc = f32x4::splat(s, 0.0);

        for x_idx in 0..Tile::WIDTH {
            let x_idx_s = f32x4::splat(s, x_idx as f32);
            let px_left_x = x_idx_s;
            let px_right_x = 1.0 + x_idx_s;

            // The y-coordinate of the intersections between the line and the pixel's left and
            // right edges respectively.
            //
            // There is some subtlety going on here: `y_slope` will usually be finite, but will
            // be `inf` for purely vertical lines (`p0_x == p1_x`).
            //
            // In the case of `inf`, the resulting slope calculation will be `-inf` or `inf`
            // depending on whether the pixel edge is left or right of the line, respectively
            // (from the viewport's coordinate system perspective). The `min` and `max`
            // y-clamping logic generalizes nicely, as a pixel edge to the left of the line is
            // clamped to `ymin`, and a pixel edge to the right is clamped to `ymax`.
            //
            // In the special case where a vertical line and pixel edge are at the exact same
            // x-position (collinear), the line belongs to the pixel on whose _left_ edge it is
            // situated. The resulting slope calculation for the edge the line is situated on
            // will be NaN, as `0 * inf` results in NaN. This is true for both the left and
            // right edge.
            //
            // We know `ymin` and `ymax` are finite. We require the `max` operation to pick `ymin`
            // if its first operand is NaN. On x86, that maps to the semantics of `_mm_max_ps`,
            // which `f32x4::max` emits: that instruction takes element-wise
            // `if first > second { first } else { second }`. For AArch64, we do require the
            // `f32x4::max_precise` semantics (as `vmax_f32` returns NaN if either operand is NaN);
            // however, for AArch64 the precise version should be comparatively less expensive than
            // on x86. For `min`, we then know both operands are finite, so we can unambiguously
            // use the relaxed version. If this ever breaks, tests should fail loudly, because NaNs
            // happen a lot here!
            trait F32x4MaxExt {
                fn max_if_first_nan_take_second(self, rhs: Self) -> Self;
            }
            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
            impl<S: Simd> F32x4MaxExt for f32x4<S> {
                #[inline(always)]
                fn max_if_first_nan_take_second(self, rhs: Self) -> Self {
                    self.max(rhs)
                }
            }
            #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
            impl<S: Simd> F32x4MaxExt for f32x4<S> {
                #[inline(always)]
                fn max_if_first_nan_take_second(self, rhs: Self) -> Self {
                    self.max_precise(rhs)
                }
            }
            let line_px_left_y = (px_left_x - line_top_x)
                .mul_add(y_slope, line_top_y)
                .max_if_first_nan_take_second(ymin)
                .min(ymax);
            let line_px_right_y = (px_right_x - line_top_x)
                .mul_add(y_slope, line_top_y)
                .max_if_first_nan_take_second(ymin)
                .min(ymax);

            // For each pixel we calculate the x-coordinates of the left- and rightmost points on
            // the line segment within that pixel. We do this based on the y-offsets of those two
            // points from the top of the line. This can be calculated as, e.g.,
            // `(line_px_left_y - line_top_y) * x_slope + line_top_x`.
            //
            // Rather than calculating that y-offset twice for each pixel within the loop through
            // subtracting from the points' y-coordinates, we get rid of that subtraction by baking
            // it into the algebraic "base" x-coordinate `line_px_base_yx` calculated above the
            // loop. When adding that term to `y * x_slope` it gives the x-coordinate of the point
            // along the line.
            //
            // Note `x_slope` is always finite, as horizontal geometry is elided.
            let line_px_left_yx = line_px_left_y.mul_add(x_slope, line_px_base_yx);
            let line_px_right_yx = line_px_right_y.mul_add(x_slope, line_px_base_yx);
            let h = (line_px_right_y - line_px_left_y).abs();

            // The trapezoidal area enclosed between the line and the right edge of the pixel
            // square. More straightforwardly written as follows, but the `madd` is faster.
            // 0.5 * h * (2. * px_right_x - line_px_right_yx - line_px_left_yx).
            let area = h * (line_px_right_yx + line_px_left_yx).mul_add(-0.5, px_right_x);
            location_winding[x_idx as usize] += area.mul_add(sign, acc);
            acc = h.mul_add(sign, acc);
        }

        accumulated_winding += acc;
    }

    if has_culled_tiles {
        Strip::emit_culled_background(
            (prev_tile.y + 1).min(row_windings.len() as u16),
            row_windings.len() as u16,
            strip_buf,
            alpha_buf,
            &tiles.windings,
            should_fill,
        );
    }
}