meshopt_rs/
overdraw.rs

1//! Overdraw analysis and optimization
2
3use crate::quantize::quantize_unorm;
4use crate::util::zero_inverse;
5use crate::vertex::{calc_pos_extents, Position};
6use crate::Vector3;
7
8const VIEWPORT: usize = 256;
9
10#[derive(Default)]
11pub struct OverdrawStatistics {
12    pub pixels_covered: u32,
13    pub pixels_shaded: u32,
14    /// Shaded pixels / covered pixels; best case is 1.0
15    pub overdraw: f32,
16}
17
18struct OverdrawBuffer {
19    z: [[[f32; 2]; VIEWPORT]; VIEWPORT],
20    overdraw: [[[u32; 2]; VIEWPORT]; VIEWPORT],
21}
22
23impl Default for OverdrawBuffer {
24    fn default() -> Self {
25        Self {
26            z: [[[0.0; 2]; VIEWPORT]; VIEWPORT],
27            overdraw: [[[0; 2]; VIEWPORT]; VIEWPORT],
28        }
29    }
30}
31
32fn compute_depth_gradients(v1: Vector3, v2: Vector3, v3: Vector3) -> (f32, f32, f32) {
33    // z2 = z1 + dzdx * (x2 - x1) + dzdy * (y2 - y1)
34    // z3 = z1 + dzdx * (x3 - x1) + dzdy * (y3 - y1)
35    // (x2-x1 y2-y1)(dzdx) = (z2-z1)
36    // (x3-x1 y3-y1)(dzdy)   (z3-z1)
37    // we'll solve it with Cramer's rule
38    let det = (v2.x - v1.x) * (v3.y - v1.y) - (v2.y - v1.y) * (v3.x - v1.x);
39    let invdet = zero_inverse(det);
40
41    let dzdx = (v2.z - v1.z) * (v3.y - v1.y) - (v2.y - v1.y) * (v3.z - v1.z) * invdet;
42    let dzdy = (v2.x - v1.x) * (v3.z - v1.z) - (v2.z - v1.z) * (v3.x - v1.x) * invdet;
43
44    (det, dzdx, dzdy)
45}
46
47// half-space fixed point triangle rasterizer
48fn rasterize(buffer: &mut OverdrawBuffer, mut v1: Vector3, mut v2: Vector3, mut v3: Vector3) {
49    // compute depth gradients
50    let (det, mut dzx, mut dzy) = compute_depth_gradients(v1, v2, v3);
51    let sign = det > 0.0;
52
53    // flip backfacing triangles to simplify rasterization logic
54    if sign {
55        // flipping v2 & v3 preserves depth gradients since they're based on v1
56        std::mem::swap(&mut v2, &mut v3);
57
58        // flip depth since we rasterize backfacing triangles to second buffer with reverse Z; only v1z is used below
59        v1.z = VIEWPORT as f32 - v1.z;
60        dzx = -dzx;
61        dzy = -dzy;
62    }
63
64    // coordinates, 28.4 fixed point
65    let x1 = (16.0 * v1.x + 0.5) as i32;
66    let x2 = (16.0 * v2.x + 0.5) as i32;
67    let x3 = (16.0 * v3.x + 0.5) as i32;
68
69    let y1 = (16.0 * v1.y + 0.5) as i32;
70    let y2 = (16.0 * v2.y + 0.5) as i32;
71    let y3 = (16.0 * v3.y + 0.5) as i32;
72
73    // bounding rectangle, clipped against viewport
74    // since we rasterize pixels with covered centers, min >0.5 should round up
75    // as for max, due to top-left filling convention we will never rasterize right/bottom edges
76    // so max >= 0.5 should round down
77    let minx = ((x1.min(x2.min(x3)) + 7) >> 4).max(0);
78    let maxx = ((x1.max(x2.max(x3)) + 7) >> 4).min(VIEWPORT as i32);
79    let miny = ((y1.min(y2.min(y3)) + 7) >> 4).max(0);
80    let maxy = ((y1.max(y2.max(y3)) + 7) >> 4).min(VIEWPORT as i32);
81
82    // deltas, 28.4 fixed point
83    let dx12 = x1 - x2;
84    let dx23 = x2 - x3;
85    let dx31 = x3 - x1;
86
87    let dy12 = y1 - y2;
88    let dy23 = y2 - y3;
89    let dy31 = y3 - y1;
90
91    // fill convention correction
92    let tl1 = (dy12 < 0 || (dy12 == 0 && dx12 > 0)) as i32;
93    let tl2 = (dy23 < 0 || (dy23 == 0 && dx23 > 0)) as i32;
94    let tl3 = (dy31 < 0 || (dy31 == 0 && dx31 > 0)) as i32;
95
96    // half edge equations, 24.8 fixed point
97    // note that we offset minx/miny by half pixel since we want to rasterize pixels with covered centers
98    let fx = (minx << 4) + 8;
99    let fy = (miny << 4) + 8;
100    let mut cy1 = dx12 * (fy - y1) - dy12 * (fx - x1) + tl1 - 1;
101    let mut cy2 = dx23 * (fy - y2) - dy23 * (fx - x2) + tl2 - 1;
102    let mut cy3 = dx31 * (fy - y3) - dy31 * (fx - x3) + tl3 - 1;
103    let mut zy = v1.z + (dzx * (fx - x1) as f32 + dzy * (fy - y1) as f32) * (1.0 / 16.0);
104
105    for y in miny..maxy {
106        let y = y as usize;
107
108        let mut cx1 = cy1;
109        let mut cx2 = cy2;
110        let mut cx3 = cy3;
111        let mut zx = zy;
112
113        for x in minx..maxx {
114            let x = x as usize;
115            let sign = sign as usize;
116
117            // check if all CXn are non-negative
118            if cx1 | cx2 | cx3 >= 0 {
119                if zx >= buffer.z[y][x][sign] {
120                    buffer.z[y][x][sign] = zx;
121                    buffer.overdraw[y][x][sign] += 1;
122                }
123            }
124
125            // signed left shift is UB for negative numbers so use unsigned-signed casts
126            // FIXME: in Rust too?
127            cx1 -= ((dy12 as u32) << 4) as i32;
128            cx2 -= ((dy23 as u32) << 4) as i32;
129            cx3 -= ((dy31 as u32) << 4) as i32;
130            zx += dzx;
131        }
132
133        // signed left shift is UB for negative numbers so use unsigned-signed casts
134        // FIXME: in Rust too?
135        cy1 += ((dx12 as u32) << 4) as i32;
136        cy2 += ((dx23 as u32) << 4) as i32;
137        cy3 += ((dx31 as u32) << 4) as i32;
138        zy += dzy;
139    }
140}
141
142/// Returns overdraw statistics using a software rasterizer.
143///
144/// Results may not match actual GPU performance.
145pub fn analyze_overdraw<Vertex>(indices: &[u32], vertices: &[Vertex]) -> OverdrawStatistics
146where
147    Vertex: Position,
148{
149    assert!(indices.len() % 3 == 0);
150
151    let mut result = OverdrawStatistics::default();
152
153    let (minv, extent) = calc_pos_extents(vertices);
154
155    let scale = VIEWPORT as f32 / extent;
156
157    let mut triangles = vec![0.0; indices.len() * 3];
158
159    for (i, index) in indices.iter().enumerate() {
160        let index = *index as usize;
161
162        let v = vertices[index].pos();
163
164        triangles[i * 3 + 0] = (v[0] - minv[0]) * scale;
165        triangles[i * 3 + 1] = (v[1] - minv[1]) * scale;
166        triangles[i * 3 + 2] = (v[2] - minv[2]) * scale;
167    }
168
169    let mut buffer = Box::default();
170
171    for axis in 0..3 {
172        *buffer = OverdrawBuffer::default();
173
174        for vn in triangles.chunks_exact(9) {
175            let vn0 = &vn[0..3];
176            let vn1 = &vn[3..6];
177            let vn2 = &vn[6..9];
178
179            match axis {
180                0 => rasterize(
181                    &mut buffer,
182                    Vector3::new(vn0[2], vn0[1], vn0[0]),
183                    Vector3::new(vn1[2], vn1[1], vn1[0]),
184                    Vector3::new(vn2[2], vn2[1], vn2[0]),
185                ),
186                1 => rasterize(
187                    &mut buffer,
188                    Vector3::new(vn0[0], vn0[2], vn0[1]),
189                    Vector3::new(vn1[0], vn1[2], vn1[1]),
190                    Vector3::new(vn2[0], vn2[2], vn2[1]),
191                ),
192                2 => rasterize(
193                    &mut buffer,
194                    Vector3::new(vn0[1], vn0[0], vn0[2]),
195                    Vector3::new(vn1[1], vn1[0], vn1[2]),
196                    Vector3::new(vn2[1], vn2[0], vn2[2]),
197                ),
198                _ => unreachable!(),
199            }
200        }
201
202        for y in 0..VIEWPORT {
203            for x in 0..VIEWPORT {
204                for s in 0..2 {
205                    let overdraw = buffer.overdraw[y][x][s];
206
207                    result.pixels_covered += (overdraw > 0) as u32;
208                    result.pixels_shaded += overdraw;
209                }
210            }
211        }
212    }
213
214    result.overdraw = if result.pixels_covered > 0 {
215        result.pixels_shaded as f32 / result.pixels_covered as f32
216    } else {
217        0.0
218    };
219
220    result
221}
222
223fn calculate_sort_data<Vertex>(sort_data: &mut [f32], indices: &[u32], vertices: &[Vertex], clusters: &[u32])
224where
225    Vertex: Position,
226{
227    let mut mesh_centroid = [0.0; 3];
228
229    for index in indices {
230        let p = vertices[*index as usize].pos();
231
232        mesh_centroid[0] += p[0];
233        mesh_centroid[1] += p[1];
234        mesh_centroid[2] += p[2];
235    }
236
237    mesh_centroid[0] /= indices.len() as f32;
238    mesh_centroid[1] /= indices.len() as f32;
239    mesh_centroid[2] /= indices.len() as f32;
240
241    for (cluster_idx, cluster_begin) in clusters.iter().enumerate() {
242        let cluster_end = if let Some(begin) = clusters.get(cluster_idx + 1) {
243            (*begin) as usize * 3
244        } else {
245            indices.len()
246        };
247        let cluster = (*cluster_begin as usize) * 3..cluster_end;
248        assert!(!cluster.is_empty());
249
250        let mut cluster_area = 0.0;
251        let mut cluster_centroid = [0.0; 3];
252        let mut cluster_normal = [0.0f32; 3];
253
254        for i in indices[cluster].chunks_exact(3) {
255            let p0 = vertices[i[0] as usize].pos();
256            let p1 = vertices[i[1] as usize].pos();
257            let p2 = vertices[i[2] as usize].pos();
258
259            let p10 = [p1[0] - p0[0], p1[1] - p0[1], p1[2] - p0[2]];
260            let p20 = [p2[0] - p0[0], p2[1] - p0[1], p2[2] - p0[2]];
261
262            let normalx = p10[1] * p20[2] - p10[2] * p20[1];
263            let normaly = p10[2] * p20[0] - p10[0] * p20[2];
264            let normalz = p10[0] * p20[1] - p10[1] * p20[0];
265
266            let area = (normalx * normalx + normaly * normaly + normalz * normalz).sqrt();
267
268            cluster_centroid[0] += (p0[0] + p1[0] + p2[0]) * (area / 3.0);
269            cluster_centroid[1] += (p0[1] + p1[1] + p2[1]) * (area / 3.0);
270            cluster_centroid[2] += (p0[2] + p1[2] + p2[2]) * (area / 3.0);
271            cluster_normal[0] += normalx;
272            cluster_normal[1] += normaly;
273            cluster_normal[2] += normalz;
274            cluster_area += area;
275        }
276
277        let inv_cluster_area = zero_inverse(cluster_area);
278
279        cluster_centroid[0] *= inv_cluster_area;
280        cluster_centroid[1] *= inv_cluster_area;
281        cluster_centroid[2] *= inv_cluster_area;
282
283        let cluster_normal_length = (cluster_normal[0] * cluster_normal[0]
284            + cluster_normal[1] * cluster_normal[1]
285            + cluster_normal[2] * cluster_normal[2])
286            .sqrt();
287        let inv_cluster_normal_length = zero_inverse(cluster_normal_length);
288
289        cluster_normal[0] *= inv_cluster_normal_length;
290        cluster_normal[1] *= inv_cluster_normal_length;
291        cluster_normal[2] *= inv_cluster_normal_length;
292
293        let centroid_vector = [
294            cluster_centroid[0] - mesh_centroid[0],
295            cluster_centroid[1] - mesh_centroid[1],
296            cluster_centroid[2] - mesh_centroid[2],
297        ];
298
299        sort_data[cluster_idx] = centroid_vector[0] * cluster_normal[0]
300            + centroid_vector[1] * cluster_normal[1]
301            + centroid_vector[2] * cluster_normal[2];
302    }
303}
304
305fn calculate_sort_order_radix(sort_order: &mut [u32], sort_data: &[f32], sort_keys: &mut [u16]) {
306    // compute sort data bounds and renormalize, using fixed point snorm
307    let mut sort_data_max: f32 = 0.001;
308
309    for data in sort_data {
310        sort_data_max = sort_data_max.max(data.abs());
311    }
312
313    const SORT_BITS: i32 = 11;
314
315    for (data, key) in sort_data.iter().zip(sort_keys.iter_mut()) {
316        // note that we flip distribution since high dot product should come first
317        let sort_key = 0.5 - 0.5 * (data / sort_data_max);
318
319        *key = (quantize_unorm(sort_key, SORT_BITS) & ((1 << SORT_BITS) - 1)) as u16;
320    }
321
322    // fill histogram for counting sort
323    let mut histogram = [0; 1 << SORT_BITS];
324
325    for key in sort_keys.iter() {
326        histogram[*key as usize] += 1;
327    }
328
329    // compute offsets based on histogram data
330    let mut histogram_sum = 0;
331
332    for i in 0..histogram.len() {
333        let count = histogram[i];
334        histogram[i] = histogram_sum;
335        histogram_sum += count;
336    }
337
338    assert_eq!(histogram_sum, sort_keys.len());
339
340    // compute sort order based on offsets
341    for i in 0..sort_keys.len() {
342        let idx = &mut histogram[sort_keys[i] as usize];
343        sort_order[*idx as usize] = i as u32;
344        *idx += 1;
345    }
346}
347
348fn update_cache(a: u32, b: u32, c: u32, cache_size: u32, cache_timestamps: &mut [u32], timestamp: &mut u32) -> u32 {
349    let mut cache_misses = 0;
350
351    // if vertex is not in cache, put it in cache
352    if *timestamp - cache_timestamps[a as usize] > cache_size {
353        cache_timestamps[a as usize] = *timestamp;
354        *timestamp += 1;
355        cache_misses += 1;
356    }
357
358    if *timestamp - cache_timestamps[b as usize] > cache_size {
359        cache_timestamps[b as usize] = *timestamp;
360        *timestamp += 1;
361        cache_misses += 1;
362    }
363
364    if *timestamp - cache_timestamps[c as usize] > cache_size {
365        cache_timestamps[c as usize] = *timestamp;
366        *timestamp += 1;
367        cache_misses += 1;
368    }
369
370    cache_misses
371}
372
373fn generate_hard_boundaries(
374    destination: &mut [u32],
375    indices: &[u32],
376    cache_size: u32,
377    cache_timestamps: &mut [u32],
378) -> usize {
379    let mut timestamp = cache_size as u32 + 1;
380
381    let face_count = indices.len() / 3;
382
383    let mut result = 0;
384
385    for i in 0..face_count {
386        let m = update_cache(
387            indices[i * 3 + 0],
388            indices[i * 3 + 1],
389            indices[i * 3 + 2],
390            cache_size,
391            cache_timestamps,
392            &mut timestamp,
393        );
394
395        // when all three vertices are not in the cache it's usually relatively safe to assume that this is a new patch in the mesh
396        // that is disjoint from previous vertices; sometimes it might come back to reference existing vertices but that frequently
397        // suggests an inefficiency in the vertex cache optimization algorithm
398        // usually the first triangle has 3 misses unless it's degenerate - thus we make sure the first cluster always starts with 0
399        if i == 0 || m == 3 {
400            destination[result] = i as u32;
401            result += 1;
402        }
403    }
404
405    assert!(result <= indices.len() / 3);
406
407    result
408}
409
410fn generate_soft_boundaries(
411    destination: &mut [u32],
412    indices: &[u32],
413    clusters: &[u32],
414    cache_size: u32,
415    threshold: f32,
416    cache_timestamps: &mut [u32],
417) -> usize {
418    let mut timestamp = 0;
419
420    let mut result = 0;
421
422    for (cluster_idx, start) in clusters.iter().enumerate() {
423        let end = if let Some(begin) = clusters.get(cluster_idx + 1) {
424            *begin as usize
425        } else {
426            indices.len() / 3
427        };
428        let cluster = (*start as usize)..end;
429        assert!(!cluster.is_empty());
430
431        // reset cache
432        timestamp += cache_size + 1;
433
434        // measure cluster ACMR
435        let mut cluster_misses = 0;
436
437        for i in cluster.clone() {
438            let m = update_cache(
439                indices[i * 3 + 0],
440                indices[i * 3 + 1],
441                indices[i * 3 + 2],
442                cache_size,
443                cache_timestamps,
444                &mut timestamp,
445            );
446
447            cluster_misses += m;
448        }
449
450        let cluster_threshold = threshold * (cluster_misses as f32 / (end - *start as usize) as f32);
451
452        // first cluster always starts from the hard cluster boundary
453        destination[result] = *start;
454        result += 1;
455
456        // reset cache
457        timestamp += cache_size + 1;
458
459        let mut running_misses = 0;
460        let mut running_faces = 0;
461
462        for i in cluster {
463            let m = update_cache(
464                indices[i * 3 + 0],
465                indices[i * 3 + 1],
466                indices[i * 3 + 2],
467                cache_size,
468                cache_timestamps,
469                &mut timestamp,
470            );
471
472            running_misses += m;
473            running_faces += 1;
474
475            if running_misses as f32 / running_faces as f32 <= cluster_threshold {
476                // we have reached the target ACMR with the current triangle so we need to start a new cluster on the next one
477                // note that this may mean that we add 'end` to destination for the last triangle, which will imply that the last
478                // cluster is empty; however, the 'pop_back' after the loop will clean it up
479                destination[result] = i as u32 + 1;
480                result += 1;
481
482                // reset cache
483                timestamp += cache_size + 1;
484
485                running_misses = 0;
486                running_faces = 0;
487            }
488        }
489
490        // each time we reach the target ACMR we flush the cluster
491        // this means that the last cluster is by definition not very good - there are frequent cases where we are left with a few triangles
492        // in the last cluster, producing a very bad ACMR and significantly penalizing the overall results
493        // thus we remove the last cluster boundary, merging the last complete cluster with the last incomplete one
494        // there are sometimes cases when the last cluster is actually good enough - in which case the code above would have added 'end'
495        // to the cluster boundary array which we need to remove anyway - this code will do that automatically
496        if destination[result - 1] != *start {
497            result -= 1;
498        }
499    }
500
501    assert!(result >= clusters.len());
502    assert!(result <= indices.len() / 3);
503
504    result
505}
506
507/// Reorders indices to reduce the number of GPU vertex shader invocations and the pixel overdraw.
508///
509/// If index buffer contains multiple ranges for multiple draw calls, this functions needs to be called on each range individually.
510///
511/// # Arguments
512///
513/// * `destination`: must contain enough space for the resulting index buffer (`indices.len()` elements)
514/// * `indices`: must contain index data that is the result of [optimize_vertex_cache](crate::vertex::cache::optimize_vertex_cache) (**not** the original mesh indices!)
515/// * `threshold`: indicates how much the overdraw optimizer can degrade vertex cache efficiency (1.05 = up to 5%) to reduce overdraw more efficiently
516pub fn optimize_overdraw<Vertex>(destination: &mut [u32], indices: &[u32], vertices: &[Vertex], threshold: f32)
517where
518    Vertex: Position,
519{
520    assert_eq!(indices.len() % 3, 0);
521
522    // guard for empty meshes
523    if indices.is_empty() || vertices.is_empty() {
524        return;
525    }
526
527    const CACHE_SIZE: u32 = 16;
528
529    let mut cache_timestamps = vec![0u32; vertices.len()];
530
531    // generate hard boundaries from full-triangle cache misses
532    let mut hard_clusters = vec![0u32; indices.len() / 3];
533    let hard_cluster_count = generate_hard_boundaries(&mut hard_clusters, indices, CACHE_SIZE, &mut cache_timestamps);
534
535    // generate soft boundaries
536    cache_timestamps.fill(0);
537    let mut soft_clusters = vec![0u32; indices.len() / 3 + 1];
538    let soft_cluster_count = generate_soft_boundaries(
539        &mut soft_clusters,
540        indices,
541        &hard_clusters[0..hard_cluster_count],
542        CACHE_SIZE,
543        threshold,
544        &mut cache_timestamps,
545    );
546
547    let clusters = &soft_clusters[0..soft_cluster_count];
548
549    // fill sort data
550    let mut sort_data = vec![0.0; clusters.len()];
551    calculate_sort_data(&mut sort_data, indices, vertices, clusters);
552
553    // sort clusters using sort data
554    let mut sort_keys = vec![0u16; clusters.len()];
555    let mut sort_order = vec![0u32; clusters.len()];
556    calculate_sort_order_radix(&mut sort_order, &mut sort_data, &mut sort_keys);
557
558    // fill output buffer
559    let mut offset = 0;
560
561    for cluster in &sort_order {
562        let cluster = *cluster as usize;
563
564        let cluster_begin = (clusters[cluster] * 3) as usize;
565        let cluster_end = if let Some(begin) = clusters.get(cluster + 1) {
566            (begin * 3) as usize
567        } else {
568            indices.len()
569        };
570        assert!(cluster_begin < cluster_end);
571
572        let cluster_size = cluster_end - cluster_begin;
573
574        destination[offset..offset + cluster_size].copy_from_slice(&indices[cluster_begin..cluster_end]);
575
576        offset += cluster_size;
577    }
578
579    assert_eq!(offset, indices.len());
580}
581
582#[cfg(test)]
583mod test {
584    use super::*;
585
586    struct DummyVertex;
587
588    impl Position for DummyVertex {
589        fn pos(&self) -> [f32; 3] {
590            [0.0; 3]
591        }
592    }
593
594    #[test]
595    fn test_empty() {
596        optimize_overdraw(&mut [], &[], &[DummyVertex], 1.0);
597    }
598}