1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
//! Vertex fetch analysis and optimization

use crate::INVALID_INDEX;

use super::Position;

#[derive(Default)]
pub struct VertexFetchStatistics {
    pub bytes_fetched: u32,
    /// Fetched bytes / vertex buffer size
    ///
    /// Best case is 1.0 (each byte is fetched once)
    pub overfetch: f32,
}

/// Returns cache hit statistics using a simplified direct mapped model.
///
/// Results may not match actual GPU performance.
pub fn analyze_vertex_fetch(indices: &[u32], vertex_count: usize, vertex_size: usize) -> VertexFetchStatistics {
    assert!(indices.len() % 3 == 0);
    assert!(vertex_size > 0 && vertex_size <= 256);

    let mut result = VertexFetchStatistics::default();

    let mut vertex_visited = vec![false; vertex_count];

    const CACHE_LINE: usize = 64;
    const CACHE_SIZE: usize = 128 * 1024;

    // simple direct mapped cache; on typical mesh data this is close to 4-way cache, and this model is a gross approximation anyway
    let mut cache = [0usize; CACHE_SIZE / CACHE_LINE];

    for index in indices {
        let index = *index as usize;

        assert!(index < vertex_count);

        vertex_visited[index] = true;

        let start_address = index * vertex_size;
        let end_address = start_address + vertex_size;

        let start_tag = start_address / CACHE_LINE;
        let end_tag = (end_address + CACHE_LINE - 1) / CACHE_LINE;

        assert!(start_tag < end_tag);

        for tag in start_tag..end_tag {
            let line = tag % (CACHE_SIZE / CACHE_LINE);

            // we store +1 since cache is filled with 0 by default
            result.bytes_fetched += (cache[line] != tag + 1) as u32 * CACHE_LINE as u32;
            cache[line] = tag + 1;
        }
    }

    let unique_vertex_count: usize = vertex_visited.iter().map(|v| *v as usize).sum();

    result.overfetch = if unique_vertex_count == 0 {
        0.0
    } else {
        result.bytes_fetched as f32 / (unique_vertex_count * vertex_size) as f32
    };

    result
}

/// Generates vertex remap to reduce the amount of GPU memory fetches during vertex processing.
///
/// Returns the number of unique vertices, which is the same as input vertex count unless some vertices are unused.
/// The resulting remap table should be used to reorder vertex/index buffers using [remap_vertex_buffer](crate::index::generator::remap_vertex_buffer)/[remap_index_buffer](crate::index::generator::remap_index_buffer).
///
/// # Arguments
///
/// * `destination`: must contain the exact space for the resulting remap table (`vertex_count` elements)
pub fn optimize_vertex_fetch_remap(destination: &mut [u32], indices: &[u32]) -> usize {
    assert!(indices.len() % 3 == 0);

    destination[..].fill(INVALID_INDEX);

    let mut next_vertex = 0;

    for index in indices {
        let index = *index as usize;

        if destination[index] == INVALID_INDEX {
            destination[index] = next_vertex as u32;
            next_vertex += 1;
        }
    }

    assert!(next_vertex <= destination.len());

    next_vertex
}

/// Reorders vertices and changes indices to reduce the amount of GPU memory fetches during vertex processing.
///
/// Returns the number of unique vertices, which is the same as input vertex count unless some vertices are unused.
/// This functions works for a single vertex stream; for multiple vertex streams, use [optimize_vertex_fetch_remap] + [remap_vertex_buffer](crate::index::generator::remap_vertex_buffer) for each stream.
///
/// # Arguments
///
/// * `destination`: must contain enough space for the resulting vertex buffer (`vertices.len()` elements)
pub fn optimize_vertex_fetch<Vertex>(destination: &mut [Vertex], indices: &mut [u32], vertices: &[Vertex]) -> usize
where
    Vertex: Position + Copy,
{
    assert!(indices.len() % 3 == 0);

    // build vertex remap table
    let mut vertex_remap = vec![INVALID_INDEX; vertices.len()];

    let mut next_vertex = 0;

    for index in indices.iter_mut() {
        let idx = *index as usize;

        let remap = &mut vertex_remap[idx];

        if *remap == INVALID_INDEX {
            // vertex was not added to destination VB
            // add vertex
            destination[next_vertex] = vertices[idx];

            *remap = next_vertex as u32;
            next_vertex += 1;
        }

        // modify indices in place
        *index = *remap;
    }

    assert!(next_vertex <= vertices.len());

    next_vertex
}