astrelis-render 0.2.4

Astrelis Core Rendering Module
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
//! GPU query and profiling support.
//!
//! This module provides wrappers for GPU queries (timestamps, occlusion) and
//! a high-level profiler for measuring GPU execution times.
//!
//! # Features Required
//!
//! - `TIMESTAMP_QUERY` - Required for timestamp queries and GPU profiling
//!
//! # Example
//!
//! ```ignore
//! use astrelis_render::{GpuProfiler, GraphicsContext};
//!
//! // Create profiler (requires TIMESTAMP_QUERY feature)
//! let mut profiler = GpuProfiler::new(context.clone(), 256);
//!
//! // In render loop:
//! profiler.begin_frame();
//!
//! {
//!     let region = profiler.begin_region(&mut encoder, "Shadow Pass");
//!     // ... render shadow pass ...
//!     profiler.end_region(&mut encoder, region);
//! }
//!
//! profiler.resolve(&mut encoder);
//!
//! // Later, read results
//! for (label, duration_ms) in profiler.read_results() {
//!     println!("{}: {:.2}ms", label, duration_ms);
//! }
//! ```

use std::sync::Arc;

use crate::capability::{GpuRequirements, RenderCapability};
use crate::context::GraphicsContext;
use crate::features::GpuFeatures;

// =============================================================================
// Query Types
// =============================================================================

/// Types of GPU queries.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum QueryType {
    /// Timestamp query for measuring GPU execution time.
    /// Requires `TIMESTAMP_QUERY` feature.
    Timestamp,
    /// Occlusion query for counting visible fragments.
    Occlusion,
}

impl QueryType {
    /// Convert to wgpu query type.
    pub fn to_wgpu(self) -> wgpu::QueryType {
        match self {
            QueryType::Timestamp => wgpu::QueryType::Timestamp,
            QueryType::Occlusion => wgpu::QueryType::Occlusion,
        }
    }
}

// =============================================================================
// QuerySet
// =============================================================================

/// A wrapper around wgpu::QuerySet with metadata.
pub struct QuerySet {
    query_set: wgpu::QuerySet,
    query_type: QueryType,
    count: u32,
}

impl QuerySet {
    /// Create a new query set.
    ///
    /// # Arguments
    ///
    /// * `device` - The wgpu device
    /// * `label` - Optional debug label
    /// * `query_type` - Type of queries in this set
    /// * `count` - Number of queries in the set
    pub fn new(
        device: &wgpu::Device,
        label: Option<&str>,
        query_type: QueryType,
        count: u32,
    ) -> Self {
        let query_set = device.create_query_set(&wgpu::QuerySetDescriptor {
            label,
            ty: query_type.to_wgpu(),
            count,
        });

        Self {
            query_set,
            query_type,
            count,
        }
    }

    /// Get the underlying wgpu query set.
    #[inline]
    pub fn query_set(&self) -> &wgpu::QuerySet {
        &self.query_set
    }

    /// Get the query type.
    #[inline]
    pub fn query_type(&self) -> QueryType {
        self.query_type
    }

    /// Get the number of queries in the set.
    #[inline]
    pub fn count(&self) -> u32 {
        self.count
    }
}

// =============================================================================
// QueryResultBuffer
// =============================================================================

/// Buffer for storing and reading query results.
pub struct QueryResultBuffer {
    resolve_buffer: wgpu::Buffer,
    read_buffer: wgpu::Buffer,
    count: u32,
}

impl QueryResultBuffer {
    /// Create a new query result buffer.
    ///
    /// # Arguments
    ///
    /// * `device` - The wgpu device
    /// * `label` - Optional debug label
    /// * `count` - Number of query results to store
    pub fn new(device: &wgpu::Device, label: Option<&str>, count: u32) -> Self {
        let size = (count as u64) * std::mem::size_of::<u64>() as u64;

        let resolve_buffer = device.create_buffer(&wgpu::BufferDescriptor {
            label: label.map(|l| format!("{} Resolve", l)).as_deref(),
            size,
            usage: wgpu::BufferUsages::QUERY_RESOLVE | wgpu::BufferUsages::COPY_SRC,
            mapped_at_creation: false,
        });

        let read_buffer = device.create_buffer(&wgpu::BufferDescriptor {
            label: label.map(|l| format!("{} Read", l)).as_deref(),
            size,
            usage: wgpu::BufferUsages::COPY_DST | wgpu::BufferUsages::MAP_READ,
            mapped_at_creation: false,
        });

        Self {
            resolve_buffer,
            read_buffer,
            count,
        }
    }

    /// Get the resolve buffer (used for query resolution).
    #[inline]
    pub fn resolve_buffer(&self) -> &wgpu::Buffer {
        &self.resolve_buffer
    }

    /// Get the read buffer (used for CPU readback).
    #[inline]
    pub fn read_buffer(&self) -> &wgpu::Buffer {
        &self.read_buffer
    }

    /// Get the number of results this buffer can hold.
    #[inline]
    pub fn count(&self) -> u32 {
        self.count
    }

    /// Resolve queries from a query set into this buffer.
    pub fn resolve(
        &self,
        encoder: &mut wgpu::CommandEncoder,
        query_set: &QuerySet,
        query_range: std::ops::Range<u32>,
        destination_offset: u32,
    ) {
        encoder.resolve_query_set(
            query_set.query_set(),
            query_range,
            &self.resolve_buffer,
            (destination_offset as u64) * std::mem::size_of::<u64>() as u64,
        );
    }

    /// Copy resolved results to the readable buffer.
    pub fn copy_to_readable(&self, encoder: &mut wgpu::CommandEncoder) {
        let size = (self.count as u64) * std::mem::size_of::<u64>() as u64;
        encoder.copy_buffer_to_buffer(&self.resolve_buffer, 0, &self.read_buffer, 0, size);
    }

    /// Map the read buffer for CPU access.
    ///
    /// Returns a future that completes when the buffer is mapped.
    pub fn map_async(
        &self,
    ) -> impl std::future::Future<Output = Result<(), wgpu::BufferAsyncError>> {
        let slice = self.read_buffer.slice(..);
        let (tx, rx) = std::sync::mpsc::channel();

        slice.map_async(wgpu::MapMode::Read, move |result| {
            let _ = tx.send(result);
        });

        async move { rx.recv().map_err(|_| wgpu::BufferAsyncError)? }
    }

    /// Read the query results (must be mapped first).
    ///
    /// Returns the raw u64 timestamps/occlusion counts.
    pub fn read_results(&self) -> Vec<u64> {
        let slice = self.read_buffer.slice(..);
        let data = slice.get_mapped_range();
        let results: Vec<u64> = bytemuck::cast_slice(&data).to_vec();
        drop(data);
        self.read_buffer.unmap();
        results
    }
}

// =============================================================================
// ProfileRegion
// =============================================================================

/// A handle to a profiling region.
///
/// Created by `GpuProfiler::begin_region` and consumed by `GpuProfiler::end_region`.
#[derive(Debug)]
pub struct ProfileRegion {
    label: String,
    start_query: u32,
}

// =============================================================================
// GpuProfiler
// =============================================================================

/// High-level GPU profiler for measuring execution times.
///
/// This profiler uses timestamp queries to measure GPU execution time
/// for different regions of your rendering code.
///
/// # Requirements
///
/// - Device must support `TIMESTAMP_QUERY` feature
/// - Must call `begin_frame()` at the start of each frame
/// - Must call `resolve()` before submitting commands
///
/// # Example
///
/// ```ignore
/// let mut profiler = GpuProfiler::new(context.clone(), 256);
///
/// // Each frame:
/// profiler.begin_frame();
///
/// let region = profiler.begin_region(&mut encoder, "My Pass");
/// // ... do rendering ...
/// profiler.end_region(&mut encoder, region);
///
/// profiler.resolve(&mut encoder);
///
/// // Read results (may be from previous frame)
/// for (label, duration_ms) in profiler.read_results() {
///     println!("{}: {:.2}ms", label, duration_ms);
/// }
/// ```
impl RenderCapability for GpuProfiler {
    fn requirements() -> GpuRequirements {
        GpuRequirements::new().require_features(GpuFeatures::TIMESTAMP_QUERY)
    }

    fn name() -> &'static str {
        "GpuProfiler"
    }
}

pub struct GpuProfiler {
    context: Arc<GraphicsContext>,
    query_set: QuerySet,
    result_buffer: QueryResultBuffer,
    /// Current query index for the frame
    current_query: u32,
    /// Maximum queries per frame
    max_queries: u32,
    /// Regions from the current frame (label, start_query, end_query)
    regions: Vec<(String, u32, u32)>,
    /// Cached results from the previous frame
    cached_results: Vec<(String, f64)>,
    /// Timestamp period in nanoseconds per tick
    timestamp_period: f32,
}

impl GpuProfiler {
    /// Create a new GPU profiler.
    ///
    /// # Arguments
    ///
    /// * `context` - Graphics context (must support TIMESTAMP_QUERY)
    /// * `max_queries` - Maximum number of timestamp queries per frame
    ///
    /// # Panics
    ///
    /// Panics if the device doesn't support timestamp queries.
    pub fn new(context: Arc<GraphicsContext>, max_queries: u32) -> Self {
        let timestamp_period = context.queue().get_timestamp_period();

        let query_set = QuerySet::new(
            context.device(),
            Some("GPU Profiler Queries"),
            QueryType::Timestamp,
            max_queries,
        );

        let result_buffer =
            QueryResultBuffer::new(context.device(), Some("GPU Profiler Results"), max_queries);

        Self {
            context,
            query_set,
            result_buffer,
            current_query: 0,
            max_queries,
            regions: Vec::new(),
            cached_results: Vec::new(),
            timestamp_period,
        }
    }

    /// Begin a new frame.
    ///
    /// Call this at the start of each frame before recording any regions.
    pub fn begin_frame(&mut self) {
        self.current_query = 0;
        self.regions.clear();
    }

    /// Begin a profiling region.
    ///
    /// # Arguments
    ///
    /// * `encoder` - Command encoder to write the timestamp
    /// * `label` - Human-readable label for this region
    ///
    /// # Returns
    ///
    /// A `ProfileRegion` handle that must be passed to `end_region`.
    pub fn begin_region(
        &mut self,
        encoder: &mut wgpu::CommandEncoder,
        label: &str,
    ) -> Option<ProfileRegion> {
        if self.current_query >= self.max_queries {
            return None;
        }

        let start_query = self.current_query;
        encoder.write_timestamp(&self.query_set.query_set, start_query);
        self.current_query += 1;

        Some(ProfileRegion {
            label: label.to_string(),
            start_query,
        })
    }

    /// End a profiling region.
    ///
    /// # Arguments
    ///
    /// * `encoder` - Command encoder to write the timestamp
    /// * `region` - The region handle from `begin_region`
    pub fn end_region(&mut self, encoder: &mut wgpu::CommandEncoder, region: ProfileRegion) {
        if self.current_query >= self.max_queries {
            return;
        }

        let end_query = self.current_query;
        encoder.write_timestamp(&self.query_set.query_set, end_query);
        self.current_query += 1;

        self.regions
            .push((region.label, region.start_query, end_query));
    }

    /// Resolve all queries from this frame.
    ///
    /// Call this after all regions have been recorded, before submitting commands.
    pub fn resolve(&self, encoder: &mut wgpu::CommandEncoder) {
        if self.current_query == 0 {
            return;
        }

        self.result_buffer
            .resolve(encoder, &self.query_set, 0..self.current_query, 0);
        self.result_buffer.copy_to_readable(encoder);
    }

    /// Read profiling results synchronously.
    ///
    /// This blocks until the results are available from the GPU.
    /// For non-blocking reads, consider using double-buffering or
    /// reading results from the previous frame.
    ///
    /// # Returns
    ///
    /// A vector of (label, duration_ms) pairs for each completed region.
    pub fn read_results(&mut self) -> &[(String, f64)] {
        if self.regions.is_empty() {
            return &self.cached_results;
        }

        let device = self.context.device();

        // Map the buffer
        let slice = self.result_buffer.read_buffer().slice(..);
        let (tx, rx) = std::sync::mpsc::channel();

        slice.map_async(wgpu::MapMode::Read, move |result| {
            let _ = tx.send(result);
        });

        // Wait for the buffer to be mapped (blocking)
        let _ = device.poll(wgpu::PollType::Wait {
            submission_index: None,
            timeout: None,
        });

        // Wait for the callback
        if rx.recv().is_ok() {
            let data = slice.get_mapped_range();
            let timestamps: &[u64] = bytemuck::cast_slice(&data);

            self.cached_results.clear();

            for (label, start, end) in &self.regions {
                let start_ts = timestamps.get(*start as usize).copied().unwrap_or(0);
                let end_ts = timestamps.get(*end as usize).copied().unwrap_or(0);

                // Convert ticks to milliseconds
                let duration_ns =
                    (end_ts.saturating_sub(start_ts)) as f64 * self.timestamp_period as f64;
                let duration_ms = duration_ns / 1_000_000.0;

                self.cached_results.push((label.clone(), duration_ms));
            }

            drop(data);
            self.result_buffer.read_buffer().unmap();
        }

        &self.cached_results
    }

    /// Try to read profiling results without blocking.
    ///
    /// Returns None if the results are not yet available.
    /// This is useful when you want to display results from the previous frame.
    ///
    /// # Returns
    ///
    /// Some reference to the cached results if new data was read, or the existing cached results.
    pub fn try_read_results(&mut self) -> &[(String, f64)] {
        if self.regions.is_empty() {
            return &self.cached_results;
        }

        let device = self.context.device();

        // Try to map the buffer
        let slice = self.result_buffer.read_buffer().slice(..);
        let (tx, rx) = std::sync::mpsc::channel();

        slice.map_async(wgpu::MapMode::Read, move |result| {
            let _ = tx.send(result);
        });

        // Non-blocking poll
        let _ = device.poll(wgpu::PollType::Poll);

        // Check if mapping succeeded
        if let Ok(Ok(())) = rx.try_recv() {
            let data = slice.get_mapped_range();
            let timestamps: &[u64] = bytemuck::cast_slice(&data);

            self.cached_results.clear();

            for (label, start, end) in &self.regions {
                let start_ts = timestamps.get(*start as usize).copied().unwrap_or(0);
                let end_ts = timestamps.get(*end as usize).copied().unwrap_or(0);

                // Convert ticks to milliseconds
                let duration_ns =
                    (end_ts.saturating_sub(start_ts)) as f64 * self.timestamp_period as f64;
                let duration_ms = duration_ns / 1_000_000.0;

                self.cached_results.push((label.clone(), duration_ms));
            }

            drop(data);
            self.result_buffer.read_buffer().unmap();
        }

        &self.cached_results
    }

    /// Get the number of queries used this frame.
    #[inline]
    pub fn queries_used(&self) -> u32 {
        self.current_query
    }

    /// Get the maximum queries per frame.
    #[inline]
    pub fn max_queries(&self) -> u32 {
        self.max_queries
    }

    /// Get the timestamp period in nanoseconds per tick.
    #[inline]
    pub fn timestamp_period(&self) -> f32 {
        self.timestamp_period
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_query_type_conversion() {
        // Just verify conversion doesn't panic
        let _ = QueryType::Timestamp.to_wgpu();
        let _ = QueryType::Occlusion.to_wgpu();
    }

    #[test]
    fn test_profile_region_debug() {
        let region = ProfileRegion {
            label: "Test".to_string(),
            start_query: 0,
        };
        // Just ensure Debug is implemented
        let _ = format!("{:?}", region);
    }
}