wgpu_profiler/
profiler.rs

1use std::{
2    collections::HashMap,
3    sync::{
4        atomic::{AtomicU32, Ordering},
5        Arc,
6    },
7};
8
9use parking_lot::{Mutex, RwLock};
10
11use crate::{
12    CreationError, EndFrameError, GpuProfilerQuery, GpuProfilerSettings, GpuTimerQueryResult,
13    ManualOwningScope, OwningScope, ProfilerCommandRecorder, Scope, SettingsError,
14};
15
16/// Profiler instance.
17///
18/// You can have an arbitrary number of independent profiler instances per application/adapter.
19/// Manages all the necessary [`wgpu::QuerySet`] and [`wgpu::Buffer`] behind the scenes.
20///
21/// Any query creation method may allocate a new [`wgpu::QuerySet`] and [`wgpu::Buffer`] internally if necessary.
22///
23/// [`GpuProfiler`] is associated with a single [`wgpu::Device`] upon creation.
24/// All references wgpu objects passed in subsequent calls must originate from that device.
25pub struct GpuProfiler {
26    device: wgpu::Device,
27
28    unused_pools: Vec<QueryPool>,
29
30    active_frame: ActiveFrame,
31    pending_frames: Vec<PendingFrame>,
32
33    num_open_queries: AtomicU32,
34    next_query_handle: AtomicU32,
35
36    size_for_new_query_pools: u32,
37
38    settings: GpuProfilerSettings,
39
40    #[cfg(feature = "tracy")]
41    tracy_context: Option<tracy_client::GpuContext>,
42}
43
44// Public interface
45impl GpuProfiler {
46    /// Combination of all timer query features [`GpuProfiler`] can leverage.
47    pub const ALL_WGPU_TIMER_FEATURES: wgpu::Features = wgpu::Features::TIMESTAMP_QUERY
48        .union(wgpu::Features::TIMESTAMP_QUERY_INSIDE_ENCODERS)
49        .union(wgpu::Features::TIMESTAMP_QUERY_INSIDE_PASSES);
50
51    /// Combination of all timer query features [`GpuProfiler`] can leverage.
52    #[deprecated(since = "0.9.0", note = "Use ALL_WGPU_TIMER_FEATURES instead")]
53    pub const REQUIRED_WGPU_FEATURES: wgpu::Features = GpuProfiler::ALL_WGPU_TIMER_FEATURES;
54
55    /// Creates a new Profiler object.
56    ///
57    /// There is nothing preventing the use of several independent profiler objects.
58    pub fn new(
59        device: &wgpu::Device,
60        settings: GpuProfilerSettings,
61    ) -> Result<Self, CreationError> {
62        settings.validate()?;
63
64        let (closed_scope_sender, closed_scope_receiver) = std::sync::mpsc::channel();
65
66        Ok(GpuProfiler {
67            device: device.clone(),
68
69            unused_pools: Vec::new(),
70
71            pending_frames: Vec::with_capacity(settings.max_num_pending_frames),
72            active_frame: ActiveFrame {
73                query_pools: RwLock::new(PendingFramePools::default()),
74                closed_query_sender: closed_scope_sender,
75                closed_query_receiver: Mutex::new(closed_scope_receiver),
76            },
77
78            num_open_queries: AtomicU32::new(0),
79            next_query_handle: AtomicU32::new(0),
80
81            size_for_new_query_pools: QueryPool::MIN_CAPACITY,
82
83            settings,
84
85            #[cfg(feature = "tracy")]
86            tracy_context: None,
87        })
88    }
89
90    /// Creates a new profiler and connects to a running Tracy client.
91    #[cfg(feature = "tracy")]
92    pub fn new_with_tracy_client(
93        settings: GpuProfilerSettings,
94        backend: wgpu::Backend,
95        device: &wgpu::Device,
96        queue: &wgpu::Queue,
97    ) -> Result<Self, CreationError> {
98        let mut profiler = Self::new(device, settings)?;
99        profiler.tracy_context = Some(crate::tracy::create_tracy_gpu_client(
100            backend, device, queue,
101        )?);
102        Ok(profiler)
103    }
104
105    /// Returns currently active settings.
106    pub fn settings(&self) -> &GpuProfilerSettings {
107        &self.settings
108    }
109
110    /// Changes the settings of an existing profiler.
111    ///
112    /// If timer scopes are disabled by setting [`GpuProfilerSettings::enable_timer_queries`] to false,
113    /// any timer queries that are in flight will still be processed,
114    /// but unused query sets and buffers will be deallocated during [`Self::process_finished_frame`].
115    /// Similarly, any opened debugging scope will still be closed if debug groups are disabled by setting
116    /// [`GpuProfilerSettings::enable_debug_groups`] to false.
117    pub fn change_settings(&mut self, settings: GpuProfilerSettings) -> Result<(), SettingsError> {
118        settings.validate()?;
119        if !settings.enable_timer_queries {
120            self.unused_pools.clear();
121        }
122        self.settings = settings;
123
124        Ok(())
125    }
126
127    /// Starts a new auto-closing profiler scope.
128    ///
129    /// To nest scopes inside this scope, call [`Scope::scope`] on the returned scope.
130    ///
131    /// If an [`wgpu::CommandEncoder`] is passed but the [`wgpu::Device`]
132    /// does not support [`wgpu::Features::TIMESTAMP_QUERY_INSIDE_ENCODERS`], no gpu timer will
133    /// be queried and the scope will not show up in the final results.
134    /// If an [`wgpu::ComputePass`] or [`wgpu::RenderPass`] is passed but the [`wgpu::Device`]
135    /// does not support [`wgpu::Features::TIMESTAMP_QUERY_INSIDE_PASSES`], no scope will be opened.
136    ///
137    /// If [`GpuProfilerSettings::enable_debug_groups`] is true, a debug group will be pushed on the encoder or pass.
138    ///
139    /// Scope is automatically closed on drop.
140    #[must_use]
141    #[track_caller]
142    #[inline]
143    pub fn scope<'a, Recorder: ProfilerCommandRecorder>(
144        &'a self,
145        label: impl Into<String>,
146        encoder_or_pass: &'a mut Recorder,
147    ) -> Scope<'a, Recorder> {
148        let scope = self.begin_query(label, encoder_or_pass);
149        Scope {
150            profiler: self,
151            recorder: encoder_or_pass,
152            scope: Some(scope),
153        }
154    }
155
156    /// Starts a new auto-closing profiler scope that takes ownership of the passed encoder or rendering/compute pass.
157    ///
158    /// To nest scopes inside this scope, call [`OwningScope::scope`] on the returned scope.
159    ///
160    /// If an [`wgpu::CommandEncoder`] is passed but the [`wgpu::Device`]
161    /// does not support [`wgpu::Features::TIMESTAMP_QUERY_INSIDE_ENCODERS`], no gpu timer will be queried
162    /// and the scope will not show up in the final results.
163    /// If an [`wgpu::ComputePass`] or [`wgpu::RenderPass`] is passed but the [`wgpu::Device`]
164    /// does not support [`wgpu::Features::TIMESTAMP_QUERY_INSIDE_PASSES`], no scope will be opened.
165    ///
166    /// If [`GpuProfilerSettings::enable_debug_groups`] is true, a debug group will be pushed on the encoder or pass.
167    ///
168    /// Scope is automatically closed on drop.
169    #[must_use]
170    #[track_caller]
171    #[inline]
172    pub fn owning_scope<Recorder: ProfilerCommandRecorder>(
173        &'_ self,
174        label: impl Into<String>,
175        mut encoder_or_pass: Recorder,
176    ) -> OwningScope<'_, Recorder> {
177        let scope = self.begin_query(label, &mut encoder_or_pass);
178        OwningScope {
179            profiler: self,
180            recorder: encoder_or_pass,
181            scope: Some(scope),
182        }
183    }
184
185    /// Starts a new **manually closed** profiler scope that takes ownership of the passed encoder or rendering/compute pass.
186    ///
187    /// Does NOT call [`GpuProfiler::end_query()`] on drop.
188    /// This construct is just for completeness in cases where working with scopes is preferred but one can't rely on the Drop call in the right place.
189    /// This is useful when the owned value needs to be recovered after the end of the scope.
190    /// In particular, to submit a [`wgpu::CommandEncoder`] to a queue, ownership of the encoder is necessary.
191    ///
192    /// To nest scopes inside this scope, call [`ManualOwningScope::scope`] on the returned scope.
193    ///
194    /// If an [`wgpu::CommandEncoder`] is passed but the [`wgpu::Device`]
195    /// does not support [`wgpu::Features::TIMESTAMP_QUERY_INSIDE_ENCODERS`], no gpu timer will be queried and the scope will
196    /// not show up in the final results.
197    /// If an [`wgpu::ComputePass`] or [`wgpu::RenderPass`] is passed but the [`wgpu::Device`]
198    /// does not support [`wgpu::Features::TIMESTAMP_QUERY_INSIDE_PASSES`], no scope will be opened.
199    ///
200    /// If [`GpuProfilerSettings::enable_debug_groups`] is true, a debug group will be pushed on the encoder or pass.
201    #[must_use]
202    #[track_caller]
203    #[inline]
204    pub fn manual_owning_scope<Recorder: ProfilerCommandRecorder>(
205        &self,
206        label: impl Into<String>,
207        mut encoder_or_pass: Recorder,
208    ) -> ManualOwningScope<'_, Recorder> {
209        let scope = self.begin_query(label, &mut encoder_or_pass);
210        ManualOwningScope {
211            profiler: self,
212            recorder: encoder_or_pass,
213            scope: Some(scope),
214        }
215    }
216
217    /// Starts a new profiler query on the given encoder or rendering/compute pass (if enabled).
218    ///
219    /// The returned query *must* be closed by calling [`GpuProfiler::end_query`] with the same encoder/pass,
220    /// even if timer queries are disabled.
221    /// To do this automatically, use [`GpuProfiler::scope`]/[`GpuProfiler::owning_scope`] instead.
222    ///
223    /// If an [`wgpu::CommandEncoder`] is passed but the [`wgpu::Device`]
224    /// does not support [`wgpu::Features::TIMESTAMP_QUERY_INSIDE_ENCODERS`], no gpu timer will be queried and the scope will
225    /// not show up in the final results.
226    /// If an [`wgpu::ComputePass`] or [`wgpu::RenderPass`] is passed but the [`wgpu::Device`]
227    /// does not support [`wgpu::Features::TIMESTAMP_QUERY_INSIDE_PASSES`], no timer queries will be allocated.
228    ///
229    /// If [`GpuProfilerSettings::enable_debug_groups`] is true, a debug group will be pushed on the encoder or pass.
230    #[track_caller]
231    #[must_use]
232    pub fn begin_query<Recorder: ProfilerCommandRecorder>(
233        &self,
234        label: impl Into<String>,
235        encoder_or_pass: &mut Recorder,
236    ) -> GpuProfilerQuery {
237        let is_for_pass_timestamp_writes = false;
238        let mut query =
239            self.begin_query_internal(label.into(), is_for_pass_timestamp_writes, encoder_or_pass);
240        if let Some(timer_query) = &mut query.timer_query_pair {
241            encoder_or_pass
242                .write_timestamp(&timer_query.pool.query_set, timer_query.start_query_idx);
243            timer_query.usage_state = QueryPairUsageState::OnlyStartWritten;
244        };
245
246        if self.settings.enable_debug_groups {
247            encoder_or_pass.push_debug_group(&query.label);
248            query.has_debug_group = true;
249        }
250        query
251    }
252
253    /// Starts a new profiler query to be used for render/compute pass timestamp writes.
254    ///
255    /// The returned query *must* be closed by calling [`GpuProfiler::end_query`], even if timer queries are disabled.
256    /// To do this automatically, use [`Scope::scoped_render_pass`]/[`Scope::scoped_compute_pass`] instead.
257    ///
258    /// Call [`GpuProfilerQuery::render_pass_timestamp_writes`] or [`GpuProfilerQuery::compute_pass_timestamp_writes`]
259    /// to acquire the corresponding [`wgpu::RenderPassTimestampWrites`]/[`wgpu::ComputePassTimestampWrites`] object.
260    ///
261    /// If the [`wgpu::Device`] does not support [`wgpu::Features::TIMESTAMP_QUERY`], no gpu timer will be reserved.
262    ///
263    /// Unlike [`GpuProfiler::begin_query`] this will not create a debug scope,
264    /// in order to not force passing of the same encoder/pass to [`GpuProfiler::end_query`].
265    /// (this is needed to relax resource tracking requirements a bit, making it easier to implement the automatic scopes)
266    pub fn begin_pass_query(
267        &self,
268        label: impl Into<String>,
269        encoder: &mut wgpu::CommandEncoder,
270    ) -> GpuProfilerQuery {
271        let is_for_pass_timestamp_writes = true;
272        let mut query =
273            self.begin_query_internal(label.into(), is_for_pass_timestamp_writes, encoder);
274        if let Some(timer_query) = &mut query.timer_query_pair {
275            timer_query.usage_state = QueryPairUsageState::ReservedForPassTimestampWrites;
276        }
277        query
278    }
279
280    /// Ends passed query.
281    ///
282    /// If the passed query was opened with [`GpuProfiler::begin_query`], the passed encoder or pass must be the same
283    /// as when the query was opened.
284    pub fn end_query<Recorder: ProfilerCommandRecorder>(
285        &self,
286        encoder_or_pass: &mut Recorder,
287        mut query: GpuProfilerQuery,
288    ) {
289        if let Some(timer_query) = &mut query.timer_query_pair {
290            match timer_query.usage_state {
291                QueryPairUsageState::Reserved => {
292                    unreachable!("Query pair has been reserved but isn't used for anything!")
293                }
294                QueryPairUsageState::ReservedForPassTimestampWrites => {
295                    // No need to do a timestamp write, this is handled by wgpu.
296                }
297                QueryPairUsageState::OnlyStartWritten => {
298                    encoder_or_pass.write_timestamp(
299                        &timer_query.pool.query_set,
300                        timer_query.start_query_idx + 1,
301                    );
302                    timer_query.usage_state = QueryPairUsageState::BothStartAndEndWritten;
303                }
304                QueryPairUsageState::BothStartAndEndWritten => {
305                    unreachable!("Query pair has already been used!")
306                }
307            }
308        }
309
310        #[cfg(feature = "tracy")]
311        if let Some(ref mut tracy_scope) = query.tracy_scope {
312            tracy_scope.end_zone();
313        }
314
315        if query.has_debug_group {
316            encoder_or_pass.pop_debug_group();
317        }
318
319        let send_result = self.active_frame.closed_query_sender.send(query);
320
321        // The only way we can fail sending the query is if the receiver has been dropped.
322        // Since it sits on `active_frame` as well, there's no way for this to happen!
323        debug_assert!(send_result.is_ok());
324
325        // Count queries even if we haven't processed this one, makes experiences more consistent
326        // if there's a lack of support for some queries.
327        self.num_open_queries.fetch_sub(1, Ordering::Release);
328    }
329
330    /// Puts query resolve commands in the encoder for all unresolved, pending queries of the active profiler frame.
331    ///
332    /// Note that you do *not* need to do this for every encoder, it is sufficient do do this once per frame as long
333    /// as you submit the corresponding command buffer after all others that may have opened queries in the same frame.
334    /// (It does not matter if the passed encoder itself has previously opened queries or not.)
335    /// If you were to make this part of a command buffer that is enqueued before any other that has
336    /// opened queries in the same profiling frame, no failure will occur but some timing results may be invalid.
337    ///
338    /// It is advised to call this only once at the end of a profiling frame, but it is safe to do so several times.
339    ///
340    ///
341    /// Implementation note:
342    /// This method could be made `&self`, taking the internal lock on the query pools.
343    /// However, the intended use is to call this once at the end of a frame, so we instead
344    /// encourage this explicit sync point and avoid the lock.
345    pub fn resolve_queries(&mut self, encoder: &mut wgpu::CommandEncoder) {
346        let query_pools = self.active_frame.query_pools.get_mut();
347
348        for query_pool in query_pools.used_pools.iter_mut() {
349            // We sync with the last update of num_used_query (which has Release semantics)
350            // mostly to be on the safe side - it happened inside a lock which gives it release semantics anyways
351            // but the concern is that if we don't acquire here, we may miss on other side prior effects of the query begin.
352            let num_used_queries = query_pool.num_used_queries.load(Ordering::Acquire);
353            let num_resolved_queries = query_pool.num_resolved_queries.load(Ordering::Acquire);
354
355            if num_resolved_queries == num_used_queries {
356                continue;
357            }
358
359            debug_assert!(query_pool.capacity >= num_used_queries);
360            debug_assert!(num_resolved_queries < num_used_queries);
361
362            // Resolve into offset 0 of the resolve buffer - this way we don't have to worry about
363            // the offset restrictions on resolve buffers (`wgpu::QUERY_RESOLVE_BUFFER_ALIGNMENT`)
364            // and we copy it anyways.
365            encoder.resolve_query_set(
366                &query_pool.query_set,
367                num_resolved_queries..num_used_queries,
368                &query_pool.resolve_buffer,
369                0,
370            );
371            // Copy the newly resolved queries into the read buffer, making sure
372            // that we don't override any of the results that are already there.
373            let destination_offset = (num_resolved_queries * wgpu::QUERY_SIZE) as u64;
374            let copy_size = ((num_used_queries - num_resolved_queries) * wgpu::QUERY_SIZE) as u64;
375            encoder.copy_buffer_to_buffer(
376                &query_pool.resolve_buffer,
377                0,
378                &query_pool.read_buffer,
379                destination_offset,
380                copy_size,
381            );
382
383            query_pool
384                .num_resolved_queries
385                .store(num_used_queries, Ordering::Release);
386        }
387    }
388
389    /// Marks the end of a frame.
390    ///
391    /// Needs to be called **after** submitting any encoder used in the current profiler frame.
392    ///
393    /// Fails if there are still open queries or unresolved queries.
394    pub fn end_frame(&mut self) -> Result<(), EndFrameError> {
395        let num_open_queries = self.num_open_queries.load(Ordering::Acquire);
396        if num_open_queries != 0 {
397            return Err(EndFrameError::UnclosedQueries(num_open_queries));
398        }
399
400        let query_pools = self.active_frame.query_pools.get_mut();
401
402        let mut new_pending_frame = PendingFrame {
403            query_pools: std::mem::take(&mut query_pools.used_pools),
404            closed_query_by_parent_handle: HashMap::new(),
405            mapped_buffers: Arc::new(AtomicU32::new(0)),
406        };
407
408        for query in self.active_frame.closed_query_receiver.get_mut().try_iter() {
409            new_pending_frame
410                .closed_query_by_parent_handle
411                .entry(query.parent_handle)
412                .or_default()
413                .push(query);
414        }
415
416        // All loads of pool.num_used_queries are Relaxed since we assume,
417        // that we already acquired the state during `resolve_queries` and no further otherwise unobserved
418        // modifications happened since then.
419
420        let num_unresolved_queries = new_pending_frame
421            .query_pools
422            .iter()
423            .map(|pool| {
424                pool.num_used_queries.load(Ordering::Relaxed)
425                    - pool.num_resolved_queries.load(Ordering::Relaxed)
426            })
427            .sum();
428        if num_unresolved_queries != 0 {
429            return Err(EndFrameError::UnresolvedQueries(num_unresolved_queries));
430        }
431
432        // Next time we create a new query pool, we want it to be at least as big to hold all queries of this frame.
433        self.size_for_new_query_pools = self
434            .size_for_new_query_pools
435            .max(
436                new_pending_frame
437                    .query_pools
438                    .iter()
439                    .map(|pool| pool.num_used_queries.load(Ordering::Relaxed))
440                    .sum(),
441            )
442            .min(QUERY_SET_MAX_QUERIES);
443
444        // Make sure we don't overflow.
445        if self.pending_frames.len() == self.settings.max_num_pending_frames {
446            // Drop previous (!) frame.
447            // Dropping the oldest frame could get us into an endless cycle where we're never able to complete
448            // any pending frames as the ones closest to completion would be evicted.
449            if let Some(dropped_frame) = self.pending_frames.pop() {
450                // Drop queries first since they still have references to the query pools that we want to reuse.
451                drop(dropped_frame.closed_query_by_parent_handle);
452
453                // Mark the frame as dropped. We'll give back the query pools once the mapping is done.
454                // Any previously issued map_async call that haven't finished yet, will invoke their callback with mapping abort.
455                self.reset_and_cache_unused_query_pools(dropped_frame.query_pools);
456            }
457        }
458
459        // Map all buffers.
460        for pool in new_pending_frame.query_pools.iter_mut() {
461            let mapped_buffers = new_pending_frame.mapped_buffers.clone();
462            pool.read_buffer
463                .slice(0..(pool.num_used_queries.load(Ordering::Relaxed) * wgpu::QUERY_SIZE) as u64)
464                .map_async(wgpu::MapMode::Read, move |mapping_result| {
465                    // Mapping should not fail unless it was cancelled due to the frame being dropped.
466                    match mapping_result {
467                        Err(_) => {
468                            // We only want to ignore the error iff the mapping has been aborted by us (due to a dropped frame, see above).
469                            // In any other case, we need should panic as this would imply something went seriously sideways.
470                            //
471                            // As of writing, this is not yet possible in wgpu, see https://github.com/gfx-rs/wgpu/pull/2939
472                        }
473                        Ok(()) => {
474                            mapped_buffers.fetch_add(1, std::sync::atomic::Ordering::Release);
475                        }
476                    }
477                });
478        }
479
480        // Enqueue
481        self.pending_frames.push(new_pending_frame);
482        assert!(self.pending_frames.len() <= self.settings.max_num_pending_frames);
483
484        Ok(())
485    }
486
487    /// Checks if all timer queries for the oldest pending finished frame are done and returns that snapshot if any.
488    ///
489    /// `timestamp_period`:
490    ///    The timestamp period of the device. Pass the result of [`wgpu::Queue::get_timestamp_period()`].
491    ///    Note that some implementations (Chrome as of writing) may converge to a timestamp period while the application is running,
492    ///    so caching this value is usually not recommended.
493    pub fn process_finished_frame(
494        &mut self,
495        timestamp_period: f32,
496    ) -> Option<Vec<GpuTimerQueryResult>> {
497        let frame = self.pending_frames.first_mut()?;
498
499        // We only process if all mappings succeed.
500        if frame
501            .mapped_buffers
502            .load(std::sync::atomic::Ordering::Acquire)
503            != frame.query_pools.len() as u32
504        {
505            return None;
506        }
507
508        let PendingFrame {
509            query_pools,
510            mut closed_query_by_parent_handle,
511            mapped_buffers: _,
512        } = self.pending_frames.remove(0);
513
514        let results = {
515            let timestamp_to_sec = timestamp_period as f64 / 1000.0 / 1000.0 / 1000.0;
516
517            Self::process_timings_recursive(
518                timestamp_to_sec,
519                &mut closed_query_by_parent_handle,
520                ROOT_QUERY_HANDLE,
521            )
522        };
523
524        // Ensure that closed queries no longer hold references to the query pools.
525        // `process_timings_recursive` should have handled this already.
526        debug_assert!(closed_query_by_parent_handle.is_empty());
527        drop(closed_query_by_parent_handle); // But just in case, we make sure to drop it here even if above debug assertion fails.
528
529        self.reset_and_cache_unused_query_pools(query_pools);
530
531        Some(results)
532    }
533}
534
535// --------------------------------------------------------------------------------
536// Internals
537// --------------------------------------------------------------------------------
538
539const QUERY_SET_MAX_QUERIES: u32 = wgpu::QUERY_SET_MAX_QUERIES;
540
541/// Returns true if a timestamp query is supported.
542fn timestamp_query_support<Recorder: ProfilerCommandRecorder>(
543    is_for_pass_timestamp_writes: bool,
544    encoder_or_pass: &mut Recorder,
545    features: wgpu::Features,
546) -> bool {
547    let required_feature = if is_for_pass_timestamp_writes {
548        wgpu::Features::TIMESTAMP_QUERY
549    } else if encoder_or_pass.is_pass() {
550        wgpu::Features::TIMESTAMP_QUERY_INSIDE_PASSES
551    } else {
552        wgpu::Features::TIMESTAMP_QUERY_INSIDE_ENCODERS
553    };
554    features.contains(required_feature)
555}
556
557impl GpuProfiler {
558    fn next_scope_tree_handle(&self) -> GpuTimerQueryTreeHandle {
559        // Relaxed is fine, we just want a number that nobody uses this frame already.
560        let mut handle = self.next_query_handle.fetch_add(1, Ordering::Relaxed);
561
562        // We don't ever expect to run out of handles during a single frame, but who knows how long the app runs.
563        while handle == ROOT_QUERY_HANDLE {
564            handle = self.next_query_handle.fetch_add(1, Ordering::Relaxed);
565        }
566
567        handle
568    }
569
570    fn reset_and_cache_unused_query_pools(&mut self, mut discarded_pools: Vec<Arc<QueryPool>>) {
571        let capacity_threshold = self.size_for_new_query_pools / 2;
572        for pool in discarded_pools.drain(..) {
573            // If the pool is truly unused now, it's ref count should be 1!
574            // If we use it anywhere else we have an implementation bug.
575            let mut pool = Arc::into_inner(pool).expect("Pool still in use");
576            pool.reset();
577
578            // If a pool was less than half of the size of the max frame, then we don't keep it.
579            // This way we're going to need less pools in upcoming frames and thus have less overhead in the long run.
580            // If timer queries were disabled, we also don't keep any pools.
581            if self.settings.enable_timer_queries && pool.capacity >= capacity_threshold {
582                self.active_frame
583                    .query_pools
584                    .get_mut()
585                    .unused_pools
586                    .push(pool);
587            }
588        }
589    }
590
591    fn try_reserve_query_pair(pool: &Arc<QueryPool>) -> Option<ReservedTimerQueryPair> {
592        let mut num_used_queries = pool.num_used_queries.load(Ordering::Relaxed);
593
594        loop {
595            if pool.capacity < num_used_queries + 2 {
596                // This pool is out of capacity, we failed the operation.
597                return None;
598            }
599
600            match pool.num_used_queries.compare_exchange_weak(
601                num_used_queries,
602                num_used_queries + 2,
603                // Write to num_used_queries with release semantics to be on the safe side.
604                // (It doesn't look like there's other side effects that we need to publish.)
605                Ordering::Release,
606                // No barrier for the failure case.
607                // The only thing we have to acquire is the pool's capacity which is constant and
608                // was definitely acquired by the RWLock prior to this call.
609                Ordering::Relaxed,
610            ) {
611                Ok(_) => {
612                    // We successfully acquired two queries!
613                    return Some(ReservedTimerQueryPair {
614                        pool: pool.clone(),
615                        start_query_idx: num_used_queries,
616                        usage_state: QueryPairUsageState::Reserved,
617                    });
618                }
619                Err(updated) => {
620                    // Someone else acquired queries in the meantime, try again.
621                    num_used_queries = updated;
622                }
623            }
624        }
625    }
626
627    // Reserves two query objects.
628    // Our query pools always have an even number of queries, so we know the next query is the next in the same pool.
629    fn reserve_query_pair(&self) -> ReservedTimerQueryPair {
630        // First, try to allocate from current top pool.
631        // Requires taking a read lock on the current query pool.
632        {
633            let query_pools = self.active_frame.query_pools.read();
634            if let Some(pair) = query_pools
635                .used_pools
636                .last()
637                .and_then(Self::try_reserve_query_pair)
638            {
639                return pair;
640            }
641        }
642        // If this didn't work, we may need to add a new pool.
643        // Requires taking a write lock on the current query pool.
644        {
645            let mut query_pools = self.active_frame.query_pools.write();
646
647            // It could be that by now, another thread has already added a new pool!
648            // This is a bit unfortunate because it means we unnecessarily took a write lock, but it seems hard to get around this.
649            if let Some(pair) = query_pools
650                .used_pools
651                .last()
652                .and_then(Self::try_reserve_query_pair)
653            {
654                return pair;
655            }
656
657            // Now we know for certain that the last pool is exhausted, so add a new one!
658            let new_pool = if let Some(reused_pool) = query_pools.unused_pools.pop() {
659                // First check if there's an unused pool we can take.
660                Arc::new(reused_pool)
661            } else {
662                // If we can't, create a new pool that is as big as all previous pools combined.
663                Arc::new(QueryPool::new(
664                    query_pools
665                        .used_pools
666                        .iter()
667                        .map(|pool| pool.capacity)
668                        .sum::<u32>()
669                        .max(self.size_for_new_query_pools)
670                        .min(QUERY_SET_MAX_QUERIES),
671                    &self.device,
672                ))
673            };
674
675            let pair = Self::try_reserve_query_pair(&new_pool)
676                .expect("Freshly reserved pool doesn't have enough capacity");
677            query_pools.used_pools.push(new_pool);
678
679            pair
680        }
681    }
682
683    #[track_caller]
684    #[must_use]
685    fn begin_query_internal<Recorder: ProfilerCommandRecorder>(
686        &self,
687        label: String,
688        is_for_pass_timestamp_writes: bool,
689        encoder_or_pass: &mut Recorder,
690    ) -> GpuProfilerQuery {
691        // Give opening/closing queries acquire/release semantics:
692        // This way, we won't get any nasty surprises when observing zero open queries.
693        self.num_open_queries.fetch_add(1, Ordering::Acquire);
694
695        let query = if self.settings.enable_timer_queries
696            && timestamp_query_support(
697                is_for_pass_timestamp_writes,
698                encoder_or_pass,
699                self.device.features(),
700            ) {
701            Some(self.reserve_query_pair())
702        } else {
703            None
704        };
705
706        let _tracy_scope = if self.settings.enable_timer_queries {
707            #[cfg(feature = "tracy")]
708            {
709                let location = std::panic::Location::caller();
710                self.tracy_context.as_ref().and_then(|c| {
711                    c.span_alloc(&label, "", location.file(), location.line())
712                        .ok()
713                })
714            }
715            #[cfg(not(feature = "tracy"))]
716            Option::<()>::None
717        } else {
718            None
719        };
720
721        let pid = if cfg!(target_arch = "wasm32") {
722            0
723        } else {
724            std::process::id()
725        };
726
727        GpuProfilerQuery {
728            label,
729            pid,
730            tid: std::thread::current().id(),
731            timer_query_pair: query,
732            handle: self.next_scope_tree_handle(),
733            parent_handle: ROOT_QUERY_HANDLE,
734            has_debug_group: false,
735            #[cfg(feature = "tracy")]
736            tracy_scope: _tracy_scope,
737        }
738    }
739
740    fn process_timings_recursive(
741        timestamp_to_sec: f64,
742        closed_scope_by_parent_handle: &mut HashMap<GpuTimerQueryTreeHandle, Vec<GpuProfilerQuery>>,
743        parent_handle: GpuTimerQueryTreeHandle,
744    ) -> Vec<GpuTimerQueryResult> {
745        let Some(queries_with_same_parent) = closed_scope_by_parent_handle.remove(&parent_handle)
746        else {
747            return Vec::new();
748        };
749
750        queries_with_same_parent
751            .into_iter()
752            .map(|mut scope| {
753                // Note that inactive queries may still have nested queries, it's therefore important we process all of them.
754                // In particular, this happens if only `wgpu::Features::TIMESTAMP_QUERY`` is enabled and `timestamp_writes`
755                // on passes are nested inside inactive encoder timer queries.
756                let time_raw = scope.timer_query_pair.take().map(|query| {
757                    // Read timestamp from buffer.
758                    // By design timestamps for start/end are consecutive.
759                    let offset = (query.start_query_idx * wgpu::QUERY_SIZE) as u64;
760                    let buffer_slice = &query
761                        .pool
762                        .read_buffer
763                        .slice(offset..(offset + (wgpu::QUERY_SIZE * 2) as u64))
764                        .get_mapped_range();
765                    let start_raw = u64::from_le_bytes(
766                        buffer_slice[0..wgpu::QUERY_SIZE as usize]
767                            .try_into()
768                            .unwrap(),
769                    );
770                    let end_raw = u64::from_le_bytes(
771                        buffer_slice[wgpu::QUERY_SIZE as usize..(wgpu::QUERY_SIZE as usize) * 2]
772                            .try_into()
773                            .unwrap(),
774                    );
775
776                    start_raw..end_raw
777                });
778
779                let time = time_raw.as_ref().map(|time_raw| {
780                    (time_raw.start as f64 * timestamp_to_sec)
781                        ..(time_raw.end as f64 * timestamp_to_sec)
782                });
783
784                #[cfg(feature = "tracy")]
785                if let (Some(tracy_scope), Some(time_raw)) = (&scope.tracy_scope, &time_raw) {
786                    tracy_scope.upload_timestamp_start(time_raw.start as i64);
787                }
788
789                let nested_queries = Self::process_timings_recursive(
790                    timestamp_to_sec,
791                    closed_scope_by_parent_handle,
792                    scope.handle,
793                );
794
795                #[cfg(feature = "tracy")]
796                if let (Some(tracy_scope), Some(time_raw)) = (&scope.tracy_scope, time_raw) {
797                    tracy_scope.upload_timestamp_end(time_raw.end as i64);
798                }
799
800                GpuTimerQueryResult {
801                    label: std::mem::take(&mut scope.label),
802                    time,
803                    nested_queries,
804                    pid: scope.pid,
805                    tid: scope.tid,
806                }
807            })
808            .collect::<Vec<_>>()
809    }
810}
811
812#[derive(PartialEq, Eq)]
813pub enum QueryPairUsageState {
814    /// Transitional state used upon creation.
815    Reserved,
816
817    /// Don't do manual timestamp writes, wgpu is expected to do them for us.
818    ReservedForPassTimestampWrites,
819
820    /// Start query has been used, end query is still available.
821    OnlyStartWritten,
822
823    /// Both start & end query have been used.
824    BothStartAndEndWritten,
825}
826
827pub struct ReservedTimerQueryPair {
828    /// [`QueryPool`] on which both start & end queries of the scope are done.
829    ///
830    /// By putting an arc here instead of an index into a vec, we don't need
831    /// need to take any locks upon closing a profiling scope.
832    pub pool: Arc<QueryPool>,
833
834    /// Query index at which the scope begins.
835    /// The query after this is reserved for the end of the scope.
836    pub start_query_idx: u32,
837
838    /// Current use of the query pair.
839    pub usage_state: QueryPairUsageState,
840}
841
842/// A pool of queries, consisting of a single queryset & buffer for query results.
843#[derive(Debug)]
844pub struct QueryPool {
845    pub query_set: wgpu::QuerySet,
846
847    resolve_buffer: wgpu::Buffer,
848    read_buffer: wgpu::Buffer,
849
850    capacity: u32,
851    num_used_queries: AtomicU32,
852    num_resolved_queries: AtomicU32,
853}
854
855impl QueryPool {
856    const MIN_CAPACITY: u32 = 32;
857
858    fn new(capacity: u32, device: &wgpu::Device) -> Self {
859        QueryPool {
860            query_set: device.create_query_set(&wgpu::QuerySetDescriptor {
861                label: Some("GpuProfiler - Query Set"),
862                ty: wgpu::QueryType::Timestamp,
863                count: capacity,
864            }),
865
866            resolve_buffer: device.create_buffer(&wgpu::BufferDescriptor {
867                label: Some("GpuProfiler - Query Resolve Buffer"),
868                size: (wgpu::QUERY_SIZE * capacity) as u64,
869                usage: wgpu::BufferUsages::QUERY_RESOLVE | wgpu::BufferUsages::COPY_SRC,
870                mapped_at_creation: false,
871            }),
872
873            read_buffer: device.create_buffer(&wgpu::BufferDescriptor {
874                label: Some("GpuProfiler - Query Read Buffer"),
875                size: (wgpu::QUERY_SIZE * capacity) as u64,
876                usage: wgpu::BufferUsages::COPY_DST | wgpu::BufferUsages::MAP_READ,
877                mapped_at_creation: false,
878            }),
879
880            capacity,
881            num_used_queries: AtomicU32::new(0),
882            num_resolved_queries: AtomicU32::new(0),
883        }
884    }
885
886    fn reset(&mut self) {
887        self.num_used_queries = AtomicU32::new(0);
888        self.num_resolved_queries = AtomicU32::new(0);
889        self.read_buffer.unmap();
890    }
891}
892
893#[derive(Default)]
894struct PendingFramePools {
895    /// List of all pools used in this frame.
896    /// The last pool is the one new profiling queries will try to make timer queries into.
897    used_pools: Vec<Arc<QueryPool>>,
898
899    /// List of unused pools recycled from previous frames.
900    unused_pools: Vec<QueryPool>,
901}
902
903/// Internal handle to building a tree of profiling queries.
904pub type GpuTimerQueryTreeHandle = u32;
905
906/// Handle for the root scope.
907pub const ROOT_QUERY_HANDLE: GpuTimerQueryTreeHandle = u32::MAX;
908
909struct ActiveFrame {
910    query_pools: RwLock<PendingFramePools>,
911
912    /// Closed queries get send to this channel.
913    ///
914    /// Note that channel is still overkill for what we want here:
915    /// We're in a multi producer situation, *but* the single consumer is known to be only
916    /// active in a mut context, i.e. while we're consuming we know that we're not producing.
917    /// We have to wrap it in a Mutex because the channel is not Sync, but we actually never lock it
918    /// since we only ever access it in a `mut` context.
919    closed_query_sender: std::sync::mpsc::Sender<GpuProfilerQuery>,
920    closed_query_receiver: Mutex<std::sync::mpsc::Receiver<GpuProfilerQuery>>,
921}
922
923struct PendingFrame {
924    query_pools: Vec<Arc<QueryPool>>,
925    closed_query_by_parent_handle: HashMap<GpuTimerQueryTreeHandle, Vec<GpuProfilerQuery>>,
926
927    /// Keeps track of the number of buffers in the query pool that have been mapped successfully.
928    mapped_buffers: std::sync::Arc<std::sync::atomic::AtomicU32>,
929}