wgpu_profiler/profiler.rs
1use std::{
2 collections::HashMap,
3 sync::{
4 atomic::{AtomicU32, Ordering},
5 Arc,
6 },
7};
8
9use parking_lot::{Mutex, RwLock};
10
11use crate::{
12 CreationError, EndFrameError, GpuProfilerQuery, GpuProfilerSettings, GpuTimerQueryResult,
13 ManualOwningScope, OwningScope, ProfilerCommandRecorder, Scope, SettingsError,
14};
15
16/// Profiler instance.
17///
18/// You can have an arbitrary number of independent profiler instances per application/adapter.
19/// Manages all the necessary [`wgpu::QuerySet`] and [`wgpu::Buffer`] behind the scenes.
20///
21/// Any query creation method may allocate a new [`wgpu::QuerySet`] and [`wgpu::Buffer`] internally if necessary.
22///
23/// [`GpuProfiler`] is associated with a single [`wgpu::Device`] upon creation.
24/// All references wgpu objects passed in subsequent calls must originate from that device.
25pub struct GpuProfiler {
26 device: wgpu::Device,
27
28 unused_pools: Vec<QueryPool>,
29
30 active_frame: ActiveFrame,
31 pending_frames: Vec<PendingFrame>,
32
33 num_open_queries: AtomicU32,
34 next_query_handle: AtomicU32,
35
36 size_for_new_query_pools: u32,
37
38 settings: GpuProfilerSettings,
39
40 #[cfg(feature = "tracy")]
41 tracy_context: Option<tracy_client::GpuContext>,
42}
43
44// Public interface
45impl GpuProfiler {
46 /// Combination of all timer query features [`GpuProfiler`] can leverage.
47 pub const ALL_WGPU_TIMER_FEATURES: wgpu::Features = wgpu::Features::TIMESTAMP_QUERY
48 .union(wgpu::Features::TIMESTAMP_QUERY_INSIDE_ENCODERS)
49 .union(wgpu::Features::TIMESTAMP_QUERY_INSIDE_PASSES);
50
51 /// Combination of all timer query features [`GpuProfiler`] can leverage.
52 #[deprecated(since = "0.9.0", note = "Use ALL_WGPU_TIMER_FEATURES instead")]
53 pub const REQUIRED_WGPU_FEATURES: wgpu::Features = GpuProfiler::ALL_WGPU_TIMER_FEATURES;
54
55 /// Creates a new Profiler object.
56 ///
57 /// There is nothing preventing the use of several independent profiler objects.
58 pub fn new(
59 device: &wgpu::Device,
60 settings: GpuProfilerSettings,
61 ) -> Result<Self, CreationError> {
62 settings.validate()?;
63
64 let (closed_scope_sender, closed_scope_receiver) = std::sync::mpsc::channel();
65
66 Ok(GpuProfiler {
67 device: device.clone(),
68
69 unused_pools: Vec::new(),
70
71 pending_frames: Vec::with_capacity(settings.max_num_pending_frames),
72 active_frame: ActiveFrame {
73 query_pools: RwLock::new(PendingFramePools::default()),
74 closed_query_sender: closed_scope_sender,
75 closed_query_receiver: Mutex::new(closed_scope_receiver),
76 },
77
78 num_open_queries: AtomicU32::new(0),
79 next_query_handle: AtomicU32::new(0),
80
81 size_for_new_query_pools: QueryPool::MIN_CAPACITY,
82
83 settings,
84
85 #[cfg(feature = "tracy")]
86 tracy_context: None,
87 })
88 }
89
90 /// Creates a new profiler and connects to a running Tracy client.
91 #[cfg(feature = "tracy")]
92 pub fn new_with_tracy_client(
93 settings: GpuProfilerSettings,
94 backend: wgpu::Backend,
95 device: &wgpu::Device,
96 queue: &wgpu::Queue,
97 ) -> Result<Self, CreationError> {
98 let mut profiler = Self::new(device, settings)?;
99 profiler.tracy_context = Some(crate::tracy::create_tracy_gpu_client(
100 backend, device, queue,
101 )?);
102 Ok(profiler)
103 }
104
105 /// Returns currently active settings.
106 pub fn settings(&self) -> &GpuProfilerSettings {
107 &self.settings
108 }
109
110 /// Changes the settings of an existing profiler.
111 ///
112 /// If timer scopes are disabled by setting [`GpuProfilerSettings::enable_timer_queries`] to false,
113 /// any timer queries that are in flight will still be processed,
114 /// but unused query sets and buffers will be deallocated during [`Self::process_finished_frame`].
115 /// Similarly, any opened debugging scope will still be closed if debug groups are disabled by setting
116 /// [`GpuProfilerSettings::enable_debug_groups`] to false.
117 pub fn change_settings(&mut self, settings: GpuProfilerSettings) -> Result<(), SettingsError> {
118 settings.validate()?;
119 if !settings.enable_timer_queries {
120 self.unused_pools.clear();
121 }
122 self.settings = settings;
123
124 Ok(())
125 }
126
127 /// Starts a new auto-closing profiler scope.
128 ///
129 /// To nest scopes inside this scope, call [`Scope::scope`] on the returned scope.
130 ///
131 /// If an [`wgpu::CommandEncoder`] is passed but the [`wgpu::Device`]
132 /// does not support [`wgpu::Features::TIMESTAMP_QUERY_INSIDE_ENCODERS`], no gpu timer will
133 /// be queried and the scope will not show up in the final results.
134 /// If an [`wgpu::ComputePass`] or [`wgpu::RenderPass`] is passed but the [`wgpu::Device`]
135 /// does not support [`wgpu::Features::TIMESTAMP_QUERY_INSIDE_PASSES`], no scope will be opened.
136 ///
137 /// If [`GpuProfilerSettings::enable_debug_groups`] is true, a debug group will be pushed on the encoder or pass.
138 ///
139 /// Scope is automatically closed on drop.
140 #[must_use]
141 #[track_caller]
142 #[inline]
143 pub fn scope<'a, Recorder: ProfilerCommandRecorder>(
144 &'a self,
145 label: impl Into<String>,
146 encoder_or_pass: &'a mut Recorder,
147 ) -> Scope<'a, Recorder> {
148 let scope = self.begin_query(label, encoder_or_pass);
149 Scope {
150 profiler: self,
151 recorder: encoder_or_pass,
152 scope: Some(scope),
153 }
154 }
155
156 /// Starts a new auto-closing profiler scope that takes ownership of the passed encoder or rendering/compute pass.
157 ///
158 /// To nest scopes inside this scope, call [`OwningScope::scope`] on the returned scope.
159 ///
160 /// If an [`wgpu::CommandEncoder`] is passed but the [`wgpu::Device`]
161 /// does not support [`wgpu::Features::TIMESTAMP_QUERY_INSIDE_ENCODERS`], no gpu timer will be queried
162 /// and the scope will not show up in the final results.
163 /// If an [`wgpu::ComputePass`] or [`wgpu::RenderPass`] is passed but the [`wgpu::Device`]
164 /// does not support [`wgpu::Features::TIMESTAMP_QUERY_INSIDE_PASSES`], no scope will be opened.
165 ///
166 /// If [`GpuProfilerSettings::enable_debug_groups`] is true, a debug group will be pushed on the encoder or pass.
167 ///
168 /// Scope is automatically closed on drop.
169 #[must_use]
170 #[track_caller]
171 #[inline]
172 pub fn owning_scope<Recorder: ProfilerCommandRecorder>(
173 &'_ self,
174 label: impl Into<String>,
175 mut encoder_or_pass: Recorder,
176 ) -> OwningScope<'_, Recorder> {
177 let scope = self.begin_query(label, &mut encoder_or_pass);
178 OwningScope {
179 profiler: self,
180 recorder: encoder_or_pass,
181 scope: Some(scope),
182 }
183 }
184
185 /// Starts a new **manually closed** profiler scope that takes ownership of the passed encoder or rendering/compute pass.
186 ///
187 /// Does NOT call [`GpuProfiler::end_query()`] on drop.
188 /// This construct is just for completeness in cases where working with scopes is preferred but one can't rely on the Drop call in the right place.
189 /// This is useful when the owned value needs to be recovered after the end of the scope.
190 /// In particular, to submit a [`wgpu::CommandEncoder`] to a queue, ownership of the encoder is necessary.
191 ///
192 /// To nest scopes inside this scope, call [`ManualOwningScope::scope`] on the returned scope.
193 ///
194 /// If an [`wgpu::CommandEncoder`] is passed but the [`wgpu::Device`]
195 /// does not support [`wgpu::Features::TIMESTAMP_QUERY_INSIDE_ENCODERS`], no gpu timer will be queried and the scope will
196 /// not show up in the final results.
197 /// If an [`wgpu::ComputePass`] or [`wgpu::RenderPass`] is passed but the [`wgpu::Device`]
198 /// does not support [`wgpu::Features::TIMESTAMP_QUERY_INSIDE_PASSES`], no scope will be opened.
199 ///
200 /// If [`GpuProfilerSettings::enable_debug_groups`] is true, a debug group will be pushed on the encoder or pass.
201 #[must_use]
202 #[track_caller]
203 #[inline]
204 pub fn manual_owning_scope<Recorder: ProfilerCommandRecorder>(
205 &self,
206 label: impl Into<String>,
207 mut encoder_or_pass: Recorder,
208 ) -> ManualOwningScope<'_, Recorder> {
209 let scope = self.begin_query(label, &mut encoder_or_pass);
210 ManualOwningScope {
211 profiler: self,
212 recorder: encoder_or_pass,
213 scope: Some(scope),
214 }
215 }
216
217 /// Starts a new profiler query on the given encoder or rendering/compute pass (if enabled).
218 ///
219 /// The returned query *must* be closed by calling [`GpuProfiler::end_query`] with the same encoder/pass,
220 /// even if timer queries are disabled.
221 /// To do this automatically, use [`GpuProfiler::scope`]/[`GpuProfiler::owning_scope`] instead.
222 ///
223 /// If an [`wgpu::CommandEncoder`] is passed but the [`wgpu::Device`]
224 /// does not support [`wgpu::Features::TIMESTAMP_QUERY_INSIDE_ENCODERS`], no gpu timer will be queried and the scope will
225 /// not show up in the final results.
226 /// If an [`wgpu::ComputePass`] or [`wgpu::RenderPass`] is passed but the [`wgpu::Device`]
227 /// does not support [`wgpu::Features::TIMESTAMP_QUERY_INSIDE_PASSES`], no timer queries will be allocated.
228 ///
229 /// If [`GpuProfilerSettings::enable_debug_groups`] is true, a debug group will be pushed on the encoder or pass.
230 #[track_caller]
231 #[must_use]
232 pub fn begin_query<Recorder: ProfilerCommandRecorder>(
233 &self,
234 label: impl Into<String>,
235 encoder_or_pass: &mut Recorder,
236 ) -> GpuProfilerQuery {
237 let is_for_pass_timestamp_writes = false;
238 let mut query =
239 self.begin_query_internal(label.into(), is_for_pass_timestamp_writes, encoder_or_pass);
240 if let Some(timer_query) = &mut query.timer_query_pair {
241 encoder_or_pass
242 .write_timestamp(&timer_query.pool.query_set, timer_query.start_query_idx);
243 timer_query.usage_state = QueryPairUsageState::OnlyStartWritten;
244 };
245
246 if self.settings.enable_debug_groups {
247 encoder_or_pass.push_debug_group(&query.label);
248 query.has_debug_group = true;
249 }
250 query
251 }
252
253 /// Starts a new profiler query to be used for render/compute pass timestamp writes.
254 ///
255 /// The returned query *must* be closed by calling [`GpuProfiler::end_query`], even if timer queries are disabled.
256 /// To do this automatically, use [`Scope::scoped_render_pass`]/[`Scope::scoped_compute_pass`] instead.
257 ///
258 /// Call [`GpuProfilerQuery::render_pass_timestamp_writes`] or [`GpuProfilerQuery::compute_pass_timestamp_writes`]
259 /// to acquire the corresponding [`wgpu::RenderPassTimestampWrites`]/[`wgpu::ComputePassTimestampWrites`] object.
260 ///
261 /// If the [`wgpu::Device`] does not support [`wgpu::Features::TIMESTAMP_QUERY`], no gpu timer will be reserved.
262 ///
263 /// Unlike [`GpuProfiler::begin_query`] this will not create a debug scope,
264 /// in order to not force passing of the same encoder/pass to [`GpuProfiler::end_query`].
265 /// (this is needed to relax resource tracking requirements a bit, making it easier to implement the automatic scopes)
266 pub fn begin_pass_query(
267 &self,
268 label: impl Into<String>,
269 encoder: &mut wgpu::CommandEncoder,
270 ) -> GpuProfilerQuery {
271 let is_for_pass_timestamp_writes = true;
272 let mut query =
273 self.begin_query_internal(label.into(), is_for_pass_timestamp_writes, encoder);
274 if let Some(timer_query) = &mut query.timer_query_pair {
275 timer_query.usage_state = QueryPairUsageState::ReservedForPassTimestampWrites;
276 }
277 query
278 }
279
280 /// Ends passed query.
281 ///
282 /// If the passed query was opened with [`GpuProfiler::begin_query`], the passed encoder or pass must be the same
283 /// as when the query was opened.
284 pub fn end_query<Recorder: ProfilerCommandRecorder>(
285 &self,
286 encoder_or_pass: &mut Recorder,
287 mut query: GpuProfilerQuery,
288 ) {
289 if let Some(timer_query) = &mut query.timer_query_pair {
290 match timer_query.usage_state {
291 QueryPairUsageState::Reserved => {
292 unreachable!("Query pair has been reserved but isn't used for anything!")
293 }
294 QueryPairUsageState::ReservedForPassTimestampWrites => {
295 // No need to do a timestamp write, this is handled by wgpu.
296 }
297 QueryPairUsageState::OnlyStartWritten => {
298 encoder_or_pass.write_timestamp(
299 &timer_query.pool.query_set,
300 timer_query.start_query_idx + 1,
301 );
302 timer_query.usage_state = QueryPairUsageState::BothStartAndEndWritten;
303 }
304 QueryPairUsageState::BothStartAndEndWritten => {
305 unreachable!("Query pair has already been used!")
306 }
307 }
308 }
309
310 #[cfg(feature = "tracy")]
311 if let Some(ref mut tracy_scope) = query.tracy_scope {
312 tracy_scope.end_zone();
313 }
314
315 if query.has_debug_group {
316 encoder_or_pass.pop_debug_group();
317 }
318
319 let send_result = self.active_frame.closed_query_sender.send(query);
320
321 // The only way we can fail sending the query is if the receiver has been dropped.
322 // Since it sits on `active_frame` as well, there's no way for this to happen!
323 debug_assert!(send_result.is_ok());
324
325 // Count queries even if we haven't processed this one, makes experiences more consistent
326 // if there's a lack of support for some queries.
327 self.num_open_queries.fetch_sub(1, Ordering::Release);
328 }
329
330 /// Puts query resolve commands in the encoder for all unresolved, pending queries of the active profiler frame.
331 ///
332 /// Note that you do *not* need to do this for every encoder, it is sufficient do do this once per frame as long
333 /// as you submit the corresponding command buffer after all others that may have opened queries in the same frame.
334 /// (It does not matter if the passed encoder itself has previously opened queries or not.)
335 /// If you were to make this part of a command buffer that is enqueued before any other that has
336 /// opened queries in the same profiling frame, no failure will occur but some timing results may be invalid.
337 ///
338 /// It is advised to call this only once at the end of a profiling frame, but it is safe to do so several times.
339 ///
340 ///
341 /// Implementation note:
342 /// This method could be made `&self`, taking the internal lock on the query pools.
343 /// However, the intended use is to call this once at the end of a frame, so we instead
344 /// encourage this explicit sync point and avoid the lock.
345 pub fn resolve_queries(&mut self, encoder: &mut wgpu::CommandEncoder) {
346 let query_pools = self.active_frame.query_pools.get_mut();
347
348 for query_pool in query_pools.used_pools.iter_mut() {
349 // We sync with the last update of num_used_query (which has Release semantics)
350 // mostly to be on the safe side - it happened inside a lock which gives it release semantics anyways
351 // but the concern is that if we don't acquire here, we may miss on other side prior effects of the query begin.
352 let num_used_queries = query_pool.num_used_queries.load(Ordering::Acquire);
353 let num_resolved_queries = query_pool.num_resolved_queries.load(Ordering::Acquire);
354
355 if num_resolved_queries == num_used_queries {
356 continue;
357 }
358
359 debug_assert!(query_pool.capacity >= num_used_queries);
360 debug_assert!(num_resolved_queries < num_used_queries);
361
362 // Resolve into offset 0 of the resolve buffer - this way we don't have to worry about
363 // the offset restrictions on resolve buffers (`wgpu::QUERY_RESOLVE_BUFFER_ALIGNMENT`)
364 // and we copy it anyways.
365 encoder.resolve_query_set(
366 &query_pool.query_set,
367 num_resolved_queries..num_used_queries,
368 &query_pool.resolve_buffer,
369 0,
370 );
371 // Copy the newly resolved queries into the read buffer, making sure
372 // that we don't override any of the results that are already there.
373 let destination_offset = (num_resolved_queries * wgpu::QUERY_SIZE) as u64;
374 let copy_size = ((num_used_queries - num_resolved_queries) * wgpu::QUERY_SIZE) as u64;
375 encoder.copy_buffer_to_buffer(
376 &query_pool.resolve_buffer,
377 0,
378 &query_pool.read_buffer,
379 destination_offset,
380 copy_size,
381 );
382
383 query_pool
384 .num_resolved_queries
385 .store(num_used_queries, Ordering::Release);
386 }
387 }
388
389 /// Marks the end of a frame.
390 ///
391 /// Needs to be called **after** submitting any encoder used in the current profiler frame.
392 ///
393 /// Fails if there are still open queries or unresolved queries.
394 pub fn end_frame(&mut self) -> Result<(), EndFrameError> {
395 let num_open_queries = self.num_open_queries.load(Ordering::Acquire);
396 if num_open_queries != 0 {
397 return Err(EndFrameError::UnclosedQueries(num_open_queries));
398 }
399
400 let query_pools = self.active_frame.query_pools.get_mut();
401
402 let mut new_pending_frame = PendingFrame {
403 query_pools: std::mem::take(&mut query_pools.used_pools),
404 closed_query_by_parent_handle: HashMap::new(),
405 mapped_buffers: Arc::new(AtomicU32::new(0)),
406 };
407
408 for query in self.active_frame.closed_query_receiver.get_mut().try_iter() {
409 new_pending_frame
410 .closed_query_by_parent_handle
411 .entry(query.parent_handle)
412 .or_default()
413 .push(query);
414 }
415
416 // All loads of pool.num_used_queries are Relaxed since we assume,
417 // that we already acquired the state during `resolve_queries` and no further otherwise unobserved
418 // modifications happened since then.
419
420 let num_unresolved_queries = new_pending_frame
421 .query_pools
422 .iter()
423 .map(|pool| {
424 pool.num_used_queries.load(Ordering::Relaxed)
425 - pool.num_resolved_queries.load(Ordering::Relaxed)
426 })
427 .sum();
428 if num_unresolved_queries != 0 {
429 return Err(EndFrameError::UnresolvedQueries(num_unresolved_queries));
430 }
431
432 // Next time we create a new query pool, we want it to be at least as big to hold all queries of this frame.
433 self.size_for_new_query_pools = self
434 .size_for_new_query_pools
435 .max(
436 new_pending_frame
437 .query_pools
438 .iter()
439 .map(|pool| pool.num_used_queries.load(Ordering::Relaxed))
440 .sum(),
441 )
442 .min(QUERY_SET_MAX_QUERIES);
443
444 // Make sure we don't overflow.
445 if self.pending_frames.len() == self.settings.max_num_pending_frames {
446 // Drop previous (!) frame.
447 // Dropping the oldest frame could get us into an endless cycle where we're never able to complete
448 // any pending frames as the ones closest to completion would be evicted.
449 if let Some(dropped_frame) = self.pending_frames.pop() {
450 // Drop queries first since they still have references to the query pools that we want to reuse.
451 drop(dropped_frame.closed_query_by_parent_handle);
452
453 // Mark the frame as dropped. We'll give back the query pools once the mapping is done.
454 // Any previously issued map_async call that haven't finished yet, will invoke their callback with mapping abort.
455 self.reset_and_cache_unused_query_pools(dropped_frame.query_pools);
456 }
457 }
458
459 // Map all buffers.
460 for pool in new_pending_frame.query_pools.iter_mut() {
461 let mapped_buffers = new_pending_frame.mapped_buffers.clone();
462 pool.read_buffer
463 .slice(0..(pool.num_used_queries.load(Ordering::Relaxed) * wgpu::QUERY_SIZE) as u64)
464 .map_async(wgpu::MapMode::Read, move |mapping_result| {
465 // Mapping should not fail unless it was cancelled due to the frame being dropped.
466 match mapping_result {
467 Err(_) => {
468 // We only want to ignore the error iff the mapping has been aborted by us (due to a dropped frame, see above).
469 // In any other case, we need should panic as this would imply something went seriously sideways.
470 //
471 // As of writing, this is not yet possible in wgpu, see https://github.com/gfx-rs/wgpu/pull/2939
472 }
473 Ok(()) => {
474 mapped_buffers.fetch_add(1, std::sync::atomic::Ordering::Release);
475 }
476 }
477 });
478 }
479
480 // Enqueue
481 self.pending_frames.push(new_pending_frame);
482 assert!(self.pending_frames.len() <= self.settings.max_num_pending_frames);
483
484 Ok(())
485 }
486
487 /// Checks if all timer queries for the oldest pending finished frame are done and returns that snapshot if any.
488 ///
489 /// `timestamp_period`:
490 /// The timestamp period of the device. Pass the result of [`wgpu::Queue::get_timestamp_period()`].
491 /// Note that some implementations (Chrome as of writing) may converge to a timestamp period while the application is running,
492 /// so caching this value is usually not recommended.
493 pub fn process_finished_frame(
494 &mut self,
495 timestamp_period: f32,
496 ) -> Option<Vec<GpuTimerQueryResult>> {
497 let frame = self.pending_frames.first_mut()?;
498
499 // We only process if all mappings succeed.
500 if frame
501 .mapped_buffers
502 .load(std::sync::atomic::Ordering::Acquire)
503 != frame.query_pools.len() as u32
504 {
505 return None;
506 }
507
508 let PendingFrame {
509 query_pools,
510 mut closed_query_by_parent_handle,
511 mapped_buffers: _,
512 } = self.pending_frames.remove(0);
513
514 let results = {
515 let timestamp_to_sec = timestamp_period as f64 / 1000.0 / 1000.0 / 1000.0;
516
517 Self::process_timings_recursive(
518 timestamp_to_sec,
519 &mut closed_query_by_parent_handle,
520 ROOT_QUERY_HANDLE,
521 )
522 };
523
524 // Ensure that closed queries no longer hold references to the query pools.
525 // `process_timings_recursive` should have handled this already.
526 debug_assert!(closed_query_by_parent_handle.is_empty());
527 drop(closed_query_by_parent_handle); // But just in case, we make sure to drop it here even if above debug assertion fails.
528
529 self.reset_and_cache_unused_query_pools(query_pools);
530
531 Some(results)
532 }
533}
534
535// --------------------------------------------------------------------------------
536// Internals
537// --------------------------------------------------------------------------------
538
539const QUERY_SET_MAX_QUERIES: u32 = wgpu::QUERY_SET_MAX_QUERIES;
540
541/// Returns true if a timestamp query is supported.
542fn timestamp_query_support<Recorder: ProfilerCommandRecorder>(
543 is_for_pass_timestamp_writes: bool,
544 encoder_or_pass: &mut Recorder,
545 features: wgpu::Features,
546) -> bool {
547 let required_feature = if is_for_pass_timestamp_writes {
548 wgpu::Features::TIMESTAMP_QUERY
549 } else if encoder_or_pass.is_pass() {
550 wgpu::Features::TIMESTAMP_QUERY_INSIDE_PASSES
551 } else {
552 wgpu::Features::TIMESTAMP_QUERY_INSIDE_ENCODERS
553 };
554 features.contains(required_feature)
555}
556
557impl GpuProfiler {
558 fn next_scope_tree_handle(&self) -> GpuTimerQueryTreeHandle {
559 // Relaxed is fine, we just want a number that nobody uses this frame already.
560 let mut handle = self.next_query_handle.fetch_add(1, Ordering::Relaxed);
561
562 // We don't ever expect to run out of handles during a single frame, but who knows how long the app runs.
563 while handle == ROOT_QUERY_HANDLE {
564 handle = self.next_query_handle.fetch_add(1, Ordering::Relaxed);
565 }
566
567 handle
568 }
569
570 fn reset_and_cache_unused_query_pools(&mut self, mut discarded_pools: Vec<Arc<QueryPool>>) {
571 let capacity_threshold = self.size_for_new_query_pools / 2;
572 for pool in discarded_pools.drain(..) {
573 // If the pool is truly unused now, it's ref count should be 1!
574 // If we use it anywhere else we have an implementation bug.
575 let mut pool = Arc::into_inner(pool).expect("Pool still in use");
576 pool.reset();
577
578 // If a pool was less than half of the size of the max frame, then we don't keep it.
579 // This way we're going to need less pools in upcoming frames and thus have less overhead in the long run.
580 // If timer queries were disabled, we also don't keep any pools.
581 if self.settings.enable_timer_queries && pool.capacity >= capacity_threshold {
582 self.active_frame
583 .query_pools
584 .get_mut()
585 .unused_pools
586 .push(pool);
587 }
588 }
589 }
590
591 fn try_reserve_query_pair(pool: &Arc<QueryPool>) -> Option<ReservedTimerQueryPair> {
592 let mut num_used_queries = pool.num_used_queries.load(Ordering::Relaxed);
593
594 loop {
595 if pool.capacity < num_used_queries + 2 {
596 // This pool is out of capacity, we failed the operation.
597 return None;
598 }
599
600 match pool.num_used_queries.compare_exchange_weak(
601 num_used_queries,
602 num_used_queries + 2,
603 // Write to num_used_queries with release semantics to be on the safe side.
604 // (It doesn't look like there's other side effects that we need to publish.)
605 Ordering::Release,
606 // No barrier for the failure case.
607 // The only thing we have to acquire is the pool's capacity which is constant and
608 // was definitely acquired by the RWLock prior to this call.
609 Ordering::Relaxed,
610 ) {
611 Ok(_) => {
612 // We successfully acquired two queries!
613 return Some(ReservedTimerQueryPair {
614 pool: pool.clone(),
615 start_query_idx: num_used_queries,
616 usage_state: QueryPairUsageState::Reserved,
617 });
618 }
619 Err(updated) => {
620 // Someone else acquired queries in the meantime, try again.
621 num_used_queries = updated;
622 }
623 }
624 }
625 }
626
627 // Reserves two query objects.
628 // Our query pools always have an even number of queries, so we know the next query is the next in the same pool.
629 fn reserve_query_pair(&self) -> ReservedTimerQueryPair {
630 // First, try to allocate from current top pool.
631 // Requires taking a read lock on the current query pool.
632 {
633 let query_pools = self.active_frame.query_pools.read();
634 if let Some(pair) = query_pools
635 .used_pools
636 .last()
637 .and_then(Self::try_reserve_query_pair)
638 {
639 return pair;
640 }
641 }
642 // If this didn't work, we may need to add a new pool.
643 // Requires taking a write lock on the current query pool.
644 {
645 let mut query_pools = self.active_frame.query_pools.write();
646
647 // It could be that by now, another thread has already added a new pool!
648 // This is a bit unfortunate because it means we unnecessarily took a write lock, but it seems hard to get around this.
649 if let Some(pair) = query_pools
650 .used_pools
651 .last()
652 .and_then(Self::try_reserve_query_pair)
653 {
654 return pair;
655 }
656
657 // Now we know for certain that the last pool is exhausted, so add a new one!
658 let new_pool = if let Some(reused_pool) = query_pools.unused_pools.pop() {
659 // First check if there's an unused pool we can take.
660 Arc::new(reused_pool)
661 } else {
662 // If we can't, create a new pool that is as big as all previous pools combined.
663 Arc::new(QueryPool::new(
664 query_pools
665 .used_pools
666 .iter()
667 .map(|pool| pool.capacity)
668 .sum::<u32>()
669 .max(self.size_for_new_query_pools)
670 .min(QUERY_SET_MAX_QUERIES),
671 &self.device,
672 ))
673 };
674
675 let pair = Self::try_reserve_query_pair(&new_pool)
676 .expect("Freshly reserved pool doesn't have enough capacity");
677 query_pools.used_pools.push(new_pool);
678
679 pair
680 }
681 }
682
683 #[track_caller]
684 #[must_use]
685 fn begin_query_internal<Recorder: ProfilerCommandRecorder>(
686 &self,
687 label: String,
688 is_for_pass_timestamp_writes: bool,
689 encoder_or_pass: &mut Recorder,
690 ) -> GpuProfilerQuery {
691 // Give opening/closing queries acquire/release semantics:
692 // This way, we won't get any nasty surprises when observing zero open queries.
693 self.num_open_queries.fetch_add(1, Ordering::Acquire);
694
695 let query = if self.settings.enable_timer_queries
696 && timestamp_query_support(
697 is_for_pass_timestamp_writes,
698 encoder_or_pass,
699 self.device.features(),
700 ) {
701 Some(self.reserve_query_pair())
702 } else {
703 None
704 };
705
706 let _tracy_scope = if self.settings.enable_timer_queries {
707 #[cfg(feature = "tracy")]
708 {
709 let location = std::panic::Location::caller();
710 self.tracy_context.as_ref().and_then(|c| {
711 c.span_alloc(&label, "", location.file(), location.line())
712 .ok()
713 })
714 }
715 #[cfg(not(feature = "tracy"))]
716 Option::<()>::None
717 } else {
718 None
719 };
720
721 let pid = if cfg!(target_arch = "wasm32") {
722 0
723 } else {
724 std::process::id()
725 };
726
727 GpuProfilerQuery {
728 label,
729 pid,
730 tid: std::thread::current().id(),
731 timer_query_pair: query,
732 handle: self.next_scope_tree_handle(),
733 parent_handle: ROOT_QUERY_HANDLE,
734 has_debug_group: false,
735 #[cfg(feature = "tracy")]
736 tracy_scope: _tracy_scope,
737 }
738 }
739
740 fn process_timings_recursive(
741 timestamp_to_sec: f64,
742 closed_scope_by_parent_handle: &mut HashMap<GpuTimerQueryTreeHandle, Vec<GpuProfilerQuery>>,
743 parent_handle: GpuTimerQueryTreeHandle,
744 ) -> Vec<GpuTimerQueryResult> {
745 let Some(queries_with_same_parent) = closed_scope_by_parent_handle.remove(&parent_handle)
746 else {
747 return Vec::new();
748 };
749
750 queries_with_same_parent
751 .into_iter()
752 .map(|mut scope| {
753 // Note that inactive queries may still have nested queries, it's therefore important we process all of them.
754 // In particular, this happens if only `wgpu::Features::TIMESTAMP_QUERY`` is enabled and `timestamp_writes`
755 // on passes are nested inside inactive encoder timer queries.
756 let time_raw = scope.timer_query_pair.take().map(|query| {
757 // Read timestamp from buffer.
758 // By design timestamps for start/end are consecutive.
759 let offset = (query.start_query_idx * wgpu::QUERY_SIZE) as u64;
760 let buffer_slice = &query
761 .pool
762 .read_buffer
763 .slice(offset..(offset + (wgpu::QUERY_SIZE * 2) as u64))
764 .get_mapped_range();
765 let start_raw = u64::from_le_bytes(
766 buffer_slice[0..wgpu::QUERY_SIZE as usize]
767 .try_into()
768 .unwrap(),
769 );
770 let end_raw = u64::from_le_bytes(
771 buffer_slice[wgpu::QUERY_SIZE as usize..(wgpu::QUERY_SIZE as usize) * 2]
772 .try_into()
773 .unwrap(),
774 );
775
776 start_raw..end_raw
777 });
778
779 let time = time_raw.as_ref().map(|time_raw| {
780 (time_raw.start as f64 * timestamp_to_sec)
781 ..(time_raw.end as f64 * timestamp_to_sec)
782 });
783
784 #[cfg(feature = "tracy")]
785 if let (Some(tracy_scope), Some(time_raw)) = (&scope.tracy_scope, &time_raw) {
786 tracy_scope.upload_timestamp_start(time_raw.start as i64);
787 }
788
789 let nested_queries = Self::process_timings_recursive(
790 timestamp_to_sec,
791 closed_scope_by_parent_handle,
792 scope.handle,
793 );
794
795 #[cfg(feature = "tracy")]
796 if let (Some(tracy_scope), Some(time_raw)) = (&scope.tracy_scope, time_raw) {
797 tracy_scope.upload_timestamp_end(time_raw.end as i64);
798 }
799
800 GpuTimerQueryResult {
801 label: std::mem::take(&mut scope.label),
802 time,
803 nested_queries,
804 pid: scope.pid,
805 tid: scope.tid,
806 }
807 })
808 .collect::<Vec<_>>()
809 }
810}
811
812#[derive(PartialEq, Eq)]
813pub enum QueryPairUsageState {
814 /// Transitional state used upon creation.
815 Reserved,
816
817 /// Don't do manual timestamp writes, wgpu is expected to do them for us.
818 ReservedForPassTimestampWrites,
819
820 /// Start query has been used, end query is still available.
821 OnlyStartWritten,
822
823 /// Both start & end query have been used.
824 BothStartAndEndWritten,
825}
826
827pub struct ReservedTimerQueryPair {
828 /// [`QueryPool`] on which both start & end queries of the scope are done.
829 ///
830 /// By putting an arc here instead of an index into a vec, we don't need
831 /// need to take any locks upon closing a profiling scope.
832 pub pool: Arc<QueryPool>,
833
834 /// Query index at which the scope begins.
835 /// The query after this is reserved for the end of the scope.
836 pub start_query_idx: u32,
837
838 /// Current use of the query pair.
839 pub usage_state: QueryPairUsageState,
840}
841
842/// A pool of queries, consisting of a single queryset & buffer for query results.
843#[derive(Debug)]
844pub struct QueryPool {
845 pub query_set: wgpu::QuerySet,
846
847 resolve_buffer: wgpu::Buffer,
848 read_buffer: wgpu::Buffer,
849
850 capacity: u32,
851 num_used_queries: AtomicU32,
852 num_resolved_queries: AtomicU32,
853}
854
855impl QueryPool {
856 const MIN_CAPACITY: u32 = 32;
857
858 fn new(capacity: u32, device: &wgpu::Device) -> Self {
859 QueryPool {
860 query_set: device.create_query_set(&wgpu::QuerySetDescriptor {
861 label: Some("GpuProfiler - Query Set"),
862 ty: wgpu::QueryType::Timestamp,
863 count: capacity,
864 }),
865
866 resolve_buffer: device.create_buffer(&wgpu::BufferDescriptor {
867 label: Some("GpuProfiler - Query Resolve Buffer"),
868 size: (wgpu::QUERY_SIZE * capacity) as u64,
869 usage: wgpu::BufferUsages::QUERY_RESOLVE | wgpu::BufferUsages::COPY_SRC,
870 mapped_at_creation: false,
871 }),
872
873 read_buffer: device.create_buffer(&wgpu::BufferDescriptor {
874 label: Some("GpuProfiler - Query Read Buffer"),
875 size: (wgpu::QUERY_SIZE * capacity) as u64,
876 usage: wgpu::BufferUsages::COPY_DST | wgpu::BufferUsages::MAP_READ,
877 mapped_at_creation: false,
878 }),
879
880 capacity,
881 num_used_queries: AtomicU32::new(0),
882 num_resolved_queries: AtomicU32::new(0),
883 }
884 }
885
886 fn reset(&mut self) {
887 self.num_used_queries = AtomicU32::new(0);
888 self.num_resolved_queries = AtomicU32::new(0);
889 self.read_buffer.unmap();
890 }
891}
892
893#[derive(Default)]
894struct PendingFramePools {
895 /// List of all pools used in this frame.
896 /// The last pool is the one new profiling queries will try to make timer queries into.
897 used_pools: Vec<Arc<QueryPool>>,
898
899 /// List of unused pools recycled from previous frames.
900 unused_pools: Vec<QueryPool>,
901}
902
903/// Internal handle to building a tree of profiling queries.
904pub type GpuTimerQueryTreeHandle = u32;
905
906/// Handle for the root scope.
907pub const ROOT_QUERY_HANDLE: GpuTimerQueryTreeHandle = u32::MAX;
908
909struct ActiveFrame {
910 query_pools: RwLock<PendingFramePools>,
911
912 /// Closed queries get send to this channel.
913 ///
914 /// Note that channel is still overkill for what we want here:
915 /// We're in a multi producer situation, *but* the single consumer is known to be only
916 /// active in a mut context, i.e. while we're consuming we know that we're not producing.
917 /// We have to wrap it in a Mutex because the channel is not Sync, but we actually never lock it
918 /// since we only ever access it in a `mut` context.
919 closed_query_sender: std::sync::mpsc::Sender<GpuProfilerQuery>,
920 closed_query_receiver: Mutex<std::sync::mpsc::Receiver<GpuProfilerQuery>>,
921}
922
923struct PendingFrame {
924 query_pools: Vec<Arc<QueryPool>>,
925 closed_query_by_parent_handle: HashMap<GpuTimerQueryTreeHandle, Vec<GpuProfilerQuery>>,
926
927 /// Keeps track of the number of buffers in the query pool that have been mapped successfully.
928 mapped_buffers: std::sync::Arc<std::sync::atomic::AtomicU32>,
929}