prgpu 0.1.8

GPU-accelerated rendering utilities for Adobe Premiere Pro and After Effects plugins
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
//! DX-friendly criterion harness for benchmarking CPU kernels.
//!
//! # Why this exists
//!
//! `prgpu::bench` gives kernel developers a **worry-less** way to wire a
//! `.vekl`-backed CPU kernel into a `criterion` benchmark without touching any
//! After Effects / Premiere plumbing.
//!
//! It provides:
//! - [`PixelFormat`] / [`Resolution`] enums with sensible defaults
//! - [`Scene`] — owns synthetic input / output / auxiliary buffers and knows how
//!   to build a [`Configuration`] for a single-pass dispatch
//! - [`KernelBenchmark`] — fluent builder that runs a full
//!   `(resolutions × formats)` sweep against any [`CpuDispatchFn`]
//! - Re-exports of the `criterion` entry points so a bench file only needs to
//!   import `prgpu::bench::*`
//!
//! # Minimal example
//!
//! ```ignore
//! use prgpu::bench::*;
//!
//! fn bench_my_kernel(c: &mut Criterion) {
//!     KernelBenchmark::new(
//!         "my_kernel",
//!         my_kernel::MY_KERNEL_CPU_DISPATCH,
//!         MyParams::default(),
//!     )
//!     .resolutions(&[Resolution::HD1080, Resolution::UHD4K])
//!     .formats(&[PixelFormat::Bgra8, PixelFormat::Bgra32f])
//!     .run(c);
//! }
//!
//! criterion_group!(benches, bench_my_kernel);
//! criterion_main!(benches);
//! ```
//!
//! # Multi-pass kernels
//!
//! For kernels that need an intermediate buffer (e.g. separable blur), build
//! the [`Configuration`]s yourself with [`Scene::input_ptr`], [`Scene::aux_ptr`],
//! [`Scene::output_ptr`], and drive them via [`Scene::dispatch_with`] inside a
//! criterion `b.iter(...)` closure. See the `vignette/benches/` example.
//!
//! # Statistical rules of thumb (defaults)
//!
//! - `sample_size = 30`  (enough for CV < 3 % on stable workloads)
//! - `measurement_time = 5 s`
//! - `warm_up_time = 1 s`
//! - Throughput is reported as `Throughput::Elements(width * height)` so the
//!   "Mpx/s" figure is directly comparable across resolutions.

#![cfg(feature = "bench")]

use std::ffi::c_void;
use std::time::Duration;

pub use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};

use crate::cpu::render::{render_cpu_direct, CpuDispatchTileFn};
use crate::types::Configuration;

// ---------------------------------------------------------------------------
// PixelFormat
// ---------------------------------------------------------------------------

/// Pixel format of the synthetic buffers used by the benchmark.
///
/// The `layout_id` is fixed to BGRA (`1`) because that is what both AE and
/// Premiere hand to CPU kernels in practice.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum PixelFormat {
	/// 8-bit BGRA, `bpp = 4`.
	Bgra8,
	/// 16-bit BGRA, `bpp = 8`. Semantically matches AE's U15 channel range.
	Bgra16,
	/// 32-bit float BGRA, `bpp = 16`. Matches AE's `F32` world type.
	Bgra32f,
}

impl PixelFormat {
	/// Bytes per pixel as seen by the kernel.
	pub const fn bpp(self) -> u32 {
		match self {
			Self::Bgra8 => 4,
			Self::Bgra16 => 8,
			Self::Bgra32f => 16,
		}
	}

	/// Layout tag passed in `FrameParams::pixel_layout`. BGRA (`1`) in all
	/// current cases.
	pub const fn layout_id(self) -> u32 {
		1
	}

	/// Whether the backing format is half-precision (AE's `U15`, mapped to
	/// `is16f` in [`Configuration::cpu`]).
	pub const fn is16f(self) -> bool {
		matches!(self, Self::Bgra16)
	}

	/// Short human-readable tag (used in bench IDs).
	pub const fn label(self) -> &'static str {
		match self {
			Self::Bgra8 => "bgra8",
			Self::Bgra16 => "bgra16",
			Self::Bgra32f => "bgra32f",
		}
	}

	/// All three formats, in the order `[u8, u16, f32]`.
	pub const ALL: &'static [PixelFormat] = &[Self::Bgra8, Self::Bgra16, Self::Bgra32f];
}

// ---------------------------------------------------------------------------
// Resolution
// ---------------------------------------------------------------------------

/// Output resolution of the synthetic scene.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum Resolution {
	/// 1280×720
	HD720,
	/// 1920×1080
	HD1080,
	/// 3840×2160
	UHD4K,
	/// Arbitrary (`width`, `height`).
	Custom(u32, u32),
}

impl Resolution {
	pub const fn dims(self) -> (u32, u32) {
		match self {
			Self::HD720 => (1280, 720),
			Self::HD1080 => (1920, 1080),
			Self::UHD4K => (3840, 2160),
			Self::Custom(w, h) => (w, h),
		}
	}

	/// Short human-readable tag (used in bench IDs).
	pub fn label(self) -> String {
		let (w, h) = self.dims();
		format!("{w}x{h}")
	}

	/// Convenience set for quick sweeps: `[HD720, HD1080, UHD4K]`.
	pub const COMMON: &'static [Resolution] = &[Self::HD720, Self::HD1080, Self::UHD4K];
}

// ---------------------------------------------------------------------------
// Scene
// ---------------------------------------------------------------------------

/// Extra bytes appended past the nominal end of every bench buffer, matching
/// the `ALLOC_GUARD_BYTES` semantics in [`crate::cpu::buffer`]. Prevents heap
/// corruption if a kernel off-by-ones into the next allocation.
const ALLOC_GUARD_BYTES: usize = 64;

/// Synthetic frame scene. Owns:
///
/// - an **input** buffer (`width × height × bpp` bytes), deterministically
///   pre-filled
/// - an **output** buffer (same geometry)
/// - zero or more **auxiliary** buffers for multi-pass kernels (e.g. the
///   intermediate of a separable blur)
///
/// All buffers are tightly packed — `pitch_px == width`.
///
/// `Scene` is the canonical "workspace" passed to a benchmarked kernel. It
/// knows how to hand out raw pointers and build [`Configuration`]s for the
/// common single-pass case.
pub struct Scene {
	pub width: u32,
	pub height: u32,
	pub format: PixelFormat,
	/// Pitch in pixels. Tightly packed, equal to `width`.
	pub pitch_px: i32,
	input: Vec<u8>,
	output: Vec<u8>,
	aux: Vec<AuxBuffer>,
}

struct AuxBuffer {
	width: u32,
	height: u32,
	data: Vec<u8>,
}

impl Scene {
	/// Allocate a new scene with the given resolution and pixel format.
	///
	/// The input buffer is pre-filled with a deterministic, non-trivial
	/// pattern to avoid denormal floats and all-zero branches skewing the
	/// measurement.
	pub fn new(resolution: Resolution, format: PixelFormat) -> Self {
		let (width, height) = resolution.dims();
		let bpp = format.bpp() as usize;
		let byte_len = (width as usize) * (height as usize) * bpp;

		let mut input = vec![0u8; byte_len + ALLOC_GUARD_BYTES];
		let mut output = vec![0u8; byte_len + ALLOC_GUARD_BYTES];

		fill_pattern(&mut input[..byte_len], format, width);
		// Pre-touch output pages so the first iteration isn't skewed by
		// page-fault / first-touch costs.
		fill_pattern(&mut output[..byte_len], format, width);

		Self {
			width,
			height,
			format,
			pitch_px: width as i32,
			input,
			output,
			aux: Vec::new(),
		}
	}

	/// Allocate an auxiliary buffer with the given dimensions (tight pitch).
	/// Returns the aux index usable with [`Scene::aux_ptr`] / [`Scene::aux_dims`].
	pub fn alloc_aux(&mut self, width: u32, height: u32) -> usize {
		let bpp = self.format.bpp() as usize;
		let byte_len = (width as usize) * (height as usize) * bpp;
		let mut data = vec![0u8; byte_len + ALLOC_GUARD_BYTES];
		fill_pattern(&mut data[..byte_len], self.format, width);
		let idx = self.aux.len();
		self.aux.push(AuxBuffer { width, height, data });
		idx
	}

	/// Raw pointer to the input buffer (bench-scope lifetime).
	pub fn input_ptr(&self) -> *mut c_void {
		self.input.as_ptr() as *mut c_void
	}

	/// Raw pointer to the output buffer (bench-scope lifetime).
	pub fn output_ptr(&mut self) -> *mut c_void {
		self.output.as_mut_ptr() as *mut c_void
	}

	/// Raw pointer to an auxiliary buffer.
	pub fn aux_ptr(&mut self, idx: usize) -> *mut c_void {
		self.aux[idx].data.as_mut_ptr() as *mut c_void
	}

	/// `(width, height)` of an auxiliary buffer.
	pub fn aux_dims(&self, idx: usize) -> (u32, u32) {
		let a = &self.aux[idx];
		(a.width, a.height)
	}

	/// Build a single-pass `Configuration`: `input → output`, matching sizes,
	/// tight pitch. `time` defaults to 0.0 (bench has no timeline context).
	pub fn simple_config(&mut self) -> Configuration {
		let ptr_in = self.input.as_ptr() as *mut c_void;
		let ptr_out = self.output.as_mut_ptr() as *mut c_void;
		Configuration::cpu(
			ptr_in,
			ptr_out,
			self.pitch_px,
			self.pitch_px,
			self.width,
			self.height,
			self.format.bpp(),
			self.format.layout_id(),
		)
	}

	/// Run a single-pass dispatch (`input → output`) on this scene using
	/// the tile dispatcher (one FFI call per rayon chunk).
	pub fn dispatch_simple<P: Copy + Sync>(&mut self, dispatch_tile_fn: CpuDispatchTileFn, params: &P) {
		let cfg = self.simple_config();
		// SAFETY: pointers in `cfg` come from `self` and live for the call;
		// the dispatch function only reads `input` and writes `output`.
		unsafe { render_cpu_direct("bench", &cfg, dispatch_tile_fn, params) };
	}

	/// Run a kernel with a user-supplied [`Configuration`] (multi-pass scenario).
	///
	/// # Safety
	/// `cfg`'s buffer pointers must remain valid for the duration of the call;
	/// typically obtained from [`Scene::input_ptr`] / [`Scene::output_ptr`] /
	/// [`Scene::aux_ptr`] on this same [`Scene`].
	pub fn dispatch_with<P: Copy + Sync>(&self, cfg: &Configuration, dispatch_tile_fn: CpuDispatchTileFn, params: &P) {
		unsafe { render_cpu_direct("bench", cfg, dispatch_tile_fn, params) };
	}
}

/// Deterministic fill: varies by x/y and format, avoids all-zero / denormal
/// regions. Does not use `rand` on purpose (no extra dep, fully reproducible).
fn fill_pattern(buf: &mut [u8], format: PixelFormat, _width: u32) {
	match format {
		PixelFormat::Bgra8 => {
			for (i, b) in buf.iter_mut().enumerate() {
				// Simple LCG-ish pattern, range [1, 255].
				let v = (i.wrapping_mul(2654435761) >> 24) as u8;
				*b = v.max(1);
			}
		}
		PixelFormat::Bgra16 => {
			let words = unsafe { std::slice::from_raw_parts_mut(buf.as_mut_ptr() as *mut u16, buf.len() / 2) };
			for (i, w) in words.iter_mut().enumerate() {
				let v = (i.wrapping_mul(2654435761) >> 16) as u16;
				*w = v.max(1);
			}
		}
		PixelFormat::Bgra32f => {
			let floats = unsafe { std::slice::from_raw_parts_mut(buf.as_mut_ptr() as *mut f32, buf.len() / 4) };
			for (i, f) in floats.iter_mut().enumerate() {
				// Produce values in [0.05, 0.95] to avoid denormals and saturation.
				let x = ((i.wrapping_mul(2654435761) >> 8) & 0xFFFF) as f32 / 65535.0;
				*f = 0.05 + 0.90 * x;
			}
		}
	}
}

// ---------------------------------------------------------------------------
// KernelBenchmark builder
// ---------------------------------------------------------------------------

/// Fluent builder that wires a CPU kernel into a `criterion` sweep.
///
/// Designed so a kernel developer can get a statistically-meaningful benchmark
/// with ~5 lines. See the module docs for the canonical example.
///
/// Default configuration:
/// - sweep over `[Resolution::HD1080]` × `[PixelFormat::Bgra8]`
/// - `sample_size = 30`, `measurement_time = 5 s`, `warm_up_time = 1 s`
///
/// The throughput is automatically reported as `Throughput::Elements(w*h)`
/// so criterion prints `Mpx/s`.
/// Signature for the per-combo customization hook passed to
/// [`KernelBenchmark::customize`]. A plain `fn` pointer is used (rather than
/// a generic `Fn` trait bound) so that closure argument types are inferred
/// directly from this concrete signature — users can write
/// `|p, res, fmt| { ... }` without annotating the parameters.
pub type CustomizeFn<P> = fn(&mut P, Resolution, PixelFormat);

pub struct KernelBenchmark<P: Copy + Sync + 'static> {
	name: String,
	dispatch_fn: CpuDispatchTileFn,
	user_params: P,
	resolutions: Vec<Resolution>,
	formats: Vec<PixelFormat>,
	customize: Option<CustomizeFn<P>>,
	sample_size: usize,
	measurement_time: Duration,
	warm_up_time: Duration,
}

impl<P: Copy + Sync + 'static> KernelBenchmark<P> {
	/// Create a new benchmark for `dispatch_fn`. `name` becomes the criterion
	/// **group** name; individual benches within the group are identified by
	/// the `(format, resolution)` tuple.
	pub fn new(name: impl Into<String>, dispatch_fn: CpuDispatchTileFn, user_params: P) -> Self {
		Self {
			name: name.into(),
			dispatch_fn,
			user_params,
			resolutions: vec![Resolution::HD1080],
			formats: vec![PixelFormat::Bgra8],
			customize: None,
			sample_size: 30,
			measurement_time: Duration::from_secs(5),
			warm_up_time: Duration::from_secs(1),
		}
	}

	/// Override the resolution sweep.
	pub fn resolutions(mut self, v: &[Resolution]) -> Self {
		self.resolutions = v.to_vec();
		self
	}

	/// Override the pixel-format sweep.
	pub fn formats(mut self, v: &[PixelFormat]) -> Self {
		self.formats = v.to_vec();
		self
	}

	/// Per-combo customization of the user params.
	///
	/// Useful when some parameter depends on the scene — e.g. a blur radius
	/// that must scale with resolution, or a per-format clamp. The hook
	/// receives a mutable reference to a **copy** of the base params, so it
	/// cannot affect other bench points.
	///
	/// Uses a concrete `fn` pointer (see [`CustomizeFn`]) so closures without
	/// captures infer their argument types automatically — you can write
	/// `|p, res, fmt| { ... }` with no type annotations.
	pub fn customize(mut self, f: CustomizeFn<P>) -> Self {
		self.customize = Some(f);
		self
	}

	/// Override criterion's sample size (default: 30).
	pub fn sample_size(mut self, n: usize) -> Self {
		self.sample_size = n;
		self
	}

	/// Override criterion's measurement time (default: 5 s).
	pub fn measurement_time(mut self, d: Duration) -> Self {
		self.measurement_time = d;
		self
	}

	/// Override criterion's warm-up time (default: 1 s).
	pub fn warm_up_time(mut self, d: Duration) -> Self {
		self.warm_up_time = d;
		self
	}

	/// Execute the full sweep. Must be called from a `#[bench]`-free binary
	/// using the `criterion_group!` / `criterion_main!` pair.
	pub fn run(self, c: &mut Criterion) {
		let dispatch_fn = self.dispatch_fn;
		let base_params = self.user_params;
		let customize = self.customize;
		let sample_size = self.sample_size;
		let m_time = self.measurement_time;
		let w_time = self.warm_up_time;

		let mut group = c.benchmark_group(&self.name);
		group.sample_size(sample_size);
		group.measurement_time(m_time);
		group.warm_up_time(w_time);

		for &fmt in &self.formats {
			for &res in &self.resolutions {
				let (w, h) = res.dims();
				let pixels = (w as u64) * (h as u64);
				group.throughput(Throughput::Elements(pixels));

				let mut params = base_params;
				if let Some(f) = customize {
					f(&mut params, res, fmt);
				}

				let id = BenchmarkId::new(fmt.label(), res.label());
				group.bench_function(id, move |b| {
					let mut scene = Scene::new(res, fmt);
					b.iter(|| {
						scene.dispatch_simple(dispatch_fn, &params);
						// Prevent the optimizer from eliminating the write.
						std::hint::black_box(scene.output_ptr());
					});
				});
			}
		}

		group.finish();
	}
}