xlog-cuda 0.9.2

CUDA kernel provider, buffers, and interop for XLOG
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
//! Core [`DeviceMemoryResource`] trait and supporting types.
//!
//! Mirrors RMM's `device_memory_resource` shape so a future optional
//! RMM backend can satisfy the same trait without requiring callers to
//! change. Stream-ordered: every alloc/dealloc names a stream; cross-
//! stream reuse requires explicit event-based synchronization.

use std::fmt;
use std::sync::atomic::{AtomicU64, Ordering};

/// Identifier for a CUDA stream owned by the runtime's stream pool.
/// Wraps the raw cudarc stream handle the resource will use for
/// `cuMemAllocAsync` / `cuMemFreeAsync` ordering. Construction goes
/// through the runtime; do not fabricate.
#[derive(Clone, Copy, Debug, Eq, PartialEq, Hash)]
pub struct StreamId(pub u32);

impl StreamId {
    /// The "default" pool stream for tests and synchronous codepaths
    /// that have no other stream context. Production callers should
    /// always carry a real stream from the executor / kernel launch
    /// site.
    pub const DEFAULT: StreamId = StreamId(0);
}

/// Caller-supplied tag for allocation log lines. Short-lived strings
/// are interned by the logging resource; long-lived borrows are not
/// retained.
#[derive(Clone, Copy, Debug, Eq, PartialEq, Hash)]
pub struct AllocTag(pub &'static str);

impl AllocTag {
    pub const UNTAGGED: AllocTag = AllocTag("untagged");
}

/// Monotonic counter for distinguishing reuse of the same byte address
/// across drop / reallocate cycles. Logging and debug-guard resources
/// use this to detect use-after-free.
#[derive(Clone, Copy, Debug, Eq, PartialEq, Hash, Ord, PartialOrd)]
pub struct Generation(pub u64);

static GENERATION_COUNTER: AtomicU64 = AtomicU64::new(1);

impl Generation {
    /// Allocate a fresh, monotonically increasing generation number.
    /// Concurrent calls return distinct values.
    pub fn next() -> Generation {
        Generation(GENERATION_COUNTER.fetch_add(1, Ordering::Relaxed))
    }
}

/// Access kind for a single block use. Drives the cross-stream
/// dependency edges the resource queues during
/// [`DeviceMemoryResource::prepare_block_use`] and the events it
/// records during [`DeviceMemoryResource::finish_block_use`].
///
///   * [`Access::Read`] — the work consumes the block's bytes.
///     Must wait on any prior write on a different stream. The
///     resulting event is appended to the block's outstanding-reads
///     list so future writers (and the eventual deallocate) can
///     wait on it.
///   * [`Access::Write`] — the work overwrites the block's bytes
///     unconditionally. Must wait on the block's prior write AND
///     all outstanding reads on different streams. The resulting
///     event becomes the block's new last-write event; the
///     outstanding-reads list is cleared at finish time.
///   * [`Access::ReadWrite`] — both. Same wait set as `Write`,
///     and the resulting event likewise replaces last-write.
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub enum Access {
    Read,
    Write,
    ReadWrite,
}

impl Access {
    /// Whether work of this access kind reads the block's bytes.
    pub fn reads(self) -> bool {
        matches!(self, Access::Read | Access::ReadWrite)
    }

    /// Whether work of this access kind writes the block's bytes.
    pub fn writes(self) -> bool {
        matches!(self, Access::Write | Access::ReadWrite)
    }
}

/// Compact identity of a [`DeviceBlock`] suitable for snapshotting
/// into structures whose lifetime should not be tied to the source
/// slice's borrow. The fields needed to validate `(ptr, generation)`
/// against the resource's live map and to resolve `alloc_stream` for
/// cross-stream waits / dealloc ordering.
///
/// Created via [`BlockId::from_block`]. Pure data; no resource
/// handle, no `Drop`. Cheap to copy.
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub struct BlockId {
    pub ptr: u64,
    pub generation: Generation,
    pub alloc_stream: StreamId,
    pub device_ordinal: u32,
}

impl BlockId {
    /// Snapshot a [`DeviceBlock`]'s identity. The returned id is
    /// independent of the original block's borrow lifetime; the
    /// runtime's generation guard catches stale ids whose backing
    /// allocation has been recycled.
    pub fn from_block(block: &DeviceBlock) -> Self {
        Self {
            ptr: block.ptr,
            generation: block.generation,
            alloc_stream: block.alloc_stream,
            device_ordinal: block.device_ordinal,
        }
    }
}

/// State of an outstanding [`DeviceBlock`] from the runtime's
/// perspective. Adaptors flip blocks between these states; bug-detection
/// resources reject operations on blocks in an unexpected state.
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub enum BlockState {
    /// Returned from `allocate`; safe to read/write on `alloc_stream`
    /// or after a synchronization to another stream.
    Live,
    /// Returned from `deallocate` but still pending kernel completion
    /// on its owning stream. Reuse must wait for stream sync.
    Retired,
    /// Held by `DebugGuardResource` for delayed reuse / canary
    /// validation. Not reissued until the quarantine window passes.
    Quarantined,
    /// Memory has been physically freed. Any further use is a bug.
    Freed,
}

/// One outstanding device-memory allocation. Owned by the caller until
/// returned to its originating resource via
/// [`DeviceMemoryResource::deallocate`].
///
/// Carries the metadata required for stream-ordered correctness and
/// post-mortem debugging: the resource that owns the block, the device
/// ordinal, the stream the allocation is bound to, byte size, alignment,
/// caller tag, generation number, and current state.
#[derive(Debug)]
pub struct DeviceBlock {
    /// Raw device pointer (opaque to safe Rust callers).
    pub ptr: u64,
    /// CUDA ordinal of the device this block lives on.
    pub device_ordinal: u32,
    /// Allocation stream. Reads/writes on a different stream require
    /// explicit synchronization (event wait or device sync).
    pub alloc_stream: StreamId,
    /// Size in bytes. May exceed the caller-requested size when the
    /// resource rounds up for alignment or pool granularity.
    pub bytes: usize,
    /// Alignment in bytes (always ≥ caller request).
    pub align: usize,
    /// Caller-supplied tag, surfaced in allocation logs.
    pub tag: AllocTag,
    /// Monotonic generation. Reused addresses get fresh generations.
    pub generation: Generation,
    /// Current state. Adaptors transition this; tests assert on it.
    pub state: BlockState,
}

/// Errors returned by resource implementations. Distinct variants for
/// the cases stress tests need to pin (out-of-budget vs CUDA driver
/// failure vs use-after-free etc.).
#[derive(Debug)]
pub enum ResourceError {
    /// The requested allocation would exceed the resource's budget.
    /// Carries the requested bytes and the remaining budget so tests
    /// can assert deterministic refusal.
    OutOfBudget { requested: usize, remaining: usize },
    /// CUDA driver returned an error. Carries the wrapped message.
    Driver(String),
    /// A stream-ordered contract was violated (e.g. dealloc on a
    /// stream that does not match the alloc stream without an
    /// intervening sync).
    StreamMisuse(String),
    /// A debug-guard or logging adaptor detected a use-after-free or
    /// double-free. Hard error in debug builds; surfaced upward.
    UseAfterFree { generation: Generation },
    /// A debug-guard adaptor detected an out-of-bounds write past a
    /// canary boundary.
    OutOfBounds { generation: Generation },
}

impl fmt::Display for ResourceError {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        match self {
            Self::OutOfBudget {
                requested,
                remaining,
            } => write!(
                f,
                "out of budget: requested {} bytes, remaining {} bytes",
                requested, remaining
            ),
            Self::Driver(msg) => write!(f, "CUDA driver error: {}", msg),
            Self::StreamMisuse(msg) => write!(f, "stream-ordered contract violated: {}", msg),
            Self::UseAfterFree { generation } => {
                write!(f, "use-after-free on generation {:?}", generation)
            }
            Self::OutOfBounds { generation } => {
                write!(f, "out-of-bounds write on generation {:?}", generation)
            }
        }
    }
}

impl std::error::Error for ResourceError {}

pub type ResourceResult<T> = std::result::Result<T, ResourceError>;

/// Stream-ordered device memory resource. Implementations:
///   * [`crate::device_runtime::direct::DirectCudaResource`] —
///     cudarc default (non-pooled) backend; **candidate** for the
///     sanitizer/cert role, **unproven** until the M1 manual gate
///     runs on a Compute-Sanitizer-supported host.
///   * [`crate::device_runtime::async_resource::AsyncCudaResource`] —
///     stream-ordered cuMemAllocAsync/cuMemFreeAsync backend;
///     production default when the context supports async-alloc.
///   * [`crate::device_runtime::logging::LoggingResource`] —
///     telemetry decorator over any inner resource.
///   * [`crate::device_runtime::budget::GlobalDeviceBudget`] —
///     per-runtime byte-limit decorator over any inner resource.
///   * `PoolResource` — performance tier; v0.7+ (not implemented).
///   * `DebugGuardResource` — canary/poison/quarantine; v0.7+
///     (not implemented).
///
/// Implementations must be thread-safe. The runtime composes resources
/// via decoration (each resource wraps an inner `Box<dyn
/// DeviceMemoryResource + Send + Sync>`).
pub trait DeviceMemoryResource: Send + Sync {
    /// Allocate `bytes` bytes on the resource's device, ordered on
    /// `stream`. The returned block is in [`BlockState::Live`].
    fn allocate(
        &self,
        bytes: usize,
        stream: StreamId,
        tag: AllocTag,
    ) -> ResourceResult<DeviceBlock>;

    /// Return `block` to the resource. After this call the block's
    /// state is [`BlockState::Retired`] (or [`BlockState::Quarantined`]
    /// for debug-guard resources). Reuse of the underlying memory is
    /// resource-specific but must respect the stream-ordered contract.
    ///
    /// `block.alloc_stream` is authoritative for ordering. If the
    /// caller has touched the memory on a different stream, they must
    /// have synchronized before calling `deallocate`.
    fn deallocate(&self, block: DeviceBlock) -> ResourceResult<()>;

    /// CUDA device ordinal this resource serves. Resources are pinned
    /// to a single device.
    fn device_ordinal(&self) -> u32;

    /// Bytes currently outstanding (live + retired-but-not-yet-freed).
    /// Used by tests and by the global budget adaptor.
    fn bytes_outstanding(&self) -> usize;

    /// Drain any retired-but-not-yet-freed bytes whose underlying
    /// CUDA work has completed. For synchronous backends this is a
    /// no-op. For stream-ordered async backends this synchronizes
    /// the streams that have queued `cuMemFreeAsync` calls and
    /// re-counts `bytes_outstanding` accordingly.
    ///
    /// Callers that need an accurate budget reading after a burst
    /// of asynchronous deallocations should call this before
    /// reading `bytes_outstanding`. Calling on a synchronous backend
    /// is harmless and free.
    fn reap_pending(&self) -> ResourceResult<()> {
        Ok(())
    }

    /// Record that work has been (or is being) submitted on
    /// `use_stream` that touches `block`'s bytes. Resources that
    /// participate in cross-stream lifetime tracking (notably the
    /// stream-ordered async backend) MUST attach a CUDA event from
    /// `use_stream` to the block; on `deallocate(block)`, the
    /// block's `alloc_stream` will wait on every recorded event
    /// before queueing the underlying free.
    ///
    /// **The default implementation returns
    /// [`ResourceError::StreamMisuse`].** This is intentional: a
    /// silent no-op default would let a launch builder call
    /// `record_block_use` against a resource that does not
    /// actually track cross-stream uses (e.g.,
    /// [`crate::device_runtime::direct::DirectCudaResource`]),
    /// observe `Ok(())`, queue a kernel on a different stream,
    /// then drop the block — and quietly hit the cross-stream
    /// use-after-free that this API exists to prevent. False
    /// safety is worse than no safety. Resources that cannot
    /// track cross-stream uses MUST inherit this default;
    /// callers (notably the future xlog launch builder) MUST
    /// surface the error rather than masking it.
    ///
    /// Override status today:
    ///   * [`crate::device_runtime::async_resource::AsyncCudaResource`]
    ///     overrides with real event tracking.
    ///   * [`crate::device_runtime::logging::LoggingResource`] and
    ///     [`crate::device_runtime::budget::GlobalDeviceBudget`]
    ///     forward to their inner resource (so the underlying
    ///     backend's behavior surfaces unchanged).
    ///   * [`crate::device_runtime::direct::DirectCudaResource`]
    ///     does NOT override — it correctly returns
    ///     `StreamMisuse` and forces callers to either route
    ///     allocations through `AsyncCudaResource` or take
    ///     responsibility for cross-stream synchronization
    ///     themselves.
    ///
    /// # Errors
    ///   * [`ResourceError::StreamMisuse`] from the default impl
    ///     when the resource cannot track cross-stream uses.
    ///   * [`ResourceError::UseAfterFree`] if `block` is not the
    ///     block currently live at `block.ptr` (caller likely
    ///     handed back a stale [`DeviceBlock`] whose generation
    ///     no longer matches the live entry).
    ///   * [`ResourceError::StreamMisuse`] if `use_stream` does
    ///     not resolve in the resource's stream pool.
    ///   * [`ResourceError::Driver`] for CUDA driver / event
    ///     creation failures.
    ///
    /// Callers that bypass this API and submit cross-stream work
    /// directly (raw `cuMemcpyDtoHAsync`, raw `Vec<*mut c_void>`
    /// kernel launches that the launch builder did not see, etc.)
    /// are responsible for their own cross-stream synchronization.
    /// The resource cannot infer arbitrary external CUDA work.
    fn record_block_use(&self, block: &DeviceBlock, use_stream: StreamId) -> ResourceResult<()> {
        let _ = (block, use_stream);
        Err(ResourceError::StreamMisuse(
            "record_block_use unsupported by this resource (the active backend \
             does not track cross-stream uses; route allocations through a \
             stream-ordered backend such as AsyncCudaResource, or take \
             responsibility for cross-stream synchronization explicitly)"
                .to_string(),
        ))
    }

    /// Whether this resource (and any inner resources it
    /// composes) actually tracks cross-stream uses via
    /// `record_block_use`. Used by the launch recorder's
    /// preflight to fail BEFORE queueing CUDA work, rather than
    /// after. The default returns `false` to match the trait's
    /// default `record_block_use` behavior; resources that
    /// override `record_block_use` to track events MUST override
    /// this to return `true`. Decorators forward to inner.
    fn supports_block_use_tracking(&self) -> bool {
        false
    }

    /// Pre-launch / pre-copy hook: queue any cross-stream waits
    /// required for `use_stream` to safely access `block` with
    /// `access` semantics. MUST be called BEFORE the GPU work is
    /// enqueued on `use_stream`.
    ///
    /// Concretely, on [`Access::Read`] the resource must queue
    /// `use_stream.wait(&last_write)` if a write on a different
    /// stream is outstanding. On [`Access::Write`] /
    /// [`Access::ReadWrite`] the resource must additionally queue
    /// waits on every outstanding read recorded on a different
    /// stream — the writer must observe completion of every prior
    /// reader. Same-stream events are skipped (CUDA stream order
    /// already covers them).
    ///
    /// **The default implementation returns
    /// [`ResourceError::StreamMisuse`].** Same rationale as
    /// `record_block_use`: a silent no-op default would let
    /// callers paired against a non-tracking backend believe the
    /// dependency edge was queued. Decorators forward; tracking
    /// backends override.
    ///
    /// # Errors
    ///   * [`ResourceError::StreamMisuse`] from the default impl
    ///     when the resource cannot track cross-stream uses.
    ///   * [`ResourceError::UseAfterFree`] if `block` is not the
    ///     id currently live at `block.ptr`.
    ///   * [`ResourceError::Driver`] for CUDA driver / event-wait
    ///     failures.
    fn prepare_block_use(
        &self,
        block: BlockId,
        use_stream: StreamId,
        access: Access,
    ) -> ResourceResult<()> {
        let _ = (block, use_stream, access);
        Err(ResourceError::StreamMisuse(
            "prepare_block_use unsupported by this resource (the active backend \
             does not track cross-stream uses; route allocations through \
             AsyncCudaResource or take responsibility for cross-stream \
             synchronization explicitly)"
                .to_string(),
        ))
    }

    /// Post-launch / post-copy hook: record an event on
    /// `use_stream` capturing the work just enqueued and update
    /// `block`'s dependency state.
    ///
    /// Concretely, on [`Access::Read`] the new event is appended
    /// to the block's outstanding-reads list (so future writers
    /// and the eventual deallocate can wait on it). On
    /// [`Access::Write`] / [`Access::ReadWrite`] the new event
    /// **replaces** the block's last-write event and the
    /// outstanding-reads list is cleared (any prior reader's
    /// dependency was queued at prepare time and is now subsumed
    /// by the new write event).
    ///
    /// **The default implementation returns
    /// [`ResourceError::StreamMisuse`].** Same rationale as
    /// `record_block_use`. Decorators forward; tracking backends
    /// override.
    fn finish_block_use(
        &self,
        block: BlockId,
        use_stream: StreamId,
        access: Access,
    ) -> ResourceResult<()> {
        let _ = (block, use_stream, access);
        Err(ResourceError::StreamMisuse(
            "finish_block_use unsupported by this resource (the active backend \
             does not track cross-stream uses; route allocations through \
             AsyncCudaResource or take responsibility for cross-stream \
             synchronization explicitly)"
                .to_string(),
        ))
    }
}