cust/memory/
pointer.rs

1use crate::memory::DeviceCopy;
2use cust_raw::CUdeviceptr;
3
4use core::{
5    fmt::{self, Debug, Pointer},
6    hash::Hash,
7    ptr,
8};
9use std::ffi::c_void;
10use std::marker::PhantomData;
11use std::mem::size_of;
12
13/// A pointer to device memory.
14///
15/// `DevicePointer` cannot be dereferenced by the CPU, as it is a pointer to a memory allocation in
16/// the device. It can be safely copied to the device (eg. as part of a kernel launch) and either
17/// unwrapped or transmuted to an appropriate pointer.
18///
19/// `DevicePointer` is guaranteed to have an equivalent internal representation to a raw pointer.
20/// Thus, it can be safely reinterpreted or transmuted to `*mut T`. It is safe to pass a
21/// `DevicePointer` through an FFI boundary to C code expecting a `*mut T`, so long as the code on
22/// the other side of that boundary does not attempt to dereference the pointer on the CPU. It is
23/// thus possible to pass a `DevicePointer` to a CUDA kernel written in C.
24#[repr(transparent)]
25#[derive(Clone, Copy, Debug, PartialOrd, Ord, PartialEq, Eq, Hash)]
26pub struct DevicePointer<T: ?Sized + DeviceCopy> {
27    ptr: CUdeviceptr,
28    marker: PhantomData<*mut T>,
29}
30
31unsafe impl<T: ?Sized + DeviceCopy> DeviceCopy for DevicePointer<T> {}
32
33impl<T: DeviceCopy> Pointer for DevicePointer<T> {
34    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
35        let ptr = self.ptr as *const c_void;
36        fmt::Pointer::fmt(&ptr, f)
37    }
38}
39
40impl<T: ?Sized + DeviceCopy> DevicePointer<T> {
41    /// Returns a rust [`pointer`] created from this pointer, meant for FFI purposes.
42    /// **The pointer is not dereferenceable from the CPU!**
43    pub fn as_ptr(&self) -> *const T {
44        self.ptr as *const T
45    }
46
47    /// Returns a rust [`pointer`] created from this pointer, meant for FFI purposes.
48    /// **The pointer is not dereferenceable from the CPU!**
49    pub fn as_mut_ptr(&self) -> *mut T {
50        self.ptr as *mut T
51    }
52
53    /// Returns the contained CUdeviceptr.
54    pub fn as_raw(&self) -> CUdeviceptr {
55        self.ptr
56    }
57
58    /// Create a DevicePointer from a raw CUDA pointer
59    pub fn from_raw(ptr: CUdeviceptr) -> Self {
60        Self {
61            ptr,
62            marker: PhantomData,
63        }
64    }
65
66    /// Returns true if the pointer is null.
67    /// # Examples
68    ///
69    /// ```
70    /// # let _context = cust::quick_init().unwrap();
71    /// use cust::memory::*;
72    /// use std::ptr;
73    /// unsafe {
74    ///     let null : *mut u64 = ptr::null_mut();
75    ///     assert!(DevicePointer::wrap(null).is_null());
76    /// }
77    /// ```
78    pub fn is_null(self) -> bool {
79        self.ptr == 0
80    }
81
82    /// Returns a null device pointer.
83    ///
84    // TODO (AL): do we even want this?
85    pub fn null() -> Self
86    where
87        T: Sized,
88    {
89        Self {
90            ptr: 0,
91            marker: PhantomData,
92        }
93    }
94
95    /// Calculates the offset from a device pointer.
96    ///
97    /// `count` is in units of T; eg. a `count` of 3 represents a pointer offset of
98    /// `3 * size_of::<T>()` bytes.
99    ///
100    /// # Safety
101    ///
102    /// If any of the following conditions are violated, the result is Undefined
103    /// Behavior:
104    ///
105    /// * Both the starting and resulting pointer must be either in bounds or one
106    ///   byte past the end of *the same* allocated object.
107    ///
108    /// * The computed offset, **in bytes**, cannot overflow an `isize`.
109    ///
110    /// * The offset being in bounds cannot rely on "wrapping around" the address
111    ///   space. That is, the infinite-precision sum, **in bytes** must fit in a usize.
112    ///
113    /// Consider using `wrapping_offset` instead if these constraints are
114    /// difficult to satisfy. The only advantage of this method is that it
115    /// enables more aggressive compiler optimizations.
116    ///
117    /// # Examples
118    ///
119    /// ```
120    /// # let _context = cust::quick_init().unwrap();
121    /// use cust::memory::*;
122    /// unsafe {
123    ///     let mut dev_ptr = cuda_malloc::<u64>(5).unwrap();
124    ///     let offset = dev_ptr.offset(1); // Points to the 2nd u64 in the buffer
125    ///     cuda_free(dev_ptr); // Must free the buffer using the original pointer
126    /// }
127    /// ```
128    pub unsafe fn offset(self, count: isize) -> Self
129    where
130        T: Sized,
131    {
132        let ptr = self.ptr + (count as usize * size_of::<T>()) as u64;
133        Self {
134            ptr,
135            marker: PhantomData,
136        }
137    }
138
139    /// Calculates the offset from a device pointer using wrapping arithmetic.
140    ///
141    /// `count` is in units of T; eg. a `count` of 3 represents a pointer offset of
142    /// `3 * size_of::<T>()` bytes.
143    ///
144    /// # Safety
145    ///
146    /// The resulting pointer does not need to be in bounds, but it is
147    /// potentially hazardous to dereference (which requires `unsafe`).
148    /// In particular, the resulting pointer may *not* be used to access a
149    /// different allocated object than the one `self` points to. In other
150    /// words, `x.wrapping_offset(y.wrapping_offset_from(x))` is
151    /// *not* the same as `y`, and dereferencing it is undefined behavior
152    /// unless `x` and `y` point into the same allocated object.
153    ///
154    /// Always use `.offset(count)` instead when possible, because `offset`
155    /// allows the compiler to optimize better.  If you need to cross object
156    /// boundaries, cast the pointer to an integer and do the arithmetic there.
157    ///
158    /// # Examples
159    ///
160    /// ```
161    /// # let _context = cust::quick_init().unwrap();
162    /// use cust::memory::*;
163    /// unsafe {
164    ///     let mut dev_ptr = cuda_malloc::<u64>(5).unwrap();
165    ///     let offset = dev_ptr.wrapping_offset(1); // Points to the 2nd u64 in the buffer
166    ///     cuda_free(dev_ptr); // Must free the buffer using the original pointer
167    /// }
168    /// ```
169    pub fn wrapping_offset(self, count: isize) -> Self
170    where
171        T: Sized,
172    {
173        let ptr = self
174            .ptr
175            .wrapping_add((count as usize * size_of::<T>()) as u64);
176        Self {
177            ptr,
178            marker: PhantomData,
179        }
180    }
181
182    /// Calculates the offset from a pointer (convenience for `.offset(count as isize)`).
183    ///
184    /// `count` is in units of T; e.g. a `count` of 3 represents a pointer
185    /// offset of `3 * size_of::<T>()` bytes.
186    ///
187    /// # Safety
188    ///
189    /// If any of the following conditions are violated, the result is Undefined
190    /// Behavior:
191    ///
192    /// * Both the starting and resulting pointer must be either in bounds or one
193    ///   byte past the end of an allocated object.
194    ///
195    /// * The computed offset, **in bytes**, cannot overflow an `isize`.
196    ///
197    /// * The offset being in bounds cannot rely on "wrapping around" the address
198    ///   space. That is, the infinite-precision sum must fit in a `usize`.
199    ///
200    /// Consider using `wrapping_offset` instead if these constraints are
201    /// difficult to satisfy. The only advantage of this method is that it
202    /// enables more aggressive compiler optimizations.
203    ///
204    /// # Examples
205    ///
206    /// ```
207    /// # let _context = cust::quick_init().unwrap();
208    /// use cust::memory::*;
209    /// unsafe {
210    ///     let mut dev_ptr = cuda_malloc::<u64>(5).unwrap();
211    ///     let offset = dev_ptr.add(1); // Points to the 2nd u64 in the buffer
212    ///     cuda_free(dev_ptr); // Must free the buffer using the original pointer
213    /// }
214    /// ```
215    #[allow(clippy::should_implement_trait)]
216    pub unsafe fn add(self, count: usize) -> Self
217    where
218        T: Sized,
219    {
220        self.offset(count as isize)
221    }
222
223    /// Calculates the offset from a pointer (convenience for
224    /// `.offset((count as isize).wrapping_neg())`).
225    ///
226    /// `count` is in units of T; e.g. a `count` of 3 represents a pointer
227    /// offset of `3 * size_of::<T>()` bytes.
228    ///
229    /// # Safety
230    ///
231    /// If any of the following conditions are violated, the result is Undefined
232    /// Behavior:
233    ///
234    /// * Both the starting and resulting pointer must be either in bounds or one
235    ///   byte past the end of an allocated object.
236    ///
237    /// * The computed offset, **in bytes**, cannot overflow an `isize`.
238    ///
239    /// * The offset being in bounds cannot rely on "wrapping around" the address
240    ///   space. That is, the infinite-precision sum must fit in a `usize`.
241    ///
242    /// Consider using `wrapping_offset` instead if these constraints are
243    /// difficult to satisfy. The only advantage of this method is that it
244    /// enables more aggressive compiler optimizations.
245    ///
246    /// # Examples
247    ///
248    /// ```
249    /// # let _context = cust::quick_init().unwrap();
250    /// use cust::memory::*;
251    /// unsafe {
252    ///     let mut dev_ptr = cuda_malloc::<u64>(5).unwrap();
253    ///     let offset = dev_ptr.add(4).sub(3); // Points to the 2nd u64 in the buffer
254    ///     cuda_free(dev_ptr); // Must free the buffer using the original pointer
255    /// }
256    #[allow(clippy::should_implement_trait)]
257    pub unsafe fn sub(self, count: usize) -> Self
258    where
259        T: Sized,
260    {
261        self.offset((count as isize).wrapping_neg())
262    }
263
264    /// Calculates the offset from a pointer using wrapping arithmetic.
265    /// (convenience for `.wrapping_offset(count as isize)`)
266    ///
267    /// `count` is in units of T; e.g. a `count` of 3 represents a pointer
268    /// offset of `3 * size_of::<T>()` bytes.
269    ///
270    /// # Safety
271    ///
272    /// The resulting pointer does not need to be in bounds, but it is
273    /// potentially hazardous to dereference.
274    ///
275    /// Always use `.add(count)` instead when possible, because `add`
276    /// allows the compiler to optimize better.
277    ///
278    /// # Examples
279    ///
280    /// ```
281    /// # let _context = cust::quick_init().unwrap();
282    /// use cust::memory::*;
283    /// unsafe {
284    ///     let mut dev_ptr = cuda_malloc::<u64>(5).unwrap();
285    ///     let offset = dev_ptr.wrapping_add(1); // Points to the 2nd u64 in the buffer
286    ///     cuda_free(dev_ptr); // Must free the buffer using the original pointer
287    /// }
288    /// ```
289    pub fn wrapping_add(self, count: usize) -> Self
290    where
291        T: Sized,
292    {
293        self.wrapping_offset(count as isize)
294    }
295
296    /// Calculates the offset from a pointer using wrapping arithmetic.
297    /// (convenience for `.wrapping_offset((count as isize).wrapping_sub())`)
298    ///
299    /// `count` is in units of T; e.g. a `count` of 3 represents a pointer
300    /// offset of `3 * size_of::<T>()` bytes.
301    ///
302    /// # Safety
303    ///
304    /// The resulting pointer does not need to be in bounds, but it is
305    /// potentially hazardous to dereference (which requires `unsafe`).
306    ///
307    /// Always use `.sub(count)` instead when possible, because `sub`
308    /// allows the compiler to optimize better.
309    ///
310    /// # Examples
311    ///
312    /// ```
313    /// # let _context = cust::quick_init().unwrap();
314    /// use cust::memory::*;
315    /// unsafe {
316    ///     let mut dev_ptr = cuda_malloc::<u64>(5).unwrap();
317    ///     let offset = dev_ptr.wrapping_add(4).wrapping_sub(3); // Points to the 2nd u64 in the buffer
318    ///     cuda_free(dev_ptr); // Must free the buffer using the original pointer
319    /// }
320    /// ```
321    pub fn wrapping_sub(self, count: usize) -> Self
322    where
323        T: Sized,
324    {
325        self.wrapping_offset((count as isize).wrapping_neg())
326    }
327
328    /// Casts this device pointer to another type.
329    pub fn cast<U: DeviceCopy>(self) -> DevicePointer<U> {
330        DevicePointer::from_raw(self.ptr)
331    }
332}
333
334/// A pointer to unified memory.
335///
336/// `UnifiedPointer` can be safely dereferenced by the CPU, as the memory allocation it points to is
337/// shared between the CPU and the GPU. It can also be safely copied to the device (eg. as part of
338/// a kernel launch).
339///
340/// `UnifiedPointer` is guaranteed to have an equivalent internal representation to a raw pointer.
341/// Thus, it can be safely reinterpreted or transmuted to `*mut T`. It is also safe to pass a
342/// `UnifiedPointer` through an FFI boundary to C code expecting a `*mut T`. It is
343/// thus possible to pass a `UnifiedPointer` to a CUDA kernel written in C.
344#[repr(transparent)]
345#[derive(Copy, Clone, Debug, PartialOrd, Ord, PartialEq, Eq, Hash)]
346pub struct UnifiedPointer<T: ?Sized + DeviceCopy>(*mut T);
347
348unsafe impl<T: ?Sized + DeviceCopy> DeviceCopy for UnifiedPointer<T> {}
349
350impl<T: DeviceCopy> Pointer for UnifiedPointer<T> {
351    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
352        fmt::Pointer::fmt(&self.0, f)
353    }
354}
355
356impl<T: ?Sized + DeviceCopy> UnifiedPointer<T> {
357    /// Wrap the given raw pointer in a UnifiedPointer. The given pointer is assumed to be a valid,
358    /// unified-memory pointer or null.
359    ///
360    /// # Safety
361    ///
362    /// The given pointer must have been allocated with
363    /// [`cuda_malloc_unified`](fn.cuda_malloc_unified.html) or be null.
364    ///
365    /// # Examples
366    ///
367    /// ```
368    /// # let _context = cust::quick_init().unwrap();
369    /// use cust::memory::*;
370    /// use std::ptr;
371    /// unsafe {
372    ///     let null : *mut u64 = ptr::null_mut();
373    ///     assert!(UnifiedPointer::wrap(null).is_null());
374    /// }
375    /// ```
376    pub unsafe fn wrap(ptr: *mut T) -> Self {
377        UnifiedPointer(ptr)
378    }
379
380    /// Returns the contained pointer as a raw pointer.
381    ///
382    /// # Examples
383    ///
384    /// ```
385    /// # let _context = cust::quick_init().unwrap();
386    /// use cust::memory::*;
387    /// unsafe {
388    ///     let unified_ptr = cuda_malloc_unified::<u64>(1).unwrap();
389    ///     let ptr: *const u64 = unified_ptr.as_raw();
390    ///     cuda_free_unified(unified_ptr);
391    /// }
392    /// ```
393    pub fn as_raw(self) -> *const T {
394        self.0
395    }
396
397    /// Returns the contained pointer as a mutable raw pointer.
398    ///
399    /// # Examples
400    ///
401    /// ```
402    /// # let _context = cust::quick_init().unwrap();
403    /// use cust::memory::*;
404    /// unsafe {
405    ///     let mut unified_ptr = cuda_malloc_unified::<u64>(1).unwrap();
406    ///     let ptr: *mut u64 = unified_ptr.as_raw_mut();
407    ///     *ptr = 5u64;
408    ///     cuda_free_unified(unified_ptr);
409    /// }
410    /// ```
411    pub fn as_raw_mut(&mut self) -> *mut T {
412        self.0
413    }
414
415    /// Returns true if the pointer is null.
416    ///
417    /// # Examples
418    ///
419    /// ```
420    /// # let _context = cust::quick_init().unwrap();
421    /// use cust::memory::*;
422    /// use std::ptr;
423    /// unsafe {
424    ///     let null : *mut u64 = ptr::null_mut();
425    ///     assert!(UnifiedPointer::wrap(null).is_null());
426    /// }
427    /// ```
428    pub fn is_null(self) -> bool {
429        self.0.is_null()
430    }
431
432    /// Returns a null unified pointer.
433    ///
434    /// # Examples:
435    ///
436    /// ```
437    /// # let _context = cust::quick_init().unwrap();
438    /// use cust::memory::*;
439    /// let ptr : UnifiedPointer<u64> = UnifiedPointer::null();
440    /// assert!(ptr.is_null());
441    /// ```
442    pub fn null() -> Self
443    where
444        T: Sized,
445    {
446        unsafe { Self::wrap(ptr::null_mut()) }
447    }
448
449    /// Calculates the offset from a unified pointer.
450    ///
451    /// `count` is in units of T; eg. a `count` of 3 represents a pointer offset of
452    /// `3 * size_of::<T>()` bytes.
453    ///
454    /// # Safety
455    ///
456    /// If any of the following conditions are violated, the result is Undefined
457    /// Behavior:
458    ///
459    /// * Both the starting and resulting pointer must be either in bounds or one
460    ///   byte past the end of *the same* allocated object.
461    ///
462    /// * The computed offset, **in bytes**, cannot overflow an `isize`.
463    ///
464    /// * The offset being in bounds cannot rely on "wrapping around" the address
465    ///   space. That is, the infinite-precision sum, **in bytes** must fit in a usize.
466    ///
467    /// Consider using `wrapping_offset` instead if these constraints are
468    /// difficult to satisfy. The only advantage of this method is that it
469    /// enables more aggressive compiler optimizations.
470    ///
471    /// # Examples
472    ///
473    /// ```
474    /// # let _context = cust::quick_init().unwrap();
475    /// use cust::memory::*;
476    /// unsafe {
477    ///     let mut unified_ptr = cuda_malloc_unified::<u64>(5).unwrap();
478    ///     let offset = unified_ptr.offset(1); // Points to the 2nd u64 in the buffer
479    ///     cuda_free_unified(unified_ptr); // Must free the buffer using the original pointer
480    /// }
481    /// ```
482    pub unsafe fn offset(self, count: isize) -> Self
483    where
484        T: Sized,
485    {
486        Self::wrap(self.0.offset(count))
487    }
488
489    /// Calculates the offset from a unified pointer using wrapping arithmetic.
490    ///
491    /// `count` is in units of T; eg. a `count` of 3 represents a pointer offset of
492    /// `3 * size_of::<T>()` bytes.
493    ///
494    /// # Safety
495    ///
496    /// The resulting pointer does not need to be in bounds, but it is
497    /// potentially hazardous to dereference (which requires `unsafe`).
498    /// In particular, the resulting pointer may *not* be used to access a
499    /// different allocated object than the one `self` points to. In other
500    /// words, `x.wrapping_offset(y.wrapping_offset_from(x))` is
501    /// *not* the same as `y`, and dereferencing it is undefined behavior
502    /// unless `x` and `y` point into the same allocated object.
503    ///
504    /// Always use `.offset(count)` instead when possible, because `offset`
505    /// allows the compiler to optimize better.  If you need to cross object
506    /// boundaries, cast the pointer to an integer and do the arithmetic there.
507    ///
508    /// # Examples
509    ///
510    /// ```
511    /// # let _context = cust::quick_init().unwrap();
512    /// use cust::memory::*;
513    /// unsafe {
514    ///     let mut unified_ptr = cuda_malloc_unified::<u64>(5).unwrap();
515    ///     let offset = unified_ptr.wrapping_offset(1); // Points to the 2nd u64 in the buffer
516    ///     cuda_free_unified(unified_ptr); // Must free the buffer using the original pointer
517    /// }
518    /// ```
519    pub fn wrapping_offset(self, count: isize) -> Self
520    where
521        T: Sized,
522    {
523        unsafe { Self::wrap(self.0.wrapping_offset(count)) }
524    }
525
526    /// Calculates the offset from a pointer (convenience for `.offset(count as isize)`).
527    ///
528    /// `count` is in units of T; e.g. a `count` of 3 represents a pointer
529    /// offset of `3 * size_of::<T>()` bytes.
530    ///
531    /// # Safety
532    ///
533    /// If any of the following conditions are violated, the result is Undefined
534    /// Behavior:
535    ///
536    /// * Both the starting and resulting pointer must be either in bounds or one
537    ///   byte past the end of an allocated object.
538    ///
539    /// * The computed offset, **in bytes**, cannot overflow an `isize`.
540    ///
541    /// * The offset being in bounds cannot rely on "wrapping around" the address
542    ///   space. That is, the infinite-precision sum must fit in a `usize`.
543    ///
544    /// Consider using `wrapping_offset` instead if these constraints are
545    /// difficult to satisfy. The only advantage of this method is that it
546    /// enables more aggressive compiler optimizations.
547    ///
548    /// # Examples
549    ///
550    /// ```
551    /// # let _context = cust::quick_init().unwrap();
552    /// use cust::memory::*;
553    /// unsafe {
554    ///     let mut unified_ptr = cuda_malloc_unified::<u64>(5).unwrap();
555    ///     let offset = unified_ptr.add(1); // Points to the 2nd u64 in the buffer
556    ///     cuda_free_unified(unified_ptr); // Must free the buffer using the original pointer
557    /// }
558    /// ```
559    #[allow(clippy::should_implement_trait)]
560    pub unsafe fn add(self, count: usize) -> Self
561    where
562        T: Sized,
563    {
564        self.offset(count as isize)
565    }
566
567    /// Calculates the offset from a pointer (convenience for
568    /// `.offset((count as isize).wrapping_neg())`).
569    ///
570    /// `count` is in units of T; e.g. a `count` of 3 represents a pointer
571    /// offset of `3 * size_of::<T>()` bytes.
572    ///
573    /// # Safety
574    ///
575    /// If any of the following conditions are violated, the result is Undefined
576    /// Behavior:
577    ///
578    /// * Both the starting and resulting pointer must be either in bounds or one
579    ///   byte past the end of an allocated object.
580    ///
581    /// * The computed offset, **in bytes**, cannot overflow an `isize`.
582    ///
583    /// * The offset being in bounds cannot rely on "wrapping around" the address
584    ///   space. That is, the infinite-precision sum must fit in a `usize`.
585    ///
586    /// Consider using `wrapping_offset` instead if these constraints are
587    /// difficult to satisfy. The only advantage of this method is that it
588    /// enables more aggressive compiler optimizations.
589    ///
590    /// # Examples
591    ///
592    /// ```
593    /// # let _context = cust::quick_init().unwrap();
594    /// use cust::memory::*;
595    /// unsafe {
596    ///     let mut unified_ptr = cuda_malloc_unified::<u64>(5).unwrap();
597    ///     let offset = unified_ptr.add(4).sub(3); // Points to the 2nd u64 in the buffer
598    ///     cuda_free_unified(unified_ptr); // Must free the buffer using the original pointer
599    /// }
600    #[allow(clippy::should_implement_trait)]
601    pub unsafe fn sub(self, count: usize) -> Self
602    where
603        T: Sized,
604    {
605        self.offset((count as isize).wrapping_neg())
606    }
607
608    /// Calculates the offset from a pointer using wrapping arithmetic.
609    /// (convenience for `.wrapping_offset(count as isize)`)
610    ///
611    /// `count` is in units of T; e.g. a `count` of 3 represents a pointer
612    /// offset of `3 * size_of::<T>()` bytes.
613    ///
614    /// # Safety
615    ///
616    /// The resulting pointer does not need to be in bounds, but it is
617    /// potentially hazardous to dereference.
618    ///
619    /// Always use `.add(count)` instead when possible, because `add`
620    /// allows the compiler to optimize better.
621    ///
622    /// # Examples
623    ///
624    /// ```
625    /// # let _context = cust::quick_init().unwrap();
626    /// use cust::memory::*;
627    /// unsafe {
628    ///     let mut unified_ptr = cuda_malloc_unified::<u64>(5).unwrap();
629    ///     let offset = unified_ptr.wrapping_add(1); // Points to the 2nd u64 in the buffer
630    ///     cuda_free_unified(unified_ptr); // Must free the buffer using the original pointer
631    /// }
632    /// ```
633    pub fn wrapping_add(self, count: usize) -> Self
634    where
635        T: Sized,
636    {
637        self.wrapping_offset(count as isize)
638    }
639
640    /// Calculates the offset from a pointer using wrapping arithmetic.
641    /// (convenience for `.wrapping_offset((count as isize).wrapping_sub())`)
642    ///
643    /// `count` is in units of T; e.g. a `count` of 3 represents a pointer
644    /// offset of `3 * size_of::<T>()` bytes.
645    ///
646    /// # Safety
647    ///
648    /// The resulting pointer does not need to be in bounds, but it is
649    /// potentially hazardous to dereference (which requires `unsafe`).
650    ///
651    /// Always use `.sub(count)` instead when possible, because `sub`
652    /// allows the compiler to optimize better.
653    ///
654    /// # Examples
655    ///
656    /// ```
657    /// # let _context = cust::quick_init().unwrap();
658    /// use cust::memory::*;
659    /// unsafe {
660    ///     let mut unified_ptr = cuda_malloc_unified::<u64>(5).unwrap();
661    ///     let offset = unified_ptr.wrapping_add(4).wrapping_sub(3); // Points to the 2nd u64 in the buffer
662    ///     cuda_free_unified(unified_ptr); // Must free the buffer using the original pointer
663    /// }
664    /// ```
665    pub fn wrapping_sub(self, count: usize) -> Self
666    where
667        T: Sized,
668    {
669        self.wrapping_offset((count as isize).wrapping_neg())
670    }
671}