1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
//! CUDA Driver API flag and constant definitions.
//!
//! Stream flags, event flags, memory pool attributes, memory attach flags,
//! host register flags, pointer attribute codes, memory type values, context
//! scheduling flags, function attribute constants, memory advise values,
//! limit constants, and occupancy flags.
// =========================================================================
// Stream creation flags
// =========================================================================
/// Default stream creation flag (implicit synchronisation with the NULL stream).
pub const CU_STREAM_DEFAULT: u32 = 0;
/// Stream does not synchronise with the NULL stream.
pub const CU_STREAM_NON_BLOCKING: u32 = 1;
// =========================================================================
// Stream-ordered memory pool attributes (CUDA 11.2+)
// =========================================================================
/// Pool reuse policy: follow event dependencies before reusing a freed block.
pub const CU_MEMPOOL_ATTR_REUSE_FOLLOW_EVENT_DEPENDENCIES: u32 = 1;
/// Pool reuse policy: allow opportunistic reuse without ordering guarantees.
pub const CU_MEMPOOL_ATTR_REUSE_ALLOW_OPPORTUNISTIC: u32 = 2;
/// Pool reuse policy: allow the driver to insert internal dependencies for reuse.
pub const CU_MEMPOOL_ATTR_REUSE_ALLOW_INTERNAL_DEPENDENCIES: u32 = 3;
/// Release threshold (bytes): memory returned to OS when usage drops below this.
pub const CU_MEMPOOL_ATTR_RELEASE_THRESHOLD: u32 = 4;
/// Current reserved memory in bytes (read-only).
pub const CU_MEMPOOL_ATTR_RESERVED_MEM_CURRENT: u32 = 5;
/// High-water mark of reserved memory in bytes (resettable).
pub const CU_MEMPOOL_ATTR_RESERVED_MEM_HIGH: u32 = 6;
/// Current used memory in bytes (read-only).
pub const CU_MEMPOOL_ATTR_USED_MEM_CURRENT: u32 = 7;
/// High-water mark of used memory in bytes (resettable).
pub const CU_MEMPOOL_ATTR_USED_MEM_HIGH: u32 = 8;
// =========================================================================
// Event creation flags
// =========================================================================
/// Default event creation flag.
pub const CU_EVENT_DEFAULT: u32 = 0;
/// Event uses blocking synchronisation.
pub const CU_EVENT_BLOCKING_SYNC: u32 = 1;
/// Event does not record timing data (faster).
pub const CU_EVENT_DISABLE_TIMING: u32 = 2;
/// Event may be used as an interprocess event.
pub const CU_EVENT_INTERPROCESS: u32 = 4;
// =========================================================================
// Memory-attach flags (for managed / mapped memory)
// =========================================================================
/// Memory is accessible from any stream on any device.
pub const CU_MEM_ATTACH_GLOBAL: u32 = 1;
/// Memory is initially accessible only from the allocating stream/host.
pub const CU_MEM_ATTACH_HOST: u32 = 2;
/// Memory is initially accessible only from a single stream.
pub const CU_MEM_ATTACH_SINGLE: u32 = 4;
// =========================================================================
// cuMemHostRegister flags
// =========================================================================
/// Registered memory is portable across CUDA contexts.
pub const CU_MEMHOSTREGISTER_PORTABLE: u32 = 0x01;
/// Registered memory is mapped into the device address space.
pub const CU_MEMHOSTREGISTER_DEVICEMAP: u32 = 0x02;
/// Pointer is to I/O memory (not system RAM).
pub const CU_MEMHOSTREGISTER_IOMEMORY: u32 = 0x04;
/// Registered memory will not be written by the GPU (read-only).
pub const CU_MEMHOSTREGISTER_READ_ONLY: u32 = 0x08;
// =========================================================================
// cuPointerGetAttribute attribute codes
// =========================================================================
/// Query the CUDA context associated with a pointer.
pub const CU_POINTER_ATTRIBUTE_CONTEXT: u32 = 1;
/// Query the memory type (host / device / unified) of a pointer.
pub const CU_POINTER_ATTRIBUTE_MEMORY_TYPE: u32 = 2;
/// Query the device pointer corresponding to a host pointer.
pub const CU_POINTER_ATTRIBUTE_DEVICE_POINTER: u32 = 3;
/// Query the host pointer corresponding to a device pointer.
pub const CU_POINTER_ATTRIBUTE_HOST_POINTER: u32 = 4;
/// Query whether the memory is managed (unified).
pub const CU_POINTER_ATTRIBUTE_IS_MANAGED: u32 = 7;
// =========================================================================
// CU_MEMORYTYPE values (returned by pointer attribute queries)
// =========================================================================
/// Host (system) memory.
pub const CU_MEMORYTYPE_HOST: u32 = 1;
/// Device (GPU) memory.
pub const CU_MEMORYTYPE_DEVICE: u32 = 2;
/// Array memory.
pub const CU_MEMORYTYPE_ARRAY: u32 = 3;
/// Unified (managed) memory.
pub const CU_MEMORYTYPE_UNIFIED: u32 = 4;
// =========================================================================
// Context scheduling flags
// =========================================================================
/// The driver picks the most appropriate scheduling mode.
pub const CU_CTX_SCHED_AUTO: u32 = 0;
/// Actively spin when waiting for results from the GPU.
pub const CU_CTX_SCHED_SPIN: u32 = 1;
/// Yield the CPU when waiting for results from the GPU.
pub const CU_CTX_SCHED_YIELD: u32 = 2;
/// Block the calling thread when waiting for results.
pub const CU_CTX_SCHED_BLOCKING_SYNC: u32 = 4;
/// Mask for the scheduling flags.
pub const CU_CTX_SCHED_MASK: u32 = 0x07;
/// Support mapped pinned allocations.
pub const CU_CTX_MAP_HOST: u32 = 0x08;
/// Keep local memory allocation after launch.
pub const CU_CTX_LMEM_RESIZE_TO_MAX: u32 = 0x10;
/// Coredump enable.
pub const CU_CTX_COREDUMP_ENABLE: u32 = 0x20;
/// User coredump enable.
pub const CU_CTX_USER_COREDUMP_ENABLE: u32 = 0x40;
/// Sync-memops flag.
pub const CU_CTX_SYNC_MEMOPS: u32 = 0x80;
/// Mask for all context flags.
pub const CU_CTX_FLAGS_MASK: u32 = 0xFF;
// =========================================================================
// Function attribute values (used with cuFuncGetAttribute)
// =========================================================================
/// Maximum threads per block for this function.
pub const CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK: i32 = 0;
/// Shared memory used by this function (bytes).
pub const CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES: i32 = 1;
/// Size of user-allocated constant memory (bytes).
pub const CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES: i32 = 2;
/// Size of local memory used by each thread (bytes).
pub const CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES: i32 = 3;
/// Number of registers used by each thread.
pub const CU_FUNC_ATTRIBUTE_NUM_REGS: i32 = 4;
/// PTX virtual architecture version (e.g. 70 for sm_70).
pub const CU_FUNC_ATTRIBUTE_PTX_VERSION: i32 = 5;
/// Binary architecture version (e.g. 70 for sm_70).
pub const CU_FUNC_ATTRIBUTE_BINARY_VERSION: i32 = 6;
/// Whether this function has been cached.
pub const CU_FUNC_ATTRIBUTE_CACHE_MODE_CA: i32 = 7;
/// Maximum dynamic shared memory size (bytes).
pub const CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES: i32 = 8;
/// Preferred shared memory carve-out.
pub const CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT: i32 = 9;
/// Cluster size setting.
pub const CU_FUNC_ATTRIBUTE_CLUSTER_SIZE_MUST_BE_SET: i32 = 10;
/// Required cluster width.
pub const CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_WIDTH: i32 = 11;
/// Required cluster height.
pub const CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_HEIGHT: i32 = 12;
/// Required cluster depth.
pub const CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_DEPTH: i32 = 13;
/// Non-portable cluster size allowed.
pub const CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED: i32 = 14;
/// Required cluster scheduling policy preference.
pub const CU_FUNC_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE: i32 = 15;
// =========================================================================
// Memory advise values
// =========================================================================
/// Hint that the data will be read mostly.
pub const CU_MEM_ADVISE_SET_READ_MOSTLY: u32 = 1;
/// Unset read-mostly hint.
pub const CU_MEM_ADVISE_UNSET_READ_MOSTLY: u32 = 2;
/// Set the preferred location to the specified device.
pub const CU_MEM_ADVISE_SET_PREFERRED_LOCATION: u32 = 3;
/// Unset the preferred location.
pub const CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: u32 = 4;
/// Set access from the specified device.
pub const CU_MEM_ADVISE_SET_ACCESSED_BY: u32 = 5;
/// Unset access from the specified device.
pub const CU_MEM_ADVISE_UNSET_ACCESSED_BY: u32 = 6;
// =========================================================================
// Limit values (cuCtxSetLimit / cuCtxGetLimit)
// =========================================================================
/// Stack size for each GPU thread.
pub const CU_LIMIT_STACK_SIZE: u32 = 0;
/// Size of the printf FIFO.
pub const CU_LIMIT_PRINTF_FIFO_SIZE: u32 = 1;
/// Size of the heap used by `malloc()` on the device.
pub const CU_LIMIT_MALLOC_HEAP_SIZE: u32 = 2;
/// Maximum nesting depth of a device runtime launch.
pub const CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH: u32 = 3;
/// Maximum number of outstanding device runtime launches.
pub const CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT: u32 = 4;
/// L2 cache fetch granularity.
pub const CU_LIMIT_MAX_L2_FETCH_GRANULARITY: u32 = 5;
/// Maximum persisting L2 cache size.
pub const CU_LIMIT_PERSISTING_L2_CACHE_SIZE: u32 = 6;
// =========================================================================
// Occupancy flags
// =========================================================================
/// Default occupancy calculation.
pub const CU_OCCUPANCY_DEFAULT: u32 = 0;
/// Disable caching override.
pub const CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE: u32 = 1;
// =========================================================================
// Tests
// =========================================================================