1pub mod allocator;
26pub mod backend_impl;
27#[cfg(feature = "cuda")]
28pub mod bf16;
29pub mod blas;
30pub mod buffer;
31pub mod conv;
32pub mod cusolver;
33pub mod device;
34pub mod error;
35pub mod flash_attention;
36pub mod graph;
37pub mod kernels;
38pub mod memory_guard;
39pub mod module_cache;
40pub mod pool;
41pub mod rng;
42pub mod stream;
43pub mod tensor_bridge;
44pub mod transfer;
45
46pub use allocator::CudaAllocator;
48pub use backend_impl::{CudaBackendImpl, get_cuda_device, init_cuda_backend};
49pub use blas::gpu_bmm_f32;
50pub use blas::{gpu_bmm_f32_into, gpu_matmul_f32_into};
51pub use blas::{gpu_matmul_f32, gpu_matmul_f64};
52#[cfg(feature = "cuda")]
53pub use blas::{
54 gpu_matmul_bf16_bf16, gpu_matmul_bf16_bf16_nt, gpu_matmul_bf16_bf16_strided_batched,
55 gpu_matmul_bf16_bf16_strided_batched_nt,
56};
57#[cfg(feature = "cuda")]
58pub use bf16::{
59 gpu_add_bf16, gpu_block_reduce_max_abs_bf16, gpu_causal_mask_bf16,
60 gpu_embedding_gather_bf16, gpu_embedding_gather_bf16_to_f32, gpu_fatrelu_bf16, gpu_mul_bf16,
61 gpu_relu_bf16, gpu_repeat_kv_bf16, gpu_rmsnorm_bf16,
62 gpu_rope_half_bf16, gpu_scale_bf16, gpu_silu_bf16, gpu_softmax_bf16,
63 gpu_transpose_from_heads_bf16, gpu_transpose_to_heads_bf16,
64};
65pub use buffer::CudaBuffer;
66pub use conv::gpu_conv2d_f32;
67pub use device::GpuDevice;
68pub use error::{GpuError, GpuResult};
69pub use graph::{
70 CaptureMode, CapturePool, CaptureStatus, CapturedGraph, GraphPoolHandle,
71 begin_capture, capture_pool_for_handle, end_capture, end_capture_with_pool, graph_pool_handle,
72 make_graphed_callable, release_graph_pool_handle,
73};
74#[cfg(feature = "cuda")]
75pub use graph::{
76 GraphCaptureGuard, begin_capture_with_mode, begin_capture_with_pool, capture_status,
77 is_stream_capturing,
78};
79pub use flash_attention::gpu_flash_attention_f32;
80pub use kernels::{gpu_add, gpu_mul, gpu_neg, gpu_relu, gpu_sub};
81pub use kernels::{
82 gpu_add_into, gpu_embed_lookup_into, gpu_gelu_into, gpu_layernorm_into, gpu_mul_into,
83 gpu_permute_0213_into, gpu_scale_into, gpu_slice_read_into, gpu_small_matmul_into,
84 gpu_softmax_into, gpu_transpose_2d_into,
85};
86pub use kernels::{gpu_broadcast_add, gpu_broadcast_mul, gpu_broadcast_sub};
87pub use kernels::{gpu_causal_mask_indirect, gpu_slice_write_indirect};
88pub use kernels::{
89 gpu_dropout, gpu_embed_lookup, gpu_gelu, gpu_layernorm, gpu_permute_0213, gpu_slice_read,
90 gpu_slice_write, gpu_small_bmm, gpu_small_matmul, gpu_softmax, gpu_transpose_2d,
91};
92pub use memory_guard::{
93 MemoryGuard, MemoryGuardBuilder, MemoryGuardedDevice, MemoryHook, MemoryPressureListener,
94 MemoryReservation, MemoryStats, MemoryWatchdog, OomPolicy, PressureLevel,
95};
96pub use pool::{cached_bytes, empty_cache, empty_cache_all, round_len};
97pub use rng::{CudaRngManager, PhiloxGenerator, PhiloxState, cuda_rng_manager, fork_rng, join_rng};
98pub use tensor_bridge::{GpuFloat, GpuTensor, cuda, cuda_default, tensor_to_cpu, tensor_to_gpu};
99pub use transfer::{alloc_zeros, alloc_zeros_f32, alloc_zeros_f64, cpu_to_gpu, gpu_to_cpu};