1pub mod allocator;
26pub mod backend_impl;
27pub mod blas;
28pub mod buffer;
29pub mod conv;
30pub mod device;
31pub mod error;
32pub mod flash_attention;
33pub mod graph;
34pub mod kernels;
35pub mod memory_guard;
36pub mod module_cache;
37pub mod pool;
38pub mod rng;
39pub mod stream;
40pub mod tensor_bridge;
41pub mod transfer;
42
43pub use allocator::CudaAllocator;
45pub use backend_impl::{CudaBackendImpl, get_cuda_device, init_cuda_backend};
46pub use blas::gpu_bmm_f32;
47pub use blas::{gpu_bmm_f32_into, gpu_matmul_f32_into};
48pub use blas::{gpu_matmul_f32, gpu_matmul_f64};
49pub use buffer::CudaBuffer;
50pub use conv::gpu_conv2d_f32;
51pub use device::GpuDevice;
52pub use error::{GpuError, GpuResult};
53pub use flash_attention::gpu_flash_attention_f32;
54pub use kernels::{gpu_add, gpu_mul, gpu_neg, gpu_relu, gpu_sub};
55pub use kernels::{
56 gpu_add_into, gpu_embed_lookup_into, gpu_gelu_into, gpu_layernorm_into, gpu_mul_into,
57 gpu_permute_0213_into, gpu_scale_into, gpu_slice_read_into, gpu_small_matmul_into,
58 gpu_softmax_into, gpu_transpose_2d_into,
59};
60pub use kernels::{gpu_broadcast_add, gpu_broadcast_mul, gpu_broadcast_sub};
61pub use kernels::{gpu_causal_mask_indirect, gpu_slice_write_indirect};
62pub use kernels::{
63 gpu_dropout, gpu_embed_lookup, gpu_gelu, gpu_layernorm, gpu_permute_0213, gpu_slice_read,
64 gpu_slice_write, gpu_small_bmm, gpu_small_matmul, gpu_softmax, gpu_transpose_2d,
65};
66pub use memory_guard::{
67 MemoryGuard, MemoryGuardBuilder, MemoryGuardedDevice, MemoryHook, MemoryPressureListener,
68 MemoryReservation, MemoryStats, MemoryWatchdog, OomPolicy, PressureLevel,
69};
70pub use pool::{cached_bytes, empty_cache, empty_cache_all, round_len};
71pub use rng::{CudaRngManager, PhiloxGenerator, PhiloxState, cuda_rng_manager, fork_rng, join_rng};
72pub use tensor_bridge::{GpuFloat, GpuTensor, cuda, cuda_default, tensor_to_cpu, tensor_to_gpu};
73pub use transfer::{alloc_zeros, alloc_zeros_f32, alloc_zeros_f64, cpu_to_gpu, gpu_to_cpu};