numr 0.5.1

High-performance numerical computing with multi-backend GPU acceleration (CPU/CUDA/WebGPU)
Documentation
[package]
name = "numr"
version = "0.5.1"
edition = "2024"
rust-version = "1.89"
description = "High-performance numerical computing with multi-backend GPU acceleration (CPU/CUDA/WebGPU)"
license = "Apache-2.0"
repository = "https://github.com/ml-rust/numr"
documentation = "https://docs.rs/numr"
keywords = ["tensor", "ndarray", "linear-algebra", "gpu", "fft"]
categories = ["science", "mathematics", "data-structures"]

[package.metadata.docs.rs]
features = ["f16", "sparse"]
# cuda and wgpu require hardware SDKs not available on docs.rs

[features]
default = ["rayon"]
cuda = ["dep:cudarc"]
nccl = ["cuda", "cudarc?/nccl"]
distributed = ["dep:nexar", "dep:tokio"]
distributed-gpu = ["distributed", "nccl", "dep:nexar-nccl"]
wgpu = ["dep:wgpu", "dep:pollster"]
rayon = ["dep:rayon"]
f16 = [
  "dep:half",
  "cudarc?/f16",
] # Half-precision floats (F16, BF16) - optional reduced-precision support
fp8 = [
] # 8-bit floats (FP8E4M3, FP8E5M2) - optional ultra-low-precision support
sparse = [] # Sparse tensor formats (CSR, CSC, COO) and operations

[dependencies]
# Core
thiserror = "2.0"
smallvec = "1"
bytemuck = { version = "1.24", features = ["derive"] }
num-traits = "0.2"
parking_lot = "0.12"

# Optional: Parallelism
rayon = { version = "1.11", optional = true }

# Zero-copy serialization for embedded data (used by sobol_data)
rkyv = "0.8"

# Optional: Half-precision floats
half = { version = "2.7", optional = true, features = [
  "bytemuck",
  "num-traits",
] }

# Optional: Inter-node distributed communication
nexar = { version = "0.1", optional = true }
nexar-nccl = { version = "0.1", optional = true }
tokio = { version = "1", features = ["rt"], optional = true }

# Optional: CUDA backend
cudarc = { version = "0.19", optional = true, features = [
  "cuda-version-from-build-system",
] }

# Optional: WebGPU backend
wgpu = { version = "28.0", optional = true }
pollster = { version = "0.4", optional = true }
paste = "1.0"

[dev-dependencies]
approx = "0.5"
rand = "0.9"
fluxbench = "0.1"
ndarray = "0.16"
nalgebra = "0.33"

[[bench]]
name = "matmul"
harness = false

[[bench]]
name = "reduce"
harness = false

[[bench]]
name = "fft"
harness = false

[[bench]]
name = "indexing"
harness = false

[[bench]]
name = "shape_ops"
harness = false

[[bench]]
name = "parallelism"
harness = false

[[bench]]
name = "ci_regression"
harness = false

[profile.release]
lto = "thin"
codegen-units = 1

[profile.bench]
lto = "thin"
codegen-units = 1