Skip to main content

rlx/
lib.rs

1// RLX — versatile ML compiler + runtime.
2// Copyright (C) 2026 Eugene Hauptmann, Nataliya Kosmyna.
3//
4// This program is free software: you can redistribute it and/or modify
5// it under the terms of the GNU General Public License as published by
6// the Free Software Foundation, version 3.
7//
8// This program is distributed in the hope that it will be useful,
9// but WITHOUT ANY WARRANTY; without even the implied warranty of
10// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11// GNU General Public License for more details.
12//
13// You should have received a copy of the GNU General Public License
14// along with this program. If not, see <https://www.gnu.org/licenses/>.
15
16//! # RLX
17//!
18//! A small ML compiler + runtime for transformer inference and training,
19//! with a JAX-shaped IR + autodiff + transforms (`jvp`, `hvp`, `vmap`)
20//! on top of CPU / Apple Silicon (Metal / MLX) / NVIDIA (CUDA) / AMD
21//! (ROCm) / Google TPU / cross-platform GPU (wgpu) / FPGA / Cortex-M
22//! backends.
23//!
24//! This is the **prelude crate** — pulls in the framework-level
25//! workspace members and re-exports the common types so a one-line
26//! `use rlx::prelude::*;` covers most usage.
27//!
28//! ## Three usage patterns
29//!
30//! ### 1. Build + run a graph by hand
31//!
32//! ```ignore
33//! use rlx::prelude::*;
34//!
35//! let mut g = Graph::new("hello");
36//! let x = g.input("x", Shape::new(&[1, 4], DType::F32));
37//! let w = g.param("w", Shape::new(&[4, 2], DType::F32));
38//! let y = g.matmul(x, w, Shape::new(&[1, 2], DType::F32));
39//! g.set_outputs(vec![y]);
40//!
41//! let mut compiled = Session::new(Device::Cpu).compile(g);
42//! compiled.set_param("w", &[1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0]);
43//! let out = compiled.run(&[("x", &[1.0, 2.0, 3.0, 4.0])]);
44//! ```
45//!
46//! ## Module map
47//!
48//! Every workspace crate is reachable as a module on `rlx`:
49//!
50//! | path            | crate           | what                                                                            |
51//! |-----------------|-----------------|---------------------------------------------------------------------------------|
52//! | `rlx::ir`       | `rlx-ir`        | IR types, ops, graph builder                                                    |
53//! | `rlx::opt`      | `rlx-opt`       | facade: `rlx-fusion` + `rlx-autodiff` + `rlx-compile`                           |
54//! | `rlx::driver`   | `rlx-driver`    | `Device` enum, registries                                                       |
55//! | `rlx::runtime`  | `rlx-runtime`   | `Session`, `CompiledGraph`                                                      |
56//! | `rlx::macros`   | `rlx-macros`    | `#[rlx_model]` proc macro                                                       |
57//! | `rlx::gguf`     | `rlx-gguf`      | GGUF parser + dequant *(feature `gguf`)*                                        |
58//! | `rlx::bench`    | `rlx-bench`     | benchmark harness *(feature `bench`)*                                           |
59//! | `rlx::sparse`   | `rlx-sparse`    | downstream: sparse linalg *(feature `sparse`)*                                  |
60//! | `rlx::splat`    | `rlx-splat`     | 3D Gaussian splatting *(feature `splat`)* — `register()`, decomposed IR ops      |
61//! | `rlx::linalg`   | `rlx-linalg`    | downstream: dense linalg via LAPACK *(feature `linalg`)*                        |
62//! | `rlx::cortexm`  | `rlx-cortexm`   | INT8 ARMv7E-M kernels *(feature `cortexm`)* — no `Backend` impl, kernels only   |
63//! | `rlx::fpga`     | `rlx-fpga`      | IR → SystemVerilog datapath synthesis *(feature `fpga`)* — no `Backend` impl    |
64//!
65//! ## Convenience namespaces
66//!
67//! Grouped re-exports for related concerns — use these when you want
68//! one focused subset without star-importing the whole prelude:
69//!
70//! | namespace            | what                                                                          |
71//! |----------------------|-------------------------------------------------------------------------------|
72//! | [`rlx::quant`]       | `QuantScheme`, `QuantMap` (IR quantization metadata)                          |
73//! | [`rlx::ops`]         | `Activation`, `BinaryOp`, `CmpOp`, `MaskKind`, `ChainStep`, `ChainOperand`    |
74//! | [`rlx::autodiff`]    | `jvp`, `hvp`, `vmap` + the autodiff entry points                              |
75//! | [`rlx::prelude`]     | star-import target covering the 95% case                                      |
76//!
77//! ## Backend feature gates
78//!
79//! Pick the ones that match your hardware. Multiple backends can be
80//! enabled at once; the runtime picks one per `Session`.
81//!
82//! | feature             | backend                              | platform                  |
83//! |---------------------|--------------------------------------|---------------------------|
84//! | `cpu` *(default)*   | NEON / AVX + Accelerate / OpenBLAS   | every host                |
85//! | `metal`             | Metal Performance Shaders + MSL      | macOS (Apple Silicon)     |
86//! | `mlx`               | Apple MLX (vendored)                 | macOS (Apple Silicon)     |
87//! | `gpu`               | wgpu (Vulkan / DX12 / WebGPU / Metal)| cross-platform            |
88//! | `cuda`              | cuBLAS / cuDNN / NVRTC               | Linux / Windows + NVIDIA  |
89//! | `rocm`              | hipBLAS / MIOpen                     | Linux + AMD               |
90//! | `tpu`               | libtpu PJRT plugin                   | Linux + GCP TPU           |
91//! | `blas-accelerate`   | macOS Accelerate                     | macOS                     |
92//! | `blas-mkl`          | Intel MKL                            | Intel / AMD CPUs          |
93//! | `blas-openblas`     | OpenBLAS                             | cross-platform CPU        |
94//!
95//! ## Convenience aggregates
96//!
97//! Single-flag setups for common platforms. Each composes the
98//! fragments most users want for that target.
99//!
100//! | feature           | expands to                                  |
101//! |-------------------|---------------------------------------------|
102//! | `apple-silicon`   | `cpu` + `metal` + `blas-accelerate`         |
103//! | `nvidia`          | `cpu` + `cuda`                              |
104//! | `edge`            | `cpu` + `cortexm`                           |
105//! | `all-cpu`         | `cpu` + `gguf` + `linalg`                   |
106//!
107//! `mlx` and `rocm` aren't in any aggregate because their crates
108//! aren't on crates.io (vendor-bundled submodule / workspace-
109//! relative kernel sources). To opt in, depend on the workspace via
110//! git and add the feature explicitly:
111//!
112//! ```toml
113//! rlx = { git = "https://github.com/MIT-RLX/rlx", features = ["apple-silicon", "mlx"] }
114//! ```
115
116#![doc(html_root_url = "https://docs.rs/rlx/0.2.1")]
117
118// ── Module re-exports ───────────────────────────────────────────
119
120/// Tensor IR — types, shapes, ops, graph builder.
121/// See [`rlx-ir`](https://crates.io/crates/rlx-ir).
122pub use rlx_ir as ir;
123
124/// Graph rewrites + autodiff + vmap.
125/// See [`rlx-opt`](https://crates.io/crates/rlx-opt).
126pub use rlx_opt as opt;
127
128/// Device enum + cross-cutting types.
129/// See [`rlx-driver`](https://crates.io/crates/rlx-driver).
130pub use rlx_driver as driver;
131
132/// User-facing `Session` / `CompiledGraph`.
133/// See [`rlx-runtime`](https://crates.io/crates/rlx-runtime).
134pub use rlx_runtime as runtime;
135
136/// Procedural macros (`#[rlx_model]`, `pipeline_schedule!`).
137/// See [`rlx-macros`](https://crates.io/crates/rlx-macros).
138pub use rlx_macros as macros;
139
140#[cfg(feature = "gguf")]
141/// GGUF v1 / v2 / v3 parser + dequant.
142/// See [`rlx-gguf`](https://crates.io/crates/rlx-gguf).
143pub use rlx_gguf as gguf;
144
145#[cfg(feature = "bench")]
146/// Uniform benchmark harness.
147/// See [`rlx-bench`](https://crates.io/crates/rlx-bench).
148pub use rlx_bench as bench;
149
150#[cfg(feature = "sparse")]
151/// Downstream: sparse linear algebra (custom-op scaffold).
152/// See [`rlx-sparse`](https://crates.io/crates/rlx-sparse).
153pub use rlx_sparse as sparse;
154
155#[cfg(feature = "linalg")]
156/// Downstream: dense linalg via LAPACK (custom-op scaffold).
157/// See [`rlx-linalg`](https://crates.io/crates/rlx-linalg).
158pub use rlx_linalg as linalg;
159
160#[cfg(feature = "splat")]
161/// Downstream: 3D Gaussian splatting (CPU reference render custom op).
162/// See [`rlx-splat`](https://crates.io/crates/rlx-splat).
163pub use rlx_splat as splat;
164
165#[cfg(feature = "umap")]
166/// Downstream: UMAP / fast-umap custom ops (k-NN from pairwise distances).
167pub use rlx_umap as umap;
168
169#[cfg(feature = "cortexm")]
170/// `no_std` ARMv7E-M INT8 kernels (Cortex-M4F / M7). Doesn't
171/// implement `Backend` — call the kernels (`dense`, `conv2d`,
172/// `maxpool`, `relu`, `argmax`) directly.
173/// See [`rlx-cortexm`](https://crates.io/crates/rlx-cortexm).
174pub use rlx_cortexm as cortexm;
175
176#[cfg(feature = "fpga")]
177/// IR → SystemVerilog datapath synthesis. Doesn't implement
178/// `Backend` — synth + P&R takes minutes; the entry point is
179/// `rlx::fpga::codegen::emit_model`.
180/// See [`rlx-fpga`](https://crates.io/crates/rlx-fpga).
181pub use rlx_fpga as fpga;
182
183// ── Error types ─────────────────────────────────────────────────
184//
185// The whole stack returns `anyhow::Result<T>` — `rlx::Result` /
186// `rlx::Error` make that the obvious choice for downstream code
187// without forcing an explicit `anyhow` dep at the call site.
188
189/// Crate-wide result type — alias of `anyhow::Result<T>`. Use this
190/// in `main()` and library boundaries.
191pub type Result<T, E = anyhow::Error> = std::result::Result<T, E>;
192
193/// Crate-wide error type — alias of `anyhow::Error`.
194pub type Error = anyhow::Error;
195
196// ── Flat re-exports for the most-common types ───────────────────
197//
198// These cover ~90% of user code: build a graph with rlx_ir types,
199// compile + run it through Session, then read back outputs. Less
200// common types stay reachable via the module re-exports above.
201
202pub use rlx_driver::Device;
203pub use rlx_ir::quant::QuantScheme;
204pub use rlx_ir::{
205    DType, Element, FusionPolicy, Graph, GraphModule, GraphStage, HirModule, HirOp, LirModule,
206    MirModule, Node, NodeId, Op, OpKind, Shape, Tick,
207};
208pub use rlx_ir::{
209    NodeOrigin, inspect_graph, inspect_graph_diff, inspect_hir, inspect_hir_stats, inspect_lir,
210    inspect_mir, inspect_mir_diff, inspect_mir_stats, node_label,
211};
212pub use rlx_opt::{
213    CalibrationRecord, CompilePipeline, CompileResult, FusionOptions, FusionReport, FusionTarget,
214    MissReason, MissedFusion, Pass, PipelineInspect, Precision, PrecisionPolicy, fusion_passes,
215    fusion_passes_for_supported, hvp, inspect_pipeline, jvp, maybe_dump_pipeline,
216    supported_for_target, supports_op, vmap,
217};
218pub use rlx_runtime::{CompiledGraph, Session};
219
220// ── Grouped namespaces ──────────────────────────────────────────
221
222/// Quantization metadata — schemes the IR carries per-tensor, plus
223/// the `QuantMap` graph-level annotation. Use these when wiring
224/// `Op::DequantMatMul` or attaching quant info to your own ops.
225///
226/// ```ignore
227/// use rlx::quant::QuantScheme;
228///
229/// let scheme = QuantScheme::GgufQ4K;   // GGUF Q4_K super-block
230/// assert!(scheme.is_gguf());
231/// assert_eq!(scheme.gguf_block_bytes(), 144);
232/// ```
233pub mod quant {
234    pub use rlx_ir::quant::{QuantMap, QuantScheme};
235}
236
237/// Op-builder helper enums — the variants the graph builder methods
238/// (`g.binary`, `g.compare`, `g.activation`, `g.attention_kind`, …)
239/// take as their first argument, plus the fused-chain primitives
240/// used by `Op::ElementwiseRegion`.
241///
242/// ```ignore
243/// use rlx::{Graph, Shape, DType};
244/// use rlx::ops::{Activation, BinaryOp};
245///
246/// let mut g = Graph::new("ex");
247/// let x = g.input("x", Shape::new(&[4], DType::F32));
248/// let y = g.input("y", Shape::new(&[4], DType::F32));
249/// let s = g.binary(BinaryOp::Add, x, y, Shape::new(&[4], DType::F32));
250/// let r = g.activation(Activation::Silu, s, Shape::new(&[4], DType::F32));
251/// g.set_outputs(vec![r]);
252/// ```
253pub mod ops {
254    pub use rlx_ir::op::{Activation, BinaryOp, ChainOperand, ChainStep, CmpOp, MaskKind};
255}
256
257/// Autodiff + transforms — re-exports the public entry points from
258/// `rlx_opt`. Use these when computing gradients or doing
259/// `vmap` / `jvp` / `hvp` over a graph.
260///
261/// ```ignore
262/// use rlx::autodiff::{jvp, vmap};
263/// ```
264pub mod autodiff {
265    pub use rlx_opt::{hvp, jvp, vmap};
266}
267
268// ── Prelude — single `use rlx::prelude::*;` for the 95% case ────
269//
270// Includes the graph-building / runtime types, common IR helper
271// enums, and autodiff entry points. Skips less-common
272// types — those stay reachable via the module re-exports above.
273
274/// Star-import target covering the 95% case:
275///
276/// ```ignore
277/// use rlx::prelude::*;
278///
279/// // graph building
280/// let mut g = Graph::new("ex");
281/// let x = g.input("x", Shape::new(&[1, 4], DType::F32));
282///
283/// // compile + run
284/// let mut compiled = Session::new(Device::Cpu).compile(g);
285/// let out = compiled.run(&[("x", &[1.0; 4])]);
286///
287/// ```
288pub mod prelude {
289    // Core graph + runtime
290    pub use crate::{
291        CompiledGraph, DType, Device, Element, Error, Graph, GraphModule, GraphStage, Node, NodeId,
292        Op, OpKind, Result, Session, Shape, Tick,
293    };
294    // IR builder helpers
295    pub use crate::ops::{Activation, BinaryOp, CmpOp, MaskKind};
296    // Quant metadata
297    pub use crate::QuantScheme;
298    // Autodiff
299    pub use crate::{hvp, jvp, vmap};
300    // Optimizer types — useful when configuring passes / precision
301    pub use crate::ir::env::{self, RlxEnv, RuntimeOverrides, flag, set, unset, var};
302    pub use crate::{CalibrationRecord, Pass, Precision, PrecisionPolicy};
303
304    // 3D Gaussian splatting (`rlx-splat` — call `register()` once per process)
305    #[cfg(feature = "splat")]
306    pub use crate::splat::{
307        gaussian_splat_render_common_ir, gaussian_splat_render_decomposed,
308        gaussian_splat_render_reference, register,
309    };
310    #[cfg(feature = "splat")]
311    pub use rlx_ir::ops::splat::{
312        GaussianSplatInputs, GaussianSplatRenderParams, gaussian_splat_prep_packed_len,
313        gaussian_splat_tile_count,
314    };
315    #[cfg(feature = "splat")]
316    pub use rlx_splat::prep_layout::{prep_packed_len, tile_count};
317}