rlx/
lib.rs

1// RLX — versatile ML compiler + runtime.
2// Copyright (C) 2026 Eugene Hauptmann, Nataliya Kosmyna.
3//
4// This program is free software: you can redistribute it and/or modify
5// it under the terms of the GNU General Public License as published by
6// the Free Software Foundation, version 3.
7//
8// This program is distributed in the hope that it will be useful,
9// but WITHOUT ANY WARRANTY; without even the implied warranty of
10// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11// GNU General Public License for more details.
12//
13// You should have received a copy of the GNU General Public License
14// along with this program. If not, see <https://www.gnu.org/licenses/>.
15
16//! # RLX
17//!
18//! A small ML compiler + runtime for transformer inference and training,
19//! with a JAX-shaped IR + autodiff + transforms (`jvp`, `hvp`, `vmap`)
20//! on top of CPU / Apple Silicon (Metal / MLX) / NVIDIA (CUDA) / AMD
21//! (ROCm) / Google TPU / cross-platform GPU (wgpu) / FPGA / Cortex-M
22//! backends.
23//!
24//! This is the **prelude crate** — pulls in the framework-level
25//! workspace members and re-exports the common types so a one-line
26//! `use rlx::prelude::*;` covers most usage.
27//!
28//! ## Three usage patterns
29//!
30//! ### 1. Build + run a graph by hand
31//!
32//! ```ignore
33//! use rlx::prelude::*;
34//!
35//! let mut g = Graph::new("hello");
36//! let x = g.input("x", Shape::new(&[1, 4], DType::F32));
37//! let w = g.param("w", Shape::new(&[4, 2], DType::F32));
38//! let y = g.matmul(x, w, Shape::new(&[1, 2], DType::F32));
39//! let scaled = g.mul(x, g.constant(2.0, DType::F32)); // GraphExt literal
40//! g.set_outputs(vec![y, scaled]);
41//!
42//! let mut compiled = Session::new(Device::Cpu).compile(g);
43//! compiled.set_param("w", &[1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0]);
44//! let out = compiled.run(&[("x", &[1.0, 2.0, 3.0, 4.0])]);
45//! ```
46//!
47//! ## Module map
48//!
49//! Every workspace crate is reachable as a module on `rlx`:
50//!
51//! | path            | crate           | what                                                                            |
52//! |-----------------|-----------------|---------------------------------------------------------------------------------|
53//! | `rlx::ir`       | `rlx-ir`        | IR types, ops, graph builder                                                    |
54//! | `rlx::opt`      | `rlx-opt`       | facade: `rlx-fusion` + `rlx-autodiff` + `rlx-compile`                           |
55//! | `rlx::driver`   | `rlx-driver`    | `Device` enum, registries                                                       |
56//! | `rlx::runtime`  | `rlx-runtime`   | `Session`, `CompiledGraph`                                                      |
57//! | `rlx::macros`   | `rlx-macros`    | `#[rlx_model]` proc macro                                                       |
58//! | `rlx::collectives` | `rlx-collectives` | in-graph collective ops + mesh/planner *(feature `distributed`)*             |
59//! | `rlx::gguf`     | `rlx-gguf`      | GGUF parser + dequant *(feature `gguf`)*                                        |
60//! | `rlx::onnx`     | `rlx-onnx`      | ONNX Runtime `.onnx` inference *(feature `onnx`)*                               |
61//! | `rlx::bench`    | `rlx-bench`     | benchmark harness *(feature `bench`)*                                           |
62//! | `rlx::sparse`   | `rlx-sparse`    | downstream: sparse linalg *(feature `sparse`)*                                  |
63//! | `rlx::splat`    | `rlx-splat`     | 3D Gaussian splatting *(feature `splat`)* — `register()`, decomposed IR ops      |
64//! | `rlx::linalg`   | `rlx-linalg`    | downstream: dense linalg via LAPACK *(feature `linalg`)*                        |
65//! | `rlx::cortexm`  | `rlx-cortexm`   | INT8 ARMv7E-M kernels *(feature `cortexm`)* — no `Backend` impl, kernels only   |
66//! | `rlx::fpga`     | `rlx-fpga`      | IR → SystemVerilog export *(feature `fpga`)* — target-agnostic RTL; no `Backend` |
67//!
68//! ## Convenience namespaces
69//!
70//! Grouped re-exports for related concerns — use these when you want
71//! one focused subset without star-importing the whole prelude:
72//!
73//! | namespace            | what                                                                          |
74//! |----------------------|-------------------------------------------------------------------------------|
75//! | [`rlx::quant`]       | `QuantScheme`, `QuantMap` (IR quantization metadata)                          |
76//! | [`rlx::ops`]         | `Activation`, `BinaryOp`, `CmpOp`, `MaskKind`, `ChainStep`, `ChainOperand`    |
77//! | [`rlx::autodiff`]    | `jvp`, `hvp`, `vmap` + the autodiff entry points                              |
78//! | `rlx::distributed`   | transports + in-graph collectives + ship-graph train/infer *(feature `distributed`)* |
79//! | [`rlx::prelude`]     | star-import target covering the 95% case                                      |
80//!
81//! ## Backend feature gates
82//!
83//! Pick the ones that match your hardware. Multiple backends can be
84//! enabled at once; the runtime picks one per `Session`.
85//!
86//! | feature             | backend                              | platform                  |
87//! |---------------------|--------------------------------------|---------------------------|
88//! | `cpu` *(default)*   | NEON / AVX + Accelerate / OpenBLAS   | every host                |
89//! | `metal`             | Metal Performance Shaders + MSL      | macOS (Apple Silicon)     |
90//! | `mlx`               | Apple MLX (vendored)                 | macOS (Apple Silicon)     |
91//! | `gpu`               | wgpu (Vulkan / DX12 / WebGPU / Metal)| cross-platform            |
92//! | `cuda`              | cuBLAS / cuDNN / NVRTC               | Linux / Windows + NVIDIA  |
93//! | `rocm`              | hipBLAS / MIOpen                     | Linux + AMD               |
94//! | `tpu`               | libtpu PJRT plugin                   | Linux + GCP TPU           |
95//! | `blas-accelerate`   | macOS Accelerate                     | macOS                     |
96//! | `blas-mkl`          | Intel MKL                            | Intel / AMD CPUs          |
97//! | `blas-openblas`     | OpenBLAS                             | cross-platform CPU        |
98//!
99//! ## Convenience aggregates
100//!
101//! Single-flag setups for common platforms. Each composes the
102//! fragments most users want for that target.
103//!
104//! | feature           | expands to                                  |
105//! |-------------------|---------------------------------------------|
106//! | `apple-silicon`   | `cpu` + `metal` + `blas-accelerate`         |
107//! | `nvidia`          | `cpu` + `cuda`                              |
108//! | `edge`            | `cpu` + `cortexm`                           |
109//! | `all-cpu`         | `cpu` + `gguf` + `linalg`                   |
110//!
111//! `mlx` and `rocm` aren't in any aggregate because their crates
112//! aren't on crates.io (vendor-bundled submodule / workspace-
113//! relative kernel sources). To opt in, depend on the workspace via
114//! git and add the feature explicitly:
115//!
116//! ```toml
117//! rlx = { git = "https://github.com/MIT-RLX/rlx", features = ["apple-silicon", "mlx"] }
118//! ```
119
120#![doc(html_root_url = "https://docs.rs/rlx/0.2.1")]
121
122// ── Module re-exports ───────────────────────────────────────────
123
124/// Tensor IR — types, shapes, ops, graph builder.
125/// See [`rlx-ir`](https://crates.io/crates/rlx-ir).
126pub use rlx_ir as ir;
127
128/// Symbolic tensor DSL — operator-overloaded graph building.
129/// Available with the `tensor` feature (on by default).
130#[cfg(feature = "tensor")]
131pub use rlx_tensor as tensor;
132
133/// Graph rewrites + autodiff + vmap.
134/// See [`rlx-opt`](https://crates.io/crates/rlx-opt).
135pub use rlx_opt as opt;
136
137/// Device enum + cross-cutting types.
138/// See [`rlx-driver`](https://crates.io/crates/rlx-driver).
139pub use rlx_driver as driver;
140
141/// User-facing `Session` / `CompiledGraph`.
142/// See [`rlx-runtime`](https://crates.io/crates/rlx-runtime).
143pub use rlx_runtime as runtime;
144
145/// Procedural macros (`#[rlx_model]`, `pipeline_schedule!`).
146/// See [`rlx-macros`](https://crates.io/crates/rlx-macros).
147pub use rlx_macros as macros;
148
149#[cfg(feature = "gguf")]
150/// GGUF v1 / v2 / v3 parser + dequant + quant encoders + writer.
151/// See [`rlx-gguf`](https://crates.io/crates/rlx-gguf).
152pub use rlx_gguf as gguf;
153
154#[cfg(feature = "gguf-convert")]
155/// safetensors / ONNX → GGUF conversion with per-tensor quantization.
156/// Useful at first inference load to shrink memory + disk footprint.
157/// See [`rlx-gguf-convert`](https://crates.io/crates/rlx-gguf-convert).
158pub use rlx_gguf_convert as gguf_convert;
159
160#[cfg(feature = "bench")]
161/// Uniform benchmark harness.
162/// See [`rlx-bench`](https://crates.io/crates/rlx-bench).
163pub use rlx_bench as bench;
164
165#[cfg(feature = "sparse")]
166/// Downstream: sparse linear algebra (custom-op scaffold).
167/// See [`rlx-sparse`](https://crates.io/crates/rlx-sparse).
168pub use rlx_sparse as sparse;
169
170#[cfg(feature = "linalg")]
171/// Downstream: dense linalg via LAPACK (custom-op scaffold).
172/// See [`rlx-linalg`](https://crates.io/crates/rlx-linalg).
173pub use rlx_linalg as linalg;
174
175#[cfg(feature = "splat")]
176/// Downstream: 3D Gaussian splatting (CPU reference render custom op).
177/// See [`rlx-splat`](https://crates.io/crates/rlx-splat).
178pub use rlx_splat as splat;
179
180#[cfg(feature = "umap")]
181/// Downstream: UMAP / fast-umap custom ops (k-NN from pairwise distances).
182pub use rlx_umap as umap;
183
184#[cfg(feature = "optim")]
185/// Training-step optimizers (Adam, AdamW, NAdamW, RAdam, QHAdamW,
186/// LAMB, Adafactor, Lion, SOAP, Kron-PSGD, Muon, Sophia, MARS). See
187/// [`rlx-optim`](https://crates.io/crates/rlx-optim).
188pub use rlx_optim as optim;
189
190#[cfg(feature = "cortexm")]
191/// `no_std` ARMv7E-M INT8 kernels (Cortex-M4F / M7). Doesn't
192/// implement `Backend` — call the kernels (`dense`, `conv2d`,
193/// `maxpool`, `relu`, `argmax`) directly.
194/// See [`rlx-cortexm`](https://crates.io/crates/rlx-cortexm).
195pub use rlx_cortexm as cortexm;
196
197#[cfg(feature = "fpga")]
198/// IR → SystemVerilog datapath synthesis + runtime [`export`](rlx_runtime::export).
199///
200/// Prefer the prelude when the `fpga` feature is on:
201///
202/// ```ignore
203/// use rlx::prelude::*;
204///
205/// let arts = ExportSession::fpga("hw/out")
206///     .hw_target(HwTarget::Generic)
207///     .export_model(&tinyconv_mnist_from_cortexm())?;
208/// ```
209///
210/// Entry via module path: `rlx::fpga::export_graph` / `emit_with_config`.
211/// Soft-port RTL by default (`HwTarget::Generic`); optional ECP5/iCE40/Xilinx7
212/// synth scripts. See [`rlx-fpga`](https://crates.io/crates/rlx-fpga).
213pub use rlx_fpga as fpga;
214
215#[cfg(feature = "onnx")]
216/// ONNX Runtime inference for `.onnx` files on RLX [`Device`] backends.
217/// See [`rlx-onnx`](https://crates.io/crates/rlx-onnx).
218pub use rlx_onnx as onnx;
219
220#[cfg(feature = "distributed")]
221/// In-graph collective ops (`collective.all_reduce`, all-gather, reduce-scatter,
222/// broadcast, all-to-all, ppermute, send/recv, the Megatron `f`/`g` operators),
223/// the group registry, and the device-mesh / placement planner. The unified
224/// [`rlx::distributed`](crate::distributed) namespace folds this together with
225/// the `rlx-driver` transports and the `rlx-runtime::dist` ship-graph API.
226/// See [`rlx-collectives`](https://crates.io/crates/rlx-collectives).
227pub use rlx_collectives as collectives;
228
229// ── Error types ─────────────────────────────────────────────────
230//
231// The whole stack returns `anyhow::Result<T>` — `rlx::Result` /
232// `rlx::Error` make that the obvious choice for downstream code
233// without forcing an explicit `anyhow` dep at the call site.
234
235/// Crate-wide result type — alias of `anyhow::Result<T>`. Use this
236/// in `main()` and library boundaries.
237pub type Result<T, E = anyhow::Error> = std::result::Result<T, E>;
238
239/// Crate-wide error type — alias of `anyhow::Error`.
240pub type Error = anyhow::Error;
241
242// ── Flat re-exports for the most-common types ───────────────────
243//
244// These cover ~90% of user code: build a graph with rlx_ir types,
245// compile + run it through Session, then read back outputs. Less
246// common types stay reachable via the module re-exports above.
247
248pub use rlx_driver::Device;
249#[cfg(feature = "fpga")]
250pub use rlx_fpga::{
251    ExportQuantMode, FpgaExportConfig, GraphIoBind, HwTarget, InputIface, IoConfig, OutputIface,
252    OutputKind, PortNames, SidebandSpec, tinyconv_mnist_from_cortexm,
253};
254pub use rlx_ir::quant::QuantScheme;
255pub use rlx_ir::{
256    DType, Element, FusionPolicy, Graph, GraphExt, GraphModule, GraphStage, HirModule, HirOp,
257    LirModule, MirModule, Node, NodeId, Op, OpKind, Shape, Tick, scalar_constant_bytes,
258};
259pub use rlx_ir::{
260    NodeOrigin, inspect_graph, inspect_graph_diff, inspect_hir, inspect_hir_stats, inspect_lir,
261    inspect_mir, inspect_mir_diff, inspect_mir_stats, node_label,
262};
263pub use rlx_opt::{
264    CalibrationRecord, CompilePipeline, CompileResult, FusionOptions, FusionReport, FusionTarget,
265    MissReason, MissedFusion, Pass, PipelineInspect, Precision, PrecisionPolicy, fusion_passes,
266    fusion_passes_for_supported, hvp, inspect_pipeline, jvp, maybe_dump_pipeline,
267    supported_for_target, supports_op, vmap,
268};
269pub use rlx_runtime::{
270    BackendsManifest, CompiledGraph, DeviceBenchResult, DeviceCandidate, DeviceFallbackError,
271    DevicePickStrategy, DevicePolicy, DeviceRouter, FlexibleSession, GraphDevices,
272    ParseDeviceError, Session, available_devices, benchmark_devices, device_chain_from_env,
273    device_from_env, device_label, device_report, devices_for, devices_for_with_policy,
274    fastest_device, fastest_device_for, graph_param_names, is_available, parse_device,
275    parse_device_list, resolve_device, resolve_device_chain, run_with_fallback,
276};
277#[cfg(feature = "fpga")]
278pub use rlx_runtime::{
279    ExportOptions, ExportSession, ExportTarget, ExportedArtifacts, export_graph,
280    export_tinyconv_mnist,
281};
282
283// ── Grouped namespaces ──────────────────────────────────────────
284
285/// Quantization metadata — schemes the IR carries per-tensor, plus
286/// the `QuantMap` graph-level annotation. Use these when wiring
287/// `Op::DequantMatMul` or attaching quant info to your own ops.
288///
289/// ```ignore
290/// use rlx::quant::QuantScheme;
291///
292/// let scheme = QuantScheme::GgufQ4K;   // GGUF Q4_K super-block
293/// assert!(scheme.is_gguf());
294/// assert_eq!(scheme.gguf_block_bytes(), 144);
295/// ```
296pub mod quant {
297    pub use rlx_ir::quant::{QuantMap, QuantScheme};
298}
299
300/// Op-builder helper enums — the variants the graph builder methods
301/// (`g.binary`, `g.compare`, `g.activation`, `g.attention_kind`, …)
302/// take as their first argument, plus the fused-chain primitives
303/// used by `Op::ElementwiseRegion`.
304///
305/// ```ignore
306/// use rlx::{Graph, GraphExt, Shape, DType};
307/// use rlx::ops::{Activation, BinaryOp};
308///
309/// let mut g = Graph::new("ex");
310/// let x = g.input("x", Shape::new(&[4], DType::F32));
311/// let y = g.input("y", Shape::new(&[4], DType::F32));
312/// let s = g.binary(BinaryOp::Add, x, y, Shape::new(&[4], DType::F32));
313/// let r = g.activation(Activation::Silu, s, Shape::new(&[4], DType::F32));
314/// let scaled = g.mul(x, g.constant(2.0, DType::F32));
315/// g.set_outputs(vec![r, scaled]);
316/// ```
317pub mod ops {
318    pub use rlx_ir::op::{Activation, BinaryOp, ChainOperand, ChainStep, CmpOp, MaskKind};
319}
320
321/// Autodiff + transforms — re-exports the public entry points from
322/// `rlx_opt`. Use these when computing gradients or doing
323/// `vmap` / `jvp` / `hvp` over a graph.
324///
325/// ```ignore
326/// use rlx::autodiff::{jvp, vmap};
327/// ```
328pub mod autodiff {
329    pub use rlx_opt::{hvp, jvp, vmap};
330}
331
332/// Distributed training + inference — the single front door over all three
333/// layers, which otherwise live in separate crates: the transport layer
334/// (`rlx-driver`: `ProcessGroup`, transports, `Node` discovery, `ReduceMode`),
335/// the in-graph collective op builders + placement planner (`rlx-collectives`),
336/// and the ship-graph worker/coordinator + heterogeneous placement
337/// (`rlx-runtime::dist` / `::hetero`). Feature `distributed`.
338///
339/// ```ignore
340/// use rlx::distributed::*;
341///
342/// register(); // install the in-graph collective kernel once
343/// // reproducible + precise cross-rank gradient reduce, baked into the graph:
344/// let g = all_reduce_op_mode(&mut bwd, grad, gid, ReduceKind::Mean, ReduceMode::Deterministic);
345/// // ship-graph data-parallel training on a heterogeneous cluster:
346/// run_train(&group, rank, &spec, resolve, reduce)?;
347/// // one-machine-vs-cluster divergence diagnostic:
348/// let d = backend_divergence(&graph, &inputs)?;
349/// ```
350///
351/// The collective ops and mesh/planner are also reachable directly on
352/// [`rlx::collectives`](crate::collectives); the ship-graph API on
353/// [`rlx::runtime::dist`](crate::runtime) (always present, no feature).
354#[cfg(feature = "distributed")]
355pub mod distributed {
356    // Transport + in-graph collective ops + device mesh / planner
357    // (rlx-driver + rlx-collectives), via the collectives prelude.
358    pub use rlx_collectives::prelude::*;
359    // Ship-graph inference / training / diagnostics (rlx-runtime::dist).
360    pub use crate::runtime::dist::{
361        BackendDivergence, DataRef, StageSpec, TrainMetrics, TrainSpec, WeightCache, WeightRef,
362        WorkerStage, backend_divergence, pull_shards, push_shards, recv_activation, recv_stage,
363        recv_train, report_backend_divergence, resolve_weight_bytes, resolve_weight_uri, run_train,
364        send_activation, serve_stage, serve_stage_uri, ship_stage, ship_train, uri_resolver,
365    };
366    // Heterogeneous multi-backend placement (rlx-runtime::hetero).
367    pub use crate::runtime::{DeviceMap, HeteroExecutable};
368}
369
370// ── Prelude — single `use rlx::prelude::*;` for the 95% case ────
371//
372// Includes the graph-building / runtime types, common IR helper
373// enums, and autodiff entry points. Skips less-common
374// types — those stay reachable via the module re-exports above.
375
376/// Star-import target covering the 95% case:
377///
378/// ```ignore
379/// use rlx::prelude::*;
380///
381/// // graph building
382/// let mut g = Graph::new("ex");
383/// let x = g.input("x", Shape::new(&[1, 4], DType::F32));
384/// let y = g.mul(x, g.constant(2.0, DType::F32));
385/// g.set_outputs(vec![y]);
386///
387/// // compile + run (auto-pick fastest, or choose any compatible backend)
388/// let mut runner = GraphDevices::new(g);
389/// let device = runner.fastest(); // or pick from runner.devices()
390/// let out = runner.run(device, &[("x", &[1.0; 4])]).unwrap();
391///
392/// ```
393pub mod prelude {
394    // Tensor DSL (expression-style graph building) — feature `tensor`.
395    #[cfg(feature = "tensor")]
396    pub use crate::tensor::{GraphScope, Tensor, ax, graph, graph_with, ix, rg, s, shape, tail};
397    // Core graph + runtime
398    pub use crate::{
399        BackendsManifest, CompiledGraph, DType, Device, DeviceBenchResult, DeviceCandidate,
400        DeviceFallbackError, DevicePickStrategy, DevicePolicy, DeviceRouter, Element, Error,
401        FlexibleSession, Graph, GraphDevices, GraphExt, GraphModule, GraphStage, Node, NodeId, Op,
402        OpKind, ParseDeviceError, Result, Session, Shape, Tick, available_devices,
403        benchmark_devices, device_chain_from_env, device_from_env, device_label, device_report,
404        devices_for, devices_for_with_policy, fastest_device, fastest_device_for,
405        graph_param_names, is_available, parse_device, parse_device_list, resolve_device,
406        resolve_device_chain, run_with_fallback, scalar_constant_bytes,
407    };
408    // IR builder helpers
409    pub use crate::ops::{Activation, BinaryOp, CmpOp, MaskKind};
410    // Quant metadata
411    pub use crate::QuantScheme;
412    // Autodiff
413    pub use crate::{hvp, jvp, vmap};
414    // Optimizer types — useful when configuring passes / precision
415    pub use crate::ir::env::{self, RlxEnv, RuntimeOverrides, flag, set, unset, var};
416    pub use crate::{CalibrationRecord, Pass, Precision, PrecisionPolicy};
417
418    // FPGA / ASIC SystemVerilog export (feature `fpga`)
419    #[cfg(feature = "fpga")]
420    pub use crate::{
421        ExportOptions, ExportQuantMode, ExportSession, ExportTarget, ExportedArtifacts,
422        FpgaExportConfig, GraphIoBind, HwTarget, InputIface, IoConfig, OutputIface, OutputKind,
423        PortNames, SidebandSpec, export_graph, export_tinyconv_mnist, tinyconv_mnist_from_cortexm,
424    };
425
426    // 3D Gaussian splatting (`rlx-splat` — call `register()` once per process)
427    #[cfg(feature = "splat")]
428    pub use crate::splat::{
429        gaussian_splat_render_common_ir, gaussian_splat_render_decomposed,
430        gaussian_splat_render_reference, register,
431    };
432    #[cfg(feature = "splat")]
433    pub use rlx_ir::ops::splat::{
434        GaussianSplatInputs, GaussianSplatRenderParams, gaussian_splat_prep_packed_len,
435        gaussian_splat_tile_count,
436    };
437    #[cfg(feature = "splat")]
438    pub use rlx_splat::prep_layout::{prep_packed_len, tile_count};
439}
440
441/// Register optional custom backends and companion custom-op crates.
442///
443/// Builtins (CPU, Metal, CUDA, …) register automatically on first
444/// [`Session`] use. Call this at process startup when you ship extra
445/// backends or custom-op libraries:
446///
447/// ```ignore
448/// rlx::register_backends! {
449///     splat => rlx::splat::register,
450///     sparse => rlx::sparse::register,
451/// }
452/// ```
453#[macro_export]
454macro_rules! register_backends {
455    () => {};
456    ( $( $name:ident => $register:path ),* $(,)? ) => {
457        $( $register(); )*
458    };
459}
rlx/lib.rs

rlx/
lib.rs