rlx_runtime/
lib.rs

1// RLX — versatile ML compiler + runtime.
2// Copyright (C) 2026 Eugene Hauptmann, Nataliya Kosmyna.
3//
4// This program is free software: you can redistribute it and/or modify
5// it under the terms of the GNU General Public License as published by
6// the Free Software Foundation, version 3.
7//
8// This program is distributed in the hope that it will be useful,
9// but WITHOUT ANY WARRANTY; without even the implied warranty of
10// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11// GNU General Public License for more details.
12//
13// You should have received a copy of the GNU General Public License
14// along with this program. If not, see <https://www.gnu.org/licenses/>.
15
16//! RLX Runtime — the user-facing API.
17//!
18//! Provides a unified [`Session`] that compiles and executes IR graphs
19//! on the selected backend. Backend selection is via Cargo features:
20//!
21//! ```toml
22//! [dependencies]
23//! rlx-runtime = { version = "0.1", features = ["cpu"] }                # CPU (default)
24//! rlx-runtime = { version = "0.1", features = ["blas-accelerate"] }    # CPU + Apple Accelerate
25//! rlx-runtime = { version = "0.1", features = ["blas-mkl"] }           # CPU + Intel MKL
26//! # rlx-runtime = { version = "0.1", features = ["gpu"] }             # GPU via wgpu
27//! # rlx-runtime = { version = "0.1", features = ["cuda"] }            # GPU via CUDA
28//! ```
29//!
30//! # Example
31//! ```rust,no_run
32//! use rlx_runtime::*;
33//! use rlx_ir::*;
34//!
35//! // Build a graph
36//! let mut g = Graph::new("example");
37//! let x = g.input("x", Shape::new(&[2, 4], DType::F32));
38//! let w = g.param("w", Shape::new(&[4, 3], DType::F32));
39//! let b = g.param("b", Shape::new(&[3], DType::F32));
40//! let mm = g.matmul(x, w, Shape::new(&[2, 3], DType::F32));
41//! let out = g.binary(op::BinaryOp::Add, mm, b, Shape::new(&[2, 3], DType::F32));
42//! g.set_outputs(vec![out]);
43//!
44//! // Compile and execute
45//! let session = Session::new(Device::Cpu);
46//! let mut compiled = session.compile(g);
47//! compiled.set_param("w", &[1.0f32; 12]);
48//! compiled.set_param("b", &[0.0f32; 3]);
49//! let result = compiled.run(&[("x", &[1.0f32; 8])]);
50//! ```
51
52// Driver-layer concerns (device, arena, handle, stream, buffer)
53// live in rlx-driver as of plan #58; re-exported below so
54// existing callers compile unchanged.
55pub mod aot_cache;
56pub mod attn_mask;
57pub mod backend;
58pub mod compile_cache;
59pub mod compiled;
60pub mod cost;
61pub mod device_ext;
62pub mod expert_pool;
63pub mod jacfwd;
64pub mod kernel_trace;
65pub mod kv_cache;
66pub mod lora_scheduler;
67pub mod memory_estimate;
68pub mod model_pipeline;
69pub mod moe_expert_store;
70pub mod op_registry;
71pub mod options;
72pub mod paged_kv;
73pub mod precision;
74pub mod record_replay;
75pub mod reflect;
76pub mod registry;
77pub mod router;
78pub mod session;
79pub mod stages;
80pub mod subgraph;
81pub mod trace;
82pub mod weight_registry;
83pub mod weights;
84pub mod worker_pool;
85/// PLAN L3 — Perfetto / chrome-trace JSON tracing. Lives in `rlx-ir`
86/// (alongside the `Tick` cycle counter it depends on) so every backend
87/// can instrument per-thunk without crate-dep gymnastics. Re-exported
88/// here so callers see one consistent `rlx_runtime::perfetto::TraceSpan`.
89pub use rlx_ir::perfetto;
90pub mod custom_ops;
91pub mod hwinfo;
92pub mod logit_verify;
93pub mod nan_check;
94pub mod phase;
95pub mod spec_decode;
96pub mod telemetry;
97pub mod validators;
98
99// Always-available now that serde is a non-optional dep + #32
100// router consumes the OpenAI-shaped structs unconditionally.
101pub mod mock_requests;
102
103// Driver-layer types — re-exported from rlx-driver (plan #58).
104pub use rlx_driver::{Buffer, BufferHandle, CommandStream, Device, DeviceArena, SyncStream};
105// Symmetric-memory primitives (plan #49) — foundation for #12.
106pub use rlx_driver::{
107    CollectiveError, LocalTransport, Rank, SymmetricBuffer, SymmetricHeap, SymmetricTransport,
108};
109// Collective ops (plan #12).
110pub use aot_cache::{AotCache, AotCacheError};
111pub use backend::{Backend, ExecutableGraph, compile_hir, compile_module};
112pub use compile_cache::{
113    BucketedCompileCache, CacheRunInput, CompileCache, DynamicDimCompileCache, pad_rows, slice_rows,
114};
115pub use compiled::CompiledGraph;
116#[cfg(feature = "apple")]
117pub use device_ext::available_apple_devices;
118pub use device_ext::{
119    available_devices, dispatch_report_for_device, dispatch_report_for_device_with_options,
120    first_unsupported_op, first_unsupported_op_with_options, full_name, is_available,
121    legalize_graph_for_device, legalize_graph_for_device_with_options,
122    legalize_graph_for_device_with_report, supports, supports_graph, supports_graph_with_options,
123};
124pub use expert_pool::{
125    ExpertPool, ExpertPoolConfig, ExpertPoolStats, ExpertRefreshPolicy, ExpertRefreshResult,
126    MoEExecMode, gpu_expert_budget_from_vram,
127};
128pub use kv_cache::LayerKvCache;
129pub use memory_estimate::{MoeOffloadEstimate, estimate_moe_offload};
130pub use model_pipeline::ModelCompilePipeline;
131pub use options::CompileOptions;
132pub use precision::Precision;
133pub use reflect::{ModelReflection, load_hir_template_with_extensions, specialize_entry};
134pub use registry::{BackendFactory, backend_for, register_backend, registered_devices};
135#[cfg(feature = "cpu")]
136pub use rlx_cpu::moe_residency::MoeResidencyStats;
137#[cfg(feature = "cpu")]
138pub use rlx_cpu::moe_topk_capture::MoeTopkCapture;
139pub use rlx_driver::{ReduceKind, all_gather, all_reduce, reduce_scatter};
140pub use rlx_ir::env::{self, RlxEnv, RuntimeOverrides};
141pub use session::Session;
142pub use stages::{
143    compile_graph_stages, compile_graph_stages_for_backend, compile_hir_stages,
144    compile_module_stages, fusion_target_for, graph_from_lir, maybe_log_fusion,
145    options_with_supported_ops, pipeline_for,
146};
147pub use subgraph::{SubgraphCache, run_if, run_while};
148
149pub use expert_pool::{merged_resident_mask, per_layer_resident_masks};
150pub use moe_expert_store::{ExpertStackF32, LayerMoeWeights, MoeExpertStore};
151pub use weight_registry::{WeightEntry, WeightHandle, WeightKind, WeightRegistry};
152pub use weights::{BytesWeightLoader, WeightLoader};
153
154// Cycle-accurate timing primitive lives in rlx-ir (lowest crate); re-export
155// here so `rlx_runtime::Tick` works without forcing callers to add the IR
156// crate as a direct dep.
157pub use rlx_ir::{AsyncCopy, BarrierToken, DoubleBuffer, SyncCopy};
158pub use rlx_ir::{CacheBuster, Tick, time_ns};
159
160// Re-export precision policy from rlx-opt for convenience
161pub use rlx_ir::{
162    inspect_graph, inspect_hir, inspect_hir_stats, inspect_lir, inspect_mir, inspect_mir_stats,
163};
164pub use rlx_opt::{OpKind, PrecisionPolicy};
165pub use rlx_opt::{PipelineInspect, inspect_pipeline};
166
167// Re-export IR types for convenience
168pub use rlx_ir::logical_kernel::{KernelDispatchConfig, KernelDispatchPolicy};
169pub use rlx_ir::op;
170pub use rlx_ir::{
171    BindingManifest, CompilationMode, DType, Graph, HirExtensionFn, HirReflection, IoBindingEntry,
172    ManifestDiff, ModelComponent, ModelPhase, ModelVariant, Node, NodeId, Op, Shape, WeightBlock,
173    apply_hir_extensions, register_hir_extension, registered_hir_extensions,
174};
175
176// Re-export proc macro
177pub use rlx_macros::pipeline_schedule;
178pub use rlx_macros::rlx_model;
rlx_runtime/lib.rs

rlx_runtime/
lib.rs