Crate rlx_runtime

Expand description

RLX Runtime — the user-facing API.

Provides a unified Session that compiles and executes IR graphs on the selected backend. Backend selection is via Cargo features:

[dependencies]
rlx-runtime = { version = "0.1", features = ["cpu"] }                # CPU (default)
rlx-runtime = { version = "0.1", features = ["blas-accelerate"] }    # CPU + Apple Accelerate
rlx-runtime = { version = "0.1", features = ["blas-mkl"] }           # CPU + Intel MKL
# rlx-runtime = { version = "0.1", features = ["gpu"] }             # GPU via wgpu
# rlx-runtime = { version = "0.1", features = ["cuda"] }            # GPU via CUDA

§Example

use rlx_runtime::*;
use rlx_ir::*;

// Build a graph
let mut g = Graph::new("example");
let x = g.input("x", Shape::new(&[2, 4], DType::F32));
let w = g.param("w", Shape::new(&[4, 3], DType::F32));
let b = g.param("b", Shape::new(&[3], DType::F32));
let mm = g.matmul(x, w, Shape::new(&[2, 3], DType::F32));
let out = g.binary(op::BinaryOp::Add, mm, b, Shape::new(&[2, 3], DType::F32));
g.set_outputs(vec![out]);

// Compile and execute
let session = Session::new(Device::Cpu);
let mut compiled = session.compile(g);
compiled.set_param("w", &[1.0f32; 12]);
compiled.set_param("b", &[0.0f32; 3]);
let result = compiled.run(&[("x", &[1.0f32; 8])]);

Re-exports§

pub use aot_cache::AotCache;
pub use aot_cache::AotCacheError;
pub use backend::Backend;
pub use backend::ExecutableGraph;
pub use backend::compile_hir;
pub use backend::compile_module;
pub use backends_manifest::BackendsManifest;
pub use compile_cache::BucketedCompileCache;
pub use compile_cache::CacheRunInput;
pub use compile_cache::CompileCache;
pub use compile_cache::DynamicDimCompileCache;
pub use compile_cache::pad_rows;
pub use compile_cache::slice_rows;
pub use compile_config::COMPILE_OUTPUT_CAP_ENV;
pub use compile_config::COMPILE_OUTPUT_CAP_ENV_MLX;
pub use compile_config::DEFAULT_COMPILE_OUTPUT_CAP;
pub use compile_config::compile_output_cap;
pub use compile_config::device_has_compile_output_cap;
pub use compile_config::reset_compile_output_cap;
pub use compile_config::set_compile_output_cap;
pub use compiled::CompiledGraph;
pub use cost::fastest_device_for;
pub use device_bench::DeviceBenchResult;
pub use device_bench::benchmark_devices;
pub use device_bench::warm_all;
pub use device_ext::available_devices;
pub use device_ext::devices_for;
pub use device_ext::dispatch_report_for_device;
pub use device_ext::dispatch_report_for_device_with_options;
pub use device_ext::fastest_device;
pub use device_ext::first_unsupported_op;
pub use device_ext::first_unsupported_op_with_options;
pub use device_ext::full_name;
pub use device_ext::is_available;
pub use device_ext::legalize_graph_for_device;
pub use device_ext::legalize_graph_for_device_with_options;
pub use device_ext::legalize_graph_for_device_with_report;
pub use device_ext::supports;
pub use device_ext::supports_graph;
pub use device_ext::supports_graph_with_options;
pub use device_ext::supports_run_slots;
pub use device_ext::trim_accelerator_arena_pool;
pub use device_parse::ParseDeviceError;
pub use device_parse::device_label;
pub use device_parse::parse_device;
pub use device_parse::parse_device_list;
pub use device_policy::DeviceCandidate;
pub use device_policy::DeviceFallbackError;
pub use device_policy::DevicePickStrategy;
pub use device_policy::DevicePolicy;
pub use device_policy::device_chain_from_env;
pub use device_policy::device_chain_from_env_key;
pub use device_policy::device_from_env;
pub use device_policy::device_from_env_key;
pub use device_policy::device_report;
pub use device_policy::devices_for_with_policy;
pub use device_policy::resolve_device;
pub use device_policy::resolve_device_chain;
pub use device_policy::run_with_fallback;
pub use device_router::DeviceRouter;
pub use expert_pool::ExpertPool;
pub use expert_pool::ExpertPoolConfig;
pub use expert_pool::ExpertPoolStats;
pub use expert_pool::ExpertRefreshPolicy;
pub use expert_pool::ExpertRefreshResult;
pub use expert_pool::MoEExecMode;
pub use expert_pool::gpu_expert_budget_from_vram;
pub use flexible_session::FlexibleSession;
pub use graph_devices::GraphDevices;
pub use graph_devices::graph_param_names;
pub use kv_cache::LayerKvCache;
pub use lm::ConfigSource;
pub use lm::LmRunner;
pub use lm::LmRunnerBuilder;
pub use lm::MirostatMode;
pub use lm::ModelRegistration;
pub use lm::PACKED_GGUF_AUTO_THRESHOLD_BYTES;
pub use lm::SampleOpts;
pub use lm::WeightFormat;
pub use lm::auto_runner_name;
pub use lm::registered_models;
pub use memory_estimate::DEFAULT_SOFT_MEMORY_FRACTION;
pub use memory_estimate::MoeOffloadEstimate;
pub use memory_estimate::available_unified_memory;
pub use memory_estimate::estimate_moe_offload;
pub use memory_estimate::llama_decode_bucket_compile_peak_bytes;
pub use memory_estimate::llama_decode_bucket_resident_bytes;
pub use memory_estimate::llama_decode_oneshot_compile_peak_bytes;
pub use memory_estimate::memory_headroom_bytes;
pub use memory_estimate::process_rss_bytes;
pub use memory_estimate::soft_memory_budget_bytes;
pub use memory_estimate::soft_memory_fraction;
pub use memory_estimate::would_exceed_soft_budget;
pub use model_pipeline::ModelCompilePipeline;
pub use options::CompileOptions;
pub use precision::Precision;
pub use reflect::ModelReflection;
pub use reflect::load_hir_template_with_extensions;
pub use reflect::specialize_entry;
pub use registry::BackendFactory;
pub use registry::backend_for;
pub use registry::register_backend;
pub use registry::registered_devices;
pub use session::Session;
pub use stages::compile_graph_stages;
pub use stages::compile_graph_stages_for_backend;
pub use stages::compile_hir_stages;
pub use stages::compile_module_stages;
pub use stages::fusion_target_for;
pub use stages::graph_from_lir;
pub use stages::maybe_log_fusion;
pub use stages::options_with_supported_ops;
pub use stages::pipeline_for;
pub use subgraph::SubgraphCache;
pub use subgraph::run_if;
pub use subgraph::run_while;
pub use expert_pool::merged_resident_mask;
pub use expert_pool::per_layer_resident_masks;
pub use moe_expert_store::ExpertStackF32;
pub use moe_expert_store::LayerMoeWeights;
pub use moe_expert_store::MoeExpertStore;
pub use weight_registry::WeightEntry;
pub use weight_registry::WeightHandle;
pub use weight_registry::WeightKind;
pub use weight_registry::WeightRegistry;
pub use weights::BytesWeightLoader;
pub use weights::WeightLoader;

Modules§

aot_cache: AOT cache — persist optimized LIR modules and reload for backend compile.
attn_mask: Attention-mask helpers for bucketed decode (pad-to-upper, slice-back).
backend: Backend trait — abstraction over CPU/GPU/CUDA execution.
backends_manifest: Compile-time backend manifest — which Cargo features were enabled.
compile_cache: Shape-bucketed compile cache.
compile_config: Compile-time limits shared across backends (training backward output slots, …).
compiled: Compiled graph — the hot-path execution object.
cost: Cross-backend cost interface.
custom_ops: Custom-op extensibility (plan #25).
device_bench: Warm-all and micro-benchmark helpers for backend ranking.
device_ext: Engine-layer extensions for rlx_driver::Device (plan #58).
device_parse: String identifiers for rlx_driver::Device — config files, CLI, env vars.
device_policy: Backend allowlists, env-driven defaults, and selection introspection.
device_router: Serving-oriented wrapper: warm-all backends, fallback execution, throttle-aware re-warm.
dist: Ship-graph distributed execution (build the worker once, run any model). Ship-graph distributed execution — build the worker once, run any model.
env: Unified RLX_* configuration — readable from code overrides or process env.
expert_pool: MoE expert residency pool (TIDE-style predictive offload).
flexible_session: Session that defers backend choice until compile time.
graph_devices: Multi-backend execution — compile once per device, run on any of them.
graph_io: Static IO / sync profile for compiled graphs (Phase 0 — fusion planning).
hwinfo: Hardware introspection (plan #47).
jacfwd: Forward-mode Jacobian materialization.
kernel_trace: Compile-time gated kernel tracing (plan #7).
kv_cache: Per-layer K/V cache for autoregressive decode (Whisper, Qwen, Gemma, …).
lm: Generic language-model runner trait and shared builder.
logit_verify: Logit / output verification (plan #61).
lora_scheduler: LoRA-aware request scheduling (plan #33).
memory_estimate: Pre-load memory estimation (plan #35).
mock_requests: Mock request payloads for tests (plan #64).
model_pipeline: Three-step model compile pipeline (template → specialize → backend).
moe_expert_store: Per-expert F32 weight slabs for MoE offload (TIDE-style migration source).
nan_check: NaN/inf check epilogue (plan #18).
onnx_active: Per-infer active token count — forwarded to CPU ONNX control-flow kernels.
op: Operation types — every tensor op in the RLX IR.
op_registry: Op registry — re-exported from rlx-ir.
options: Unified compile options.
paged_kv: Paged KV cache + continuous batching (plan #31).
perfetto: PLAN L3 — Perfetto / chrome-trace JSON tracing. Lives in rlx-ir (alongside the Tick cycle counter it depends on) so every backend can instrument per-thunk without crate-dep gymnastics. Re-exported here so callers see one consistent rlx_runtime::perfetto::TraceSpan. PLAN L3: Perfetto / chrome-trace JSON output for cross-backend timeline capture.
phase: Phase-aware streaming inference (plan #16).
precision: Precision selection for graph execution.
precompile: Shared graph cleanup before the fusion / backend pipeline.
quantized_kv: Block-quantized K/V cache — store decode-time history as q8_0, q4_0, or q5_0 GGUF-encoded blocks instead of f32/f16. Memory saving vs f16 is roughly:
record_replay: Record/replay middleware (plan #63).
reflect: Model reflection services (Slang compiler/runtime API §5).
registry: Backend registry — a single registration point for all backends.
router: Multi-protocol request router (plan #32).
samplers: Advanced token samplers — the long-tail of llama.cpp samplers ported to a backend-agnostic Sampler chain.
session: Session — the main entry point for compiling and executing graphs.
spec_decode: Speculative decoding scheduling pattern (plan #34).
stages: Shared HIR → MIR → LIR compile stages for runtime backends.
subgraph: Sub-graph execution helper.
telemetry: Telemetry primitives (plan #65).
trace: Tracing API — build IR graphs by recording operations on traced tensors.
validators: Composable request validators (plan #84).
weight_registry: Named weights registry (plan #24).
weights: Weight-loading abstraction.
worker_pool: Worker pool with isolation primitives (plan #36).

Macros§

ktrace: Compile-time gated kernel trace. Expands to a no-op call without the kernel-trace feature; the optimizer removes it entirely.
pipeline_schedule: Compile-time pipeline scheduler (plan #11). See pipeline_schedule_impl in this crate’s private pipeline module for the full grammar.

Structs§

BarrierToken: Opaque ticket returned by AsyncCopy::issue. Pass back to AsyncCopy::wait to block until the corresponding copy is done. Tokens are scoped to one engine — don’t pass them across.
BindingManifest: Full I/O + parameter manifest for a compiled graph.
Buffer: A buffer that knows where its bytes live.
BufferHandle: External, persistent buffer reference. Created once, bound at compile, carried across many compiled.run() invocations.
CacheBuster: Cache-busting buffer — sized to evict L1+L2 on Apple Silicon (M-series: 192 KB L1d / core, 16 MB L2 shared per cluster). Borrowed from MAX’s internal_utils/_cache_busting.mojo (#19).
DoubleBuffer: Two-buffer ring. current() is what compute reads this step; next_mut() is where the next async copy should land. Call swap() after waiting on the current copy to advance.
Graph: A computation graph — the core IR data structure.
HirReflection: Introspection of an unspecialized HirModule (loadModule analogue).
IoBindingEntry: One named graph boundary with arena layout after buffer planning.
KernelDispatchConfig: Per-compile overrides on top of KernelDispatchPolicy.
LocalTransport: Single-machine in-process transport. All num_ranks “ranks” share one SymmetricHeap instance, so put / get are just locks + memcpy. Useful for unit tests and for algorithm-correctness checking of collective ops without a real cluster.
ManifestDiff: Compare template vs specialized manifests (dims / arena may differ).
ModelComponent: Full specialization + binding bundle (Slang shader-component analogue).
ModelVariant: Concrete shape bucket for compile-once / specialize-at-runtime workflows.
MoeResidencyStats
MoeTopkCapture: Shared capture buffer — one entry per MoE router TopK in schedule order.
NetTransport: Full-mesh TCP transport implementing both Transport (two-sided send/recv) and SymmetricTransport (one-sided put/get/barrier). Construct via TcpTransport::bind or ThunderboltTransport::bind.
Node: A single node in the computation graph.
NodeId: Stable identifier for a node in the graph. Indices are never reused.
PipelineInspect: Text dump of each compiler pipeline stage.
ProcessGroup: A handle to the collective-communication world: a rank, a size, and the transport that connects them.
Rank: Identifier for a participant in a collective. Ranks are 0..num_ranks and stay stable for the lifetime of a transport.
RlxEnv: Bulk builder for code-side RLX_* overrides.
RngOptions: Compile-time / execute-time RNG policy for graphs containing random ops.
RuntimeOverrides: RAII guard: installs overrides on construction, restores previous values on drop.
ScaledQuantConfig: Native low-precision GEMM config for CompileOptions::scaled_quant. Which element formats + scale layout to quantize matmul operands to.
Shape: Tensor shape: ordered list of dimensions + element type.
SymmetricBuffer: (rank, offset, len) view into a symmetric heap. The same (offset, len) pair is valid on every rank — that’s what “symmetric” means.
SymmetricHeap: Per-rank symmetric memory: a Vec<u8> per rank, all the same size. Owned by the LocalTransport.
SyncCopy: CPU “async” copy — actually synchronous. issue() does a memcpy immediately and returns a fresh token; wait() is a no-op. Useful as the test fixture and for code paths that don’t actually need overlap.
SyncStream: Default implementation for synchronous backends — work has already happened by the time submit is called.
TcpTransport: Portable TCP transport. Works over any reachable IP — Ethernet or the macOS Thunderbolt Bridge link. This is the baseline that pipeline- and tensor-parallel inference run on before any fast path.
ThunderboltTransport: Thunderbolt transport. Same wire protocol as TcpTransport today (TCP over the Thunderbolt Bridge IP link), but a distinct type so the symmetric one-sided heap path is the intended interface and so a future zero-copy Thunderbolt DMA backend can replace the engine behind the unchanged Transport + SymmetricTransport traits.
Tick: Opaque tick reading. Subtract two of these to get a Duration.
WeightBlock: Nested parameter block (Slang PerFrame / material grouping).

Enums§

CollectiveError
CompilationMode: When the backend executable is produced relative to the host loop.
DType: Scalar element type. Matches hardware-supported types.
Device: Target device for graph execution.
KernelDispatchPolicy: When to use native backend kernels vs the shared IR common body.
ModelPhase: Coarse execution phase (prefill vs decode vs encoder).
Op: An operation in the RLX IR graph.
OpKind: High-level op categorization for precision policies.
PrecisionPolicy: Declarative precision policy for graph compilation.
ReduceKind: Element-wise reduction operator for collectives.
RngBackend: Which RNG implementation to use for [Op::RngNormal] / [Op::RngUniform].

Constants§

DEFAULT_HEAP_BYTES: Default symmetric-heap size per rank (bytes) when a constructor is not given one. 64 MiB comfortably holds a single transformer hidden state at typical batch×seq×d_model for collective scratch.
DEFAULT_MLX_COMPILE_OUTPUT_CAP: Alias for DEFAULT_COMPILE_OUTPUT_CAP.
MLX_COMPILE_OUTPUT_CAP_ENV: Alias for COMPILE_OUTPUT_CAP_ENV.

Traits§

AsyncCopy: Pluggable async-copy engine. Backends (SyncCopy for CPU, future MetalBlitCopy for GPU) implement this.
CommandStream: Per-backend command stream.
DeviceArena: Per-backend arena interface.
SymmetricTransport: One-sided operation surface. put(buf, src) writes src into buf.rank’s memory at buf.offset; get(buf, dst) reads from buf.rank’s memory into dst. Both calls block until completion (a future async impl can return a future).
Transport: Two-sided, tagged, byte-oriented point-to-point transport between world_size ranks.

Functions§

all_gather: AllGather: every rank ends up with the concatenation of all per-rank local slices, in rank order.
all_reduce: AllReduce: every rank ends up with op({values from every rank}).
apply_hir_extensions: Apply all registered extensions in order.
inspect_graph: Annotated graph dump (MIR body). Alias for pretty_print.
inspect_hir: Annotated HIR module dump.
inspect_hir_stats: One-line HIR summary (header + op histogram).
inspect_lir: Annotated LIR dump: optimized MIR + buffer plan + schedule.
inspect_mir: Annotated MIR module dump (optimized tensor DAG).
inspect_mir_stats: One-line MIR summary.
inspect_pipeline: Inspect every lowering stage for hir through pipeline.
mlx_compile_output_cap: Alias for compile_output_cap.
reduce_scatter: ReduceScatter: equivalent to AllReduce followed by partition — every rank ends up with one chunk_size-element slice of the reduced result. Rank r gets element indices [r*chunk_size, (r+1)*chunk_size).
register_hir_extension: Register a named extension (call from init or model crate startup).
registered_hir_extensions: Registered extension names in registration order.
reset_mlx_compile_output_cap: Alias for reset_compile_output_cap.
set_mlx_compile_output_cap: Alias for set_compile_output_cap.
time_ns: Time f, returning (result, elapsed_ns). Inlined so the surrounding loop can keep the closure body in registers.

Type Aliases§

HirExtensionFn: Transform applied after model flow build, before MIR lower.

Attribute Macros§

rlx_model: AOT compilation macro for RLX models.

Crate rlx_runtime

Crate rlx_runtime Copy item path

§Example

Re-exports§

Modules§

Macros§

Structs§

Enums§

Constants§

Traits§

Functions§

Type Aliases§

Attribute Macros§

Crate rlx_runtime