Skip to main content

atomr_infer_pipeline/
lib.rs

1//! # inference-pipeline
2//!
3//! `atomr-streams` integration for inference graphs (doc §9), plus a
4//! re-export shim over `atomr-accel-patterns` so callers get the
5//! upstream universal-GPU blueprints (batching, cascade, replica pool,
6//! fair-share scheduler, hot-swap, MoE router, speculative decoder)
7//! without taking a second dependency.
8//!
9//! The patterns are runtime-agnostic: they accept user-supplied
10//! closures / trait impls as the backend, so an inference deployment
11//! plugs in by handing them a closure that calls into a
12//! `Box<dyn ModelRunner>`. That avoids reimplementing any of the
13//! patterns locally — they're the §9 building blocks the doc names.
14//!
15//! Re-exports are gated behind the `cuda-patterns` feature so
16//! `inference --features remote-only` builds don't pull `cudarc`.
17
18#![forbid(unsafe_code)]
19#![deny(rust_2018_idioms)]
20
21use atomr_streams::Source;
22use tokio::sync::mpsc;
23
24use atomr_infer_core::batch::ExecuteBatch;
25
26/// Adapter — accept a `tokio::mpsc` receiver and emit it as a stream
27/// `Source`. The caller owns the sender and is responsible for closing
28/// it to terminate the stream.
29pub fn request_source(rx: mpsc::UnboundedReceiver<ExecuteBatch>) -> Source<ExecuteBatch> {
30    Source::from_receiver(rx)
31}
32
33/// Re-export of the upstream `atomr-accel-patterns` crate so callers
34/// can write `atomr_infer_pipeline::patterns::DynamicBatchingServer`
35/// without separately adding it to their workspace deps.
36///
37/// Use these directly to compose §9-shaped graphs:
38/// - `patterns::batching::DynamicBatchingServer` — accumulate
39///   `ExecuteBatch`es up to a size/time bound, then dispatch as one
40///   `ModelRunner::execute` call.
41/// - `patterns::cascade::InferenceCascade` — early-exit routing with
42///   a confidence gate (cheap classifier → escalation, doc §9.1).
43/// - `patterns::replica_pool::ModelReplicaPool` — round-robin /
44///   least-loaded routing across N replicas.
45/// - `patterns::scheduler::FairShareScheduler` — WFQ tenant
46///   scheduling.
47/// - `patterns::hot_swap::ModelHotSwapServer` — live model
48///   replacement (doc §7.5 canary / hot-swap).
49/// - `patterns::speculative::SpeculativeDecoder` — draft + verifier
50///   pair.
51/// - `patterns::moe::MoeRouter` — mixture-of-experts gating.
52#[cfg(feature = "cuda-patterns")]
53pub mod patterns {
54    pub use atomr_accel_patterns::*;
55}
56
57/// Reference hybrid-graph descriptor. Pure metadata; the
58/// instantiation lives in caller code (the `examples/remote_only_demo`
59/// crate exercises one path). When the `cuda-patterns` feature is on,
60/// callers turn the descriptor into an `InferenceCascade` by handing
61/// each deployment name to a `CascadeStage` whose closure looks the
62/// `ActorRef` up in the cluster.
63pub struct HybridGraph {
64    pub local_classify_deployment: String,
65    pub local_executor_deployment: String,
66    pub remote_planner_deployment: String,
67    pub remote_fallback_deployment: String,
68}
69
70impl HybridGraph {
71    pub fn new(
72        local_classify: impl Into<String>,
73        local_executor: impl Into<String>,
74        remote_planner: impl Into<String>,
75        remote_fallback: impl Into<String>,
76    ) -> std::sync::Arc<Self> {
77        std::sync::Arc::new(Self {
78            local_classify_deployment: local_classify.into(),
79            local_executor_deployment: local_executor.into(),
80            remote_planner_deployment: remote_planner.into(),
81            remote_fallback_deployment: remote_fallback.into(),
82        })
83    }
84}