atomr_infer_pipeline/lib.rs
1//! # inference-pipeline
2//!
3//! `atomr-streams` integration for inference graphs (doc §9), plus a
4//! re-export shim over `atomr-accel-patterns` so callers get the
5//! upstream universal-GPU blueprints (batching, cascade, replica pool,
6//! fair-share scheduler, hot-swap, MoE router, speculative decoder)
7//! without taking a second dependency.
8//!
9//! The patterns are runtime-agnostic: they accept user-supplied
10//! closures / trait impls as the backend, so an inference deployment
11//! plugs in by handing them a closure that calls into a
12//! `Box<dyn ModelRunner>`. That avoids reimplementing any of the
13//! patterns locally — they're the §9 building blocks the doc names.
14//!
15//! Re-exports are gated behind the `cuda-patterns` feature so
16//! `inference --features remote-only` builds don't pull `cudarc`.
17
18#![forbid(unsafe_code)]
19#![deny(rust_2018_idioms)]
20
21use atomr_streams::Source;
22use tokio::sync::mpsc;
23
24use atomr_infer_core::batch::ExecuteBatch;
25
26/// Adapter — accept a `tokio::mpsc` receiver and emit it as a stream
27/// `Source`. The caller owns the sender and is responsible for closing
28/// it to terminate the stream.
29pub fn request_source(rx: mpsc::UnboundedReceiver<ExecuteBatch>) -> Source<ExecuteBatch> {
30 Source::from_receiver(rx)
31}
32
33/// Re-export of the upstream `atomr-accel-patterns` crate so callers
34/// can write `atomr_infer_pipeline::patterns::DynamicBatchingServer`
35/// without separately adding it to their workspace deps.
36///
37/// Use these directly to compose §9-shaped graphs:
38/// - `patterns::batching::DynamicBatchingServer` — accumulate
39/// `ExecuteBatch`es up to a size/time bound, then dispatch as one
40/// `ModelRunner::execute` call.
41/// - `patterns::cascade::InferenceCascade` — early-exit routing with
42/// a confidence gate (cheap classifier → escalation, doc §9.1).
43/// - `patterns::replica_pool::ModelReplicaPool` — round-robin /
44/// least-loaded routing across N replicas.
45/// - `patterns::scheduler::FairShareScheduler` — WFQ tenant
46/// scheduling.
47/// - `patterns::hot_swap::ModelHotSwapServer` — live model
48/// replacement (doc §7.5 canary / hot-swap).
49/// - `patterns::speculative::SpeculativeDecoder` — draft + verifier
50/// pair.
51/// - `patterns::moe::MoeRouter` — mixture-of-experts gating.
52#[cfg(feature = "cuda-patterns")]
53pub mod patterns {
54 pub use atomr_accel_patterns::*;
55}
56
57/// Reference hybrid-graph descriptor. Pure metadata; the
58/// instantiation lives in caller code (the `examples/remote_only_demo`
59/// crate exercises one path). When the `cuda-patterns` feature is on,
60/// callers turn the descriptor into an `InferenceCascade` by handing
61/// each deployment name to a `CascadeStage` whose closure looks the
62/// `ActorRef` up in the cluster.
63pub struct HybridGraph {
64 pub local_classify_deployment: String,
65 pub local_executor_deployment: String,
66 pub remote_planner_deployment: String,
67 pub remote_fallback_deployment: String,
68}
69
70impl HybridGraph {
71 pub fn new(
72 local_classify: impl Into<String>,
73 local_executor: impl Into<String>,
74 remote_planner: impl Into<String>,
75 remote_fallback: impl Into<String>,
76 ) -> std::sync::Arc<Self> {
77 std::sync::Arc::new(Self {
78 local_classify_deployment: local_classify.into(),
79 local_executor_deployment: local_executor.into(),
80 remote_planner_deployment: remote_planner.into(),
81 remote_fallback_deployment: remote_fallback.into(),
82 })
83 }
84}