Struct AppState

Source

pub struct AppState {Show 19 fields
    pub queue: Sender<BatchRequest>,
    pub model_id: String,
    pub loaded_at: u64,
    pub default_sampler: SamplerConfig,
    pub vocab_bytes: Option<VocabBytes>,
    pub hidden_size: usize,
    pub metrics: Arc<Metrics>,
    pub batch_store: BatchStore,
    pub batch_disk_store: Arc<BatchStore>,
    pub batch_queue_tx: BatchQueueSender,
    pub model_pool: Mutex<ModelPool>,
    pub prefix_cache: Arc<Mutex<PrefixKvCache>>,
    pub loras: Arc<RwLock<HashMap<String, Arc<LoadedLora>>>>,
    pub threads_store: Option<Arc<ThreadStore>>,
    pub run_queue_tx: Option<RunQueueSender>,
    pub files_store: Option<Arc<FilesStore>>,
    pub run_event_tx_broadcast: Option<RunEventSender>,
    pub responses_store: Option<Arc<ResponseStore>>,
    pub per_key_rate_limiter: Option<Arc<PerKeyRateLimiter>>,
}

Expand description

Shared application state accessible by all route handlers.

All inference is delegated to the single background worker via queue. Read-only metadata (model ID, default sampler, vocabulary, hidden size) is cached here so handlers never need to reach into the engine.

Fields§

§queue: Sender<BatchRequest>

Channel to send inference requests to the worker.

§model_id: String

The model name/identifier for API responses.

§loaded_at: u64

Unix timestamp (seconds) when the model was loaded.

§default_sampler: SamplerConfig

Default sampler configuration read from EngineConfig at startup.

Route handlers clone this and apply per-request overrides on top.

§vocab_bytes: Option<VocabBytes>

Vocabulary byte table used for grammar-constrained sampling.

None when the model has no tokenizer (should not happen at serve time).

§hidden_size: usize

Hidden-state dimension for the /v1/embeddings endpoint.

§metrics: Arc<Metrics>

Shared metrics store.

§batch_store: BatchStore

In-memory batch job registry (legacy OpenAI batch compat layer).

§batch_disk_store: Arc<BatchStore>

Disk-backed batch job store (C3: disk-spool backend).

§batch_queue_tx: BatchQueueSender

Sender into the disk-backed batch processing queue (C3).

§model_pool: Mutex<ModelPool>

Multi-model LRU warm-pool (C1).

Wrapped in Mutex so admin routes can mutate it without blocking the inference worker. In the current single-worker design the worker also holds the pool; admin mutations use try_lock to avoid deadlocks.

§prefix_cache: Arc<Mutex<PrefixKvCache>>

Prefix KV cache for system-prompt reuse across requests.

When a new request shares a long prefix with a previously-cached sequence (e.g. a fixed system prompt), the matching KV state is restored and only the suffix tokens need a fresh prefill pass.

§loras: Arc<RwLock<HashMap<String, Arc<LoadedLora>>>>

Loaded LoRA adapter registry: stable name → Arc<LoadedLora>.

Populated via POST /admin/loras. Request handlers look up adapters by name and pass them to the worker via BatchRequest::Generate.

§threads_store: Option<Arc<ThreadStore>>

Persistent thread/message/run store for the Assistants API.

None when the Assistants API has not been configured (no --threads-dir flag was passed at startup). Route handlers return 503 in this case.

§run_queue_tx: Option<RunQueueSender>

Sender into the run processing queue for the Assistants API.

None when threads_store is None.

§files_store: Option<Arc<FilesStore>>

Persistent files store for the Files API (/v1/files).

None when the Files API has not been configured.

§run_event_tx_broadcast: Option<RunEventSender>

Broadcast sender for run lifecycle events (SSE streaming).

None when SSE streaming is not enabled.

§responses_store: Option<Arc<ResponseStore>>

In-memory store for Responses API objects.

None when the Responses API has not been enabled. Route handlers return 503 (ModelNotReady) in this case.

§per_key_rate_limiter: Option<Arc<PerKeyRateLimiter>>

Per-API-key token-bucket rate limiter.

None when per-key rate limiting has not been configured.

Struct AppState Copy item path

Fields§

Implementations§

impl AppState

pub fn new( queue: Sender<BatchRequest>, model_id: String, default_sampler: SamplerConfig, vocab_bytes: Option<VocabBytes>, hidden_size: usize, ) -> Self

pub fn with_threads(self, store: Arc<ThreadStore>, tx: RunQueueSender) -> Self

pub fn with_files(self, store: Arc<FilesStore>) -> Self

pub fn with_run_event_sender(self, tx: RunEventSender) -> Self

pub fn with_responses_store(self, store: Arc<ResponseStore>) -> Self

pub fn with_per_key_rate_limiter(self, limiter: Arc<PerKeyRateLimiter>) -> Self

pub fn with_batch_pipeline( queue: Sender<BatchRequest>, model_id: String, default_sampler: SamplerConfig, vocab_bytes: Option<VocabBytes>, hidden_size: usize, batch_disk_store: Arc<DiskBatchStore>, batch_queue_tx: BatchQueueSender, ) -> Self

Auto Trait Implementations§

impl !Freeze for AppState

impl !RefUnwindSafe for AppState

impl Send for AppState

impl Sync for AppState

impl Unpin for AppState

impl UnsafeUnpin for AppState

impl !UnwindSafe for AppState

Blanket Implementations§

impl<T> Any for Twhere T: 'static + ?Sized,

fn type_id(&self) -> TypeId

impl<T> Borrow<T> for Twhere T: ?Sized,

fn borrow(&self) -> &T

impl<T> BorrowMut<T> for Twhere T: ?Sized,

fn borrow_mut(&mut self) -> &mut T

impl<T> From<T> for T

fn from(t: T) -> T

impl<T> Instrument for T

fn instrument(self, span: Span) -> Instrumented<Self>

fn in_current_span(self) -> Instrumented<Self>

impl<T, U> Into<U> for Twhere U: From<T>,

fn into(self) -> U

impl<T> IntoEither for T

fn into_either(self, into_left: bool) -> Either<Self, Self>

fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>where F: FnOnce(&Self) -> bool,

impl<T> Pointable for T

const ALIGN: usize

type Init = T

unsafe fn init(init: <T as Pointable>::Init) -> usize

unsafe fn deref<'a>(ptr: usize) -> &'a T

unsafe fn deref_mut<'a>(ptr: usize) -> &'a mut T

unsafe fn drop(ptr: usize)

impl<T> Same for T

type Output = T

impl<T, U> TryFrom<U> for Twhere U: Into<T>,

type Error = Infallible

fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>

impl<T, U> TryInto<U> for Twhere U: TryFrom<T>,

type Error = <U as TryFrom<T>>::Error

fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>

impl<V, T> VZip<V> for Twhere V: MultiLane<T>,

fn vzip(self) -> V

impl<T> WithSubscriber for T

fn with_subscriber<S>(self, subscriber: S) -> WithDispatch<Self>where S: Into<Dispatch>,

fn with_current_subscriber(self) -> WithDispatch<Self>

impl<A, B, T> HttpServerConnExec<A, B> for Twhere B: Body,

Struct AppState

impl<T> Any for T
where T: 'static + ?Sized,

impl<T> Borrow<T> for T
where T: ?Sized,

impl<T> BorrowMut<T> for T
where T: ?Sized,

impl<T, U> Into<U> for T
where U: From<T>,

fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
where F: FnOnce(&Self) -> bool,

impl<T, U> TryFrom<U> for T
where U: Into<T>,

impl<T, U> TryInto<U> for T
where U: TryFrom<T>,

impl<V, T> VZip<V> for T
where V: MultiLane<T>,

fn with_subscriber<S>(self, subscriber: S) -> WithDispatch<Self>
where S: Into<Dispatch>,

impl<A, B, T> HttpServerConnExec<A, B> for T
where B: Body,