Skip to main content

inference_runtime_litellm/
lib.rs

1//! # inference-runtime-litellm
2//!
3//! Thin LiteLLM-proxy adapter on top of `inference-runtime-openai`.
4//! LiteLLM exposes an OpenAI-compatible HTTP surface fronting any
5//! backend (OpenAI, Anthropic, Bedrock, Azure, …) and applies its own
6//! caching / fallback / retry policies. Doc §10.3.
7//!
8//! The `LiteLlmRunner` is a newtype around `OpenAiRunner` that:
9//! - points at the LiteLLM proxy URL instead of `api.openai.com`,
10//! - lowers `max_retries` (LiteLLM does its own retries; we want fast
11//!   fail-through),
12//! - preserves `runtime_kind() == LiteLlm` so dashboards and routing
13//!   can distinguish "via LiteLLM" from "direct to OpenAI".
14
15#![forbid(unsafe_code)]
16#![deny(rust_2018_idioms)]
17
18use std::sync::Arc;
19
20use arc_swap::ArcSwap;
21use async_trait::async_trait;
22use serde::{Deserialize, Serialize};
23use url::Url;
24
25use inference_core::batch::ExecuteBatch;
26use inference_core::deployment::{RateLimits, RetryPolicy, Timeouts};
27use inference_core::error::InferenceResult;
28use inference_core::runner::{ModelRunner, RunHandle, SessionRebuildCause};
29use inference_core::runtime::{CircuitBreakerConfig, ProviderKind, RuntimeKind, TransportKind};
30
31use inference_remote_core::session::SessionSnapshot;
32use inference_runtime_openai::{OpenAiConfig, OpenAiRunner, OpenAiVariant};
33
34#[derive(Debug, Clone, Serialize, Deserialize)]
35pub struct LiteLlmConfig {
36    pub endpoint: Url,
37    pub api_key: SecretRef,
38    #[serde(default)]
39    pub rate_limits: RateLimits,
40    #[serde(default = "default_retry")]
41    pub retry: RetryPolicy,
42    #[serde(default)]
43    pub circuit_breaker: CircuitBreakerConfig,
44    #[serde(default)]
45    pub timeouts: Timeouts,
46}
47
48#[derive(Debug, Clone, Serialize, Deserialize)]
49#[serde(tag = "from", rename_all = "snake_case")]
50pub enum SecretRef {
51    Env { name: String },
52    File { path: std::path::PathBuf },
53    Inline { value: String },
54}
55
56fn default_retry() -> RetryPolicy {
57    // LiteLLM has its own retries; drive client-side retries low so we
58    // don't compound. Doc §10.3.
59    RetryPolicy {
60        max_retries: 1,
61        ..RetryPolicy::default()
62    }
63}
64
65impl LiteLlmConfig {
66    pub fn into_openai(self, openai_secret: inference_runtime_openai::config::SecretRef) -> OpenAiConfig {
67        OpenAiConfig {
68            variant: OpenAiVariant::Direct {
69                endpoint: self.endpoint,
70            },
71            api_key: openai_secret,
72            organization: None,
73            project: None,
74            rate_limits: self.rate_limits,
75            retry: self.retry,
76            circuit_breaker: self.circuit_breaker,
77            timeouts: self.timeouts,
78        }
79    }
80}
81
82/// Newtype wrapper. Delegates to the inner `OpenAiRunner` for all
83/// `ModelRunner` ops; only `runtime_kind` and `transport_kind` differ
84/// so observability can distinguish LiteLLM from direct OpenAI.
85pub struct LiteLlmRunner {
86    inner: OpenAiRunner,
87}
88
89impl LiteLlmRunner {
90    pub fn new(config: OpenAiConfig, session: Arc<ArcSwap<SessionSnapshot>>) -> InferenceResult<Self> {
91        Ok(Self {
92            inner: OpenAiRunner::new(config, session)?,
93        })
94    }
95}
96
97#[async_trait]
98impl ModelRunner for LiteLlmRunner {
99    async fn execute(&mut self, batch: ExecuteBatch) -> InferenceResult<RunHandle> {
100        self.inner.execute(batch).await
101    }
102
103    async fn rebuild_session(&mut self, cause: SessionRebuildCause) -> InferenceResult<()> {
104        self.inner.rebuild_session(cause).await
105    }
106
107    fn runtime_kind(&self) -> RuntimeKind {
108        RuntimeKind::LiteLlm
109    }
110    fn transport_kind(&self) -> TransportKind {
111        TransportKind::RemoteNetwork {
112            provider: ProviderKind::LiteLlm,
113        }
114    }
115    fn rate_limits(&self) -> Option<&RateLimits> {
116        self.inner.rate_limits()
117    }
118    fn estimate_cost_usd(&self, batch: &ExecuteBatch) -> f64 {
119        // LiteLLM proxies many backends; we don't try to recover the
120        // actual price here. Operators set per-deployment pricing
121        // explicitly in `inference-cli`.
122        self.inner.estimate_cost_usd(batch)
123    }
124}