sentinel_proxy/inference/
mod.rs

1//! Inference routing module for LLM/AI traffic patterns
2//!
3//! This module provides:
4//! - Token-based rate limiting (tokens/minute instead of requests/second)
5//! - Token budget tracking (cumulative usage per period)
6//! - Cost attribution (per-model pricing)
7//! - Multi-provider token counting (OpenAI, Anthropic, generic)
8//! - Model-aware load balancing (LeastTokensQueued strategy)
9//!
10//! # Example Usage
11//!
12//! ```kdl
13//! route "/v1/chat/completions" {
14//!     inference {
15//!         provider "openai"
16//!         rate-limit {
17//!             tokens-per-minute 100000
18//!             burst-tokens 10000
19//!         }
20//!         budget {
21//!             period "daily"
22//!             limit 1000000
23//!             enforce true
24//!         }
25//!         cost-attribution {
26//!             pricing {
27//!                 model "gpt-4*" {
28//!                     input-cost-per-million 30.0
29//!                     output-cost-per-million 60.0
30//!                 }
31//!             }
32//!         }
33//!         routing {
34//!             strategy "least-tokens-queued"
35//!         }
36//!     }
37//!     upstream "llm-pool" { ... }
38//! }
39//! ```
40
41mod budget;
42mod cost;
43mod guardrails;
44mod manager;
45mod metrics;
46mod providers;
47mod rate_limit;
48mod streaming;
49mod tiktoken;
50mod tokens;
51
52pub use budget::TokenBudgetTracker;
53pub use cost::CostCalculator;
54pub use guardrails::{
55    extract_inference_content, GuardrailProcessor, PiiCheckResult, PromptInjectionResult,
56};
57pub use manager::{InferenceCheckResult, InferenceRateLimitManager, InferenceRouteStats};
58pub use metrics::InferenceMetrics;
59pub use providers::{create_provider, InferenceProviderAdapter};
60pub use rate_limit::{TokenRateLimitResult, TokenRateLimiter};
61pub use streaming::{is_sse_response, StreamingTokenCounter, StreamingTokenResult, TokenCountSource};
62pub use tiktoken::{tiktoken_manager, TiktokenEncoding, TiktokenManager};
63pub use tokens::{TokenCounter, TokenEstimate, TokenSource};
64
65use sentinel_config::{InferenceConfig, InferenceProvider};
66
67/// Create a provider adapter based on the configured provider type
68pub fn create_inference_provider(config: &InferenceConfig) -> Box<dyn InferenceProviderAdapter> {
69    create_provider(&config.provider)
70}