sentinel_proxy/inference/mod.rs
1//! Inference routing module for LLM/AI traffic patterns
2//!
3//! This module provides:
4//! - Token-based rate limiting (tokens/minute instead of requests/second)
5//! - Token budget tracking (cumulative usage per period)
6//! - Cost attribution (per-model pricing)
7//! - Multi-provider token counting (OpenAI, Anthropic, generic)
8//! - Model-aware load balancing (LeastTokensQueued strategy)
9//!
10//! # Example Usage
11//!
12//! ```kdl
13//! route "/v1/chat/completions" {
14//! inference {
15//! provider "openai"
16//! rate-limit {
17//! tokens-per-minute 100000
18//! burst-tokens 10000
19//! }
20//! budget {
21//! period "daily"
22//! limit 1000000
23//! enforce true
24//! }
25//! cost-attribution {
26//! pricing {
27//! model "gpt-4*" {
28//! input-cost-per-million 30.0
29//! output-cost-per-million 60.0
30//! }
31//! }
32//! }
33//! routing {
34//! strategy "least-tokens-queued"
35//! }
36//! }
37//! upstream "llm-pool" { ... }
38//! }
39//! ```
40
41mod budget;
42mod cost;
43mod guardrails;
44mod manager;
45mod metrics;
46mod providers;
47mod rate_limit;
48mod streaming;
49mod tiktoken;
50mod tokens;
51
52pub use budget::TokenBudgetTracker;
53pub use cost::CostCalculator;
54pub use guardrails::{
55 extract_inference_content, GuardrailProcessor, PiiCheckResult, PromptInjectionResult,
56};
57pub use manager::{InferenceCheckResult, InferenceRateLimitManager, InferenceRouteStats};
58pub use metrics::InferenceMetrics;
59pub use providers::{create_provider, InferenceProviderAdapter};
60pub use rate_limit::{TokenRateLimitResult, TokenRateLimiter};
61pub use streaming::{is_sse_response, StreamingTokenCounter, StreamingTokenResult, TokenCountSource};
62pub use tiktoken::{tiktoken_manager, TiktokenEncoding, TiktokenManager};
63pub use tokens::{TokenCounter, TokenEstimate, TokenSource};
64
65use sentinel_config::{InferenceConfig, InferenceProvider};
66
67/// Create a provider adapter based on the configured provider type
68pub fn create_inference_provider(config: &InferenceConfig) -> Box<dyn InferenceProviderAdapter> {
69 create_provider(&config.provider)
70}