token_counting/
token_counting.rs

1//! Token counting, estimation, and budget management for `OpenAI` API.
2#![allow(dead_code)]
3#![allow(clippy::too_many_lines)]
4#![allow(clippy::missing_const_for_fn)]
5#![allow(clippy::cast_possible_truncation)]
6#![allow(clippy::cast_precision_loss)]
7#![allow(clippy::if_same_then_else)]
8#![allow(clippy::cast_lossless)]
9#![allow(clippy::similar_names)]
10#![allow(clippy::unreadable_literal)]
11#![allow(clippy::cast_possible_wrap)]
12#![allow(clippy::doc_markdown)]
13#![allow(clippy::uninlined_format_args)]
14#![allow(clippy::unused_self)]
15#![allow(clippy::struct_excessive_bools)]
16#![allow(clippy::fn_params_excessive_bools)]
17#![allow(clippy::significant_drop_tightening)]
18#![allow(clippy::branches_sharing_code)]
19#![allow(clippy::cast_sign_loss)]
20#![allow(clippy::eq_op)]
21#![allow(clippy::needless_pass_by_value)]
22#![allow(clippy::use_self)]
23#![allow(clippy::struct_field_names)]
24#![allow(clippy::module_name_repetitions)]
25#![allow(clippy::suboptimal_flops)]
26#![allow(clippy::option_if_let_else)]
27//!
28//! This example demonstrates comprehensive token management including:
29//! - Accurate token counting for different models and encodings
30//! - Token estimation before API calls to predict costs
31//! - Budget management and cost control mechanisms
32//! - Token optimization strategies for efficient API usage
33//! - Real-time monitoring and alerting for token consumption
34//! - Historical analysis and trend tracking
35//! - Token-aware request batching and optimization
36//! - Cost forecasting and budget planning
37//!
38//! Token management is crucial for:
39//! - Controlling costs in production AI applications
40//! - Preventing unexpected billing spikes
41//! - Optimizing model selection based on cost/performance
42//! - Planning capacity for high-volume applications
43//! - Monitoring usage patterns and trends
44//!
45//! Run with: `cargo run --example token_counting`
46
47use openai_ergonomic::{Client, Config, Error, Result};
48use std::collections::HashMap;
49use std::sync::{Arc, Mutex};
50use std::time::{Duration, SystemTime, UNIX_EPOCH};
51use tracing::{debug, error, info, warn};
52
53/// Token counting and estimation utilities
54#[derive(Debug, Clone)]
55struct TokenCounter {
56    /// Character-to-token ratios for different languages/content types
57    encoding_ratios: HashMap<String, f64>,
58    /// Model-specific token limits
59    model_limits: HashMap<String, TokenLimits>,
60    /// Model pricing information
61    model_pricing: HashMap<String, ModelPricing>,
62}
63
64/// Token limits for different models
65#[derive(Debug, Clone)]
66struct TokenLimits {
67    /// Maximum context length (input + output)
68    max_context_length: i32,
69    /// Maximum output tokens
70    max_output_tokens: i32,
71    /// Recommended safe input limit (leaving room for output)
72    safe_input_limit: i32,
73}
74
75/// Pricing information for models
76#[derive(Debug, Clone)]
77struct ModelPricing {
78    /// Cost per 1K input tokens
79    input_cost_per_1k: f64,
80    /// Cost per 1K output tokens
81    output_cost_per_1k: f64,
82    /// Base cost per request (if any)
83    base_cost: f64,
84}
85
86impl TokenCounter {
87    /// Create a new token counter with default configurations
88    fn new() -> Self {
89        let mut encoding_ratios = HashMap::new();
90        encoding_ratios.insert("english".to_string(), 0.25); // ~4 chars per token
91        encoding_ratios.insert("code".to_string(), 0.33); // ~3 chars per token
92        encoding_ratios.insert("multilingual".to_string(), 0.2); // ~5 chars per token
93        encoding_ratios.insert("json".to_string(), 0.5); // ~2 chars per token
94
95        let mut model_limits = HashMap::new();
96        model_limits.insert(
97            "gpt-4".to_string(),
98            TokenLimits {
99                max_context_length: 8192,
100                max_output_tokens: 4096,
101                safe_input_limit: 6000,
102            },
103        );
104        model_limits.insert(
105            "gpt-4-32k".to_string(),
106            TokenLimits {
107                max_context_length: 32768,
108                max_output_tokens: 4096,
109                safe_input_limit: 28000,
110            },
111        );
112        model_limits.insert(
113            "gpt-3.5-turbo".to_string(),
114            TokenLimits {
115                max_context_length: 4096,
116                max_output_tokens: 4096,
117                safe_input_limit: 3000,
118            },
119        );
120        model_limits.insert(
121            "gpt-3.5-turbo-16k".to_string(),
122            TokenLimits {
123                max_context_length: 16384,
124                max_output_tokens: 4096,
125                safe_input_limit: 12000,
126            },
127        );
128
129        let mut model_pricing = HashMap::new();
130        model_pricing.insert(
131            "gpt-4".to_string(),
132            ModelPricing {
133                input_cost_per_1k: 0.03,
134                output_cost_per_1k: 0.06,
135                base_cost: 0.0,
136            },
137        );
138        model_pricing.insert(
139            "gpt-4-32k".to_string(),
140            ModelPricing {
141                input_cost_per_1k: 0.06,
142                output_cost_per_1k: 0.12,
143                base_cost: 0.0,
144            },
145        );
146        model_pricing.insert(
147            "gpt-3.5-turbo".to_string(),
148            ModelPricing {
149                input_cost_per_1k: 0.0015,
150                output_cost_per_1k: 0.002,
151                base_cost: 0.0,
152            },
153        );
154        model_pricing.insert(
155            "gpt-3.5-turbo-16k".to_string(),
156            ModelPricing {
157                input_cost_per_1k: 0.003,
158                output_cost_per_1k: 0.004,
159                base_cost: 0.0,
160            },
161        );
162
163        Self {
164            encoding_ratios,
165            model_limits,
166            model_pricing,
167        }
168    }
169
170    /// Estimate tokens for text content
171    fn estimate_tokens(&self, text: &str, content_type: &str) -> i32 {
172        let ratio = self.encoding_ratios.get(content_type).unwrap_or(&0.25);
173        (text.len() as f64 * ratio).ceil() as i32
174    }
175
176    /// Estimate tokens for a chat completion request
177    fn estimate_chat_tokens(&self, messages: &[ChatMessage], model: &str) -> TokenEstimate {
178        let mut total_tokens = 0;
179
180        // Add tokens for each message
181        for message in messages {
182            // Message overhead (role, formatting, etc.)
183            total_tokens += 4;
184
185            // Content tokens
186            let content_type = if message.role == "system" {
187                "english"
188            } else {
189                "english"
190            };
191            total_tokens += self.estimate_tokens(&message.content, content_type);
192        }
193
194        // Add overhead for the completion request
195        total_tokens += 2;
196
197        // Get model limits for validation
198        let limits = self
199            .model_limits
200            .get(model)
201            .cloned()
202            .unwrap_or(TokenLimits {
203                max_context_length: 4096,
204                max_output_tokens: 1000,
205                safe_input_limit: 3000,
206            });
207
208        TokenEstimate {
209            estimated_input_tokens: total_tokens,
210            max_output_tokens: limits.max_output_tokens,
211            total_estimated_tokens: total_tokens + limits.max_output_tokens,
212            exceeds_context_limit: total_tokens > limits.max_context_length,
213            exceeds_safe_limit: total_tokens > limits.safe_input_limit,
214            model_limits: limits,
215        }
216    }
217
218    /// Calculate cost estimate for a request
219    fn estimate_cost(&self, estimate: &TokenEstimate, model: &str) -> CostEstimate {
220        let pricing = self
221            .model_pricing
222            .get(model)
223            .cloned()
224            .unwrap_or(ModelPricing {
225                input_cost_per_1k: 0.002,
226                output_cost_per_1k: 0.002,
227                base_cost: 0.0,
228            });
229
230        let input_cost =
231            (estimate.estimated_input_tokens as f64 / 1000.0) * pricing.input_cost_per_1k;
232        let max_output_cost =
233            (estimate.max_output_tokens as f64 / 1000.0) * pricing.output_cost_per_1k;
234
235        CostEstimate {
236            estimated_input_cost: input_cost,
237            max_output_cost,
238            total_max_cost: input_cost + max_output_cost + pricing.base_cost,
239            pricing_info: pricing,
240        }
241    }
242
243    /// Optimize messages to fit within token limits
244    fn optimize_messages(
245        &self,
246        messages: &[ChatMessage],
247        model: &str,
248        target_tokens: i32,
249    ) -> Vec<ChatMessage> {
250        let mut optimized = messages.to_vec();
251        let mut current_estimate = self.estimate_chat_tokens(&optimized, model);
252
253        // If we're already under the limit, return as-is
254        if current_estimate.estimated_input_tokens <= target_tokens {
255            return optimized;
256        }
257
258        info!(
259            "Optimizing messages: current {} tokens, target {} tokens",
260            current_estimate.estimated_input_tokens, target_tokens
261        );
262
263        // Strategy 1: Truncate user messages from the beginning (keep recent context)
264        while current_estimate.estimated_input_tokens > target_tokens && optimized.len() > 1 {
265            // Find the oldest user message to remove
266            if let Some(pos) = optimized.iter().position(|msg| msg.role == "user") {
267                if pos > 0 {
268                    // Don't remove system messages
269                    optimized.remove(pos);
270                    current_estimate = self.estimate_chat_tokens(&optimized, model);
271                    debug!(
272                        "Removed message, now {} tokens",
273                        current_estimate.estimated_input_tokens
274                    );
275                } else {
276                    break;
277                }
278            } else {
279                break;
280            }
281        }
282
283        // Strategy 2: Truncate long messages
284        if current_estimate.estimated_input_tokens > target_tokens {
285            for message in &mut optimized {
286                if message.role != "system" && message.content.len() > 500 {
287                    let max_chars = (target_tokens as f64 * 4.0) as usize; // Rough conversion
288                    if message.content.len() > max_chars {
289                        message.content =
290                            format!("{}...", &message.content[..max_chars.saturating_sub(3)]);
291                        debug!(
292                            "Truncated long message to {} characters",
293                            message.content.len()
294                        );
295                    }
296                }
297            }
298            current_estimate = self.estimate_chat_tokens(&optimized, model);
299        }
300
301        info!(
302            "Optimization complete: {} tokens (saved {})",
303            current_estimate.estimated_input_tokens,
304            current_estimate.estimated_input_tokens - current_estimate.estimated_input_tokens
305        );
306
307        optimized
308    }
309
310    /// Get the most cost-effective model for a given request
311    fn recommend_model(
312        &self,
313        messages: &[ChatMessage],
314        quality_tier: QualityTier,
315    ) -> ModelRecommendation {
316        let candidates = match quality_tier {
317            QualityTier::Budget => vec!["gpt-3.5-turbo", "gpt-3.5-turbo-16k"],
318            QualityTier::Balanced => vec!["gpt-3.5-turbo", "gpt-3.5-turbo-16k", "gpt-4"],
319            QualityTier::Premium => vec!["gpt-4", "gpt-4-32k"],
320        };
321
322        let mut best_option = None;
323        let mut best_cost = f64::INFINITY;
324
325        for model in candidates {
326            let estimate = self.estimate_chat_tokens(messages, model);
327            if !estimate.exceeds_context_limit {
328                let cost_estimate = self.estimate_cost(&estimate, model);
329                if cost_estimate.total_max_cost < best_cost {
330                    best_cost = cost_estimate.total_max_cost;
331                    best_option = Some(ModelRecommendation {
332                        model: model.to_string(),
333                        estimated_cost: cost_estimate.total_max_cost,
334                        token_estimate: estimate,
335                        cost_details: cost_estimate,
336                        reason: format!("Most cost-effective for {} tier", quality_tier.as_str()),
337                    });
338                }
339            }
340        }
341
342        best_option.unwrap_or_else(|| {
343            // Fallback to the largest model that can handle the request
344            let fallback_model = "gpt-4-32k";
345            let estimate = self.estimate_chat_tokens(messages, fallback_model);
346            let cost_estimate = self.estimate_cost(&estimate, fallback_model);
347
348            ModelRecommendation {
349                model: fallback_model.to_string(),
350                estimated_cost: cost_estimate.total_max_cost,
351                token_estimate: estimate,
352                cost_details: cost_estimate,
353                reason: "Fallback - requires large context window".to_string(),
354            }
355        })
356    }
357}
358
359/// Token usage estimate for a request
360#[derive(Debug, Clone)]
361struct TokenEstimate {
362    estimated_input_tokens: i32,
363    max_output_tokens: i32,
364    total_estimated_tokens: i32,
365    exceeds_context_limit: bool,
366    exceeds_safe_limit: bool,
367    model_limits: TokenLimits,
368}
369
370/// Cost estimate for a request
371#[derive(Debug, Clone)]
372struct CostEstimate {
373    estimated_input_cost: f64,
374    max_output_cost: f64,
375    total_max_cost: f64,
376    pricing_info: ModelPricing,
377}
378
379/// Quality tier for model selection
380#[derive(Debug, Clone)]
381enum QualityTier {
382    Budget,
383    Balanced,
384    Premium,
385}
386
387impl QualityTier {
388    fn as_str(&self) -> &str {
389        match self {
390            QualityTier::Budget => "budget",
391            QualityTier::Balanced => "balanced",
392            QualityTier::Premium => "premium",
393        }
394    }
395}
396
397/// Model recommendation with cost analysis
398#[derive(Debug, Clone)]
399struct ModelRecommendation {
400    model: String,
401    estimated_cost: f64,
402    token_estimate: TokenEstimate,
403    cost_details: CostEstimate,
404    reason: String,
405}
406
407/// Budget manager for tracking and controlling costs
408#[derive(Debug)]
409struct BudgetManager {
410    /// Daily budget limit in USD
411    daily_budget: f64,
412    /// Monthly budget limit in USD
413    monthly_budget: f64,
414    /// Current day spending
415    daily_spending: Arc<Mutex<f64>>,
416    /// Current month spending
417    monthly_spending: Arc<Mutex<f64>>,
418    /// Spending history
419    spending_history: Arc<Mutex<Vec<SpendingRecord>>>,
420    /// Alert thresholds
421    alert_thresholds: AlertThresholds,
422}
423
424/// Alert thresholds for budget monitoring
425#[derive(Debug, Clone)]
426struct AlertThresholds {
427    /// Percentage of daily budget for warning
428    daily_warning_percent: f64,
429    /// Percentage of daily budget for critical alert
430    daily_critical_percent: f64,
431    /// Percentage of monthly budget for warning
432    monthly_warning_percent: f64,
433    /// Percentage of monthly budget for critical alert
434    monthly_critical_percent: f64,
435}
436
437impl Default for AlertThresholds {
438    fn default() -> Self {
439        Self {
440            daily_warning_percent: 80.0,
441            daily_critical_percent: 95.0,
442            monthly_warning_percent: 80.0,
443            monthly_critical_percent: 95.0,
444        }
445    }
446}
447
448/// Record of spending for analytics
449#[derive(Debug, Clone)]
450struct SpendingRecord {
451    timestamp: u64,
452    model: String,
453    input_tokens: i32,
454    output_tokens: i32,
455    cost: f64,
456    request_type: String,
457    user_id: Option<String>,
458}
459
460impl BudgetManager {
461    /// Create a new budget manager
462    fn new(daily_budget: f64, monthly_budget: f64) -> Self {
463        Self {
464            daily_budget,
465            monthly_budget,
466            daily_spending: Arc::new(Mutex::new(0.0)),
467            monthly_spending: Arc::new(Mutex::new(0.0)),
468            spending_history: Arc::new(Mutex::new(Vec::new())),
469            alert_thresholds: AlertThresholds::default(),
470        }
471    }
472
473    /// Check if a request is within budget
474    fn check_budget(&self, estimated_cost: f64) -> BudgetCheckResult {
475        let daily_spent = *self.daily_spending.lock().unwrap();
476        let monthly_spent = *self.monthly_spending.lock().unwrap();
477
478        let daily_after = daily_spent + estimated_cost;
479        let monthly_after = monthly_spent + estimated_cost;
480
481        let daily_percent = (daily_after / self.daily_budget) * 100.0;
482        let monthly_percent = (monthly_after / self.monthly_budget) * 100.0;
483
484        // Check for budget violations
485        if daily_after > self.daily_budget {
486            return BudgetCheckResult {
487                approved: false,
488                reason: format!(
489                    "Would exceed daily budget: ${:.4} > ${:.2}",
490                    daily_after, self.daily_budget
491                ),
492                current_daily_usage: daily_percent,
493                current_monthly_usage: monthly_percent,
494                alerts: vec![BudgetAlert::DailyExceeded],
495            };
496        }
497
498        if monthly_after > self.monthly_budget {
499            return BudgetCheckResult {
500                approved: false,
501                reason: format!(
502                    "Would exceed monthly budget: ${:.4} > ${:.2}",
503                    monthly_after, self.monthly_budget
504                ),
505                current_daily_usage: daily_percent,
506                current_monthly_usage: monthly_percent,
507                alerts: vec![BudgetAlert::MonthlyExceeded],
508            };
509        }
510
511        // Check for alerts
512        let mut alerts = Vec::new();
513
514        if daily_percent >= self.alert_thresholds.daily_critical_percent {
515            alerts.push(BudgetAlert::DailyCritical);
516        } else if daily_percent >= self.alert_thresholds.daily_warning_percent {
517            alerts.push(BudgetAlert::DailyWarning);
518        }
519
520        if monthly_percent >= self.alert_thresholds.monthly_critical_percent {
521            alerts.push(BudgetAlert::MonthlyCritical);
522        } else if monthly_percent >= self.alert_thresholds.monthly_warning_percent {
523            alerts.push(BudgetAlert::MonthlyWarning);
524        }
525
526        BudgetCheckResult {
527            approved: true,
528            reason: "Within budget limits".to_string(),
529            current_daily_usage: daily_percent,
530            current_monthly_usage: monthly_percent,
531            alerts,
532        }
533    }
534
535    /// Record actual spending
536    fn record_spending(&self, record: SpendingRecord) {
537        let mut daily_spending = self.daily_spending.lock().unwrap();
538        let mut monthly_spending = self.monthly_spending.lock().unwrap();
539        let mut history = self.spending_history.lock().unwrap();
540
541        *daily_spending += record.cost;
542        *monthly_spending += record.cost;
543        history.push(record);
544
545        // Keep only recent history (last 1000 records)
546        if history.len() > 1000 {
547            history.remove(0);
548        }
549    }
550
551    /// Get budget status summary
552    fn get_budget_status(&self) -> BudgetStatus {
553        let daily_spent = *self.daily_spending.lock().unwrap();
554        let monthly_spent = *self.monthly_spending.lock().unwrap();
555        let history = self.spending_history.lock().unwrap();
556
557        let total_requests = history.len() as u64;
558        let total_tokens: i32 = history
559            .iter()
560            .map(|r| r.input_tokens + r.output_tokens)
561            .sum();
562
563        let avg_cost_per_request = if total_requests > 0 {
564            monthly_spent / total_requests as f64
565        } else {
566            0.0
567        };
568
569        BudgetStatus {
570            daily_budget: self.daily_budget,
571            monthly_budget: self.monthly_budget,
572            daily_spent,
573            monthly_spent,
574            daily_remaining: self.daily_budget - daily_spent,
575            monthly_remaining: self.monthly_budget - monthly_spent,
576            daily_usage_percent: (daily_spent / self.daily_budget) * 100.0,
577            monthly_usage_percent: (monthly_spent / self.monthly_budget) * 100.0,
578            total_requests,
579            total_tokens,
580            average_cost_per_request: avg_cost_per_request,
581        }
582    }
583
584    /// Reset daily spending (should be called daily)
585    fn reset_daily_spending(&self) {
586        *self.daily_spending.lock().unwrap() = 0.0;
587        info!("Daily spending reset");
588    }
589
590    /// Reset monthly spending (should be called monthly)
591    fn reset_monthly_spending(&self) {
592        *self.monthly_spending.lock().unwrap() = 0.0;
593        info!("Monthly spending reset");
594    }
595}
596
597/// Result of budget check
598#[derive(Debug, Clone)]
599struct BudgetCheckResult {
600    approved: bool,
601    reason: String,
602    current_daily_usage: f64,
603    current_monthly_usage: f64,
604    alerts: Vec<BudgetAlert>,
605}
606
607/// Budget alert types
608#[derive(Debug, Clone)]
609enum BudgetAlert {
610    DailyWarning,
611    DailyCritical,
612    DailyExceeded,
613    MonthlyWarning,
614    MonthlyCritical,
615    MonthlyExceeded,
616}
617
618impl BudgetAlert {
619    fn message(&self) -> &str {
620        match self {
621            BudgetAlert::DailyWarning => "Daily budget usage approaching limit",
622            BudgetAlert::DailyCritical => "Daily budget usage critical",
623            BudgetAlert::DailyExceeded => "Daily budget exceeded",
624            BudgetAlert::MonthlyWarning => "Monthly budget usage approaching limit",
625            BudgetAlert::MonthlyCritical => "Monthly budget usage critical",
626            BudgetAlert::MonthlyExceeded => "Monthly budget exceeded",
627        }
628    }
629}
630
631/// Budget status summary
632#[derive(Debug, Clone)]
633struct BudgetStatus {
634    daily_budget: f64,
635    monthly_budget: f64,
636    daily_spent: f64,
637    monthly_spent: f64,
638    daily_remaining: f64,
639    monthly_remaining: f64,
640    daily_usage_percent: f64,
641    monthly_usage_percent: f64,
642    total_requests: u64,
643    total_tokens: i32,
644    average_cost_per_request: f64,
645}
646
647impl BudgetStatus {
648    fn print_status(&self) {
649        info!("=== Budget Status ===");
650        info!(
651            "Daily: ${:.4} / ${:.2} ({:.1}% used, ${:.4} remaining)",
652            self.daily_spent, self.daily_budget, self.daily_usage_percent, self.daily_remaining
653        );
654        info!(
655            "Monthly: ${:.4} / ${:.2} ({:.1}% used, ${:.4} remaining)",
656            self.monthly_spent,
657            self.monthly_budget,
658            self.monthly_usage_percent,
659            self.monthly_remaining
660        );
661        info!("Total requests: {}", self.total_requests);
662        info!("Total tokens: {}", self.total_tokens);
663        info!(
664            "Average cost per request: ${:.6}",
665            self.average_cost_per_request
666        );
667    }
668}
669
670/// Chat message structure
671#[derive(Debug, Clone)]
672struct ChatMessage {
673    role: String,
674    content: String,
675}
676
677impl ChatMessage {
678    fn user(content: &str) -> Self {
679        Self {
680            role: "user".to_string(),
681            content: content.to_string(),
682        }
683    }
684
685    fn system(content: &str) -> Self {
686        Self {
687            role: "system".to_string(),
688            content: content.to_string(),
689        }
690    }
691
692    fn assistant(content: &str) -> Self {
693        Self {
694            role: "assistant".to_string(),
695            content: content.to_string(),
696        }
697    }
698}
699
700/// Token-aware client that integrates counting and budgeting
701#[derive(Debug)]
702struct TokenAwareClient {
703    client: Client,
704    token_counter: TokenCounter,
705    budget_manager: Arc<BudgetManager>,
706}
707
708impl TokenAwareClient {
709    /// Create a new token-aware client
710    fn new(client: Client, daily_budget: f64, monthly_budget: f64) -> Self {
711        Self {
712            client,
713            token_counter: TokenCounter::new(),
714            budget_manager: Arc::new(BudgetManager::new(daily_budget, monthly_budget)),
715        }
716    }
717
718    /// Send a chat completion with token and budget checking
719    async fn chat_completion_with_budget(
720        &self,
721        messages: &[ChatMessage],
722        model: &str,
723        max_tokens: Option<i32>,
724        user_id: Option<String>,
725    ) -> Result<String> {
726        // Estimate tokens and cost
727        let token_estimate = self.token_counter.estimate_chat_tokens(messages, model);
728        let cost_estimate = self.token_counter.estimate_cost(&token_estimate, model);
729
730        info!(
731            "Token estimate: {} input, {} max output, ${:.4} max cost",
732            token_estimate.estimated_input_tokens,
733            token_estimate.max_output_tokens,
734            cost_estimate.total_max_cost
735        );
736
737        // Check token limits
738        if token_estimate.exceeds_context_limit {
739            return Err(Error::InvalidRequest(format!(
740                "Request exceeds context limit: {} > {}",
741                token_estimate.estimated_input_tokens,
742                token_estimate.model_limits.max_context_length
743            )));
744        }
745
746        if token_estimate.exceeds_safe_limit {
747            warn!(
748                "Request exceeds safe input limit: {} > {}",
749                token_estimate.estimated_input_tokens, token_estimate.model_limits.safe_input_limit
750            );
751        }
752
753        // Check budget
754        let budget_check = self
755            .budget_manager
756            .check_budget(cost_estimate.total_max_cost);
757
758        if !budget_check.approved {
759            return Err(Error::InvalidRequest(format!(
760                "Budget check failed: {}",
761                budget_check.reason
762            )));
763        }
764
765        // Handle alerts
766        for alert in &budget_check.alerts {
767            match alert {
768                BudgetAlert::DailyWarning | BudgetAlert::MonthlyWarning => {
769                    warn!("{}", alert.message());
770                }
771                BudgetAlert::DailyCritical | BudgetAlert::MonthlyCritical => {
772                    error!("{}", alert.message());
773                }
774                _ => {}
775            }
776        }
777
778        // Simulate API call
779        let response = self.simulate_api_call(messages, model, max_tokens).await?;
780
781        // Calculate actual usage (simplified)
782        let actual_output_tokens = self.token_counter.estimate_tokens(&response, "english");
783        let actual_cost = (token_estimate.estimated_input_tokens as f64 / 1000.0)
784            * cost_estimate.pricing_info.input_cost_per_1k
785            + (actual_output_tokens as f64 / 1000.0)
786                * cost_estimate.pricing_info.output_cost_per_1k;
787
788        // Record spending
789        let spending_record = SpendingRecord {
790            timestamp: SystemTime::now()
791                .duration_since(UNIX_EPOCH)
792                .unwrap()
793                .as_secs(),
794            model: model.to_string(),
795            input_tokens: token_estimate.estimated_input_tokens,
796            output_tokens: actual_output_tokens,
797            cost: actual_cost,
798            request_type: "chat_completion".to_string(),
799            user_id,
800        };
801
802        self.budget_manager.record_spending(spending_record);
803
804        info!(
805            "Request completed: {} tokens used, ${:.6} actual cost",
806            token_estimate.estimated_input_tokens + actual_output_tokens,
807            actual_cost
808        );
809
810        Ok(response)
811    }
812
813    /// Get model recommendation for messages
814    fn recommend_model(
815        &self,
816        messages: &[ChatMessage],
817        quality_tier: QualityTier,
818    ) -> ModelRecommendation {
819        self.token_counter.recommend_model(messages, quality_tier)
820    }
821
822    /// Optimize messages for token efficiency
823    fn optimize_for_budget(
824        &self,
825        messages: &[ChatMessage],
826        model: &str,
827        target_cost: f64,
828    ) -> Vec<ChatMessage> {
829        // Calculate target tokens based on cost
830        let pricing = self.token_counter.model_pricing.get(model).unwrap();
831        let target_tokens = ((target_cost / pricing.input_cost_per_1k) * 1000.0) as i32;
832
833        self.token_counter
834            .optimize_messages(messages, model, target_tokens)
835    }
836
837    /// Get budget status
838    fn get_budget_status(&self) -> BudgetStatus {
839        self.budget_manager.get_budget_status()
840    }
841
842    /// Simulate API call for demonstration
843    async fn simulate_api_call(
844        &self,
845        messages: &[ChatMessage],
846        model: &str,
847        _max_tokens: Option<i32>,
848    ) -> Result<String> {
849        // Simulate processing time based on model
850        let delay = match model {
851            "gpt-4" | "gpt-4-32k" => Duration::from_millis(800),
852            _ => Duration::from_millis(400),
853        };
854
855        tokio::time::sleep(delay).await;
856
857        // Generate a response based on the last user message
858        if let Some(last_message) = messages.iter().rev().find(|m| m.role == "user") {
859            Ok(format!(
860                "Simulated {} response to: {}",
861                model,
862                last_message.content.chars().take(50).collect::<String>()
863            ))
864        } else {
865            Ok("Simulated response with no user input".to_string())
866        }
867    }
868}
869
870#[tokio::main]
871async fn main() -> Result<()> {
872    // Initialize logging
873    tracing_subscriber::fmt()
874        .with_env_filter(
875            tracing_subscriber::EnvFilter::try_from_default_env()
876                .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new("info")),
877        )
878        .init();
879
880    info!("Starting token counting and budget management example");
881
882    // Create client
883    let config = Config::builder().api_key("test-api-key").build();
884    let client = Client::builder(config)?.build();
885
886    // Example 1: Basic token counting and estimation
887    info!("=== Example 1: Token Counting and Estimation ===");
888
889    let token_counter = TokenCounter::new();
890
891    let test_messages = vec![
892        ChatMessage::system("You are a helpful assistant that provides detailed explanations."),
893        ChatMessage::user("Explain the concept of machine learning in simple terms."),
894        ChatMessage::assistant("Machine learning is a way for computers to learn patterns from data without being explicitly programmed for every scenario."),
895        ChatMessage::user("Can you give me a practical example?"),
896    ];
897
898    for model in ["gpt-3.5-turbo", "gpt-4", "gpt-4-32k"] {
899        let estimate = token_counter.estimate_chat_tokens(&test_messages, model);
900        let cost_estimate = token_counter.estimate_cost(&estimate, model);
901
902        info!("Model: {}", model);
903        info!(
904            "  Estimated input tokens: {}",
905            estimate.estimated_input_tokens
906        );
907        info!("  Max output tokens: {}", estimate.max_output_tokens);
908        info!(
909            "  Total estimated tokens: {}",
910            estimate.total_estimated_tokens
911        );
912        info!(
913            "  Exceeds context limit: {}",
914            estimate.exceeds_context_limit
915        );
916        info!("  Exceeds safe limit: {}", estimate.exceeds_safe_limit);
917        info!("  Estimated cost: ${:.6}", cost_estimate.total_max_cost);
918        info!("");
919    }
920
921    // Example 2: Model recommendations based on cost and quality
922    info!("=== Example 2: Model Recommendations ===");
923
924    for quality_tier in [
925        QualityTier::Budget,
926        QualityTier::Balanced,
927        QualityTier::Premium,
928    ] {
929        let recommendation = token_counter.recommend_model(&test_messages, quality_tier.clone());
930        info!("Quality tier: {}", quality_tier.as_str());
931        info!("  Recommended model: {}", recommendation.model);
932        info!("  Estimated cost: ${:.6}", recommendation.estimated_cost);
933        info!("  Reason: {}", recommendation.reason);
934        info!("");
935    }
936
937    // Example 3: Message optimization for token limits
938    info!("=== Example 3: Message Optimization ===");
939
940    let long_messages = vec![
941        ChatMessage::system("You are an expert assistant with deep knowledge across many domains."),
942        ChatMessage::user("Tell me everything you know about artificial intelligence, machine learning, deep learning, neural networks, natural language processing, computer vision, and how they all relate to each other. I want a comprehensive overview."),
943        ChatMessage::assistant("Artificial intelligence is a broad field..."),
944        ChatMessage::user("Now explain quantum computing and how it might affect AI in the future."),
945        ChatMessage::user("What about the ethical implications of AI?"),
946        ChatMessage::user("How do transformers work in detail?"),
947    ];
948
949    let original_estimate = token_counter.estimate_chat_tokens(&long_messages, "gpt-3.5-turbo");
950    info!(
951        "Original message tokens: {}",
952        original_estimate.estimated_input_tokens
953    );
954
955    let optimized_messages = token_counter.optimize_messages(&long_messages, "gpt-3.5-turbo", 2000);
956    let optimized_estimate =
957        token_counter.estimate_chat_tokens(&optimized_messages, "gpt-3.5-turbo");
958    info!(
959        "Optimized message tokens: {}",
960        optimized_estimate.estimated_input_tokens
961    );
962    info!(
963        "Optimization saved: {} tokens",
964        original_estimate.estimated_input_tokens - optimized_estimate.estimated_input_tokens
965    );
966
967    // Example 4: Budget management
968    info!("\n=== Example 4: Budget Management ===");
969
970    let token_aware_client = TokenAwareClient::new(client, 10.0, 100.0); // $10 daily, $100 monthly
971
972    // Test budget status
973    let initial_status = token_aware_client.get_budget_status();
974    initial_status.print_status();
975
976    // Make several requests to test budget tracking
977    let test_requests = vec![
978        ("What is the weather like?", "gpt-3.5-turbo"),
979        ("Explain quantum physics", "gpt-4"),
980        ("Write a short story", "gpt-3.5-turbo"),
981        ("Solve this math problem: 2x + 5 = 15", "gpt-3.5-turbo"),
982    ];
983
984    for (prompt, model) in test_requests {
985        let messages = vec![ChatMessage::user(prompt)];
986
987        match token_aware_client
988            .chat_completion_with_budget(&messages, model, Some(150), Some("test_user".to_string()))
989            .await
990        {
991            Ok(response) => {
992                info!("Request successful: {}", response);
993            }
994            Err(e) => {
995                error!("Request failed: {}", e);
996            }
997        }
998
999        // Small delay between requests
1000        tokio::time::sleep(Duration::from_millis(500)).await;
1001    }
1002
1003    // Check final budget status
1004    let final_status = token_aware_client.get_budget_status();
1005    info!("\nFinal budget status:");
1006    final_status.print_status();
1007
1008    // Example 5: Cost optimization strategies
1009    info!("\n=== Example 5: Cost Optimization ===");
1010
1011    let expensive_prompt = vec![
1012        ChatMessage::system("You are a comprehensive research assistant."),
1013        ChatMessage::user("I need a detailed analysis of the global economic impact of artificial intelligence across all major industries, including specific case studies, statistical data, future projections, and policy recommendations. Please provide a thorough report with citations and references."),
1014    ];
1015
1016    // Get model recommendation for this expensive request
1017    let budget_recommendation =
1018        token_aware_client.recommend_model(&expensive_prompt, QualityTier::Budget);
1019    let balanced_recommendation =
1020        token_aware_client.recommend_model(&expensive_prompt, QualityTier::Balanced);
1021
1022    info!("Expensive request analysis:");
1023    info!(
1024        "  Budget option: {} (${:.6})",
1025        budget_recommendation.model, budget_recommendation.estimated_cost
1026    );
1027    info!(
1028        "  Balanced option: {} (${:.6})",
1029        balanced_recommendation.model, balanced_recommendation.estimated_cost
1030    );
1031
1032    // Optimize for a specific budget
1033    let optimized_for_budget =
1034        token_aware_client.optimize_for_budget(&expensive_prompt, "gpt-3.5-turbo", 0.05);
1035    let optimized_estimate =
1036        token_counter.estimate_chat_tokens(&optimized_for_budget, "gpt-3.5-turbo");
1037    let optimized_cost = token_counter.estimate_cost(&optimized_estimate, "gpt-3.5-turbo");
1038
1039    info!("Optimized for $0.05 budget:");
1040    info!("  Tokens: {}", optimized_estimate.estimated_input_tokens);
1041    info!("  Estimated cost: ${:.6}", optimized_cost.total_max_cost);
1042
1043    // Example 6: Real-time monitoring and alerts
1044    info!("\n=== Example 6: Budget Monitoring ===");
1045
1046    // Simulate approaching budget limits
1047    let high_usage_client = TokenAwareClient::new(
1048        Client::builder(Config::builder().api_key("test-api-key").build())?.build(),
1049        1.0, // Low daily budget for demonstration
1050        10.0,
1051    );
1052
1053    // Make expensive requests to trigger alerts
1054    let expensive_messages = vec![ChatMessage::user(
1055        "Generate a very long detailed response about the history of computing.",
1056    )];
1057
1058    for i in 1..=5 {
1059        info!("Making expensive request {}/5", i);
1060
1061        match high_usage_client
1062            .chat_completion_with_budget(
1063                &expensive_messages,
1064                "gpt-4", // Expensive model
1065                Some(500),
1066                Some(format!("user_{}", i)),
1067            )
1068            .await
1069        {
1070            Ok(response) => {
1071                info!(
1072                    "Request {} completed: {}",
1073                    i,
1074                    response.chars().take(100).collect::<String>()
1075                );
1076            }
1077            Err(e) => {
1078                warn!("Request {} blocked: {}", i, e);
1079                break;
1080            }
1081        }
1082
1083        // Show budget status after each request
1084        let status = high_usage_client.get_budget_status();
1085        info!(
1086            "Budget after request {}: {:.1}% daily, {:.1}% monthly",
1087            i, status.daily_usage_percent, status.monthly_usage_percent
1088        );
1089    }
1090
1091    // Example 7: Analytics and reporting
1092    info!("\n=== Example 7: Usage Analytics ===");
1093
1094    let final_analytics = high_usage_client.get_budget_status();
1095    info!("=== Usage Analytics Summary ===");
1096    info!(
1097        "Total API requests made: {}",
1098        final_analytics.total_requests
1099    );
1100    info!("Total tokens processed: {}", final_analytics.total_tokens);
1101    info!(
1102        "Average tokens per request: {:.1}",
1103        final_analytics.total_tokens as f64 / final_analytics.total_requests.max(1) as f64
1104    );
1105    info!(
1106        "Average cost per request: ${:.6}",
1107        final_analytics.average_cost_per_request
1108    );
1109    info!(
1110        "Total spending: ${:.4}",
1111        final_analytics.daily_spent + final_analytics.monthly_spent
1112    );
1113
1114    // Calculate efficiency metrics
1115    let tokens_per_dollar =
1116        final_analytics.total_tokens as f64 / (final_analytics.daily_spent + 0.001);
1117    info!("Tokens per dollar: {:.0}", tokens_per_dollar);
1118
1119    info!("Token counting and budget management example completed successfully!");
1120    Ok(())
1121}
token_counting/token_counting.rs

token_counting/
token_counting.rs