trustformers_tokenizers/training/
analysis.rs

1//! Advanced analysis tools for tokenizer training and evaluation.
2//!
3//! This module provides comprehensive analysis capabilities including:
4//! - Coverage analysis for evaluating tokenizer effectiveness
5//! - Language detection based on character and n-gram frequencies
6//! - Token distribution analysis for vocabulary optimization
7//! - Statistical analysis of tokenization patterns
8
9use serde::{Deserialize, Serialize};
10use std::collections::HashMap;
11
12/// Detailed coverage analysis results for tokenizer evaluation.
13#[derive(Debug, Clone, Serialize, Deserialize)]
14pub struct CoverageAnalysis {
15    /// Character-level coverage rate (0.0 to 1.0)
16    pub char_coverage_rate: f64,
17    /// Word-level coverage rate (0.0 to 1.0)
18    pub word_coverage_rate: f64,
19    /// Compression ratio (tokens per character)
20    pub compression_ratio: f64,
21    /// Total characters in the test corpus
22    pub total_chars: usize,
23    /// Total words in the test corpus
24    pub total_words: usize,
25    /// Total tokens produced by tokenization
26    pub total_tokens: usize,
27    /// Number of characters covered by the vocabulary
28    pub covered_chars: usize,
29    /// Number of words covered by the vocabulary
30    pub covered_words: usize,
31    /// Distribution of token lengths
32    pub length_distribution: HashMap<usize, u32>,
33    /// List of out-of-vocabulary tokens encountered
34    pub oov_tokens: Vec<String>,
35    /// Size of the tokenizer vocabulary
36    pub vocab_size: usize,
37}
38
39impl CoverageAnalysis {
40    /// Generate a comprehensive summary report of the coverage analysis.
41    pub fn summary(&self) -> String {
42        format!(
43            "Coverage Analysis Summary:\n\
44             - Character Coverage: {:.2}% ({}/{})\n\
45             - Word Coverage: {:.2}% ({}/{})\n\
46             - Compression Ratio: {:.3}\n\
47             - Vocabulary Size: {}\n\
48             - OOV Tokens: {}\n\
49             - Average Token Length: {:.2}",
50            self.char_coverage_rate * 100.0,
51            self.covered_chars,
52            self.total_chars,
53            self.word_coverage_rate * 100.0,
54            self.covered_words,
55            self.total_words,
56            self.compression_ratio,
57            self.vocab_size,
58            self.oov_tokens.len(),
59            self.average_token_length()
60        )
61    }
62
63    /// Calculate average token length based on the length distribution.
64    pub fn average_token_length(&self) -> f64 {
65        let total_tokens: u32 = self.length_distribution.values().sum();
66        if total_tokens == 0 {
67            return 0.0;
68        }
69
70        let weighted_sum: u32 = self
71            .length_distribution
72            .iter()
73            .map(|(&length, &count)| length as u32 * count)
74            .sum();
75
76        weighted_sum as f64 / total_tokens as f64
77    }
78
79    /// Get the most common token lengths with their frequencies.
80    pub fn top_token_lengths(&self, n: usize) -> Vec<(usize, u32)> {
81        let mut lengths: Vec<_> = self.length_distribution.iter().collect();
82        lengths.sort_by(|a, b| b.1.cmp(a.1));
83        lengths.into_iter().take(n).map(|(&len, &count)| (len, count)).collect()
84    }
85
86    /// Calculate efficiency score combining coverage and compression.
87    pub fn efficiency_score(&self) -> f64 {
88        // Weighted combination of character coverage and inverse compression ratio
89        let coverage_score = 0.6 * self.char_coverage_rate + 0.4 * self.word_coverage_rate;
90        let compression_score = 1.0 / (1.0 + self.compression_ratio);
91        0.7 * coverage_score + 0.3 * compression_score
92    }
93}
94
95/// Language detection based on character frequency analysis.
96#[derive(Debug, Clone, Serialize, Deserialize)]
97pub struct LanguageDetector {
98    /// Character frequency profiles for different languages
99    language_profiles: HashMap<String, HashMap<char, f64>>,
100    /// N-gram profiles for enhanced detection accuracy
101    ngram_profiles: HashMap<String, HashMap<String, f64>>,
102    /// List of supported languages
103    supported_languages: Vec<String>,
104}
105
106impl LanguageDetector {
107    /// Create a new language detector with built-in language profiles.
108    pub fn new() -> Self {
109        let mut detector = Self {
110            language_profiles: HashMap::new(),
111            ngram_profiles: HashMap::new(),
112            supported_languages: Vec::new(),
113        };
114
115        detector.initialize_built_in_profiles();
116        detector
117    }
118
119    /// Initialize built-in language profiles for common languages.
120    fn initialize_built_in_profiles(&mut self) {
121        // English character frequencies (approximated from corpora)
122        let mut english_chars = HashMap::new();
123        english_chars.insert('e', 12.7);
124        english_chars.insert('t', 9.1);
125        english_chars.insert('a', 8.2);
126        english_chars.insert('o', 7.5);
127        english_chars.insert('i', 7.0);
128        english_chars.insert('n', 6.7);
129        english_chars.insert('s', 6.3);
130        english_chars.insert('h', 6.1);
131        english_chars.insert('r', 6.0);
132
133        let mut english_ngrams = HashMap::new();
134        english_ngrams.insert("th".to_string(), 2.7);
135        english_ngrams.insert("he".to_string(), 2.3);
136        english_ngrams.insert("in".to_string(), 2.0);
137        english_ngrams.insert("er".to_string(), 1.8);
138        english_ngrams.insert("an".to_string(), 1.6);
139
140        self.language_profiles.insert("en".to_string(), english_chars);
141        self.ngram_profiles.insert("en".to_string(), english_ngrams);
142
143        // Spanish character frequencies
144        let mut spanish_chars = HashMap::new();
145        spanish_chars.insert('e', 13.7);
146        spanish_chars.insert('a', 11.5);
147        spanish_chars.insert('o', 8.7);
148        spanish_chars.insert('s', 8.0);
149        spanish_chars.insert('r', 6.9);
150        spanish_chars.insert('n', 6.7);
151        spanish_chars.insert('i', 6.2);
152        spanish_chars.insert('d', 5.9);
153        spanish_chars.insert('l', 5.0);
154
155        let mut spanish_ngrams = HashMap::new();
156        spanish_ngrams.insert("de".to_string(), 2.8);
157        spanish_ngrams.insert("la".to_string(), 2.5);
158        spanish_ngrams.insert("es".to_string(), 2.1);
159        spanish_ngrams.insert("en".to_string(), 1.9);
160        spanish_ngrams.insert("el".to_string(), 1.7);
161
162        self.language_profiles.insert("es".to_string(), spanish_chars);
163        self.ngram_profiles.insert("es".to_string(), spanish_ngrams);
164
165        // French character frequencies
166        let mut french_chars = HashMap::new();
167        french_chars.insert('e', 14.7);
168        french_chars.insert('s', 7.9);
169        french_chars.insert('a', 7.6);
170        french_chars.insert('i', 7.5);
171        french_chars.insert('t', 7.2);
172        french_chars.insert('n', 7.1);
173        french_chars.insert('r', 6.6);
174        french_chars.insert('u', 6.3);
175        french_chars.insert('l', 5.5);
176
177        let mut french_ngrams = HashMap::new();
178        french_ngrams.insert("de".to_string(), 3.0);
179        french_ngrams.insert("le".to_string(), 2.4);
180        french_ngrams.insert("es".to_string(), 2.1);
181        french_ngrams.insert("re".to_string(), 1.8);
182        french_ngrams.insert("nt".to_string(), 1.6);
183
184        self.language_profiles.insert("fr".to_string(), french_chars);
185        self.ngram_profiles.insert("fr".to_string(), french_ngrams);
186
187        // German character frequencies
188        let mut german_chars = HashMap::new();
189        german_chars.insert('e', 17.4);
190        german_chars.insert('n', 9.8);
191        german_chars.insert('i', 7.6);
192        german_chars.insert('s', 7.3);
193        german_chars.insert('r', 7.0);
194        german_chars.insert('a', 6.5);
195        german_chars.insert('t', 6.2);
196        german_chars.insert('d', 5.1);
197        german_chars.insert('h', 4.8);
198
199        let mut german_ngrams = HashMap::new();
200        german_ngrams.insert("er".to_string(), 3.9);
201        german_ngrams.insert("en".to_string(), 3.6);
202        german_ngrams.insert("ch".to_string(), 2.4);
203        german_ngrams.insert("de".to_string(), 2.1);
204        german_ngrams.insert("ei".to_string(), 1.8);
205
206        self.language_profiles.insert("de".to_string(), german_chars);
207        self.ngram_profiles.insert("de".to_string(), german_ngrams);
208
209        self.supported_languages = vec![
210            "en".to_string(),
211            "es".to_string(),
212            "fr".to_string(),
213            "de".to_string(),
214        ];
215    }
216
217    /// Detect the language of a given text.
218    ///
219    /// Returns a `LanguageDetectionResult` with the detected language,
220    /// confidence score, and scores for all supported languages.
221    pub fn detect_language(&self, text: &str) -> LanguageDetectionResult {
222        if text.trim().is_empty() {
223            return LanguageDetectionResult {
224                detected_language: "unknown".to_string(),
225                confidence: 0.0,
226                scores: HashMap::new(),
227            };
228        }
229
230        let text_lower = text.to_lowercase();
231        let char_freq = self.calculate_char_frequency(&text_lower);
232        let ngram_freq = self.calculate_ngram_frequency(&text_lower, 2);
233
234        let mut scores = HashMap::new();
235
236        for lang in &self.supported_languages {
237            let char_score = self.calculate_char_similarity(&char_freq, lang);
238            let ngram_score = self.calculate_ngram_similarity(&ngram_freq, lang);
239
240            // Weighted combination of character and n-gram scores
241            let combined_score = 0.6 * char_score + 0.4 * ngram_score;
242            scores.insert(lang.clone(), combined_score);
243        }
244
245        let (detected_language, confidence) = scores
246            .iter()
247            .max_by(|a, b| a.1.partial_cmp(b.1).unwrap_or(std::cmp::Ordering::Equal))
248            .map(|(lang, score)| (lang.clone(), *score))
249            .unwrap_or(("unknown".to_string(), 0.0));
250
251        LanguageDetectionResult {
252            detected_language,
253            confidence,
254            scores,
255        }
256    }
257
258    /// Calculate character frequency distribution in text.
259    fn calculate_char_frequency(&self, text: &str) -> HashMap<char, f64> {
260        let mut freq = HashMap::new();
261        let total_chars = text.chars().filter(|c| c.is_alphabetic()).count() as f64;
262
263        if total_chars == 0.0 {
264            return freq;
265        }
266
267        for ch in text.chars() {
268            if ch.is_alphabetic() {
269                *freq.entry(ch).or_insert(0.0) += 1.0;
270            }
271        }
272
273        // Convert to percentages
274        for value in freq.values_mut() {
275            *value = (*value / total_chars) * 100.0;
276        }
277
278        freq
279    }
280
281    /// Calculate n-gram frequency distribution in text.
282    fn calculate_ngram_frequency(&self, text: &str, n: usize) -> HashMap<String, f64> {
283        let mut freq = HashMap::new();
284        let chars: Vec<char> = text.chars().filter(|c| c.is_alphabetic()).collect();
285        let total_ngrams = chars.len().saturating_sub(n - 1) as f64;
286
287        if total_ngrams == 0.0 {
288            return freq;
289        }
290
291        for window in chars.windows(n) {
292            let ngram: String = window.iter().collect();
293            *freq.entry(ngram).or_insert(0.0) += 1.0;
294        }
295
296        // Convert to percentages
297        for value in freq.values_mut() {
298            *value = (*value / total_ngrams) * 100.0;
299        }
300
301        freq
302    }
303
304    /// Calculate similarity between text character frequency and language profile.
305    fn calculate_char_similarity(&self, text_freq: &HashMap<char, f64>, language: &str) -> f64 {
306        let profile = match self.language_profiles.get(language) {
307            Some(p) => p,
308            None => return 0.0,
309        };
310
311        let mut similarity = 0.0;
312        let mut total_chars = 0;
313
314        for (ch, expected_freq) in profile {
315            let actual_freq = text_freq.get(ch).unwrap_or(&0.0);
316            similarity += 1.0 / (1.0 + (expected_freq - actual_freq).abs());
317            total_chars += 1;
318        }
319
320        if total_chars > 0 {
321            similarity / total_chars as f64
322        } else {
323            0.0
324        }
325    }
326
327    /// Calculate similarity between text n-gram frequency and language profile.
328    fn calculate_ngram_similarity(&self, text_freq: &HashMap<String, f64>, language: &str) -> f64 {
329        let profile = match self.ngram_profiles.get(language) {
330            Some(p) => p,
331            None => return 0.0,
332        };
333
334        let mut similarity = 0.0;
335        let mut total_ngrams = 0;
336
337        for (ngram, expected_freq) in profile {
338            let actual_freq = text_freq.get(ngram).unwrap_or(&0.0);
339            similarity += 1.0 / (1.0 + (expected_freq - actual_freq).abs());
340            total_ngrams += 1;
341        }
342
343        if total_ngrams > 0 {
344            similarity / total_ngrams as f64
345        } else {
346            0.0
347        }
348    }
349
350    /// Get list of supported languages.
351    pub fn supported_languages(&self) -> &[String] {
352        &self.supported_languages
353    }
354
355    /// Add a custom language profile.
356    pub fn add_language_profile(
357        &mut self,
358        language: String,
359        char_profile: HashMap<char, f64>,
360        ngram_profile: HashMap<String, f64>,
361    ) {
362        self.language_profiles.insert(language.clone(), char_profile);
363        self.ngram_profiles.insert(language.clone(), ngram_profile);
364        if !self.supported_languages.contains(&language) {
365            self.supported_languages.push(language);
366        }
367    }
368}
369
370impl Default for LanguageDetector {
371    fn default() -> Self {
372        Self::new()
373    }
374}
375
376/// Language detection result containing detected language and confidence scores.
377#[derive(Debug, Clone, Serialize, Deserialize)]
378pub struct LanguageDetectionResult {
379    /// The detected language code (e.g., "en", "es", "fr", "de")
380    pub detected_language: String,
381    /// Confidence score for the detected language (0.0 to 1.0)
382    pub confidence: f64,
383    /// Scores for all supported languages
384    pub scores: HashMap<String, f64>,
385}
386
387impl LanguageDetectionResult {
388    /// Get the top N language candidates with their scores.
389    pub fn top_candidates(&self, n: usize) -> Vec<(String, f64)> {
390        let mut sorted_scores: Vec<_> = self.scores.iter().collect();
391        sorted_scores.sort_by(|a, b| b.1.partial_cmp(a.1).unwrap_or(std::cmp::Ordering::Equal));
392        sorted_scores
393            .into_iter()
394            .take(n)
395            .map(|(lang, score)| (lang.clone(), *score))
396            .collect()
397    }
398
399    /// Check if the detection confidence is above a threshold.
400    pub fn is_confident(&self, threshold: f64) -> bool {
401        self.confidence >= threshold
402    }
403}
404
405/// Enhanced token distribution analysis for vocabulary optimization.
406#[derive(Debug, Clone, Serialize, Deserialize)]
407pub struct TokenDistributionAnalyzer {
408    /// Length distribution bins for analysis
409    length_bins: Vec<usize>,
410    /// Frequency distribution bins for analysis
411    frequency_bins: Vec<usize>,
412}
413
414impl TokenDistributionAnalyzer {
415    /// Create a new token distribution analyzer with default bins.
416    pub fn new() -> Self {
417        Self {
418            length_bins: vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 30, 50],
419            frequency_bins: vec![1, 2, 5, 10, 20, 50, 100, 200, 500, 1000, 2000, 5000],
420        }
421    }
422
423    /// Create a custom analyzer with specified bins.
424    pub fn with_bins(length_bins: Vec<usize>, frequency_bins: Vec<usize>) -> Self {
425        Self {
426            length_bins,
427            frequency_bins,
428        }
429    }
430
431    /// Analyze token distribution from a vocabulary mapping.
432    pub fn analyze_distribution(&self, vocab: &HashMap<String, u32>) -> TokenDistributionResult {
433        let mut length_distribution = HashMap::new();
434        let mut frequency_distribution = HashMap::new();
435        let mut character_usage = HashMap::new();
436        let mut prefix_analysis = HashMap::new();
437        let mut suffix_analysis = HashMap::new();
438
439        // Initialize bins
440        for &bin in &self.length_bins {
441            length_distribution.insert(bin, 0);
442        }
443        for &bin in &self.frequency_bins {
444            frequency_distribution.insert(bin, 0);
445        }
446
447        let mut total_tokens = 0;
448        let mut total_length = 0;
449        let mut max_length = 0;
450        let mut min_length = usize::MAX;
451
452        for (token, &freq) in vocab {
453            let length = token.chars().count();
454            total_tokens += 1;
455            total_length += length;
456            max_length = max_length.max(length);
457            min_length = min_length.min(length);
458
459            // Length distribution
460            for &bin in &self.length_bins {
461                if length <= bin {
462                    *length_distribution.entry(bin).or_insert(0) += 1;
463                    break;
464                }
465            }
466
467            // Frequency distribution (using token frequency)
468            let token_freq = freq as usize;
469            for &bin in &self.frequency_bins {
470                if token_freq <= bin {
471                    *frequency_distribution.entry(bin).or_insert(0) += 1;
472                    break;
473                }
474            }
475
476            // Character usage analysis
477            for ch in token.chars() {
478                *character_usage.entry(ch).or_insert(0) += 1;
479            }
480
481            // Prefix/suffix analysis (2-3 character prefixes/suffixes)
482            if length >= 2 {
483                let prefix2: String = token.chars().take(2).collect();
484                *prefix_analysis.entry(prefix2).or_insert(0) += 1;
485
486                let suffix2: String =
487                    token.chars().rev().take(2).collect::<String>().chars().rev().collect();
488                *suffix_analysis.entry(suffix2).or_insert(0) += 1;
489            }
490            if length >= 3 {
491                let prefix3: String = token.chars().take(3).collect();
492                *prefix_analysis.entry(prefix3).or_insert(0) += 1;
493
494                let suffix3: String =
495                    token.chars().rev().take(3).collect::<String>().chars().rev().collect();
496                *suffix_analysis.entry(suffix3).or_insert(0) += 1;
497            }
498        }
499
500        let average_length =
501            if total_tokens > 0 { total_length as f64 / total_tokens as f64 } else { 0.0 };
502
503        // Sort character usage by frequency
504        let mut char_frequency: Vec<_> = character_usage.into_iter().collect();
505        char_frequency.sort_by_key(|item| std::cmp::Reverse(item.1));
506
507        // Get top prefixes and suffixes
508        let mut prefix_frequency: Vec<_> = prefix_analysis.into_iter().collect();
509        prefix_frequency.sort_by_key(|item| std::cmp::Reverse(item.1));
510        prefix_frequency.truncate(20); // Top 20
511
512        let mut suffix_frequency: Vec<_> = suffix_analysis.into_iter().collect();
513        suffix_frequency.sort_by_key(|item| std::cmp::Reverse(item.1));
514        suffix_frequency.truncate(20); // Top 20
515
516        TokenDistributionResult {
517            total_tokens,
518            average_length,
519            max_length,
520            min_length: if min_length == usize::MAX { 0 } else { min_length },
521            length_distribution,
522            frequency_distribution,
523            character_frequency: char_frequency.into_iter().collect(),
524            prefix_frequency: prefix_frequency.into_iter().collect(),
525            suffix_frequency: suffix_frequency.into_iter().collect(),
526        }
527    }
528}
529
530impl Default for TokenDistributionAnalyzer {
531    fn default() -> Self {
532        Self::new()
533    }
534}
535
536/// Token distribution analysis result with comprehensive statistics.
537#[derive(Debug, Clone, Serialize, Deserialize)]
538pub struct TokenDistributionResult {
539    /// Total number of tokens analyzed
540    pub total_tokens: usize,
541    /// Average token length in characters
542    pub average_length: f64,
543    /// Maximum token length
544    pub max_length: usize,
545    /// Minimum token length
546    pub min_length: usize,
547    /// Distribution of tokens by length bins
548    pub length_distribution: HashMap<usize, usize>,
549    /// Distribution of tokens by frequency bins
550    pub frequency_distribution: HashMap<usize, usize>,
551    /// Character usage frequency
552    pub character_frequency: HashMap<char, usize>,
553    /// Prefix frequency analysis
554    pub prefix_frequency: HashMap<String, usize>,
555    /// Suffix frequency analysis
556    pub suffix_frequency: HashMap<String, usize>,
557}
558
559impl TokenDistributionResult {
560    /// Generate a human-readable analysis report.
561    pub fn generate_report(&self) -> String {
562        format!(
563            "Token Distribution Analysis Report\n\
564             ===================================\n\
565             Total Tokens: {}\n\
566             Average Token Length: {:.2}\n\
567             Min/Max Length: {}/{}\n\
568             \n\
569             Length Distribution:\n\
570             {}\n\
571             \n\
572             Top 10 Characters by Frequency:\n\
573             {}\n\
574             \n\
575             Top 10 Prefixes:\n\
576             {}\n\
577             \n\
578             Top 10 Suffixes:\n\
579             {}",
580            self.total_tokens,
581            self.average_length,
582            self.min_length,
583            self.max_length,
584            self.format_length_distribution(),
585            self.format_character_frequency(),
586            self.format_prefix_frequency(),
587            self.format_suffix_frequency()
588        )
589    }
590
591    /// Format length distribution for display.
592    fn format_length_distribution(&self) -> String {
593        let mut sorted: Vec<_> = self.length_distribution.iter().collect();
594        sorted.sort_by_key(|(len, _)| *len);
595
596        sorted
597            .iter()
598            .map(|(len, count)| format!("  Length ≤{}: {} tokens", len, count))
599            .collect::<Vec<_>>()
600            .join("\n")
601    }
602
603    /// Format character frequency for display.
604    fn format_character_frequency(&self) -> String {
605        self.character_frequency
606            .iter()
607            .take(10)
608            .map(|(ch, count)| format!("  '{}': {} occurrences", ch, count))
609            .collect::<Vec<_>>()
610            .join("\n")
611    }
612
613    /// Format prefix frequency for display.
614    fn format_prefix_frequency(&self) -> String {
615        self.prefix_frequency
616            .iter()
617            .take(10)
618            .map(|(prefix, count)| format!("  '{}': {} tokens", prefix, count))
619            .collect::<Vec<_>>()
620            .join("\n")
621    }
622
623    /// Format suffix frequency for display.
624    fn format_suffix_frequency(&self) -> String {
625        self.suffix_frequency
626            .iter()
627            .take(10)
628            .map(|(suffix, count)| format!("  '{}': {} tokens", suffix, count))
629            .collect::<Vec<_>>()
630            .join("\n")
631    }
632
633    /// Calculate vocabulary diversity score.
634    pub fn diversity_score(&self) -> f64 {
635        if self.total_tokens == 0 {
636            return 0.0;
637        }
638
639        // Calculate entropy-based diversity score
640        let mut entropy = 0.0;
641        let total_chars: usize = self.character_frequency.values().sum();
642
643        if total_chars > 0 {
644            for &freq in self.character_frequency.values() {
645                let prob = freq as f64 / total_chars as f64;
646                if prob > 0.0 {
647                    entropy -= prob * prob.log2();
648                }
649            }
650        }
651
652        entropy
653    }
654
655    /// Get optimal length range based on the distribution.
656    pub fn optimal_length_range(&self) -> (usize, usize) {
657        let mut cumulative = 0;
658        let target_coverage = (self.total_tokens as f64 * 0.8) as usize; // 80% coverage
659
660        let mut sorted: Vec<_> = self.length_distribution.iter().collect();
661        sorted.sort_by_key(|(len, _)| *len);
662
663        let min_optimal = self.min_length;
664        let mut max_optimal = self.max_length;
665
666        for (len, count) in sorted {
667            cumulative += count;
668            if cumulative >= target_coverage {
669                max_optimal = *len;
670                break;
671            }
672        }
673
674        (min_optimal, max_optimal)
675    }
676}
677
678#[cfg(test)]
679mod tests {
680    use super::*;
681
682    #[test]
683    fn test_coverage_analysis() {
684        let analysis = CoverageAnalysis {
685            char_coverage_rate: 0.95,
686            word_coverage_rate: 0.88,
687            compression_ratio: 0.6,
688            total_chars: 1000,
689            total_words: 200,
690            total_tokens: 600,
691            covered_chars: 950,
692            covered_words: 176,
693            length_distribution: {
694                let mut dist = HashMap::new();
695                dist.insert(1, 100);
696                dist.insert(2, 200);
697                dist.insert(3, 150);
698                dist.insert(4, 100);
699                dist.insert(5, 50);
700                dist
701            },
702            oov_tokens: vec!["[UNK]".to_string()],
703            vocab_size: 5000,
704        };
705
706        assert!((analysis.average_token_length() - 2.6666666666666665).abs() < 1e-10);
707        assert!(analysis.efficiency_score() > 0.0);
708
709        let summary = analysis.summary();
710        assert!(summary.contains("95.00%"));
711        assert!(summary.contains("88.00%"));
712
713        let top_lengths = analysis.top_token_lengths(3);
714        assert_eq!(top_lengths[0], (2, 200));
715    }
716
717    #[test]
718    fn test_language_detector() {
719        let detector = LanguageDetector::new();
720
721        assert_eq!(detector.supported_languages().len(), 4);
722        assert!(detector.supported_languages().contains(&"en".to_string()));
723
724        let result = detector.detect_language("Hello world, this is a test in English");
725        assert_eq!(result.detected_language, "en");
726        assert!(result.confidence > 0.0);
727
728        let top_candidates = result.top_candidates(2);
729        assert_eq!(top_candidates[0].0, "en");
730
731        assert!(result.is_confident(0.1));
732    }
733
734    #[test]
735    fn test_language_detector_empty_text() {
736        let detector = LanguageDetector::new();
737        let result = detector.detect_language("");
738        assert_eq!(result.detected_language, "unknown");
739        assert_eq!(result.confidence, 0.0);
740    }
741
742    #[test]
743    fn test_token_distribution_analyzer() {
744        let analyzer = TokenDistributionAnalyzer::new();
745
746        let mut vocab = HashMap::new();
747        vocab.insert("a".to_string(), 1);
748        vocab.insert("the".to_string(), 2);
749        vocab.insert("hello".to_string(), 3);
750        vocab.insert("world".to_string(), 4);
751        vocab.insert("test".to_string(), 5);
752
753        let result = analyzer.analyze_distribution(&vocab);
754
755        assert_eq!(result.total_tokens, 5);
756        assert!(result.average_length > 0.0);
757        assert_eq!(result.min_length, 1);
758        assert_eq!(result.max_length, 5);
759
760        let report = result.generate_report();
761        assert!(report.contains("Token Distribution Analysis Report"));
762        assert!(report.contains("Total Tokens: 5"));
763
764        assert!(result.diversity_score() > 0.0);
765
766        let (min_opt, max_opt) = result.optimal_length_range();
767        assert!(min_opt <= max_opt);
768    }
769
770    #[test]
771    fn test_custom_language_profile() {
772        let mut detector = LanguageDetector::new();
773
774        let mut custom_chars = HashMap::new();
775        custom_chars.insert('x', 50.0);
776        custom_chars.insert('y', 40.0);
777        custom_chars.insert('z', 30.0);
778
779        let mut custom_ngrams = HashMap::new();
780        custom_ngrams.insert("xy".to_string(), 20.0);
781        custom_ngrams.insert("yz".to_string(), 15.0);
782        custom_ngrams.insert("xz".to_string(), 10.0);
783
784        detector.add_language_profile("custom".to_string(), custom_chars, custom_ngrams);
785
786        assert!(detector.supported_languages().contains(&"custom".to_string()));
787
788        // Use a text with very high density of custom characters
789        let result = detector.detect_language("xyxyxyxyxyzyzyzyzxzxzxzxz");
790        assert_eq!(result.detected_language, "custom");
791    }
792
793    #[test]
794    fn test_distribution_analyzer_custom_bins() {
795        let analyzer = TokenDistributionAnalyzer::with_bins(vec![1, 3, 5], vec![1, 10, 100]);
796
797        let mut vocab = HashMap::new();
798        vocab.insert("a".to_string(), 1);
799        vocab.insert("abc".to_string(), 15);
800        vocab.insert("abcde".to_string(), 150);
801
802        let result = analyzer.analyze_distribution(&vocab);
803        assert_eq!(result.total_tokens, 3);
804    }
805}
trustformers_tokenizers/training/analysis.rs

trustformers_tokenizers/training/
analysis.rs