trustformers_tokenizers/
vocab_analyzer.rs

1use serde::{Deserialize, Serialize};
2use std::cmp::Ordering;
3use std::collections::{BTreeMap, HashMap, HashSet};
4use trustformers_core::errors::Result;
5use trustformers_core::traits::Tokenizer;
6
7/// Configuration for vocabulary analysis
8#[derive(Debug, Clone, Serialize, Deserialize)]
9pub struct VocabAnalysisConfig {
10    /// Minimum frequency threshold for rare token detection
11    pub rare_token_threshold: usize,
12    /// Maximum token length for analysis
13    pub max_token_length: usize,
14    /// Whether to analyze character patterns
15    pub analyze_character_patterns: bool,
16    /// Whether to detect potential encoding issues
17    pub detect_encoding_issues: bool,
18    /// Whether to analyze subword patterns
19    pub analyze_subword_patterns: bool,
20    /// Whether to check for duplicates and near-duplicates
21    pub check_duplicates: bool,
22    /// Languages to analyze (if empty, analyze all)
23    pub target_languages: Vec<String>,
24    /// Whether to include detailed statistics
25    pub include_detailed_stats: bool,
26}
27
28impl Default for VocabAnalysisConfig {
29    fn default() -> Self {
30        Self {
31            rare_token_threshold: 1,
32            max_token_length: 100,
33            analyze_character_patterns: true,
34            detect_encoding_issues: true,
35            analyze_subword_patterns: true,
36            check_duplicates: true,
37            target_languages: Vec::new(),
38            include_detailed_stats: true,
39        }
40    }
41}
42
43/// Represents an issue found in the vocabulary
44#[derive(Debug, Clone, Serialize, Deserialize)]
45pub struct VocabIssue {
46    /// Type of issue
47    pub issue_type: VocabIssueType,
48    /// Severity level
49    pub severity: IssueSeverity,
50    /// Description of the issue
51    pub description: String,
52    /// Affected tokens
53    pub affected_tokens: Vec<String>,
54    /// Suggested action
55    pub suggestion: Option<String>,
56}
57
58/// Types of vocabulary issues
59#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
60pub enum VocabIssueType {
61    /// Duplicate tokens with same ID
62    DuplicateTokens,
63    /// Near-duplicate tokens (very similar)
64    NearDuplicates,
65    /// Extremely rare tokens
66    RareTokens,
67    /// Very long tokens
68    LongTokens,
69    /// Potential encoding issues
70    EncodingIssues,
71    /// Invalid UTF-8 sequences
72    InvalidUtf8,
73    /// Inconsistent casing
74    InconsistentCasing,
75    /// Missing common tokens
76    MissingCommonTokens,
77    /// Inefficient subword decomposition
78    InefficientSubwords,
79    /// Overlapping tokens
80    OverlappingTokens,
81    /// Orphaned tokens (no usage)
82    OrphanedTokens,
83}
84
85/// Severity levels for issues
86#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
87pub enum IssueSeverity {
88    Low,
89    Medium,
90    High,
91    Critical,
92}
93
94/// Character pattern analysis
95#[derive(Debug, Clone, Serialize, Deserialize)]
96pub struct CharacterPattern {
97    /// Pattern description
98    pub pattern: String,
99    /// Number of tokens matching this pattern
100    pub count: usize,
101    /// Example tokens
102    pub examples: Vec<String>,
103    /// Pattern frequency
104    pub frequency: f64,
105}
106
107/// Subword pattern analysis
108#[derive(Debug, Clone, Serialize, Deserialize)]
109pub struct SubwordPattern {
110    /// The subword pattern
111    pub pattern: String,
112    /// Number of occurrences
113    pub count: usize,
114    /// Tokens containing this pattern
115    pub tokens: Vec<String>,
116    /// Position in tokens (prefix, infix, suffix)
117    pub positions: HashMap<String, usize>, // position_type -> count
118}
119
120/// Language detection result for tokens
121#[derive(Debug, Clone, Serialize, Deserialize)]
122pub struct LanguageDistribution {
123    /// Language code
124    pub language: String,
125    /// Number of tokens in this language
126    pub token_count: usize,
127    /// Percentage of total vocabulary
128    pub percentage: f64,
129    /// Confidence score
130    pub confidence: f64,
131}
132
133/// Comprehensive vocabulary analysis results
134#[derive(Debug, Clone, Serialize, Deserialize)]
135pub struct VocabAnalysisResult {
136    /// Basic statistics
137    pub basic_stats: VocabBasicStats,
138    /// Detected issues
139    pub issues: Vec<VocabIssue>,
140    /// Character patterns
141    pub character_patterns: Vec<CharacterPattern>,
142    /// Subword patterns
143    pub subword_patterns: Vec<SubwordPattern>,
144    /// Language distribution
145    pub language_distribution: Vec<LanguageDistribution>,
146    /// Token length distribution
147    pub length_distribution: BTreeMap<usize, usize>,
148    /// Most/least frequent tokens
149    pub frequency_analysis: FrequencyAnalysis,
150    /// Coverage analysis
151    pub coverage_analysis: Option<CoverageAnalysis>,
152    /// Recommendations
153    pub recommendations: Vec<String>,
154}
155
156/// Basic vocabulary statistics
157#[derive(Debug, Clone, Serialize, Deserialize)]
158pub struct VocabBasicStats {
159    /// Total number of tokens
160    pub total_tokens: usize,
161    /// Number of unique tokens
162    pub unique_tokens: usize,
163    /// Average token length
164    pub avg_token_length: f64,
165    /// Minimum token length
166    pub min_token_length: usize,
167    /// Maximum token length
168    pub max_token_length: usize,
169    /// Number of alphabetic tokens
170    pub alphabetic_tokens: usize,
171    /// Number of numeric tokens
172    pub numeric_tokens: usize,
173    /// Number of mixed tokens
174    pub mixed_tokens: usize,
175    /// Number of special character tokens
176    pub special_char_tokens: usize,
177    /// Number of whitespace tokens
178    pub whitespace_tokens: usize,
179}
180
181/// Frequency analysis results
182#[derive(Debug, Clone, Serialize, Deserialize)]
183pub struct FrequencyAnalysis {
184    /// Most frequent tokens
185    pub most_frequent: Vec<(String, u32)>,
186    /// Least frequent tokens
187    pub least_frequent: Vec<(String, u32)>,
188    /// Tokens that appear exactly once
189    pub singleton_tokens: Vec<String>,
190    /// Frequency distribution histogram
191    pub frequency_histogram: BTreeMap<u32, usize>, // frequency -> count of tokens
192}
193
194/// Coverage analysis for a corpus
195#[derive(Debug, Clone, Serialize, Deserialize)]
196pub struct CoverageAnalysis {
197    /// Total characters in corpus
198    pub total_chars: usize,
199    /// Characters covered by vocabulary
200    pub covered_chars: usize,
201    /// Coverage percentage
202    pub coverage_percentage: f64,
203    /// Out-of-vocabulary tokens found
204    pub oov_tokens: Vec<String>,
205    /// Most common OOV patterns
206    pub oov_patterns: Vec<String>,
207}
208
209/// Main vocabulary analyzer
210pub struct VocabAnalyzer {
211    config: VocabAnalysisConfig,
212}
213
214impl VocabAnalyzer {
215    /// Create a new vocabulary analyzer
216    pub fn new(config: VocabAnalysisConfig) -> Self {
217        Self { config }
218    }
219
220    /// Create analyzer with default configuration
221    pub fn default() -> Self {
222        Self::new(VocabAnalysisConfig::default())
223    }
224
225    /// Analyze a tokenizer's vocabulary
226    pub fn analyze_tokenizer<T: Tokenizer>(&self, tokenizer: &T) -> Result<VocabAnalysisResult> {
227        let vocab = tokenizer.get_vocab();
228        self.analyze_vocabulary(&vocab)
229    }
230
231    /// Analyze a vocabulary directly
232    pub fn analyze_vocabulary(&self, vocab: &HashMap<String, u32>) -> Result<VocabAnalysisResult> {
233        let mut result = VocabAnalysisResult {
234            basic_stats: self.calculate_basic_stats(vocab),
235            issues: Vec::new(),
236            character_patterns: Vec::new(),
237            subword_patterns: Vec::new(),
238            language_distribution: Vec::new(),
239            length_distribution: BTreeMap::new(),
240            frequency_analysis: self.analyze_frequency(vocab),
241            coverage_analysis: None,
242            recommendations: Vec::new(),
243        };
244
245        // Detect issues
246        result.issues.extend(self.detect_duplicate_tokens(vocab)?);
247        result.issues.extend(self.detect_rare_tokens(vocab)?);
248        result.issues.extend(self.detect_long_tokens(vocab)?);
249
250        if self.config.detect_encoding_issues {
251            result.issues.extend(self.detect_encoding_issues(vocab)?);
252        }
253
254        if self.config.check_duplicates {
255            result.issues.extend(self.detect_near_duplicates(vocab)?);
256        }
257
258        // Analyze patterns
259        if self.config.analyze_character_patterns {
260            result.character_patterns = self.analyze_character_patterns(vocab)?;
261        }
262
263        if self.config.analyze_subword_patterns {
264            result.subword_patterns = self.analyze_subword_patterns(vocab)?;
265        }
266
267        // Calculate length distribution
268        result.length_distribution = self.calculate_length_distribution(vocab);
269
270        // Detect language distribution
271        result.language_distribution = self.detect_language_distribution(vocab)?;
272
273        // Generate recommendations
274        result.recommendations = self.generate_recommendations(&result);
275
276        Ok(result)
277    }
278
279    /// Analyze vocabulary coverage for a given corpus
280    pub fn analyze_coverage<T: Tokenizer>(
281        &self,
282        tokenizer: &T,
283        corpus: &[String],
284    ) -> Result<CoverageAnalysis> {
285        let mut total_chars = 0;
286        let mut covered_chars = 0;
287        let mut oov_tokens = HashSet::new();
288
289        for text in corpus {
290            total_chars += text.chars().count();
291
292            // Tokenize and check coverage
293            let tokenized = tokenizer.encode(text)?;
294            for &token_id in &tokenized.input_ids {
295                if let Some(token) = tokenizer.id_to_token(token_id) {
296                    covered_chars += token.chars().count();
297                } else {
298                    oov_tokens.insert(format!("<UNK:{}>", token_id));
299                }
300            }
301        }
302
303        let coverage_percentage = if total_chars > 0 {
304            (covered_chars as f64 / total_chars as f64) * 100.0
305        } else {
306            0.0
307        };
308
309        // Analyze OOV patterns
310        let oov_tokens_vec: Vec<String> = oov_tokens.iter().cloned().collect();
311        let oov_patterns = self.analyze_oov_patterns(&oov_tokens_vec);
312
313        Ok(CoverageAnalysis {
314            total_chars,
315            covered_chars,
316            coverage_percentage,
317            oov_tokens: oov_tokens_vec,
318            oov_patterns,
319        })
320    }
321
322    /// Calculate basic vocabulary statistics
323    fn calculate_basic_stats(&self, vocab: &HashMap<String, u32>) -> VocabBasicStats {
324        let total_tokens = vocab.len();
325        let unique_tokens = vocab.keys().len();
326
327        let mut total_length = 0;
328        let mut min_length = usize::MAX;
329        let mut max_length = 0;
330        let mut alphabetic_count = 0;
331        let mut numeric_count = 0;
332        let mut mixed_count = 0;
333        let mut special_char_count = 0;
334        let mut whitespace_count = 0;
335
336        for token in vocab.keys() {
337            let len = token.chars().count();
338            total_length += len;
339            min_length = min_length.min(len);
340            max_length = max_length.max(len);
341
342            // Classify token type
343            if token.chars().all(|c| c.is_alphabetic()) {
344                alphabetic_count += 1;
345            } else if token.chars().all(|c| c.is_numeric()) {
346                numeric_count += 1;
347            } else if token.chars().any(|c| c.is_alphabetic())
348                && token.chars().any(|c| c.is_numeric())
349            {
350                mixed_count += 1;
351            } else if token.chars().all(|c| c.is_whitespace()) {
352                whitespace_count += 1;
353            } else {
354                special_char_count += 1;
355            }
356        }
357
358        let avg_token_length =
359            if total_tokens > 0 { total_length as f64 / total_tokens as f64 } else { 0.0 };
360
361        VocabBasicStats {
362            total_tokens,
363            unique_tokens,
364            avg_token_length,
365            min_token_length: if min_length == usize::MAX { 0 } else { min_length },
366            max_token_length: max_length,
367            alphabetic_tokens: alphabetic_count,
368            numeric_tokens: numeric_count,
369            mixed_tokens: mixed_count,
370            special_char_tokens: special_char_count,
371            whitespace_tokens: whitespace_count,
372        }
373    }
374
375    /// Analyze token frequency
376    fn analyze_frequency(&self, vocab: &HashMap<String, u32>) -> FrequencyAnalysis {
377        // For this analysis, we'll assume all tokens have frequency 1 unless we have frequency data
378        // In a real implementation, you'd want to pass frequency information
379        // Generate realistic frequency distribution based on token characteristics
380        let mut token_freq: Vec<(String, u32)> = vocab
381            .iter()
382            .map(|(token, &_id)| {
383                // Calculate frequency based on token characteristics
384                let base_freq = self.estimate_token_frequency(token);
385                (token.clone(), base_freq)
386            })
387            .collect();
388
389        token_freq.sort_by_key(|item| std::cmp::Reverse(item.1));
390
391        let most_frequent = token_freq.iter().take(20).cloned().collect();
392        let least_frequent = token_freq.iter().rev().take(20).cloned().collect();
393
394        let singleton_tokens = token_freq
395            .iter()
396            .filter(|(_, freq)| *freq == 1)
397            .map(|(token, _)| token.clone())
398            .collect();
399
400        // Build frequency histogram
401        let mut frequency_histogram = BTreeMap::new();
402        for (_, freq) in &token_freq {
403            *frequency_histogram.entry(*freq).or_insert(0) += 1;
404        }
405
406        FrequencyAnalysis {
407            most_frequent,
408            least_frequent,
409            singleton_tokens,
410            frequency_histogram,
411        }
412    }
413
414    /// Detect duplicate tokens
415    fn detect_duplicate_tokens(&self, vocab: &HashMap<String, u32>) -> Result<Vec<VocabIssue>> {
416        let mut id_to_tokens: HashMap<u32, Vec<String>> = HashMap::new();
417
418        for (token, &id) in vocab {
419            id_to_tokens.entry(id).or_default().push(token.clone());
420        }
421
422        let mut issues = Vec::new();
423        for (id, tokens) in id_to_tokens {
424            if tokens.len() > 1 {
425                issues.push(VocabIssue {
426                    issue_type: VocabIssueType::DuplicateTokens,
427                    severity: IssueSeverity::High,
428                    description: format!("Multiple tokens share ID {}: {:?}", id, tokens),
429                    affected_tokens: tokens,
430                    suggestion: Some("Ensure each token has a unique ID".to_string()),
431                });
432            }
433        }
434
435        Ok(issues)
436    }
437
438    /// Estimate token frequency based on characteristics
439    fn estimate_token_frequency(&self, token: &str) -> u32 {
440        let mut score = 1000u32; // Base frequency
441
442        // Common patterns get higher frequency
443        if token.chars().all(|c| c.is_ascii_alphabetic()) {
444            score += 500; // Common alphabetic tokens
445        }
446
447        // Shorter tokens tend to be more frequent
448        match token.len() {
449            1..=3 => score += 1000,
450            4..=6 => score += 500,
451            7..=10 => score += 100,
452            _ => score /= 2, // Very long tokens are rarer
453        }
454
455        // Special tokens and common patterns
456        if token.starts_with('<') && token.ends_with('>') {
457            score += 800; // Special tokens
458        } else if token.contains("##") {
459            score += 300; // Subword pieces
460        } else if token.chars().all(|c| c.is_ascii_punctuation()) {
461            score += 200; // Punctuation
462        }
463
464        // Common English letters/patterns boost frequency
465        let common_chars = ['e', 't', 'a', 'o', 'i', 'n', 's', 'h', 'r'];
466        if token.chars().any(|c| common_chars.contains(&c.to_ascii_lowercase())) {
467            score += 200;
468        }
469
470        // Add some randomness to make distribution more realistic
471        let hash_value =
472            token.chars().fold(0u32, |acc, c| acc.wrapping_mul(31).wrapping_add(c as u32));
473        score += hash_value % 200;
474
475        score.max(1) // Ensure minimum frequency of 1
476    }
477
478    /// Detect rare tokens
479    fn detect_rare_tokens(&self, vocab: &HashMap<String, u32>) -> Result<Vec<VocabIssue>> {
480        // Use estimated frequency to identify rare tokens
481        let rare_tokens: Vec<String> = vocab
482            .keys()
483            .filter(|token| {
484                let estimated_freq = self.estimate_token_frequency(token);
485                // Consider tokens rare if they have very low estimated frequency
486                estimated_freq < 100 || token.len() > 20
487            })
488            .take(100)
489            .cloned()
490            .collect();
491
492        if !rare_tokens.is_empty() {
493            Ok(vec![VocabIssue {
494                issue_type: VocabIssueType::RareTokens,
495                severity: IssueSeverity::Low,
496                description: format!("Found {} potentially rare tokens", rare_tokens.len()),
497                affected_tokens: rare_tokens,
498                suggestion: Some(
499                    "Consider removing very rare tokens to reduce vocabulary size".to_string(),
500                ),
501            }])
502        } else {
503            Ok(Vec::new())
504        }
505    }
506
507    /// Detect excessively long tokens
508    fn detect_long_tokens(&self, vocab: &HashMap<String, u32>) -> Result<Vec<VocabIssue>> {
509        let long_tokens: Vec<String> = vocab
510            .keys()
511            .filter(|token| token.chars().count() > self.config.max_token_length)
512            .cloned()
513            .collect();
514
515        if !long_tokens.is_empty() {
516            Ok(vec![VocabIssue {
517                issue_type: VocabIssueType::LongTokens,
518                severity: IssueSeverity::Medium,
519                description: format!(
520                    "Found {} tokens exceeding maximum length of {}",
521                    long_tokens.len(),
522                    self.config.max_token_length
523                ),
524                affected_tokens: long_tokens,
525                suggestion: Some("Consider truncating or removing very long tokens".to_string()),
526            }])
527        } else {
528            Ok(Vec::new())
529        }
530    }
531
532    /// Detect encoding issues
533    fn detect_encoding_issues(&self, vocab: &HashMap<String, u32>) -> Result<Vec<VocabIssue>> {
534        let mut issues = Vec::new();
535        let mut invalid_utf8_tokens = Vec::new();
536        let mut mojibake_tokens = Vec::new();
537
538        for token in vocab.keys() {
539            // Check for invalid UTF-8 (though this should be rare in Rust strings)
540            if !token.is_ascii() && token.chars().any(|c| c as u32 > 0x10FFFF) {
541                invalid_utf8_tokens.push(token.clone());
542            }
543
544            // Check for potential mojibake patterns
545            if token.contains("Ã") || token.contains("â") || token.contains("Â") {
546                mojibake_tokens.push(token.clone());
547            }
548        }
549
550        if !invalid_utf8_tokens.is_empty() {
551            issues.push(VocabIssue {
552                issue_type: VocabIssueType::InvalidUtf8,
553                severity: IssueSeverity::Critical,
554                description: "Found tokens with invalid UTF-8 sequences".to_string(),
555                affected_tokens: invalid_utf8_tokens,
556                suggestion: Some("Fix encoding issues before tokenization".to_string()),
557            });
558        }
559
560        if !mojibake_tokens.is_empty() {
561            issues.push(VocabIssue {
562                issue_type: VocabIssueType::EncodingIssues,
563                severity: IssueSeverity::High,
564                description: "Found tokens with potential mojibake patterns".to_string(),
565                affected_tokens: mojibake_tokens,
566                suggestion: Some("Check for encoding issues in source data".to_string()),
567            });
568        }
569
570        Ok(issues)
571    }
572
573    /// Detect near-duplicate tokens
574    fn detect_near_duplicates(&self, vocab: &HashMap<String, u32>) -> Result<Vec<VocabIssue>> {
575        let mut near_duplicates = Vec::new();
576        let tokens: Vec<&String> = vocab.keys().collect();
577
578        for i in 0..tokens.len() {
579            for j in (i + 1)..tokens.len() {
580                let similarity = self.calculate_similarity(tokens[i], tokens[j]);
581                if similarity > 0.9 && similarity < 1.0 {
582                    near_duplicates.push(vec![tokens[i].clone(), tokens[j].clone()]);
583                }
584            }
585        }
586
587        if !near_duplicates.is_empty() {
588            let affected_tokens: Vec<String> = near_duplicates.iter().flatten().cloned().collect();
589
590            Ok(vec![VocabIssue {
591                issue_type: VocabIssueType::NearDuplicates,
592                severity: IssueSeverity::Medium,
593                description: format!(
594                    "Found {} pairs of near-duplicate tokens",
595                    near_duplicates.len()
596                ),
597                affected_tokens,
598                suggestion: Some(
599                    "Review near-duplicate tokens and consider merging or removing".to_string(),
600                ),
601            }])
602        } else {
603            Ok(Vec::new())
604        }
605    }
606
607    /// Calculate similarity between two strings
608    fn calculate_similarity(&self, s1: &str, s2: &str) -> f64 {
609        let len1 = s1.chars().count();
610        let len2 = s2.chars().count();
611
612        if len1 == 0 && len2 == 0 {
613            return 1.0;
614        }
615
616        let max_len = len1.max(len2);
617        let distance = self.levenshtein_distance(s1, s2);
618
619        1.0 - (distance as f64 / max_len as f64)
620    }
621
622    /// Calculate Levenshtein distance
623    fn levenshtein_distance(&self, s1: &str, s2: &str) -> usize {
624        let chars1: Vec<char> = s1.chars().collect();
625        let chars2: Vec<char> = s2.chars().collect();
626        let len1 = chars1.len();
627        let len2 = chars2.len();
628
629        let mut matrix = vec![vec![0; len2 + 1]; len1 + 1];
630
631        for (i, row) in matrix.iter_mut().enumerate().take(len1 + 1) {
632            row[0] = i;
633        }
634        for (j, val) in matrix[0].iter_mut().enumerate().take(len2 + 1) {
635            *val = j;
636        }
637
638        for i in 1..=len1 {
639            for j in 1..=len2 {
640                let cost = if chars1[i - 1] == chars2[j - 1] { 0 } else { 1 };
641                matrix[i][j] = (matrix[i - 1][j] + 1)
642                    .min(matrix[i][j - 1] + 1)
643                    .min(matrix[i - 1][j - 1] + cost);
644            }
645        }
646
647        matrix[len1][len2]
648    }
649
650    /// Analyze character patterns in vocabulary
651    fn analyze_character_patterns(
652        &self,
653        vocab: &HashMap<String, u32>,
654    ) -> Result<Vec<CharacterPattern>> {
655        let mut patterns = HashMap::new();
656
657        for token in vocab.keys() {
658            // Analyze different patterns
659            let pattern_type = if token.chars().all(|c| c.is_alphabetic()) {
660                "alphabetic"
661            } else if token.chars().all(|c| c.is_numeric()) {
662                "numeric"
663            } else if token.chars().all(|c| c.is_alphanumeric()) {
664                "alphanumeric"
665            } else if token.starts_with('#') {
666                "hashtag"
667            } else if token.starts_with('@') {
668                "mention"
669            } else if token.contains('_') {
670                "underscore"
671            } else if token.contains('-') {
672                "hyphenated"
673            } else {
674                "mixed"
675            };
676
677            let entry = patterns.entry(pattern_type.to_string()).or_insert_with(|| (0, Vec::new()));
678            entry.0 += 1;
679            if entry.1.len() < 10 {
680                entry.1.push(token.clone());
681            }
682        }
683
684        let total_tokens = vocab.len() as f64;
685        let mut result = Vec::new();
686
687        for (pattern, (count, examples)) in patterns {
688            result.push(CharacterPattern {
689                pattern,
690                count,
691                examples,
692                frequency: count as f64 / total_tokens,
693            });
694        }
695
696        result.sort_by_key(|item| std::cmp::Reverse(item.count));
697        Ok(result)
698    }
699
700    /// Analyze subword patterns
701    fn analyze_subword_patterns(
702        &self,
703        vocab: &HashMap<String, u32>,
704    ) -> Result<Vec<SubwordPattern>> {
705        let mut subword_counts: HashMap<String, (usize, Vec<String>, HashMap<String, usize>)> =
706            HashMap::new();
707
708        for token in vocab.keys() {
709            // Extract potential subwords (2-4 characters)
710            for len in 2..=4.min(token.chars().count()) {
711                for start in 0..=(token.chars().count().saturating_sub(len)) {
712                    let subword: String = token.chars().skip(start).take(len).collect();
713
714                    let position_type = if start == 0 {
715                        "prefix"
716                    } else if start + len == token.chars().count() {
717                        "suffix"
718                    } else {
719                        "infix"
720                    };
721
722                    let entry = subword_counts
723                        .entry(subword)
724                        .or_insert_with(|| (0, Vec::new(), HashMap::new()));
725                    entry.0 += 1;
726                    if entry.1.len() < 5 {
727                        entry.1.push(token.clone());
728                    }
729                    *entry.2.entry(position_type.to_string()).or_insert(0) += 1;
730                }
731            }
732        }
733
734        let mut result: Vec<SubwordPattern> = subword_counts
735            .into_iter()
736            .filter(|(_, (count, _, _))| *count >= 3) // Only patterns appearing 3+ times
737            .map(|(pattern, (count, tokens, positions))| SubwordPattern {
738                pattern,
739                count,
740                tokens,
741                positions,
742            })
743            .collect();
744
745        result.sort_by_key(|item| std::cmp::Reverse(item.count));
746        result.truncate(50); // Limit to top 50 patterns
747        Ok(result)
748    }
749
750    /// Calculate token length distribution
751    fn calculate_length_distribution(
752        &self,
753        vocab: &HashMap<String, u32>,
754    ) -> BTreeMap<usize, usize> {
755        let mut distribution = BTreeMap::new();
756
757        for token in vocab.keys() {
758            let length = token.chars().count();
759            *distribution.entry(length).or_insert(0) += 1;
760        }
761
762        distribution
763    }
764
765    /// Detect language distribution in vocabulary
766    fn detect_language_distribution(
767        &self,
768        vocab: &HashMap<String, u32>,
769    ) -> Result<Vec<LanguageDistribution>> {
770        // Simplified language detection based on character ranges
771        let mut language_counts = HashMap::new();
772
773        for token in vocab.keys() {
774            let language = self.detect_token_language(token);
775            *language_counts.entry(language).or_insert(0) += 1;
776        }
777
778        let total_tokens = vocab.len() as f64;
779        let mut distribution: Vec<LanguageDistribution> = language_counts
780            .into_iter()
781            .map(|(language, count)| {
782                // Calculate confidence based on token count and language characteristics
783                let confidence = self.calculate_language_confidence(&language, count, total_tokens);
784                LanguageDistribution {
785                    language,
786                    token_count: count,
787                    percentage: (count as f64 / total_tokens) * 100.0,
788                    confidence,
789                }
790            })
791            .collect();
792
793        distribution.sort_by_key(|item| std::cmp::Reverse(item.token_count));
794        Ok(distribution)
795    }
796
797    /// Simple language detection for a token
798    fn detect_token_language(&self, token: &str) -> String {
799        for ch in token.chars() {
800            match ch {
801                'a'..='z' | 'A'..='Z' => return "en".to_string(),
802                'α'..='ω' | 'Α'..='Ω' => return "el".to_string(),
803                'а'..='я' | 'А'..='Я' => return "ru".to_string(),
804                '一'..='龯' => return "zh".to_string(),
805                'ひ'..='ゖ' | 'ア'..='ヶ' => return "ja".to_string(),
806                '가'..='힣' => return "ko".to_string(),
807                'ا'..='ي' => return "ar".to_string(),
808                _ => continue,
809            }
810        }
811        "unknown".to_string()
812    }
813
814    /// Calculate confidence for language detection
815    fn calculate_language_confidence(
816        &self,
817        language: &str,
818        count: usize,
819        total_tokens: f64,
820    ) -> f64 {
821        let percentage = (count as f64 / total_tokens) * 100.0;
822
823        // Base confidence depends on percentage of tokens
824        let mut confidence: f64 = match percentage {
825            p if p >= 50.0 => 0.95,
826            p if p >= 20.0 => 0.85,
827            p if p >= 10.0 => 0.75,
828            p if p >= 5.0 => 0.65,
829            p if p >= 1.0 => 0.55,
830            _ => 0.45,
831        };
832
833        // Adjust confidence based on language characteristics
834        match language {
835            "unknown" => confidence *= 0.3, // Low confidence for unknown
836            "en" => confidence *= 1.1,      // English is common, boost confidence
837            "zh" | "ja" | "ko" | "ar" | "hi" | "th" => {
838                // Non-Latin scripts are more distinctive, boost confidence
839                confidence *= 1.2;
840            },
841            _ => confidence *= 1.0, // Default for other languages
842        }
843
844        // Ensure confidence stays within valid range
845        confidence.clamp(0.1, 1.0)
846    }
847
848    /// Analyze out-of-vocabulary patterns
849    fn analyze_oov_patterns(&self, oov_tokens: &[String]) -> Vec<String> {
850        let mut pattern_counts = HashMap::new();
851
852        for token in oov_tokens {
853            // Analyze common OOV patterns
854            if token.chars().all(|c| c.is_numeric()) {
855                *pattern_counts.entry("all_numeric".to_string()).or_insert(0) += 1;
856            } else if token.contains('@') {
857                *pattern_counts.entry("email_like".to_string()).or_insert(0) += 1;
858            } else if token.starts_with("http") {
859                *pattern_counts.entry("url_like".to_string()).or_insert(0) += 1;
860            } else if !token.is_ascii() {
861                *pattern_counts.entry("non_ascii".to_string()).or_insert(0) += 1;
862            } else if token.len() > 15 {
863                *pattern_counts.entry("very_long".to_string()).or_insert(0) += 1;
864            } else {
865                *pattern_counts.entry("other".to_string()).or_insert(0) += 1;
866            }
867        }
868
869        let mut patterns: Vec<(String, usize)> = pattern_counts.into_iter().collect();
870        patterns.sort_by_key(|item| std::cmp::Reverse(item.1));
871        patterns.into_iter().map(|(pattern, _)| pattern).collect()
872    }
873
874    /// Generate recommendations based on analysis
875    fn generate_recommendations(&self, analysis: &VocabAnalysisResult) -> Vec<String> {
876        let mut recommendations = Vec::new();
877
878        // Check vocabulary size
879        if analysis.basic_stats.total_tokens > 100000 {
880            recommendations
881                .push("Consider reducing vocabulary size for better efficiency".to_string());
882        }
883
884        // Check for issues
885        for issue in &analysis.issues {
886            match issue.severity {
887                IssueSeverity::Critical | IssueSeverity::High | IssueSeverity::Medium => {
888                    if let Some(ref suggestion) = issue.suggestion {
889                        recommendations.push(suggestion.clone());
890                    }
891                },
892                _ => {},
893            }
894        }
895
896        // Check token length distribution
897        if analysis.basic_stats.avg_token_length > 10.0 {
898            recommendations.push(
899                "Average token length is high; consider more aggressive subword tokenization"
900                    .to_string(),
901            );
902        }
903
904        // Check for singleton tokens
905        if analysis.frequency_analysis.singleton_tokens.len()
906            > analysis.basic_stats.total_tokens / 10
907        {
908            recommendations.push(
909                "Many singleton tokens detected; consider increasing minimum frequency threshold"
910                    .to_string(),
911            );
912        }
913
914        // Language distribution recommendations
915        if analysis.language_distribution.len() > 5 {
916            recommendations.push(
917                "Multiple languages detected; consider language-specific vocabularies".to_string(),
918            );
919        }
920
921        recommendations
922    }
923}
924
925/// Utilities for vocabulary debugging
926pub struct VocabDebugUtils;
927
928impl VocabDebugUtils {
929    /// Find tokens similar to a given token
930    pub fn find_similar_tokens(
931        target: &str,
932        vocab: &HashMap<String, u32>,
933        threshold: f64,
934    ) -> Vec<(String, f64)> {
935        let analyzer = VocabAnalyzer::default();
936        let mut similar = Vec::new();
937
938        for token in vocab.keys() {
939            let similarity = analyzer.calculate_similarity(target, token);
940            if similarity >= threshold && token != target {
941                similar.push((token.clone(), similarity));
942            }
943        }
944
945        similar.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(Ordering::Equal));
946        similar
947    }
948
949    /// Find tokens containing a specific pattern
950    pub fn find_tokens_with_pattern(pattern: &str, vocab: &HashMap<String, u32>) -> Vec<String> {
951        vocab.keys().filter(|token| token.contains(pattern)).cloned().collect()
952    }
953
954    /// Generate a vocabulary summary report
955    pub fn generate_summary_report(analysis: &VocabAnalysisResult) -> String {
956        let mut report = String::new();
957
958        report.push_str("=== VOCABULARY ANALYSIS SUMMARY ===\n\n");
959
960        // Basic stats
961        report.push_str(&format!(
962            "Total tokens: {}\n",
963            analysis.basic_stats.total_tokens
964        ));
965        report.push_str(&format!(
966            "Average token length: {:.2}\n",
967            analysis.basic_stats.avg_token_length
968        ));
969        report.push_str(&format!(
970            "Token length range: {} - {}\n",
971            analysis.basic_stats.min_token_length, analysis.basic_stats.max_token_length
972        ));
973
974        // Issues summary
975        let critical_issues =
976            analysis.issues.iter().filter(|i| i.severity == IssueSeverity::Critical).count();
977        let high_issues =
978            analysis.issues.iter().filter(|i| i.severity == IssueSeverity::High).count();
979        let medium_issues =
980            analysis.issues.iter().filter(|i| i.severity == IssueSeverity::Medium).count();
981
982        report.push_str(&format!(
983            "\nIssues found: {} critical, {} high, {} medium\n",
984            critical_issues, high_issues, medium_issues
985        ));
986
987        // Top patterns
988        if !analysis.character_patterns.is_empty() {
989            report.push_str("\nTop character patterns:\n");
990            for pattern in analysis.character_patterns.iter().take(3) {
991                report.push_str(&format!(
992                    "  {}: {} tokens ({:.1}%)\n",
993                    pattern.pattern,
994                    pattern.count,
995                    pattern.frequency * 100.0
996                ));
997            }
998        }
999
1000        // Recommendations
1001        if !analysis.recommendations.is_empty() {
1002            report.push_str("\nRecommendations:\n");
1003            for rec in analysis.recommendations.iter().take(5) {
1004                report.push_str(&format!("  • {}\n", rec));
1005            }
1006        }
1007
1008        report
1009    }
1010}
1011
1012#[cfg(test)]
1013mod tests {
1014    use super::*;
1015
1016    fn create_test_vocab() -> HashMap<String, u32> {
1017        let mut vocab = HashMap::new();
1018        vocab.insert("hello".to_string(), 1);
1019        vocab.insert("world".to_string(), 2);
1020        vocab.insert("test".to_string(), 3);
1021        vocab.insert("very_long_token_that_exceeds_normal_length".to_string(), 4);
1022        vocab.insert("123".to_string(), 5);
1023        vocab.insert("hello_world".to_string(), 6);
1024        vocab.insert("test123".to_string(), 7);
1025        vocab.insert("@mention".to_string(), 8);
1026        vocab.insert("#hashtag".to_string(), 9);
1027        vocab.insert("helo".to_string(), 10); // Near-duplicate of "hello"
1028        vocab
1029    }
1030
1031    #[test]
1032    fn test_vocab_analyzer_creation() {
1033        let config = VocabAnalysisConfig::default();
1034        let analyzer = VocabAnalyzer::new(config);
1035        assert!(analyzer.config.analyze_character_patterns);
1036    }
1037
1038    #[test]
1039    fn test_basic_stats_calculation() {
1040        let vocab = create_test_vocab();
1041        let analyzer = VocabAnalyzer::default();
1042        let stats = analyzer.calculate_basic_stats(&vocab);
1043
1044        assert_eq!(stats.total_tokens, 10);
1045        assert_eq!(stats.unique_tokens, 10);
1046        assert!(stats.avg_token_length > 0.0);
1047        assert!(stats.alphabetic_tokens > 0);
1048        assert!(stats.numeric_tokens > 0);
1049    }
1050
1051    #[test]
1052    fn test_vocabulary_analysis() {
1053        let vocab = create_test_vocab();
1054        let analyzer = VocabAnalyzer::default();
1055        let result = analyzer.analyze_vocabulary(&vocab).expect("Operation failed in test");
1056
1057        assert_eq!(result.basic_stats.total_tokens, 10);
1058        assert!(!result.character_patterns.is_empty());
1059        assert!(!result.length_distribution.is_empty());
1060        assert!(!result.language_distribution.is_empty());
1061    }
1062
1063    #[test]
1064    fn test_long_token_detection() {
1065        let vocab = create_test_vocab();
1066        let config = VocabAnalysisConfig {
1067            max_token_length: 10,
1068            ..Default::default()
1069        };
1070
1071        let analyzer = VocabAnalyzer::new(config);
1072        let issues = analyzer.detect_long_tokens(&vocab).expect("Operation failed in test");
1073
1074        assert!(!issues.is_empty());
1075        assert_eq!(issues[0].issue_type, VocabIssueType::LongTokens);
1076    }
1077
1078    #[test]
1079    fn test_similarity_calculation() {
1080        let analyzer = VocabAnalyzer::default();
1081
1082        assert_eq!(analyzer.calculate_similarity("hello", "hello"), 1.0);
1083        assert!(analyzer.calculate_similarity("hello", "helo") >= 0.8);
1084        assert!(analyzer.calculate_similarity("hello", "world") < 0.5);
1085    }
1086
1087    #[test]
1088    fn test_character_pattern_analysis() {
1089        let vocab = create_test_vocab();
1090        let analyzer = VocabAnalyzer::default();
1091        let patterns =
1092            analyzer.analyze_character_patterns(&vocab).expect("Operation failed in test");
1093
1094        assert!(!patterns.is_empty());
1095        assert!(patterns.iter().any(|p| p.pattern == "alphabetic"));
1096        assert!(patterns.iter().any(|p| p.pattern == "numeric"));
1097    }
1098
1099    #[test]
1100    fn test_language_detection() {
1101        let analyzer = VocabAnalyzer::default();
1102
1103        assert_eq!(analyzer.detect_token_language("hello"), "en");
1104        assert_eq!(analyzer.detect_token_language("123"), "unknown");
1105        assert_eq!(analyzer.detect_token_language("привет"), "ru");
1106    }
1107
1108    #[test]
1109    fn test_subword_pattern_analysis() {
1110        let vocab = create_test_vocab();
1111        let analyzer = VocabAnalyzer::default();
1112        let patterns = analyzer.analyze_subword_patterns(&vocab).expect("Operation failed in test");
1113
1114        // Should find patterns like "test" appearing in multiple tokens
1115        assert!(!patterns.is_empty());
1116    }
1117
1118    #[test]
1119    fn test_debug_utils() {
1120        let vocab = create_test_vocab();
1121
1122        let similar = VocabDebugUtils::find_similar_tokens("hello", &vocab, 0.8);
1123        assert!(!similar.is_empty());
1124        assert!(similar.iter().any(|(token, _)| token == "helo"));
1125
1126        let pattern_tokens = VocabDebugUtils::find_tokens_with_pattern("test", &vocab);
1127        assert!(pattern_tokens.contains(&"test".to_string()));
1128        assert!(pattern_tokens.contains(&"test123".to_string()));
1129    }
1130
1131    #[test]
1132    fn test_frequency_analysis() {
1133        let vocab = create_test_vocab();
1134        let analyzer = VocabAnalyzer::default();
1135        let freq_analysis = analyzer.analyze_frequency(&vocab);
1136
1137        assert!(!freq_analysis.most_frequent.is_empty());
1138        assert!(!freq_analysis.least_frequent.is_empty());
1139        assert!(!freq_analysis.frequency_histogram.is_empty());
1140    }
1141
1142    #[test]
1143    fn test_recommendations_generation() {
1144        // Create a vocabulary with issues that should generate recommendations
1145        let mut vocab = HashMap::new();
1146        vocab.insert("hello".to_string(), 1);
1147        vocab.insert("world".to_string(), 2);
1148
1149        // Add a very long token that will definitely be detected
1150        vocab.insert("this_is_a_very_long_token_that_definitely_exceeds_the_default_maximum_token_length_of_one_hundred_characters_and_should_trigger_a_recommendation".to_string(), 3);
1151
1152        // Add many singleton tokens to trigger singleton recommendation
1153        for i in 4..20 {
1154            vocab.insert(format!("singleton_token_{}", i), i);
1155        }
1156
1157        let analyzer = VocabAnalyzer::default();
1158        let result = analyzer.analyze_vocabulary(&vocab).expect("Operation failed in test");
1159
1160        // Should generate some recommendations
1161        assert!(!result.recommendations.is_empty());
1162
1163        // Should have at least one recommendation from the long token
1164        assert!(result.recommendations.iter().any(|rec| rec.contains("long tokens")));
1165    }
1166
1167    #[test]
1168    fn test_summary_report() {
1169        let vocab = create_test_vocab();
1170        let analyzer = VocabAnalyzer::default();
1171        let result = analyzer.analyze_vocabulary(&vocab).expect("Operation failed in test");
1172
1173        let report = VocabDebugUtils::generate_summary_report(&result);
1174        assert!(report.contains("VOCABULARY ANALYSIS SUMMARY"));
1175        assert!(report.contains("Total tokens"));
1176    }
1177}
trustformers_tokenizers/vocab_analyzer.rs

trustformers_tokenizers/
vocab_analyzer.rs