1use serde::{Deserialize, Serialize};
2use std::cmp::Ordering;
3use std::collections::{BTreeMap, HashMap, HashSet};
4use trustformers_core::errors::Result;
5use trustformers_core::traits::Tokenizer;
6
7#[derive(Debug, Clone, Serialize, Deserialize)]
9pub struct VocabAnalysisConfig {
10 pub rare_token_threshold: usize,
12 pub max_token_length: usize,
14 pub analyze_character_patterns: bool,
16 pub detect_encoding_issues: bool,
18 pub analyze_subword_patterns: bool,
20 pub check_duplicates: bool,
22 pub target_languages: Vec<String>,
24 pub include_detailed_stats: bool,
26}
27
28impl Default for VocabAnalysisConfig {
29 fn default() -> Self {
30 Self {
31 rare_token_threshold: 1,
32 max_token_length: 100,
33 analyze_character_patterns: true,
34 detect_encoding_issues: true,
35 analyze_subword_patterns: true,
36 check_duplicates: true,
37 target_languages: Vec::new(),
38 include_detailed_stats: true,
39 }
40 }
41}
42
43#[derive(Debug, Clone, Serialize, Deserialize)]
45pub struct VocabIssue {
46 pub issue_type: VocabIssueType,
48 pub severity: IssueSeverity,
50 pub description: String,
52 pub affected_tokens: Vec<String>,
54 pub suggestion: Option<String>,
56}
57
58#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
60pub enum VocabIssueType {
61 DuplicateTokens,
63 NearDuplicates,
65 RareTokens,
67 LongTokens,
69 EncodingIssues,
71 InvalidUtf8,
73 InconsistentCasing,
75 MissingCommonTokens,
77 InefficientSubwords,
79 OverlappingTokens,
81 OrphanedTokens,
83}
84
85#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
87pub enum IssueSeverity {
88 Low,
89 Medium,
90 High,
91 Critical,
92}
93
94#[derive(Debug, Clone, Serialize, Deserialize)]
96pub struct CharacterPattern {
97 pub pattern: String,
99 pub count: usize,
101 pub examples: Vec<String>,
103 pub frequency: f64,
105}
106
107#[derive(Debug, Clone, Serialize, Deserialize)]
109pub struct SubwordPattern {
110 pub pattern: String,
112 pub count: usize,
114 pub tokens: Vec<String>,
116 pub positions: HashMap<String, usize>, }
119
120#[derive(Debug, Clone, Serialize, Deserialize)]
122pub struct LanguageDistribution {
123 pub language: String,
125 pub token_count: usize,
127 pub percentage: f64,
129 pub confidence: f64,
131}
132
133#[derive(Debug, Clone, Serialize, Deserialize)]
135pub struct VocabAnalysisResult {
136 pub basic_stats: VocabBasicStats,
138 pub issues: Vec<VocabIssue>,
140 pub character_patterns: Vec<CharacterPattern>,
142 pub subword_patterns: Vec<SubwordPattern>,
144 pub language_distribution: Vec<LanguageDistribution>,
146 pub length_distribution: BTreeMap<usize, usize>,
148 pub frequency_analysis: FrequencyAnalysis,
150 pub coverage_analysis: Option<CoverageAnalysis>,
152 pub recommendations: Vec<String>,
154}
155
156#[derive(Debug, Clone, Serialize, Deserialize)]
158pub struct VocabBasicStats {
159 pub total_tokens: usize,
161 pub unique_tokens: usize,
163 pub avg_token_length: f64,
165 pub min_token_length: usize,
167 pub max_token_length: usize,
169 pub alphabetic_tokens: usize,
171 pub numeric_tokens: usize,
173 pub mixed_tokens: usize,
175 pub special_char_tokens: usize,
177 pub whitespace_tokens: usize,
179}
180
181#[derive(Debug, Clone, Serialize, Deserialize)]
183pub struct FrequencyAnalysis {
184 pub most_frequent: Vec<(String, u32)>,
186 pub least_frequent: Vec<(String, u32)>,
188 pub singleton_tokens: Vec<String>,
190 pub frequency_histogram: BTreeMap<u32, usize>, }
193
194#[derive(Debug, Clone, Serialize, Deserialize)]
196pub struct CoverageAnalysis {
197 pub total_chars: usize,
199 pub covered_chars: usize,
201 pub coverage_percentage: f64,
203 pub oov_tokens: Vec<String>,
205 pub oov_patterns: Vec<String>,
207}
208
209pub struct VocabAnalyzer {
211 config: VocabAnalysisConfig,
212}
213
214impl VocabAnalyzer {
215 pub fn new(config: VocabAnalysisConfig) -> Self {
217 Self { config }
218 }
219
220 pub fn default() -> Self {
222 Self::new(VocabAnalysisConfig::default())
223 }
224
225 pub fn analyze_tokenizer<T: Tokenizer>(&self, tokenizer: &T) -> Result<VocabAnalysisResult> {
227 let vocab = tokenizer.get_vocab();
228 self.analyze_vocabulary(&vocab)
229 }
230
231 pub fn analyze_vocabulary(&self, vocab: &HashMap<String, u32>) -> Result<VocabAnalysisResult> {
233 let mut result = VocabAnalysisResult {
234 basic_stats: self.calculate_basic_stats(vocab),
235 issues: Vec::new(),
236 character_patterns: Vec::new(),
237 subword_patterns: Vec::new(),
238 language_distribution: Vec::new(),
239 length_distribution: BTreeMap::new(),
240 frequency_analysis: self.analyze_frequency(vocab),
241 coverage_analysis: None,
242 recommendations: Vec::new(),
243 };
244
245 result.issues.extend(self.detect_duplicate_tokens(vocab)?);
247 result.issues.extend(self.detect_rare_tokens(vocab)?);
248 result.issues.extend(self.detect_long_tokens(vocab)?);
249
250 if self.config.detect_encoding_issues {
251 result.issues.extend(self.detect_encoding_issues(vocab)?);
252 }
253
254 if self.config.check_duplicates {
255 result.issues.extend(self.detect_near_duplicates(vocab)?);
256 }
257
258 if self.config.analyze_character_patterns {
260 result.character_patterns = self.analyze_character_patterns(vocab)?;
261 }
262
263 if self.config.analyze_subword_patterns {
264 result.subword_patterns = self.analyze_subword_patterns(vocab)?;
265 }
266
267 result.length_distribution = self.calculate_length_distribution(vocab);
269
270 result.language_distribution = self.detect_language_distribution(vocab)?;
272
273 result.recommendations = self.generate_recommendations(&result);
275
276 Ok(result)
277 }
278
279 pub fn analyze_coverage<T: Tokenizer>(
281 &self,
282 tokenizer: &T,
283 corpus: &[String],
284 ) -> Result<CoverageAnalysis> {
285 let mut total_chars = 0;
286 let mut covered_chars = 0;
287 let mut oov_tokens = HashSet::new();
288
289 for text in corpus {
290 total_chars += text.chars().count();
291
292 let tokenized = tokenizer.encode(text)?;
294 for &token_id in &tokenized.input_ids {
295 if let Some(token) = tokenizer.id_to_token(token_id) {
296 covered_chars += token.chars().count();
297 } else {
298 oov_tokens.insert(format!("<UNK:{}>", token_id));
299 }
300 }
301 }
302
303 let coverage_percentage = if total_chars > 0 {
304 (covered_chars as f64 / total_chars as f64) * 100.0
305 } else {
306 0.0
307 };
308
309 let oov_tokens_vec: Vec<String> = oov_tokens.iter().cloned().collect();
311 let oov_patterns = self.analyze_oov_patterns(&oov_tokens_vec);
312
313 Ok(CoverageAnalysis {
314 total_chars,
315 covered_chars,
316 coverage_percentage,
317 oov_tokens: oov_tokens_vec,
318 oov_patterns,
319 })
320 }
321
322 fn calculate_basic_stats(&self, vocab: &HashMap<String, u32>) -> VocabBasicStats {
324 let total_tokens = vocab.len();
325 let unique_tokens = vocab.keys().len();
326
327 let mut total_length = 0;
328 let mut min_length = usize::MAX;
329 let mut max_length = 0;
330 let mut alphabetic_count = 0;
331 let mut numeric_count = 0;
332 let mut mixed_count = 0;
333 let mut special_char_count = 0;
334 let mut whitespace_count = 0;
335
336 for token in vocab.keys() {
337 let len = token.chars().count();
338 total_length += len;
339 min_length = min_length.min(len);
340 max_length = max_length.max(len);
341
342 if token.chars().all(|c| c.is_alphabetic()) {
344 alphabetic_count += 1;
345 } else if token.chars().all(|c| c.is_numeric()) {
346 numeric_count += 1;
347 } else if token.chars().any(|c| c.is_alphabetic())
348 && token.chars().any(|c| c.is_numeric())
349 {
350 mixed_count += 1;
351 } else if token.chars().all(|c| c.is_whitespace()) {
352 whitespace_count += 1;
353 } else {
354 special_char_count += 1;
355 }
356 }
357
358 let avg_token_length =
359 if total_tokens > 0 { total_length as f64 / total_tokens as f64 } else { 0.0 };
360
361 VocabBasicStats {
362 total_tokens,
363 unique_tokens,
364 avg_token_length,
365 min_token_length: if min_length == usize::MAX { 0 } else { min_length },
366 max_token_length: max_length,
367 alphabetic_tokens: alphabetic_count,
368 numeric_tokens: numeric_count,
369 mixed_tokens: mixed_count,
370 special_char_tokens: special_char_count,
371 whitespace_tokens: whitespace_count,
372 }
373 }
374
375 fn analyze_frequency(&self, vocab: &HashMap<String, u32>) -> FrequencyAnalysis {
377 let mut token_freq: Vec<(String, u32)> = vocab
381 .iter()
382 .map(|(token, &_id)| {
383 let base_freq = self.estimate_token_frequency(token);
385 (token.clone(), base_freq)
386 })
387 .collect();
388
389 token_freq.sort_by_key(|item| std::cmp::Reverse(item.1));
390
391 let most_frequent = token_freq.iter().take(20).cloned().collect();
392 let least_frequent = token_freq.iter().rev().take(20).cloned().collect();
393
394 let singleton_tokens = token_freq
395 .iter()
396 .filter(|(_, freq)| *freq == 1)
397 .map(|(token, _)| token.clone())
398 .collect();
399
400 let mut frequency_histogram = BTreeMap::new();
402 for (_, freq) in &token_freq {
403 *frequency_histogram.entry(*freq).or_insert(0) += 1;
404 }
405
406 FrequencyAnalysis {
407 most_frequent,
408 least_frequent,
409 singleton_tokens,
410 frequency_histogram,
411 }
412 }
413
414 fn detect_duplicate_tokens(&self, vocab: &HashMap<String, u32>) -> Result<Vec<VocabIssue>> {
416 let mut id_to_tokens: HashMap<u32, Vec<String>> = HashMap::new();
417
418 for (token, &id) in vocab {
419 id_to_tokens.entry(id).or_default().push(token.clone());
420 }
421
422 let mut issues = Vec::new();
423 for (id, tokens) in id_to_tokens {
424 if tokens.len() > 1 {
425 issues.push(VocabIssue {
426 issue_type: VocabIssueType::DuplicateTokens,
427 severity: IssueSeverity::High,
428 description: format!("Multiple tokens share ID {}: {:?}", id, tokens),
429 affected_tokens: tokens,
430 suggestion: Some("Ensure each token has a unique ID".to_string()),
431 });
432 }
433 }
434
435 Ok(issues)
436 }
437
438 fn estimate_token_frequency(&self, token: &str) -> u32 {
440 let mut score = 1000u32; if token.chars().all(|c| c.is_ascii_alphabetic()) {
444 score += 500; }
446
447 match token.len() {
449 1..=3 => score += 1000,
450 4..=6 => score += 500,
451 7..=10 => score += 100,
452 _ => score /= 2, }
454
455 if token.starts_with('<') && token.ends_with('>') {
457 score += 800; } else if token.contains("##") {
459 score += 300; } else if token.chars().all(|c| c.is_ascii_punctuation()) {
461 score += 200; }
463
464 let common_chars = ['e', 't', 'a', 'o', 'i', 'n', 's', 'h', 'r'];
466 if token.chars().any(|c| common_chars.contains(&c.to_ascii_lowercase())) {
467 score += 200;
468 }
469
470 let hash_value =
472 token.chars().fold(0u32, |acc, c| acc.wrapping_mul(31).wrapping_add(c as u32));
473 score += hash_value % 200;
474
475 score.max(1) }
477
478 fn detect_rare_tokens(&self, vocab: &HashMap<String, u32>) -> Result<Vec<VocabIssue>> {
480 let rare_tokens: Vec<String> = vocab
482 .keys()
483 .filter(|token| {
484 let estimated_freq = self.estimate_token_frequency(token);
485 estimated_freq < 100 || token.len() > 20
487 })
488 .take(100)
489 .cloned()
490 .collect();
491
492 if !rare_tokens.is_empty() {
493 Ok(vec![VocabIssue {
494 issue_type: VocabIssueType::RareTokens,
495 severity: IssueSeverity::Low,
496 description: format!("Found {} potentially rare tokens", rare_tokens.len()),
497 affected_tokens: rare_tokens,
498 suggestion: Some(
499 "Consider removing very rare tokens to reduce vocabulary size".to_string(),
500 ),
501 }])
502 } else {
503 Ok(Vec::new())
504 }
505 }
506
507 fn detect_long_tokens(&self, vocab: &HashMap<String, u32>) -> Result<Vec<VocabIssue>> {
509 let long_tokens: Vec<String> = vocab
510 .keys()
511 .filter(|token| token.chars().count() > self.config.max_token_length)
512 .cloned()
513 .collect();
514
515 if !long_tokens.is_empty() {
516 Ok(vec![VocabIssue {
517 issue_type: VocabIssueType::LongTokens,
518 severity: IssueSeverity::Medium,
519 description: format!(
520 "Found {} tokens exceeding maximum length of {}",
521 long_tokens.len(),
522 self.config.max_token_length
523 ),
524 affected_tokens: long_tokens,
525 suggestion: Some("Consider truncating or removing very long tokens".to_string()),
526 }])
527 } else {
528 Ok(Vec::new())
529 }
530 }
531
532 fn detect_encoding_issues(&self, vocab: &HashMap<String, u32>) -> Result<Vec<VocabIssue>> {
534 let mut issues = Vec::new();
535 let mut invalid_utf8_tokens = Vec::new();
536 let mut mojibake_tokens = Vec::new();
537
538 for token in vocab.keys() {
539 if !token.is_ascii() && token.chars().any(|c| c as u32 > 0x10FFFF) {
541 invalid_utf8_tokens.push(token.clone());
542 }
543
544 if token.contains("Ã") || token.contains("â") || token.contains("Â") {
546 mojibake_tokens.push(token.clone());
547 }
548 }
549
550 if !invalid_utf8_tokens.is_empty() {
551 issues.push(VocabIssue {
552 issue_type: VocabIssueType::InvalidUtf8,
553 severity: IssueSeverity::Critical,
554 description: "Found tokens with invalid UTF-8 sequences".to_string(),
555 affected_tokens: invalid_utf8_tokens,
556 suggestion: Some("Fix encoding issues before tokenization".to_string()),
557 });
558 }
559
560 if !mojibake_tokens.is_empty() {
561 issues.push(VocabIssue {
562 issue_type: VocabIssueType::EncodingIssues,
563 severity: IssueSeverity::High,
564 description: "Found tokens with potential mojibake patterns".to_string(),
565 affected_tokens: mojibake_tokens,
566 suggestion: Some("Check for encoding issues in source data".to_string()),
567 });
568 }
569
570 Ok(issues)
571 }
572
573 fn detect_near_duplicates(&self, vocab: &HashMap<String, u32>) -> Result<Vec<VocabIssue>> {
575 let mut near_duplicates = Vec::new();
576 let tokens: Vec<&String> = vocab.keys().collect();
577
578 for i in 0..tokens.len() {
579 for j in (i + 1)..tokens.len() {
580 let similarity = self.calculate_similarity(tokens[i], tokens[j]);
581 if similarity > 0.9 && similarity < 1.0 {
582 near_duplicates.push(vec![tokens[i].clone(), tokens[j].clone()]);
583 }
584 }
585 }
586
587 if !near_duplicates.is_empty() {
588 let affected_tokens: Vec<String> = near_duplicates.iter().flatten().cloned().collect();
589
590 Ok(vec![VocabIssue {
591 issue_type: VocabIssueType::NearDuplicates,
592 severity: IssueSeverity::Medium,
593 description: format!(
594 "Found {} pairs of near-duplicate tokens",
595 near_duplicates.len()
596 ),
597 affected_tokens,
598 suggestion: Some(
599 "Review near-duplicate tokens and consider merging or removing".to_string(),
600 ),
601 }])
602 } else {
603 Ok(Vec::new())
604 }
605 }
606
607 fn calculate_similarity(&self, s1: &str, s2: &str) -> f64 {
609 let len1 = s1.chars().count();
610 let len2 = s2.chars().count();
611
612 if len1 == 0 && len2 == 0 {
613 return 1.0;
614 }
615
616 let max_len = len1.max(len2);
617 let distance = self.levenshtein_distance(s1, s2);
618
619 1.0 - (distance as f64 / max_len as f64)
620 }
621
622 fn levenshtein_distance(&self, s1: &str, s2: &str) -> usize {
624 let chars1: Vec<char> = s1.chars().collect();
625 let chars2: Vec<char> = s2.chars().collect();
626 let len1 = chars1.len();
627 let len2 = chars2.len();
628
629 let mut matrix = vec![vec![0; len2 + 1]; len1 + 1];
630
631 for (i, row) in matrix.iter_mut().enumerate().take(len1 + 1) {
632 row[0] = i;
633 }
634 for (j, val) in matrix[0].iter_mut().enumerate().take(len2 + 1) {
635 *val = j;
636 }
637
638 for i in 1..=len1 {
639 for j in 1..=len2 {
640 let cost = if chars1[i - 1] == chars2[j - 1] { 0 } else { 1 };
641 matrix[i][j] = (matrix[i - 1][j] + 1)
642 .min(matrix[i][j - 1] + 1)
643 .min(matrix[i - 1][j - 1] + cost);
644 }
645 }
646
647 matrix[len1][len2]
648 }
649
650 fn analyze_character_patterns(
652 &self,
653 vocab: &HashMap<String, u32>,
654 ) -> Result<Vec<CharacterPattern>> {
655 let mut patterns = HashMap::new();
656
657 for token in vocab.keys() {
658 let pattern_type = if token.chars().all(|c| c.is_alphabetic()) {
660 "alphabetic"
661 } else if token.chars().all(|c| c.is_numeric()) {
662 "numeric"
663 } else if token.chars().all(|c| c.is_alphanumeric()) {
664 "alphanumeric"
665 } else if token.starts_with('#') {
666 "hashtag"
667 } else if token.starts_with('@') {
668 "mention"
669 } else if token.contains('_') {
670 "underscore"
671 } else if token.contains('-') {
672 "hyphenated"
673 } else {
674 "mixed"
675 };
676
677 let entry = patterns.entry(pattern_type.to_string()).or_insert_with(|| (0, Vec::new()));
678 entry.0 += 1;
679 if entry.1.len() < 10 {
680 entry.1.push(token.clone());
681 }
682 }
683
684 let total_tokens = vocab.len() as f64;
685 let mut result = Vec::new();
686
687 for (pattern, (count, examples)) in patterns {
688 result.push(CharacterPattern {
689 pattern,
690 count,
691 examples,
692 frequency: count as f64 / total_tokens,
693 });
694 }
695
696 result.sort_by_key(|item| std::cmp::Reverse(item.count));
697 Ok(result)
698 }
699
700 fn analyze_subword_patterns(
702 &self,
703 vocab: &HashMap<String, u32>,
704 ) -> Result<Vec<SubwordPattern>> {
705 let mut subword_counts: HashMap<String, (usize, Vec<String>, HashMap<String, usize>)> =
706 HashMap::new();
707
708 for token in vocab.keys() {
709 for len in 2..=4.min(token.chars().count()) {
711 for start in 0..=(token.chars().count().saturating_sub(len)) {
712 let subword: String = token.chars().skip(start).take(len).collect();
713
714 let position_type = if start == 0 {
715 "prefix"
716 } else if start + len == token.chars().count() {
717 "suffix"
718 } else {
719 "infix"
720 };
721
722 let entry = subword_counts
723 .entry(subword)
724 .or_insert_with(|| (0, Vec::new(), HashMap::new()));
725 entry.0 += 1;
726 if entry.1.len() < 5 {
727 entry.1.push(token.clone());
728 }
729 *entry.2.entry(position_type.to_string()).or_insert(0) += 1;
730 }
731 }
732 }
733
734 let mut result: Vec<SubwordPattern> = subword_counts
735 .into_iter()
736 .filter(|(_, (count, _, _))| *count >= 3) .map(|(pattern, (count, tokens, positions))| SubwordPattern {
738 pattern,
739 count,
740 tokens,
741 positions,
742 })
743 .collect();
744
745 result.sort_by_key(|item| std::cmp::Reverse(item.count));
746 result.truncate(50); Ok(result)
748 }
749
750 fn calculate_length_distribution(
752 &self,
753 vocab: &HashMap<String, u32>,
754 ) -> BTreeMap<usize, usize> {
755 let mut distribution = BTreeMap::new();
756
757 for token in vocab.keys() {
758 let length = token.chars().count();
759 *distribution.entry(length).or_insert(0) += 1;
760 }
761
762 distribution
763 }
764
765 fn detect_language_distribution(
767 &self,
768 vocab: &HashMap<String, u32>,
769 ) -> Result<Vec<LanguageDistribution>> {
770 let mut language_counts = HashMap::new();
772
773 for token in vocab.keys() {
774 let language = self.detect_token_language(token);
775 *language_counts.entry(language).or_insert(0) += 1;
776 }
777
778 let total_tokens = vocab.len() as f64;
779 let mut distribution: Vec<LanguageDistribution> = language_counts
780 .into_iter()
781 .map(|(language, count)| {
782 let confidence = self.calculate_language_confidence(&language, count, total_tokens);
784 LanguageDistribution {
785 language,
786 token_count: count,
787 percentage: (count as f64 / total_tokens) * 100.0,
788 confidence,
789 }
790 })
791 .collect();
792
793 distribution.sort_by_key(|item| std::cmp::Reverse(item.token_count));
794 Ok(distribution)
795 }
796
797 fn detect_token_language(&self, token: &str) -> String {
799 for ch in token.chars() {
800 match ch {
801 'a'..='z' | 'A'..='Z' => return "en".to_string(),
802 'α'..='ω' | 'Α'..='Ω' => return "el".to_string(),
803 'а'..='я' | 'А'..='Я' => return "ru".to_string(),
804 '一'..='龯' => return "zh".to_string(),
805 'ひ'..='ゖ' | 'ア'..='ヶ' => return "ja".to_string(),
806 '가'..='힣' => return "ko".to_string(),
807 'ا'..='ي' => return "ar".to_string(),
808 _ => continue,
809 }
810 }
811 "unknown".to_string()
812 }
813
814 fn calculate_language_confidence(
816 &self,
817 language: &str,
818 count: usize,
819 total_tokens: f64,
820 ) -> f64 {
821 let percentage = (count as f64 / total_tokens) * 100.0;
822
823 let mut confidence: f64 = match percentage {
825 p if p >= 50.0 => 0.95,
826 p if p >= 20.0 => 0.85,
827 p if p >= 10.0 => 0.75,
828 p if p >= 5.0 => 0.65,
829 p if p >= 1.0 => 0.55,
830 _ => 0.45,
831 };
832
833 match language {
835 "unknown" => confidence *= 0.3, "en" => confidence *= 1.1, "zh" | "ja" | "ko" | "ar" | "hi" | "th" => {
838 confidence *= 1.2;
840 },
841 _ => confidence *= 1.0, }
843
844 confidence.clamp(0.1, 1.0)
846 }
847
848 fn analyze_oov_patterns(&self, oov_tokens: &[String]) -> Vec<String> {
850 let mut pattern_counts = HashMap::new();
851
852 for token in oov_tokens {
853 if token.chars().all(|c| c.is_numeric()) {
855 *pattern_counts.entry("all_numeric".to_string()).or_insert(0) += 1;
856 } else if token.contains('@') {
857 *pattern_counts.entry("email_like".to_string()).or_insert(0) += 1;
858 } else if token.starts_with("http") {
859 *pattern_counts.entry("url_like".to_string()).or_insert(0) += 1;
860 } else if !token.is_ascii() {
861 *pattern_counts.entry("non_ascii".to_string()).or_insert(0) += 1;
862 } else if token.len() > 15 {
863 *pattern_counts.entry("very_long".to_string()).or_insert(0) += 1;
864 } else {
865 *pattern_counts.entry("other".to_string()).or_insert(0) += 1;
866 }
867 }
868
869 let mut patterns: Vec<(String, usize)> = pattern_counts.into_iter().collect();
870 patterns.sort_by_key(|item| std::cmp::Reverse(item.1));
871 patterns.into_iter().map(|(pattern, _)| pattern).collect()
872 }
873
874 fn generate_recommendations(&self, analysis: &VocabAnalysisResult) -> Vec<String> {
876 let mut recommendations = Vec::new();
877
878 if analysis.basic_stats.total_tokens > 100000 {
880 recommendations
881 .push("Consider reducing vocabulary size for better efficiency".to_string());
882 }
883
884 for issue in &analysis.issues {
886 match issue.severity {
887 IssueSeverity::Critical | IssueSeverity::High | IssueSeverity::Medium => {
888 if let Some(ref suggestion) = issue.suggestion {
889 recommendations.push(suggestion.clone());
890 }
891 },
892 _ => {},
893 }
894 }
895
896 if analysis.basic_stats.avg_token_length > 10.0 {
898 recommendations.push(
899 "Average token length is high; consider more aggressive subword tokenization"
900 .to_string(),
901 );
902 }
903
904 if analysis.frequency_analysis.singleton_tokens.len()
906 > analysis.basic_stats.total_tokens / 10
907 {
908 recommendations.push(
909 "Many singleton tokens detected; consider increasing minimum frequency threshold"
910 .to_string(),
911 );
912 }
913
914 if analysis.language_distribution.len() > 5 {
916 recommendations.push(
917 "Multiple languages detected; consider language-specific vocabularies".to_string(),
918 );
919 }
920
921 recommendations
922 }
923}
924
925pub struct VocabDebugUtils;
927
928impl VocabDebugUtils {
929 pub fn find_similar_tokens(
931 target: &str,
932 vocab: &HashMap<String, u32>,
933 threshold: f64,
934 ) -> Vec<(String, f64)> {
935 let analyzer = VocabAnalyzer::default();
936 let mut similar = Vec::new();
937
938 for token in vocab.keys() {
939 let similarity = analyzer.calculate_similarity(target, token);
940 if similarity >= threshold && token != target {
941 similar.push((token.clone(), similarity));
942 }
943 }
944
945 similar.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(Ordering::Equal));
946 similar
947 }
948
949 pub fn find_tokens_with_pattern(pattern: &str, vocab: &HashMap<String, u32>) -> Vec<String> {
951 vocab.keys().filter(|token| token.contains(pattern)).cloned().collect()
952 }
953
954 pub fn generate_summary_report(analysis: &VocabAnalysisResult) -> String {
956 let mut report = String::new();
957
958 report.push_str("=== VOCABULARY ANALYSIS SUMMARY ===\n\n");
959
960 report.push_str(&format!(
962 "Total tokens: {}\n",
963 analysis.basic_stats.total_tokens
964 ));
965 report.push_str(&format!(
966 "Average token length: {:.2}\n",
967 analysis.basic_stats.avg_token_length
968 ));
969 report.push_str(&format!(
970 "Token length range: {} - {}\n",
971 analysis.basic_stats.min_token_length, analysis.basic_stats.max_token_length
972 ));
973
974 let critical_issues =
976 analysis.issues.iter().filter(|i| i.severity == IssueSeverity::Critical).count();
977 let high_issues =
978 analysis.issues.iter().filter(|i| i.severity == IssueSeverity::High).count();
979 let medium_issues =
980 analysis.issues.iter().filter(|i| i.severity == IssueSeverity::Medium).count();
981
982 report.push_str(&format!(
983 "\nIssues found: {} critical, {} high, {} medium\n",
984 critical_issues, high_issues, medium_issues
985 ));
986
987 if !analysis.character_patterns.is_empty() {
989 report.push_str("\nTop character patterns:\n");
990 for pattern in analysis.character_patterns.iter().take(3) {
991 report.push_str(&format!(
992 " {}: {} tokens ({:.1}%)\n",
993 pattern.pattern,
994 pattern.count,
995 pattern.frequency * 100.0
996 ));
997 }
998 }
999
1000 if !analysis.recommendations.is_empty() {
1002 report.push_str("\nRecommendations:\n");
1003 for rec in analysis.recommendations.iter().take(5) {
1004 report.push_str(&format!(" • {}\n", rec));
1005 }
1006 }
1007
1008 report
1009 }
1010}
1011
1012#[cfg(test)]
1013mod tests {
1014 use super::*;
1015
1016 fn create_test_vocab() -> HashMap<String, u32> {
1017 let mut vocab = HashMap::new();
1018 vocab.insert("hello".to_string(), 1);
1019 vocab.insert("world".to_string(), 2);
1020 vocab.insert("test".to_string(), 3);
1021 vocab.insert("very_long_token_that_exceeds_normal_length".to_string(), 4);
1022 vocab.insert("123".to_string(), 5);
1023 vocab.insert("hello_world".to_string(), 6);
1024 vocab.insert("test123".to_string(), 7);
1025 vocab.insert("@mention".to_string(), 8);
1026 vocab.insert("#hashtag".to_string(), 9);
1027 vocab.insert("helo".to_string(), 10); vocab
1029 }
1030
1031 #[test]
1032 fn test_vocab_analyzer_creation() {
1033 let config = VocabAnalysisConfig::default();
1034 let analyzer = VocabAnalyzer::new(config);
1035 assert!(analyzer.config.analyze_character_patterns);
1036 }
1037
1038 #[test]
1039 fn test_basic_stats_calculation() {
1040 let vocab = create_test_vocab();
1041 let analyzer = VocabAnalyzer::default();
1042 let stats = analyzer.calculate_basic_stats(&vocab);
1043
1044 assert_eq!(stats.total_tokens, 10);
1045 assert_eq!(stats.unique_tokens, 10);
1046 assert!(stats.avg_token_length > 0.0);
1047 assert!(stats.alphabetic_tokens > 0);
1048 assert!(stats.numeric_tokens > 0);
1049 }
1050
1051 #[test]
1052 fn test_vocabulary_analysis() {
1053 let vocab = create_test_vocab();
1054 let analyzer = VocabAnalyzer::default();
1055 let result = analyzer.analyze_vocabulary(&vocab).expect("Operation failed in test");
1056
1057 assert_eq!(result.basic_stats.total_tokens, 10);
1058 assert!(!result.character_patterns.is_empty());
1059 assert!(!result.length_distribution.is_empty());
1060 assert!(!result.language_distribution.is_empty());
1061 }
1062
1063 #[test]
1064 fn test_long_token_detection() {
1065 let vocab = create_test_vocab();
1066 let config = VocabAnalysisConfig {
1067 max_token_length: 10,
1068 ..Default::default()
1069 };
1070
1071 let analyzer = VocabAnalyzer::new(config);
1072 let issues = analyzer.detect_long_tokens(&vocab).expect("Operation failed in test");
1073
1074 assert!(!issues.is_empty());
1075 assert_eq!(issues[0].issue_type, VocabIssueType::LongTokens);
1076 }
1077
1078 #[test]
1079 fn test_similarity_calculation() {
1080 let analyzer = VocabAnalyzer::default();
1081
1082 assert_eq!(analyzer.calculate_similarity("hello", "hello"), 1.0);
1083 assert!(analyzer.calculate_similarity("hello", "helo") >= 0.8);
1084 assert!(analyzer.calculate_similarity("hello", "world") < 0.5);
1085 }
1086
1087 #[test]
1088 fn test_character_pattern_analysis() {
1089 let vocab = create_test_vocab();
1090 let analyzer = VocabAnalyzer::default();
1091 let patterns =
1092 analyzer.analyze_character_patterns(&vocab).expect("Operation failed in test");
1093
1094 assert!(!patterns.is_empty());
1095 assert!(patterns.iter().any(|p| p.pattern == "alphabetic"));
1096 assert!(patterns.iter().any(|p| p.pattern == "numeric"));
1097 }
1098
1099 #[test]
1100 fn test_language_detection() {
1101 let analyzer = VocabAnalyzer::default();
1102
1103 assert_eq!(analyzer.detect_token_language("hello"), "en");
1104 assert_eq!(analyzer.detect_token_language("123"), "unknown");
1105 assert_eq!(analyzer.detect_token_language("привет"), "ru");
1106 }
1107
1108 #[test]
1109 fn test_subword_pattern_analysis() {
1110 let vocab = create_test_vocab();
1111 let analyzer = VocabAnalyzer::default();
1112 let patterns = analyzer.analyze_subword_patterns(&vocab).expect("Operation failed in test");
1113
1114 assert!(!patterns.is_empty());
1116 }
1117
1118 #[test]
1119 fn test_debug_utils() {
1120 let vocab = create_test_vocab();
1121
1122 let similar = VocabDebugUtils::find_similar_tokens("hello", &vocab, 0.8);
1123 assert!(!similar.is_empty());
1124 assert!(similar.iter().any(|(token, _)| token == "helo"));
1125
1126 let pattern_tokens = VocabDebugUtils::find_tokens_with_pattern("test", &vocab);
1127 assert!(pattern_tokens.contains(&"test".to_string()));
1128 assert!(pattern_tokens.contains(&"test123".to_string()));
1129 }
1130
1131 #[test]
1132 fn test_frequency_analysis() {
1133 let vocab = create_test_vocab();
1134 let analyzer = VocabAnalyzer::default();
1135 let freq_analysis = analyzer.analyze_frequency(&vocab);
1136
1137 assert!(!freq_analysis.most_frequent.is_empty());
1138 assert!(!freq_analysis.least_frequent.is_empty());
1139 assert!(!freq_analysis.frequency_histogram.is_empty());
1140 }
1141
1142 #[test]
1143 fn test_recommendations_generation() {
1144 let mut vocab = HashMap::new();
1146 vocab.insert("hello".to_string(), 1);
1147 vocab.insert("world".to_string(), 2);
1148
1149 vocab.insert("this_is_a_very_long_token_that_definitely_exceeds_the_default_maximum_token_length_of_one_hundred_characters_and_should_trigger_a_recommendation".to_string(), 3);
1151
1152 for i in 4..20 {
1154 vocab.insert(format!("singleton_token_{}", i), i);
1155 }
1156
1157 let analyzer = VocabAnalyzer::default();
1158 let result = analyzer.analyze_vocabulary(&vocab).expect("Operation failed in test");
1159
1160 assert!(!result.recommendations.is_empty());
1162
1163 assert!(result.recommendations.iter().any(|rec| rec.contains("long tokens")));
1165 }
1166
1167 #[test]
1168 fn test_summary_report() {
1169 let vocab = create_test_vocab();
1170 let analyzer = VocabAnalyzer::default();
1171 let result = analyzer.analyze_vocabulary(&vocab).expect("Operation failed in test");
1172
1173 let report = VocabDebugUtils::generate_summary_report(&result);
1174 assert!(report.contains("VOCABULARY ANALYSIS SUMMARY"));
1175 assert!(report.contains("Total tokens"));
1176 }
1177}