use serde::{Deserialize, Serialize};
use std::cmp::Ordering;
use std::collections::{BTreeMap, HashMap, HashSet};
use trustformers_core::errors::Result;
use trustformers_core::traits::Tokenizer;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct VocabAnalysisConfig {
pub rare_token_threshold: usize,
pub max_token_length: usize,
pub analyze_character_patterns: bool,
pub detect_encoding_issues: bool,
pub analyze_subword_patterns: bool,
pub check_duplicates: bool,
pub target_languages: Vec<String>,
pub include_detailed_stats: bool,
}
impl Default for VocabAnalysisConfig {
fn default() -> Self {
Self {
rare_token_threshold: 1,
max_token_length: 100,
analyze_character_patterns: true,
detect_encoding_issues: true,
analyze_subword_patterns: true,
check_duplicates: true,
target_languages: Vec::new(),
include_detailed_stats: true,
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct VocabIssue {
pub issue_type: VocabIssueType,
pub severity: IssueSeverity,
pub description: String,
pub affected_tokens: Vec<String>,
pub suggestion: Option<String>,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub enum VocabIssueType {
DuplicateTokens,
NearDuplicates,
RareTokens,
LongTokens,
EncodingIssues,
InvalidUtf8,
InconsistentCasing,
MissingCommonTokens,
InefficientSubwords,
OverlappingTokens,
OrphanedTokens,
}
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
pub enum IssueSeverity {
Low,
Medium,
High,
Critical,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CharacterPattern {
pub pattern: String,
pub count: usize,
pub examples: Vec<String>,
pub frequency: f64,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SubwordPattern {
pub pattern: String,
pub count: usize,
pub tokens: Vec<String>,
pub positions: HashMap<String, usize>, }
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct LanguageDistribution {
pub language: String,
pub token_count: usize,
pub percentage: f64,
pub confidence: f64,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct VocabAnalysisResult {
pub basic_stats: VocabBasicStats,
pub issues: Vec<VocabIssue>,
pub character_patterns: Vec<CharacterPattern>,
pub subword_patterns: Vec<SubwordPattern>,
pub language_distribution: Vec<LanguageDistribution>,
pub length_distribution: BTreeMap<usize, usize>,
pub frequency_analysis: FrequencyAnalysis,
pub coverage_analysis: Option<CoverageAnalysis>,
pub recommendations: Vec<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct VocabBasicStats {
pub total_tokens: usize,
pub unique_tokens: usize,
pub avg_token_length: f64,
pub min_token_length: usize,
pub max_token_length: usize,
pub alphabetic_tokens: usize,
pub numeric_tokens: usize,
pub mixed_tokens: usize,
pub special_char_tokens: usize,
pub whitespace_tokens: usize,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct FrequencyAnalysis {
pub most_frequent: Vec<(String, u32)>,
pub least_frequent: Vec<(String, u32)>,
pub singleton_tokens: Vec<String>,
pub frequency_histogram: BTreeMap<u32, usize>, }
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CoverageAnalysis {
pub total_chars: usize,
pub covered_chars: usize,
pub coverage_percentage: f64,
pub oov_tokens: Vec<String>,
pub oov_patterns: Vec<String>,
}
pub struct VocabAnalyzer {
config: VocabAnalysisConfig,
}
impl VocabAnalyzer {
pub fn new(config: VocabAnalysisConfig) -> Self {
Self { config }
}
pub fn default() -> Self {
Self::new(VocabAnalysisConfig::default())
}
pub fn analyze_tokenizer<T: Tokenizer>(&self, tokenizer: &T) -> Result<VocabAnalysisResult> {
let vocab = tokenizer.get_vocab();
self.analyze_vocabulary(&vocab)
}
pub fn analyze_vocabulary(&self, vocab: &HashMap<String, u32>) -> Result<VocabAnalysisResult> {
let mut result = VocabAnalysisResult {
basic_stats: self.calculate_basic_stats(vocab),
issues: Vec::new(),
character_patterns: Vec::new(),
subword_patterns: Vec::new(),
language_distribution: Vec::new(),
length_distribution: BTreeMap::new(),
frequency_analysis: self.analyze_frequency(vocab),
coverage_analysis: None,
recommendations: Vec::new(),
};
result.issues.extend(self.detect_duplicate_tokens(vocab)?);
result.issues.extend(self.detect_rare_tokens(vocab)?);
result.issues.extend(self.detect_long_tokens(vocab)?);
if self.config.detect_encoding_issues {
result.issues.extend(self.detect_encoding_issues(vocab)?);
}
if self.config.check_duplicates {
result.issues.extend(self.detect_near_duplicates(vocab)?);
}
if self.config.analyze_character_patterns {
result.character_patterns = self.analyze_character_patterns(vocab)?;
}
if self.config.analyze_subword_patterns {
result.subword_patterns = self.analyze_subword_patterns(vocab)?;
}
result.length_distribution = self.calculate_length_distribution(vocab);
result.language_distribution = self.detect_language_distribution(vocab)?;
result.recommendations = self.generate_recommendations(&result);
Ok(result)
}
pub fn analyze_coverage<T: Tokenizer>(
&self,
tokenizer: &T,
corpus: &[String],
) -> Result<CoverageAnalysis> {
let mut total_chars = 0;
let mut covered_chars = 0;
let mut oov_tokens = HashSet::new();
for text in corpus {
total_chars += text.chars().count();
let tokenized = tokenizer.encode(text)?;
for &token_id in &tokenized.input_ids {
if let Some(token) = tokenizer.id_to_token(token_id) {
covered_chars += token.chars().count();
} else {
oov_tokens.insert(format!("<UNK:{}>", token_id));
}
}
}
let coverage_percentage = if total_chars > 0 {
(covered_chars as f64 / total_chars as f64) * 100.0
} else {
0.0
};
let oov_tokens_vec: Vec<String> = oov_tokens.iter().cloned().collect();
let oov_patterns = self.analyze_oov_patterns(&oov_tokens_vec);
Ok(CoverageAnalysis {
total_chars,
covered_chars,
coverage_percentage,
oov_tokens: oov_tokens_vec,
oov_patterns,
})
}
fn calculate_basic_stats(&self, vocab: &HashMap<String, u32>) -> VocabBasicStats {
let total_tokens = vocab.len();
let unique_tokens = vocab.keys().len();
let mut total_length = 0;
let mut min_length = usize::MAX;
let mut max_length = 0;
let mut alphabetic_count = 0;
let mut numeric_count = 0;
let mut mixed_count = 0;
let mut special_char_count = 0;
let mut whitespace_count = 0;
for token in vocab.keys() {
let len = token.chars().count();
total_length += len;
min_length = min_length.min(len);
max_length = max_length.max(len);
if token.chars().all(|c| c.is_alphabetic()) {
alphabetic_count += 1;
} else if token.chars().all(|c| c.is_numeric()) {
numeric_count += 1;
} else if token.chars().any(|c| c.is_alphabetic())
&& token.chars().any(|c| c.is_numeric())
{
mixed_count += 1;
} else if token.chars().all(|c| c.is_whitespace()) {
whitespace_count += 1;
} else {
special_char_count += 1;
}
}
let avg_token_length =
if total_tokens > 0 { total_length as f64 / total_tokens as f64 } else { 0.0 };
VocabBasicStats {
total_tokens,
unique_tokens,
avg_token_length,
min_token_length: if min_length == usize::MAX { 0 } else { min_length },
max_token_length: max_length,
alphabetic_tokens: alphabetic_count,
numeric_tokens: numeric_count,
mixed_tokens: mixed_count,
special_char_tokens: special_char_count,
whitespace_tokens: whitespace_count,
}
}
fn analyze_frequency(&self, vocab: &HashMap<String, u32>) -> FrequencyAnalysis {
let mut token_freq: Vec<(String, u32)> = vocab
.iter()
.map(|(token, &_id)| {
let base_freq = self.estimate_token_frequency(token);
(token.clone(), base_freq)
})
.collect();
token_freq.sort_by_key(|item| std::cmp::Reverse(item.1));
let most_frequent = token_freq.iter().take(20).cloned().collect();
let least_frequent = token_freq.iter().rev().take(20).cloned().collect();
let singleton_tokens = token_freq
.iter()
.filter(|(_, freq)| *freq == 1)
.map(|(token, _)| token.clone())
.collect();
let mut frequency_histogram = BTreeMap::new();
for (_, freq) in &token_freq {
*frequency_histogram.entry(*freq).or_insert(0) += 1;
}
FrequencyAnalysis {
most_frequent,
least_frequent,
singleton_tokens,
frequency_histogram,
}
}
fn detect_duplicate_tokens(&self, vocab: &HashMap<String, u32>) -> Result<Vec<VocabIssue>> {
let mut id_to_tokens: HashMap<u32, Vec<String>> = HashMap::new();
for (token, &id) in vocab {
id_to_tokens.entry(id).or_default().push(token.clone());
}
let mut issues = Vec::new();
for (id, tokens) in id_to_tokens {
if tokens.len() > 1 {
issues.push(VocabIssue {
issue_type: VocabIssueType::DuplicateTokens,
severity: IssueSeverity::High,
description: format!("Multiple tokens share ID {}: {:?}", id, tokens),
affected_tokens: tokens,
suggestion: Some("Ensure each token has a unique ID".to_string()),
});
}
}
Ok(issues)
}
fn estimate_token_frequency(&self, token: &str) -> u32 {
let mut score = 1000u32;
if token.chars().all(|c| c.is_ascii_alphabetic()) {
score += 500; }
match token.len() {
1..=3 => score += 1000,
4..=6 => score += 500,
7..=10 => score += 100,
_ => score /= 2, }
if token.starts_with('<') && token.ends_with('>') {
score += 800; } else if token.contains("##") {
score += 300; } else if token.chars().all(|c| c.is_ascii_punctuation()) {
score += 200; }
let common_chars = ['e', 't', 'a', 'o', 'i', 'n', 's', 'h', 'r'];
if token.chars().any(|c| common_chars.contains(&c.to_ascii_lowercase())) {
score += 200;
}
let hash_value =
token.chars().fold(0u32, |acc, c| acc.wrapping_mul(31).wrapping_add(c as u32));
score += hash_value % 200;
score.max(1) }
fn detect_rare_tokens(&self, vocab: &HashMap<String, u32>) -> Result<Vec<VocabIssue>> {
let rare_tokens: Vec<String> = vocab
.keys()
.filter(|token| {
let estimated_freq = self.estimate_token_frequency(token);
estimated_freq < 100 || token.len() > 20
})
.take(100)
.cloned()
.collect();
if !rare_tokens.is_empty() {
Ok(vec![VocabIssue {
issue_type: VocabIssueType::RareTokens,
severity: IssueSeverity::Low,
description: format!("Found {} potentially rare tokens", rare_tokens.len()),
affected_tokens: rare_tokens,
suggestion: Some(
"Consider removing very rare tokens to reduce vocabulary size".to_string(),
),
}])
} else {
Ok(Vec::new())
}
}
fn detect_long_tokens(&self, vocab: &HashMap<String, u32>) -> Result<Vec<VocabIssue>> {
let long_tokens: Vec<String> = vocab
.keys()
.filter(|token| token.chars().count() > self.config.max_token_length)
.cloned()
.collect();
if !long_tokens.is_empty() {
Ok(vec![VocabIssue {
issue_type: VocabIssueType::LongTokens,
severity: IssueSeverity::Medium,
description: format!(
"Found {} tokens exceeding maximum length of {}",
long_tokens.len(),
self.config.max_token_length
),
affected_tokens: long_tokens,
suggestion: Some("Consider truncating or removing very long tokens".to_string()),
}])
} else {
Ok(Vec::new())
}
}
fn detect_encoding_issues(&self, vocab: &HashMap<String, u32>) -> Result<Vec<VocabIssue>> {
let mut issues = Vec::new();
let mut invalid_utf8_tokens = Vec::new();
let mut mojibake_tokens = Vec::new();
for token in vocab.keys() {
if !token.is_ascii() && token.chars().any(|c| c as u32 > 0x10FFFF) {
invalid_utf8_tokens.push(token.clone());
}
if token.contains("Ã") || token.contains("â") || token.contains("Â") {
mojibake_tokens.push(token.clone());
}
}
if !invalid_utf8_tokens.is_empty() {
issues.push(VocabIssue {
issue_type: VocabIssueType::InvalidUtf8,
severity: IssueSeverity::Critical,
description: "Found tokens with invalid UTF-8 sequences".to_string(),
affected_tokens: invalid_utf8_tokens,
suggestion: Some("Fix encoding issues before tokenization".to_string()),
});
}
if !mojibake_tokens.is_empty() {
issues.push(VocabIssue {
issue_type: VocabIssueType::EncodingIssues,
severity: IssueSeverity::High,
description: "Found tokens with potential mojibake patterns".to_string(),
affected_tokens: mojibake_tokens,
suggestion: Some("Check for encoding issues in source data".to_string()),
});
}
Ok(issues)
}
fn detect_near_duplicates(&self, vocab: &HashMap<String, u32>) -> Result<Vec<VocabIssue>> {
let mut near_duplicates = Vec::new();
let tokens: Vec<&String> = vocab.keys().collect();
for i in 0..tokens.len() {
for j in (i + 1)..tokens.len() {
let similarity = self.calculate_similarity(tokens[i], tokens[j]);
if similarity > 0.9 && similarity < 1.0 {
near_duplicates.push(vec![tokens[i].clone(), tokens[j].clone()]);
}
}
}
if !near_duplicates.is_empty() {
let affected_tokens: Vec<String> = near_duplicates.iter().flatten().cloned().collect();
Ok(vec![VocabIssue {
issue_type: VocabIssueType::NearDuplicates,
severity: IssueSeverity::Medium,
description: format!(
"Found {} pairs of near-duplicate tokens",
near_duplicates.len()
),
affected_tokens,
suggestion: Some(
"Review near-duplicate tokens and consider merging or removing".to_string(),
),
}])
} else {
Ok(Vec::new())
}
}
fn calculate_similarity(&self, s1: &str, s2: &str) -> f64 {
let len1 = s1.chars().count();
let len2 = s2.chars().count();
if len1 == 0 && len2 == 0 {
return 1.0;
}
let max_len = len1.max(len2);
let distance = self.levenshtein_distance(s1, s2);
1.0 - (distance as f64 / max_len as f64)
}
fn levenshtein_distance(&self, s1: &str, s2: &str) -> usize {
let chars1: Vec<char> = s1.chars().collect();
let chars2: Vec<char> = s2.chars().collect();
let len1 = chars1.len();
let len2 = chars2.len();
let mut matrix = vec![vec![0; len2 + 1]; len1 + 1];
for (i, row) in matrix.iter_mut().enumerate().take(len1 + 1) {
row[0] = i;
}
for (j, val) in matrix[0].iter_mut().enumerate().take(len2 + 1) {
*val = j;
}
for i in 1..=len1 {
for j in 1..=len2 {
let cost = if chars1[i - 1] == chars2[j - 1] { 0 } else { 1 };
matrix[i][j] = (matrix[i - 1][j] + 1)
.min(matrix[i][j - 1] + 1)
.min(matrix[i - 1][j - 1] + cost);
}
}
matrix[len1][len2]
}
fn analyze_character_patterns(
&self,
vocab: &HashMap<String, u32>,
) -> Result<Vec<CharacterPattern>> {
let mut patterns = HashMap::new();
for token in vocab.keys() {
let pattern_type = if token.chars().all(|c| c.is_alphabetic()) {
"alphabetic"
} else if token.chars().all(|c| c.is_numeric()) {
"numeric"
} else if token.chars().all(|c| c.is_alphanumeric()) {
"alphanumeric"
} else if token.starts_with('#') {
"hashtag"
} else if token.starts_with('@') {
"mention"
} else if token.contains('_') {
"underscore"
} else if token.contains('-') {
"hyphenated"
} else {
"mixed"
};
let entry = patterns.entry(pattern_type.to_string()).or_insert_with(|| (0, Vec::new()));
entry.0 += 1;
if entry.1.len() < 10 {
entry.1.push(token.clone());
}
}
let total_tokens = vocab.len() as f64;
let mut result = Vec::new();
for (pattern, (count, examples)) in patterns {
result.push(CharacterPattern {
pattern,
count,
examples,
frequency: count as f64 / total_tokens,
});
}
result.sort_by_key(|item| std::cmp::Reverse(item.count));
Ok(result)
}
fn analyze_subword_patterns(
&self,
vocab: &HashMap<String, u32>,
) -> Result<Vec<SubwordPattern>> {
let mut subword_counts: HashMap<String, (usize, Vec<String>, HashMap<String, usize>)> =
HashMap::new();
for token in vocab.keys() {
for len in 2..=4.min(token.chars().count()) {
for start in 0..=(token.chars().count().saturating_sub(len)) {
let subword: String = token.chars().skip(start).take(len).collect();
let position_type = if start == 0 {
"prefix"
} else if start + len == token.chars().count() {
"suffix"
} else {
"infix"
};
let entry = subword_counts
.entry(subword)
.or_insert_with(|| (0, Vec::new(), HashMap::new()));
entry.0 += 1;
if entry.1.len() < 5 {
entry.1.push(token.clone());
}
*entry.2.entry(position_type.to_string()).or_insert(0) += 1;
}
}
}
let mut result: Vec<SubwordPattern> = subword_counts
.into_iter()
.filter(|(_, (count, _, _))| *count >= 3) .map(|(pattern, (count, tokens, positions))| SubwordPattern {
pattern,
count,
tokens,
positions,
})
.collect();
result.sort_by_key(|item| std::cmp::Reverse(item.count));
result.truncate(50); Ok(result)
}
fn calculate_length_distribution(
&self,
vocab: &HashMap<String, u32>,
) -> BTreeMap<usize, usize> {
let mut distribution = BTreeMap::new();
for token in vocab.keys() {
let length = token.chars().count();
*distribution.entry(length).or_insert(0) += 1;
}
distribution
}
fn detect_language_distribution(
&self,
vocab: &HashMap<String, u32>,
) -> Result<Vec<LanguageDistribution>> {
let mut language_counts = HashMap::new();
for token in vocab.keys() {
let language = self.detect_token_language(token);
*language_counts.entry(language).or_insert(0) += 1;
}
let total_tokens = vocab.len() as f64;
let mut distribution: Vec<LanguageDistribution> = language_counts
.into_iter()
.map(|(language, count)| {
let confidence = self.calculate_language_confidence(&language, count, total_tokens);
LanguageDistribution {
language,
token_count: count,
percentage: (count as f64 / total_tokens) * 100.0,
confidence,
}
})
.collect();
distribution.sort_by_key(|item| std::cmp::Reverse(item.token_count));
Ok(distribution)
}
fn detect_token_language(&self, token: &str) -> String {
for ch in token.chars() {
match ch {
'a'..='z' | 'A'..='Z' => return "en".to_string(),
'α'..='ω' | 'Α'..='Ω' => return "el".to_string(),
'а'..='я' | 'А'..='Я' => return "ru".to_string(),
'一'..='龯' => return "zh".to_string(),
'ひ'..='ゖ' | 'ア'..='ヶ' => return "ja".to_string(),
'가'..='힣' => return "ko".to_string(),
'ا'..='ي' => return "ar".to_string(),
_ => continue,
}
}
"unknown".to_string()
}
fn calculate_language_confidence(
&self,
language: &str,
count: usize,
total_tokens: f64,
) -> f64 {
let percentage = (count as f64 / total_tokens) * 100.0;
let mut confidence: f64 = match percentage {
p if p >= 50.0 => 0.95,
p if p >= 20.0 => 0.85,
p if p >= 10.0 => 0.75,
p if p >= 5.0 => 0.65,
p if p >= 1.0 => 0.55,
_ => 0.45,
};
match language {
"unknown" => confidence *= 0.3, "en" => confidence *= 1.1, "zh" | "ja" | "ko" | "ar" | "hi" | "th" => {
confidence *= 1.2;
},
_ => confidence *= 1.0, }
confidence.clamp(0.1, 1.0)
}
fn analyze_oov_patterns(&self, oov_tokens: &[String]) -> Vec<String> {
let mut pattern_counts = HashMap::new();
for token in oov_tokens {
if token.chars().all(|c| c.is_numeric()) {
*pattern_counts.entry("all_numeric".to_string()).or_insert(0) += 1;
} else if token.contains('@') {
*pattern_counts.entry("email_like".to_string()).or_insert(0) += 1;
} else if token.starts_with("http") {
*pattern_counts.entry("url_like".to_string()).or_insert(0) += 1;
} else if !token.is_ascii() {
*pattern_counts.entry("non_ascii".to_string()).or_insert(0) += 1;
} else if token.len() > 15 {
*pattern_counts.entry("very_long".to_string()).or_insert(0) += 1;
} else {
*pattern_counts.entry("other".to_string()).or_insert(0) += 1;
}
}
let mut patterns: Vec<(String, usize)> = pattern_counts.into_iter().collect();
patterns.sort_by_key(|item| std::cmp::Reverse(item.1));
patterns.into_iter().map(|(pattern, _)| pattern).collect()
}
fn generate_recommendations(&self, analysis: &VocabAnalysisResult) -> Vec<String> {
let mut recommendations = Vec::new();
if analysis.basic_stats.total_tokens > 100000 {
recommendations
.push("Consider reducing vocabulary size for better efficiency".to_string());
}
for issue in &analysis.issues {
match issue.severity {
IssueSeverity::Critical | IssueSeverity::High | IssueSeverity::Medium => {
if let Some(ref suggestion) = issue.suggestion {
recommendations.push(suggestion.clone());
}
},
_ => {},
}
}
if analysis.basic_stats.avg_token_length > 10.0 {
recommendations.push(
"Average token length is high; consider more aggressive subword tokenization"
.to_string(),
);
}
if analysis.frequency_analysis.singleton_tokens.len()
> analysis.basic_stats.total_tokens / 10
{
recommendations.push(
"Many singleton tokens detected; consider increasing minimum frequency threshold"
.to_string(),
);
}
if analysis.language_distribution.len() > 5 {
recommendations.push(
"Multiple languages detected; consider language-specific vocabularies".to_string(),
);
}
recommendations
}
}
pub struct VocabDebugUtils;
impl VocabDebugUtils {
pub fn find_similar_tokens(
target: &str,
vocab: &HashMap<String, u32>,
threshold: f64,
) -> Vec<(String, f64)> {
let analyzer = VocabAnalyzer::default();
let mut similar = Vec::new();
for token in vocab.keys() {
let similarity = analyzer.calculate_similarity(target, token);
if similarity >= threshold && token != target {
similar.push((token.clone(), similarity));
}
}
similar.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(Ordering::Equal));
similar
}
pub fn find_tokens_with_pattern(pattern: &str, vocab: &HashMap<String, u32>) -> Vec<String> {
vocab.keys().filter(|token| token.contains(pattern)).cloned().collect()
}
pub fn generate_summary_report(analysis: &VocabAnalysisResult) -> String {
let mut report = String::new();
report.push_str("=== VOCABULARY ANALYSIS SUMMARY ===\n\n");
report.push_str(&format!(
"Total tokens: {}\n",
analysis.basic_stats.total_tokens
));
report.push_str(&format!(
"Average token length: {:.2}\n",
analysis.basic_stats.avg_token_length
));
report.push_str(&format!(
"Token length range: {} - {}\n",
analysis.basic_stats.min_token_length, analysis.basic_stats.max_token_length
));
let critical_issues =
analysis.issues.iter().filter(|i| i.severity == IssueSeverity::Critical).count();
let high_issues =
analysis.issues.iter().filter(|i| i.severity == IssueSeverity::High).count();
let medium_issues =
analysis.issues.iter().filter(|i| i.severity == IssueSeverity::Medium).count();
report.push_str(&format!(
"\nIssues found: {} critical, {} high, {} medium\n",
critical_issues, high_issues, medium_issues
));
if !analysis.character_patterns.is_empty() {
report.push_str("\nTop character patterns:\n");
for pattern in analysis.character_patterns.iter().take(3) {
report.push_str(&format!(
" {}: {} tokens ({:.1}%)\n",
pattern.pattern,
pattern.count,
pattern.frequency * 100.0
));
}
}
if !analysis.recommendations.is_empty() {
report.push_str("\nRecommendations:\n");
for rec in analysis.recommendations.iter().take(5) {
report.push_str(&format!(" • {}\n", rec));
}
}
report
}
}
#[cfg(test)]
mod tests {
use super::*;
fn create_test_vocab() -> HashMap<String, u32> {
let mut vocab = HashMap::new();
vocab.insert("hello".to_string(), 1);
vocab.insert("world".to_string(), 2);
vocab.insert("test".to_string(), 3);
vocab.insert("very_long_token_that_exceeds_normal_length".to_string(), 4);
vocab.insert("123".to_string(), 5);
vocab.insert("hello_world".to_string(), 6);
vocab.insert("test123".to_string(), 7);
vocab.insert("@mention".to_string(), 8);
vocab.insert("#hashtag".to_string(), 9);
vocab.insert("helo".to_string(), 10); vocab
}
#[test]
fn test_vocab_analyzer_creation() {
let config = VocabAnalysisConfig::default();
let analyzer = VocabAnalyzer::new(config);
assert!(analyzer.config.analyze_character_patterns);
}
#[test]
fn test_basic_stats_calculation() {
let vocab = create_test_vocab();
let analyzer = VocabAnalyzer::default();
let stats = analyzer.calculate_basic_stats(&vocab);
assert_eq!(stats.total_tokens, 10);
assert_eq!(stats.unique_tokens, 10);
assert!(stats.avg_token_length > 0.0);
assert!(stats.alphabetic_tokens > 0);
assert!(stats.numeric_tokens > 0);
}
#[test]
fn test_vocabulary_analysis() {
let vocab = create_test_vocab();
let analyzer = VocabAnalyzer::default();
let result = analyzer.analyze_vocabulary(&vocab).expect("Operation failed in test");
assert_eq!(result.basic_stats.total_tokens, 10);
assert!(!result.character_patterns.is_empty());
assert!(!result.length_distribution.is_empty());
assert!(!result.language_distribution.is_empty());
}
#[test]
fn test_long_token_detection() {
let vocab = create_test_vocab();
let config = VocabAnalysisConfig {
max_token_length: 10,
..Default::default()
};
let analyzer = VocabAnalyzer::new(config);
let issues = analyzer.detect_long_tokens(&vocab).expect("Operation failed in test");
assert!(!issues.is_empty());
assert_eq!(issues[0].issue_type, VocabIssueType::LongTokens);
}
#[test]
fn test_similarity_calculation() {
let analyzer = VocabAnalyzer::default();
assert_eq!(analyzer.calculate_similarity("hello", "hello"), 1.0);
assert!(analyzer.calculate_similarity("hello", "helo") >= 0.8);
assert!(analyzer.calculate_similarity("hello", "world") < 0.5);
}
#[test]
fn test_character_pattern_analysis() {
let vocab = create_test_vocab();
let analyzer = VocabAnalyzer::default();
let patterns =
analyzer.analyze_character_patterns(&vocab).expect("Operation failed in test");
assert!(!patterns.is_empty());
assert!(patterns.iter().any(|p| p.pattern == "alphabetic"));
assert!(patterns.iter().any(|p| p.pattern == "numeric"));
}
#[test]
fn test_language_detection() {
let analyzer = VocabAnalyzer::default();
assert_eq!(analyzer.detect_token_language("hello"), "en");
assert_eq!(analyzer.detect_token_language("123"), "unknown");
assert_eq!(analyzer.detect_token_language("привет"), "ru");
}
#[test]
fn test_subword_pattern_analysis() {
let vocab = create_test_vocab();
let analyzer = VocabAnalyzer::default();
let patterns = analyzer.analyze_subword_patterns(&vocab).expect("Operation failed in test");
assert!(!patterns.is_empty());
}
#[test]
fn test_debug_utils() {
let vocab = create_test_vocab();
let similar = VocabDebugUtils::find_similar_tokens("hello", &vocab, 0.8);
assert!(!similar.is_empty());
assert!(similar.iter().any(|(token, _)| token == "helo"));
let pattern_tokens = VocabDebugUtils::find_tokens_with_pattern("test", &vocab);
assert!(pattern_tokens.contains(&"test".to_string()));
assert!(pattern_tokens.contains(&"test123".to_string()));
}
#[test]
fn test_frequency_analysis() {
let vocab = create_test_vocab();
let analyzer = VocabAnalyzer::default();
let freq_analysis = analyzer.analyze_frequency(&vocab);
assert!(!freq_analysis.most_frequent.is_empty());
assert!(!freq_analysis.least_frequent.is_empty());
assert!(!freq_analysis.frequency_histogram.is_empty());
}
#[test]
fn test_recommendations_generation() {
let mut vocab = HashMap::new();
vocab.insert("hello".to_string(), 1);
vocab.insert("world".to_string(), 2);
vocab.insert("this_is_a_very_long_token_that_definitely_exceeds_the_default_maximum_token_length_of_one_hundred_characters_and_should_trigger_a_recommendation".to_string(), 3);
for i in 4..20 {
vocab.insert(format!("singleton_token_{}", i), i);
}
let analyzer = VocabAnalyzer::default();
let result = analyzer.analyze_vocabulary(&vocab).expect("Operation failed in test");
assert!(!result.recommendations.is_empty());
assert!(result.recommendations.iter().any(|rec| rec.contains("long tokens")));
}
#[test]
fn test_summary_report() {
let vocab = create_test_vocab();
let analyzer = VocabAnalyzer::default();
let result = analyzer.analyze_vocabulary(&vocab).expect("Operation failed in test");
let report = VocabDebugUtils::generate_summary_report(&result);
assert!(report.contains("VOCABULARY ANALYSIS SUMMARY"));
assert!(report.contains("Total tokens"));
}
}