trustformers_tokenizers/
tokenization_debugger.rs

1use crate::tokenizer::TokenizerWrapper;
2use crate::visualization::{TokenVisualizer, VisualizationConfig};
3use serde::{Deserialize, Serialize};
4use std::collections::HashMap;
5use trustformers_core::errors::{Result, TrustformersError};
6use trustformers_core::traits::Tokenizer;
7
8/// Comprehensive tokenization debugger for analyzing tokenization behavior
9pub struct TokenizationDebugger {
10    tokenizers: HashMap<String, TokenizerWrapper>,
11    history: Vec<DebugSession>,
12    config: DebuggerConfig,
13}
14
15/// Configuration for the tokenization debugger
16#[derive(Debug, Clone, Serialize, Deserialize)]
17pub struct DebuggerConfig {
18    /// Maximum number of sessions to keep in history
19    pub max_history_size: usize,
20
21    /// Whether to automatically analyze common issues
22    pub auto_analyze_issues: bool,
23
24    /// Whether to show detailed character-level information
25    pub show_character_details: bool,
26
27    /// Whether to compare with reference tokenizers
28    pub enable_comparison: bool,
29
30    /// Maximum text length to debug (for performance)
31    pub max_text_length: usize,
32}
33
34impl Default for DebuggerConfig {
35    fn default() -> Self {
36        Self {
37            max_history_size: 100,
38            auto_analyze_issues: true,
39            show_character_details: true,
40            enable_comparison: true,
41            max_text_length: 10000,
42        }
43    }
44}
45
46/// A debugging session containing input text and analysis results
47#[derive(Debug, Clone, Serialize, Deserialize)]
48pub struct DebugSession {
49    pub id: String,
50    pub input_text: String,
51    pub timestamp: u64,
52    pub tokenizer_results: HashMap<String, TokenizationResult>,
53    pub analysis: DebugAnalysis,
54    pub issues: Vec<DetectedIssue>,
55}
56
57/// Results from tokenizing with a specific tokenizer
58#[derive(Debug, Clone, Serialize, Deserialize)]
59pub struct TokenizationResult {
60    pub tokenizer_name: String,
61    pub tokens: Vec<String>,
62    pub token_ids: Vec<u32>,
63    pub token_count: usize,
64    pub character_count: usize,
65    pub compression_ratio: f64,
66    pub processing_time_ms: f64,
67    pub character_offsets: Option<Vec<(usize, usize)>>,
68    pub oov_tokens: Vec<String>,
69    pub special_tokens: Vec<String>,
70}
71
72/// Analysis of tokenization behavior across tokenizers
73#[derive(Debug, Clone, Serialize, Deserialize)]
74pub struct DebugAnalysis {
75    pub total_tokenizers: usize,
76    pub consensus_tokens: Vec<String>,
77    pub disagreement_tokens: Vec<String>,
78    pub compression_stats: CompressionStats,
79    pub performance_stats: PerformanceStats,
80    pub character_analysis: CharacterAnalysis,
81    pub pattern_analysis: PatternAnalysis,
82}
83
84/// Statistics about compression ratios across tokenizers
85#[derive(Debug, Clone, Serialize, Deserialize)]
86pub struct CompressionStats {
87    pub min_ratio: f64,
88    pub max_ratio: f64,
89    pub avg_ratio: f64,
90    pub std_deviation: f64,
91    pub best_tokenizer: String,
92    pub worst_tokenizer: String,
93}
94
95/// Performance statistics across tokenizers
96#[derive(Debug, Clone, Serialize, Deserialize)]
97pub struct PerformanceStats {
98    pub min_time_ms: f64,
99    pub max_time_ms: f64,
100    pub avg_time_ms: f64,
101    pub fastest_tokenizer: String,
102    pub slowest_tokenizer: String,
103}
104
105/// Analysis of character-level behavior
106#[derive(Debug, Clone, Serialize, Deserialize)]
107pub struct CharacterAnalysis {
108    pub total_characters: usize,
109    pub unique_characters: usize,
110    pub character_frequency: HashMap<char, usize>,
111    pub problematic_characters: Vec<char>,
112    pub unicode_categories: HashMap<String, usize>,
113}
114
115/// Analysis of tokenization patterns
116#[derive(Debug, Clone, Serialize, Deserialize)]
117pub struct PatternAnalysis {
118    pub common_prefixes: Vec<(String, usize)>,
119    pub common_suffixes: Vec<(String, usize)>,
120    pub token_length_distribution: HashMap<usize, usize>,
121    pub subword_patterns: Vec<(String, usize)>,
122}
123
124/// Types of issues that can be detected during tokenization
125#[derive(Debug, Clone, Serialize, Deserialize)]
126pub enum IssueType {
127    /// High variance in token count across tokenizers
128    HighTokenCountVariance,
129
130    /// Many OOV (out-of-vocabulary) tokens
131    HighOOVRate,
132
133    /// Poor compression ratio
134    PoorCompression,
135
136    /// Slow tokenization performance
137    SlowPerformance,
138
139    /// Inconsistent tokenization across similar texts
140    InconsistentTokenization,
141
142    /// Problematic Unicode handling
143    UnicodeIssues,
144
145    /// Unexpected special token behavior
146    SpecialTokenIssues,
147
148    /// Token boundary issues
149    BoundaryIssues,
150}
151
152/// A detected issue with suggested solutions
153#[derive(Debug, Clone, Serialize, Deserialize)]
154pub struct DetectedIssue {
155    pub issue_type: IssueType,
156    pub severity: IssueSeverity,
157    pub description: String,
158    pub affected_tokenizers: Vec<String>,
159    pub suggestions: Vec<String>,
160    pub examples: Vec<String>,
161}
162
163/// Severity levels for detected issues
164#[derive(Debug, Clone, Serialize, Deserialize)]
165pub enum IssueSeverity {
166    Low,
167    Medium,
168    High,
169    Critical,
170}
171
172impl Default for TokenizationDebugger {
173    fn default() -> Self {
174        Self::new()
175    }
176}
177
178impl TokenizationDebugger {
179    /// Create a new tokenization debugger
180    pub fn new() -> Self {
181        Self {
182            tokenizers: HashMap::new(),
183            history: Vec::new(),
184            config: DebuggerConfig::default(),
185        }
186    }
187
188    /// Create debugger with custom configuration
189    pub fn with_config(config: DebuggerConfig) -> Self {
190        Self {
191            tokenizers: HashMap::new(),
192            history: Vec::new(),
193            config,
194        }
195    }
196
197    /// Add a tokenizer to the debugger
198    pub fn add_tokenizer(&mut self, name: String, tokenizer: TokenizerWrapper) {
199        self.tokenizers.insert(name, tokenizer);
200    }
201
202    /// Remove a tokenizer from the debugger
203    pub fn remove_tokenizer(&mut self, name: &str) -> Option<TokenizerWrapper> {
204        self.tokenizers.remove(name)
205    }
206
207    /// List all available tokenizers
208    pub fn list_tokenizers(&self) -> Vec<String> {
209        self.tokenizers.keys().cloned().collect()
210    }
211
212    /// Debug tokenization of input text with all registered tokenizers
213    pub fn debug_text(&mut self, text: &str) -> Result<DebugSession> {
214        if text.len() > self.config.max_text_length {
215            return Err(TrustformersError::invalid_input(format!(
216                "Text too long: {} characters (max: {})",
217                text.len(),
218                self.config.max_text_length
219            )));
220        }
221
222        let session_id = format!("debug_{}", chrono::Utc::now().timestamp());
223        let mut tokenizer_results = HashMap::new();
224
225        // Tokenize with each registered tokenizer
226        for (name, tokenizer) in &self.tokenizers {
227            let start_time = std::time::Instant::now();
228
229            match tokenizer.encode(text) {
230                Ok(result) => {
231                    let processing_time = start_time.elapsed().as_secs_f64() * 1000.0;
232
233                    let compression_ratio = if !text.is_empty() {
234                        result.input_ids.len() as f64 / text.len() as f64
235                    } else {
236                        0.0
237                    };
238
239                    // Analyze OOV tokens (simplified - would need tokenizer vocab access)
240                    let tokens: Vec<String> = result
241                        .input_ids
242                        .iter()
243                        .filter_map(|&id| tokenizer.id_to_token(id))
244                        .collect();
245                    let oov_tokens = self.find_oov_tokens(&tokens, tokenizer);
246                    let special_tokens = self.find_special_tokens(&tokens, tokenizer);
247
248                    let tokenization_result = TokenizationResult {
249                        tokenizer_name: name.clone(),
250                        tokens: result
251                            .input_ids
252                            .iter()
253                            .filter_map(|&id| tokenizer.id_to_token(id))
254                            .collect(),
255                        token_ids: result.input_ids.clone(),
256                        token_count: result.input_ids.len(),
257                        character_count: text.len(),
258                        compression_ratio,
259                        processing_time_ms: processing_time,
260                        character_offsets: None, // TokenizedInput doesn't have offsets
261                        oov_tokens,
262                        special_tokens,
263                    };
264
265                    tokenizer_results.insert(name.clone(), tokenization_result);
266                },
267                Err(e) => {
268                    // Create error result
269                    let tokenization_result = TokenizationResult {
270                        tokenizer_name: name.clone(),
271                        tokens: vec![format!("ERROR: {}", e)],
272                        token_ids: vec![],
273                        token_count: 0,
274                        character_count: text.len(),
275                        compression_ratio: 0.0,
276                        processing_time_ms: start_time.elapsed().as_secs_f64() * 1000.0,
277                        character_offsets: None,
278                        oov_tokens: vec![],
279                        special_tokens: vec![],
280                    };
281
282                    tokenizer_results.insert(name.clone(), tokenization_result);
283                },
284            }
285        }
286
287        // Perform analysis
288        let analysis = self.analyze_results(&tokenizer_results, text);
289        let issues = if self.config.auto_analyze_issues {
290            self.detect_issues(&tokenizer_results, &analysis)
291        } else {
292            vec![]
293        };
294
295        let session = DebugSession {
296            id: session_id,
297            input_text: text.to_string(),
298            timestamp: chrono::Utc::now().timestamp() as u64,
299            tokenizer_results,
300            analysis,
301            issues,
302        };
303
304        // Add to history
305        self.history.push(session.clone());
306        if self.history.len() > self.config.max_history_size {
307            self.history.remove(0);
308        }
309
310        Ok(session)
311    }
312
313    /// Compare tokenization across multiple texts
314    pub fn compare_texts(&mut self, texts: &[String]) -> Result<Vec<DebugSession>> {
315        let mut sessions = Vec::new();
316
317        for text in texts {
318            let session = self.debug_text(text)?;
319            sessions.push(session);
320        }
321
322        Ok(sessions)
323    }
324
325    /// Get debugging history
326    pub fn get_history(&self) -> &[DebugSession] {
327        &self.history
328    }
329
330    /// Get a specific session by ID
331    pub fn get_session(&self, session_id: &str) -> Option<&DebugSession> {
332        self.history.iter().find(|s| s.id == session_id)
333    }
334
335    /// Generate a detailed debug report
336    pub fn generate_report(&self, session_id: &str) -> Result<String> {
337        let session = self.get_session(session_id).ok_or_else(|| {
338            TrustformersError::invalid_input(format!("Session not found: {}", session_id))
339        })?;
340
341        let mut report = String::new();
342
343        report.push_str("# Tokenization Debug Report\n");
344        report.push_str(&format!("Session ID: {}\n", session.id));
345        report.push_str(&format!("Timestamp: {}\n", session.timestamp));
346        report.push_str(&format!(
347            "Input Length: {} characters\n\n",
348            session.input_text.len()
349        ));
350
351        report.push_str("## Input Text\n");
352        report.push_str(&format!("```\n{}\n```\n\n", session.input_text));
353
354        report.push_str("## Tokenizer Results\n");
355        for (name, result) in &session.tokenizer_results {
356            report.push_str(&format!("### {}\n", name));
357            report.push_str(&format!("- Tokens: {}\n", result.token_count));
358            report.push_str(&format!("- Compression: {:.3}\n", result.compression_ratio));
359            report.push_str(&format!("- Time: {:.2}ms\n", result.processing_time_ms));
360            report.push_str(&format!("- OOV Tokens: {}\n", result.oov_tokens.len()));
361            report.push_str(&format!(
362                "- Special Tokens: {}\n",
363                result.special_tokens.len()
364            ));
365            report.push('\n');
366        }
367
368        report.push_str("## Analysis\n");
369        let analysis = &session.analysis;
370        report.push_str(&format!(
371            "- Total Tokenizers: {}\n",
372            analysis.total_tokenizers
373        ));
374        report.push_str(&format!(
375            "- Consensus Tokens: {}\n",
376            analysis.consensus_tokens.len()
377        ));
378        report.push_str(&format!(
379            "- Disagreement Tokens: {}\n",
380            analysis.disagreement_tokens.len()
381        ));
382        report.push_str(&format!(
383            "- Best Compression: {} ({:.3})\n",
384            analysis.compression_stats.best_tokenizer, analysis.compression_stats.min_ratio
385        ));
386        report.push_str(&format!(
387            "- Fastest: {} ({:.2}ms)\n",
388            analysis.performance_stats.fastest_tokenizer, analysis.performance_stats.min_time_ms
389        ));
390        report.push('\n');
391
392        if !session.issues.is_empty() {
393            report.push_str("## Detected Issues\n");
394            for (i, issue) in session.issues.iter().enumerate() {
395                report.push_str(&format!("### Issue {}: {:?}\n", i + 1, issue.issue_type));
396                report.push_str(&format!("**Severity:** {:?}\n", issue.severity));
397                report.push_str(&format!("**Description:** {}\n", issue.description));
398                report.push_str(&format!(
399                    "**Affected Tokenizers:** {}\n",
400                    issue.affected_tokenizers.join(", ")
401                ));
402                report.push_str("**Suggestions:**\n");
403                for suggestion in &issue.suggestions {
404                    report.push_str(&format!("- {}\n", suggestion));
405                }
406                report.push('\n');
407            }
408        }
409
410        Ok(report)
411    }
412
413    /// Generate HTML visualization of tokenization
414    pub fn generate_html_visualization(&self, session_id: &str) -> Result<String> {
415        let session = self.get_session(session_id).ok_or_else(|| {
416            TrustformersError::invalid_input(format!("Session not found: {}", session_id))
417        })?;
418
419        // Use the existing visualization module
420        let config = VisualizationConfig::default();
421        let _visualizer = TokenVisualizer::new(config);
422
423        // Generate visualization for each tokenizer
424        let mut html = String::new();
425        html.push_str("<!DOCTYPE html><html><head><title>Tokenization Debug</title>");
426        html.push_str("<style>body{font-family:Arial,sans-serif;margin:20px;}");
427        html.push_str(".tokenizer{margin-bottom:30px;border:1px solid #ccc;padding:15px;}");
428        html.push_str(".token{display:inline-block;margin:2px;padding:4px 8px;border:1px solid #999;background:#f0f0f0;}");
429        html.push_str("</style></head><body>");
430
431        html.push_str("<h1>Tokenization Debug Report</h1>");
432        html.push_str(&format!(
433            "<p><strong>Input:</strong> {}</p>",
434            session.input_text
435        ));
436
437        for (name, result) in &session.tokenizer_results {
438            html.push_str(&format!("<div class='tokenizer'><h2>{}</h2>", name));
439            html.push_str(&format!(
440                "<p>Tokens: {} | Compression: {:.3} | Time: {:.2}ms</p>",
441                result.token_count, result.compression_ratio, result.processing_time_ms
442            ));
443
444            html.push_str("<div>");
445            for token in &result.tokens {
446                html.push_str(&format!(
447                    "<span class='token'>{}</span>",
448                    html_escape(token)
449                ));
450            }
451            html.push_str("</div></div>");
452        }
453
454        html.push_str("</body></html>");
455        Ok(html)
456    }
457
458    /// Analyze tokenization results and generate statistics
459    fn analyze_results(
460        &self,
461        results: &HashMap<String, TokenizationResult>,
462        text: &str,
463    ) -> DebugAnalysis {
464        let total_tokenizers = results.len();
465
466        // Find consensus and disagreement tokens
467        let mut token_agreement = HashMap::new();
468        for result in results.values() {
469            for token in &result.tokens {
470                *token_agreement.entry(token.clone()).or_insert(0) += 1;
471            }
472        }
473
474        let consensus_threshold = (total_tokenizers as f64 * 0.7) as usize;
475        let consensus_tokens: Vec<String> = token_agreement
476            .iter()
477            .filter(|(_, &count)| count >= consensus_threshold)
478            .map(|(token, _)| token.clone())
479            .collect();
480
481        let disagreement_tokens: Vec<String> = token_agreement
482            .iter()
483            .filter(|(_, &count)| count < consensus_threshold)
484            .map(|(token, _)| token.clone())
485            .collect();
486
487        // Compression statistics
488        let compression_ratios: Vec<f64> = results.values().map(|r| r.compression_ratio).collect();
489        let compression_stats = self.calculate_compression_stats(&compression_ratios, results);
490
491        // Performance statistics
492        let performance_times: Vec<f64> = results.values().map(|r| r.processing_time_ms).collect();
493        let performance_stats = self.calculate_performance_stats(&performance_times, results);
494
495        // Character analysis
496        let character_analysis = self.analyze_characters(text);
497
498        // Pattern analysis
499        let pattern_analysis = self.analyze_patterns(results);
500
501        DebugAnalysis {
502            total_tokenizers,
503            consensus_tokens,
504            disagreement_tokens,
505            compression_stats,
506            performance_stats,
507            character_analysis,
508            pattern_analysis,
509        }
510    }
511
512    fn calculate_compression_stats(
513        &self,
514        ratios: &[f64],
515        results: &HashMap<String, TokenizationResult>,
516    ) -> CompressionStats {
517        if ratios.is_empty() {
518            return CompressionStats {
519                min_ratio: 0.0,
520                max_ratio: 0.0,
521                avg_ratio: 0.0,
522                std_deviation: 0.0,
523                best_tokenizer: "None".to_string(),
524                worst_tokenizer: "None".to_string(),
525            };
526        }
527
528        let min_ratio = ratios.iter().cloned().fold(f64::INFINITY, f64::min);
529        let max_ratio = ratios.iter().cloned().fold(f64::NEG_INFINITY, f64::max);
530        let avg_ratio = ratios.iter().sum::<f64>() / ratios.len() as f64;
531
532        let variance =
533            ratios.iter().map(|r| (r - avg_ratio).powi(2)).sum::<f64>() / ratios.len() as f64;
534        let std_deviation = variance.sqrt();
535
536        let best_tokenizer = results
537            .iter()
538            .min_by(|a, b| {
539                a.1.compression_ratio
540                    .partial_cmp(&b.1.compression_ratio)
541                    .unwrap_or(std::cmp::Ordering::Equal)
542            })
543            .map(|(name, _)| name.clone())
544            .unwrap_or_else(|| "Unknown".to_string());
545
546        let worst_tokenizer = results
547            .iter()
548            .max_by(|a, b| {
549                a.1.compression_ratio
550                    .partial_cmp(&b.1.compression_ratio)
551                    .unwrap_or(std::cmp::Ordering::Equal)
552            })
553            .map(|(name, _)| name.clone())
554            .unwrap_or_else(|| "Unknown".to_string());
555
556        CompressionStats {
557            min_ratio,
558            max_ratio,
559            avg_ratio,
560            std_deviation,
561            best_tokenizer,
562            worst_tokenizer,
563        }
564    }
565
566    fn calculate_performance_stats(
567        &self,
568        times: &[f64],
569        results: &HashMap<String, TokenizationResult>,
570    ) -> PerformanceStats {
571        if times.is_empty() {
572            return PerformanceStats {
573                min_time_ms: 0.0,
574                max_time_ms: 0.0,
575                avg_time_ms: 0.0,
576                fastest_tokenizer: "None".to_string(),
577                slowest_tokenizer: "None".to_string(),
578            };
579        }
580
581        let min_time_ms = times.iter().cloned().fold(f64::INFINITY, f64::min);
582        let max_time_ms = times.iter().cloned().fold(f64::NEG_INFINITY, f64::max);
583        let avg_time_ms = times.iter().sum::<f64>() / times.len() as f64;
584
585        let fastest_tokenizer = results
586            .iter()
587            .min_by(|a, b| {
588                a.1.processing_time_ms
589                    .partial_cmp(&b.1.processing_time_ms)
590                    .unwrap_or(std::cmp::Ordering::Equal)
591            })
592            .map(|(name, _)| name.clone())
593            .unwrap_or_else(|| "Unknown".to_string());
594
595        let slowest_tokenizer = results
596            .iter()
597            .max_by(|a, b| {
598                a.1.processing_time_ms
599                    .partial_cmp(&b.1.processing_time_ms)
600                    .unwrap_or(std::cmp::Ordering::Equal)
601            })
602            .map(|(name, _)| name.clone())
603            .unwrap_or_else(|| "Unknown".to_string());
604
605        PerformanceStats {
606            min_time_ms,
607            max_time_ms,
608            avg_time_ms,
609            fastest_tokenizer,
610            slowest_tokenizer,
611        }
612    }
613
614    fn analyze_characters(&self, text: &str) -> CharacterAnalysis {
615        let mut character_frequency = HashMap::new();
616        let mut unicode_categories = HashMap::new();
617        let mut problematic_characters = Vec::new();
618
619        for ch in text.chars() {
620            *character_frequency.entry(ch).or_insert(0) += 1;
621
622            // Categorize by Unicode general category
623            let category = match ch {
624                c if c.is_ascii_alphabetic() => "ASCII Letter",
625                c if c.is_ascii_digit() => "ASCII Digit",
626                c if c.is_ascii_punctuation() => "ASCII Punctuation",
627                c if c.is_ascii_whitespace() => "ASCII Whitespace",
628                c if c.is_alphabetic() => "Unicode Letter",
629                c if c.is_numeric() => "Unicode Number",
630                c if c.is_whitespace() => "Unicode Whitespace",
631                _ => "Other",
632            };
633            *unicode_categories.entry(category.to_string()).or_insert(0) += 1;
634
635            // Detect potentially problematic characters
636            if (ch.is_control() || (ch as u32) > 0x10000) && !problematic_characters.contains(&ch) {
637                problematic_characters.push(ch);
638            }
639        }
640
641        CharacterAnalysis {
642            total_characters: text.len(),
643            unique_characters: character_frequency.len(),
644            character_frequency,
645            problematic_characters,
646            unicode_categories,
647        }
648    }
649
650    fn analyze_patterns(&self, results: &HashMap<String, TokenizationResult>) -> PatternAnalysis {
651        let mut all_tokens = Vec::new();
652        for result in results.values() {
653            all_tokens.extend(result.tokens.iter().cloned());
654        }
655
656        // Analyze prefixes and suffixes
657        let mut prefix_counts = HashMap::new();
658        let mut suffix_counts = HashMap::new();
659        let mut length_distribution = HashMap::new();
660
661        for token in &all_tokens {
662            *length_distribution.entry(token.len()).or_insert(0) += 1;
663
664            if token.len() >= 2 {
665                let prefix = &token[..2];
666                let suffix = &token[token.len() - 2..];
667                *prefix_counts.entry(prefix.to_string()).or_insert(0) += 1;
668                *suffix_counts.entry(suffix.to_string()).or_insert(0) += 1;
669            }
670        }
671
672        let mut common_prefixes: Vec<_> = prefix_counts.into_iter().collect();
673        common_prefixes.sort_by_key(|item| std::cmp::Reverse(item.1));
674        common_prefixes.truncate(10);
675
676        let mut common_suffixes: Vec<_> = suffix_counts.into_iter().collect();
677        common_suffixes.sort_by_key(|item| std::cmp::Reverse(item.1));
678        common_suffixes.truncate(10);
679
680        // Simple subword pattern detection
681        let mut subword_patterns = HashMap::new();
682        for token in &all_tokens {
683            if token.starts_with("##") || token.starts_with("▁") || token.ends_with("@@") {
684                *subword_patterns.entry(token.clone()).or_insert(0) += 1;
685            }
686        }
687
688        let mut subword_patterns: Vec<_> = subword_patterns.into_iter().collect();
689        subword_patterns.sort_by_key(|item| std::cmp::Reverse(item.1));
690        subword_patterns.truncate(20);
691
692        PatternAnalysis {
693            common_prefixes,
694            common_suffixes,
695            token_length_distribution: length_distribution,
696            subword_patterns,
697        }
698    }
699
700    fn detect_issues(
701        &self,
702        results: &HashMap<String, TokenizationResult>,
703        analysis: &DebugAnalysis,
704    ) -> Vec<DetectedIssue> {
705        let mut issues = Vec::new();
706
707        // Check for high token count variance
708        let token_counts: Vec<usize> = results.values().map(|r| r.token_count).collect();
709        if let (Some(&min_tokens), Some(&max_tokens)) =
710            (token_counts.iter().min(), token_counts.iter().max())
711        {
712            let variance_ratio = max_tokens as f64 / min_tokens.max(1) as f64;
713            if variance_ratio > 2.0 {
714                issues.push(DetectedIssue {
715                    issue_type: IssueType::HighTokenCountVariance,
716                    severity: IssueSeverity::Medium,
717                    description: format!(
718                        "High variance in token count: {} to {} tokens",
719                        min_tokens, max_tokens
720                    ),
721                    affected_tokenizers: results.keys().cloned().collect(),
722                    suggestions: vec![
723                        "Consider using tokenizers with similar vocabularies".to_string(),
724                        "Check if different tokenizers are appropriate for the same use case"
725                            .to_string(),
726                    ],
727                    examples: vec![],
728                });
729            }
730        }
731
732        // Check for poor compression
733        if analysis.compression_stats.avg_ratio > 0.8 {
734            issues.push(DetectedIssue {
735                issue_type: IssueType::PoorCompression,
736                severity: IssueSeverity::Medium,
737                description: format!(
738                    "Poor compression ratio: {:.3} (higher is worse)",
739                    analysis.compression_stats.avg_ratio
740                ),
741                affected_tokenizers: results.keys().cloned().collect(),
742                suggestions: vec![
743                    "Consider using subword tokenizers (BPE, WordPiece, Unigram)".to_string(),
744                    "Increase vocabulary size if using limited vocabularies".to_string(),
745                    "Check if the text domain matches the tokenizer training data".to_string(),
746                ],
747                examples: vec![],
748            });
749        }
750
751        // Check for slow performance
752        if analysis.performance_stats.avg_time_ms > 100.0 {
753            issues.push(DetectedIssue {
754                issue_type: IssueType::SlowPerformance,
755                severity: IssueSeverity::Low,
756                description: format!(
757                    "Slow tokenization: {:.2}ms average",
758                    analysis.performance_stats.avg_time_ms
759                ),
760                affected_tokenizers: vec![analysis.performance_stats.slowest_tokenizer.clone()],
761                suggestions: vec![
762                    "Consider using faster tokenizers for real-time applications".to_string(),
763                    "Check if vocabulary loading can be optimized".to_string(),
764                    "Consider caching tokenization results".to_string(),
765                ],
766                examples: vec![],
767            });
768        }
769
770        // Check for Unicode issues
771        if !analysis.character_analysis.problematic_characters.is_empty() {
772            issues.push(DetectedIssue {
773                issue_type: IssueType::UnicodeIssues,
774                severity: IssueSeverity::High,
775                description: format!(
776                    "Found {} potentially problematic Unicode characters",
777                    analysis.character_analysis.problematic_characters.len()
778                ),
779                affected_tokenizers: results.keys().cloned().collect(),
780                suggestions: vec![
781                    "Ensure tokenizers properly handle Unicode normalization".to_string(),
782                    "Consider preprocessing to handle control characters".to_string(),
783                    "Verify tokenizer training data included diverse Unicode content".to_string(),
784                ],
785                examples: analysis
786                    .character_analysis
787                    .problematic_characters
788                    .iter()
789                    .take(5)
790                    .map(|c| format!("'{}'", c))
791                    .collect(),
792            });
793        }
794
795        issues
796    }
797
798    fn find_oov_tokens(&self, tokens: &[String], _tokenizer: &TokenizerWrapper) -> Vec<String> {
799        // Simplified OOV detection - would need access to tokenizer vocabulary
800        // For now, detect common patterns that might indicate OOV tokens
801        tokens
802            .iter()
803            .filter(|token| {
804                token.contains("[UNK]") || token.contains("<unk>") || token.contains("�")
805            })
806            .cloned()
807            .collect()
808    }
809
810    fn find_special_tokens(&self, tokens: &[String], _tokenizer: &TokenizerWrapper) -> Vec<String> {
811        // Detect common special token patterns
812        tokens
813            .iter()
814            .filter(|token| {
815                token.starts_with('[') && token.ends_with(']')
816                    || token.starts_with('<') && token.ends_with('>')
817                    || token.starts_with("▁")
818                    || token.starts_with("##")
819            })
820            .cloned()
821            .collect()
822    }
823}
824
825/// Helper function to escape HTML characters
826fn html_escape(text: &str) -> String {
827    text.replace('&', "&amp;")
828        .replace('<', "&lt;")
829        .replace('>', "&gt;")
830        .replace('"', "&quot;")
831        .replace('\'', "&#x27;")
832}
833
834#[cfg(test)]
835mod tests {
836    use super::*;
837
838    #[test]
839    fn test_debugger_creation() {
840        let debugger = TokenizationDebugger::new();
841        assert_eq!(debugger.list_tokenizers().len(), 0);
842        assert_eq!(debugger.get_history().len(), 0);
843    }
844
845    #[test]
846    fn test_config_default() {
847        let config = DebuggerConfig::default();
848        assert_eq!(config.max_history_size, 100);
849        assert!(config.auto_analyze_issues);
850        assert!(config.show_character_details);
851    }
852
853    #[test]
854    fn test_html_escape() {
855        assert_eq!(html_escape("<test>"), "&lt;test&gt;");
856        assert_eq!(html_escape("&amp;"), "&amp;amp;");
857        assert_eq!(html_escape("\"quote\""), "&quot;quote&quot;");
858    }
859}
trustformers_tokenizers/tokenization_debugger.rs

trustformers_tokenizers/
tokenization_debugger.rs