1use crate::tokenizer::TokenizerWrapper;
2use crate::visualization::{TokenVisualizer, VisualizationConfig};
3use serde::{Deserialize, Serialize};
4use std::collections::HashMap;
5use trustformers_core::errors::{Result, TrustformersError};
6use trustformers_core::traits::Tokenizer;
7
8pub struct TokenizationDebugger {
10 tokenizers: HashMap<String, TokenizerWrapper>,
11 history: Vec<DebugSession>,
12 config: DebuggerConfig,
13}
14
15#[derive(Debug, Clone, Serialize, Deserialize)]
17pub struct DebuggerConfig {
18 pub max_history_size: usize,
20
21 pub auto_analyze_issues: bool,
23
24 pub show_character_details: bool,
26
27 pub enable_comparison: bool,
29
30 pub max_text_length: usize,
32}
33
34impl Default for DebuggerConfig {
35 fn default() -> Self {
36 Self {
37 max_history_size: 100,
38 auto_analyze_issues: true,
39 show_character_details: true,
40 enable_comparison: true,
41 max_text_length: 10000,
42 }
43 }
44}
45
46#[derive(Debug, Clone, Serialize, Deserialize)]
48pub struct DebugSession {
49 pub id: String,
50 pub input_text: String,
51 pub timestamp: u64,
52 pub tokenizer_results: HashMap<String, TokenizationResult>,
53 pub analysis: DebugAnalysis,
54 pub issues: Vec<DetectedIssue>,
55}
56
57#[derive(Debug, Clone, Serialize, Deserialize)]
59pub struct TokenizationResult {
60 pub tokenizer_name: String,
61 pub tokens: Vec<String>,
62 pub token_ids: Vec<u32>,
63 pub token_count: usize,
64 pub character_count: usize,
65 pub compression_ratio: f64,
66 pub processing_time_ms: f64,
67 pub character_offsets: Option<Vec<(usize, usize)>>,
68 pub oov_tokens: Vec<String>,
69 pub special_tokens: Vec<String>,
70}
71
72#[derive(Debug, Clone, Serialize, Deserialize)]
74pub struct DebugAnalysis {
75 pub total_tokenizers: usize,
76 pub consensus_tokens: Vec<String>,
77 pub disagreement_tokens: Vec<String>,
78 pub compression_stats: CompressionStats,
79 pub performance_stats: PerformanceStats,
80 pub character_analysis: CharacterAnalysis,
81 pub pattern_analysis: PatternAnalysis,
82}
83
84#[derive(Debug, Clone, Serialize, Deserialize)]
86pub struct CompressionStats {
87 pub min_ratio: f64,
88 pub max_ratio: f64,
89 pub avg_ratio: f64,
90 pub std_deviation: f64,
91 pub best_tokenizer: String,
92 pub worst_tokenizer: String,
93}
94
95#[derive(Debug, Clone, Serialize, Deserialize)]
97pub struct PerformanceStats {
98 pub min_time_ms: f64,
99 pub max_time_ms: f64,
100 pub avg_time_ms: f64,
101 pub fastest_tokenizer: String,
102 pub slowest_tokenizer: String,
103}
104
105#[derive(Debug, Clone, Serialize, Deserialize)]
107pub struct CharacterAnalysis {
108 pub total_characters: usize,
109 pub unique_characters: usize,
110 pub character_frequency: HashMap<char, usize>,
111 pub problematic_characters: Vec<char>,
112 pub unicode_categories: HashMap<String, usize>,
113}
114
115#[derive(Debug, Clone, Serialize, Deserialize)]
117pub struct PatternAnalysis {
118 pub common_prefixes: Vec<(String, usize)>,
119 pub common_suffixes: Vec<(String, usize)>,
120 pub token_length_distribution: HashMap<usize, usize>,
121 pub subword_patterns: Vec<(String, usize)>,
122}
123
124#[derive(Debug, Clone, Serialize, Deserialize)]
126pub enum IssueType {
127 HighTokenCountVariance,
129
130 HighOOVRate,
132
133 PoorCompression,
135
136 SlowPerformance,
138
139 InconsistentTokenization,
141
142 UnicodeIssues,
144
145 SpecialTokenIssues,
147
148 BoundaryIssues,
150}
151
152#[derive(Debug, Clone, Serialize, Deserialize)]
154pub struct DetectedIssue {
155 pub issue_type: IssueType,
156 pub severity: IssueSeverity,
157 pub description: String,
158 pub affected_tokenizers: Vec<String>,
159 pub suggestions: Vec<String>,
160 pub examples: Vec<String>,
161}
162
163#[derive(Debug, Clone, Serialize, Deserialize)]
165pub enum IssueSeverity {
166 Low,
167 Medium,
168 High,
169 Critical,
170}
171
172impl Default for TokenizationDebugger {
173 fn default() -> Self {
174 Self::new()
175 }
176}
177
178impl TokenizationDebugger {
179 pub fn new() -> Self {
181 Self {
182 tokenizers: HashMap::new(),
183 history: Vec::new(),
184 config: DebuggerConfig::default(),
185 }
186 }
187
188 pub fn with_config(config: DebuggerConfig) -> Self {
190 Self {
191 tokenizers: HashMap::new(),
192 history: Vec::new(),
193 config,
194 }
195 }
196
197 pub fn add_tokenizer(&mut self, name: String, tokenizer: TokenizerWrapper) {
199 self.tokenizers.insert(name, tokenizer);
200 }
201
202 pub fn remove_tokenizer(&mut self, name: &str) -> Option<TokenizerWrapper> {
204 self.tokenizers.remove(name)
205 }
206
207 pub fn list_tokenizers(&self) -> Vec<String> {
209 self.tokenizers.keys().cloned().collect()
210 }
211
212 pub fn debug_text(&mut self, text: &str) -> Result<DebugSession> {
214 if text.len() > self.config.max_text_length {
215 return Err(TrustformersError::invalid_input(format!(
216 "Text too long: {} characters (max: {})",
217 text.len(),
218 self.config.max_text_length
219 )));
220 }
221
222 let session_id = format!("debug_{}", chrono::Utc::now().timestamp());
223 let mut tokenizer_results = HashMap::new();
224
225 for (name, tokenizer) in &self.tokenizers {
227 let start_time = std::time::Instant::now();
228
229 match tokenizer.encode(text) {
230 Ok(result) => {
231 let processing_time = start_time.elapsed().as_secs_f64() * 1000.0;
232
233 let compression_ratio = if !text.is_empty() {
234 result.input_ids.len() as f64 / text.len() as f64
235 } else {
236 0.0
237 };
238
239 let tokens: Vec<String> = result
241 .input_ids
242 .iter()
243 .filter_map(|&id| tokenizer.id_to_token(id))
244 .collect();
245 let oov_tokens = self.find_oov_tokens(&tokens, tokenizer);
246 let special_tokens = self.find_special_tokens(&tokens, tokenizer);
247
248 let tokenization_result = TokenizationResult {
249 tokenizer_name: name.clone(),
250 tokens: result
251 .input_ids
252 .iter()
253 .filter_map(|&id| tokenizer.id_to_token(id))
254 .collect(),
255 token_ids: result.input_ids.clone(),
256 token_count: result.input_ids.len(),
257 character_count: text.len(),
258 compression_ratio,
259 processing_time_ms: processing_time,
260 character_offsets: None, oov_tokens,
262 special_tokens,
263 };
264
265 tokenizer_results.insert(name.clone(), tokenization_result);
266 },
267 Err(e) => {
268 let tokenization_result = TokenizationResult {
270 tokenizer_name: name.clone(),
271 tokens: vec![format!("ERROR: {}", e)],
272 token_ids: vec![],
273 token_count: 0,
274 character_count: text.len(),
275 compression_ratio: 0.0,
276 processing_time_ms: start_time.elapsed().as_secs_f64() * 1000.0,
277 character_offsets: None,
278 oov_tokens: vec![],
279 special_tokens: vec![],
280 };
281
282 tokenizer_results.insert(name.clone(), tokenization_result);
283 },
284 }
285 }
286
287 let analysis = self.analyze_results(&tokenizer_results, text);
289 let issues = if self.config.auto_analyze_issues {
290 self.detect_issues(&tokenizer_results, &analysis)
291 } else {
292 vec![]
293 };
294
295 let session = DebugSession {
296 id: session_id,
297 input_text: text.to_string(),
298 timestamp: chrono::Utc::now().timestamp() as u64,
299 tokenizer_results,
300 analysis,
301 issues,
302 };
303
304 self.history.push(session.clone());
306 if self.history.len() > self.config.max_history_size {
307 self.history.remove(0);
308 }
309
310 Ok(session)
311 }
312
313 pub fn compare_texts(&mut self, texts: &[String]) -> Result<Vec<DebugSession>> {
315 let mut sessions = Vec::new();
316
317 for text in texts {
318 let session = self.debug_text(text)?;
319 sessions.push(session);
320 }
321
322 Ok(sessions)
323 }
324
325 pub fn get_history(&self) -> &[DebugSession] {
327 &self.history
328 }
329
330 pub fn get_session(&self, session_id: &str) -> Option<&DebugSession> {
332 self.history.iter().find(|s| s.id == session_id)
333 }
334
335 pub fn generate_report(&self, session_id: &str) -> Result<String> {
337 let session = self.get_session(session_id).ok_or_else(|| {
338 TrustformersError::invalid_input(format!("Session not found: {}", session_id))
339 })?;
340
341 let mut report = String::new();
342
343 report.push_str("# Tokenization Debug Report\n");
344 report.push_str(&format!("Session ID: {}\n", session.id));
345 report.push_str(&format!("Timestamp: {}\n", session.timestamp));
346 report.push_str(&format!(
347 "Input Length: {} characters\n\n",
348 session.input_text.len()
349 ));
350
351 report.push_str("## Input Text\n");
352 report.push_str(&format!("```\n{}\n```\n\n", session.input_text));
353
354 report.push_str("## Tokenizer Results\n");
355 for (name, result) in &session.tokenizer_results {
356 report.push_str(&format!("### {}\n", name));
357 report.push_str(&format!("- Tokens: {}\n", result.token_count));
358 report.push_str(&format!("- Compression: {:.3}\n", result.compression_ratio));
359 report.push_str(&format!("- Time: {:.2}ms\n", result.processing_time_ms));
360 report.push_str(&format!("- OOV Tokens: {}\n", result.oov_tokens.len()));
361 report.push_str(&format!(
362 "- Special Tokens: {}\n",
363 result.special_tokens.len()
364 ));
365 report.push('\n');
366 }
367
368 report.push_str("## Analysis\n");
369 let analysis = &session.analysis;
370 report.push_str(&format!(
371 "- Total Tokenizers: {}\n",
372 analysis.total_tokenizers
373 ));
374 report.push_str(&format!(
375 "- Consensus Tokens: {}\n",
376 analysis.consensus_tokens.len()
377 ));
378 report.push_str(&format!(
379 "- Disagreement Tokens: {}\n",
380 analysis.disagreement_tokens.len()
381 ));
382 report.push_str(&format!(
383 "- Best Compression: {} ({:.3})\n",
384 analysis.compression_stats.best_tokenizer, analysis.compression_stats.min_ratio
385 ));
386 report.push_str(&format!(
387 "- Fastest: {} ({:.2}ms)\n",
388 analysis.performance_stats.fastest_tokenizer, analysis.performance_stats.min_time_ms
389 ));
390 report.push('\n');
391
392 if !session.issues.is_empty() {
393 report.push_str("## Detected Issues\n");
394 for (i, issue) in session.issues.iter().enumerate() {
395 report.push_str(&format!("### Issue {}: {:?}\n", i + 1, issue.issue_type));
396 report.push_str(&format!("**Severity:** {:?}\n", issue.severity));
397 report.push_str(&format!("**Description:** {}\n", issue.description));
398 report.push_str(&format!(
399 "**Affected Tokenizers:** {}\n",
400 issue.affected_tokenizers.join(", ")
401 ));
402 report.push_str("**Suggestions:**\n");
403 for suggestion in &issue.suggestions {
404 report.push_str(&format!("- {}\n", suggestion));
405 }
406 report.push('\n');
407 }
408 }
409
410 Ok(report)
411 }
412
413 pub fn generate_html_visualization(&self, session_id: &str) -> Result<String> {
415 let session = self.get_session(session_id).ok_or_else(|| {
416 TrustformersError::invalid_input(format!("Session not found: {}", session_id))
417 })?;
418
419 let config = VisualizationConfig::default();
421 let _visualizer = TokenVisualizer::new(config);
422
423 let mut html = String::new();
425 html.push_str("<!DOCTYPE html><html><head><title>Tokenization Debug</title>");
426 html.push_str("<style>body{font-family:Arial,sans-serif;margin:20px;}");
427 html.push_str(".tokenizer{margin-bottom:30px;border:1px solid #ccc;padding:15px;}");
428 html.push_str(".token{display:inline-block;margin:2px;padding:4px 8px;border:1px solid #999;background:#f0f0f0;}");
429 html.push_str("</style></head><body>");
430
431 html.push_str("<h1>Tokenization Debug Report</h1>");
432 html.push_str(&format!(
433 "<p><strong>Input:</strong> {}</p>",
434 session.input_text
435 ));
436
437 for (name, result) in &session.tokenizer_results {
438 html.push_str(&format!("<div class='tokenizer'><h2>{}</h2>", name));
439 html.push_str(&format!(
440 "<p>Tokens: {} | Compression: {:.3} | Time: {:.2}ms</p>",
441 result.token_count, result.compression_ratio, result.processing_time_ms
442 ));
443
444 html.push_str("<div>");
445 for token in &result.tokens {
446 html.push_str(&format!(
447 "<span class='token'>{}</span>",
448 html_escape(token)
449 ));
450 }
451 html.push_str("</div></div>");
452 }
453
454 html.push_str("</body></html>");
455 Ok(html)
456 }
457
458 fn analyze_results(
460 &self,
461 results: &HashMap<String, TokenizationResult>,
462 text: &str,
463 ) -> DebugAnalysis {
464 let total_tokenizers = results.len();
465
466 let mut token_agreement = HashMap::new();
468 for result in results.values() {
469 for token in &result.tokens {
470 *token_agreement.entry(token.clone()).or_insert(0) += 1;
471 }
472 }
473
474 let consensus_threshold = (total_tokenizers as f64 * 0.7) as usize;
475 let consensus_tokens: Vec<String> = token_agreement
476 .iter()
477 .filter(|(_, &count)| count >= consensus_threshold)
478 .map(|(token, _)| token.clone())
479 .collect();
480
481 let disagreement_tokens: Vec<String> = token_agreement
482 .iter()
483 .filter(|(_, &count)| count < consensus_threshold)
484 .map(|(token, _)| token.clone())
485 .collect();
486
487 let compression_ratios: Vec<f64> = results.values().map(|r| r.compression_ratio).collect();
489 let compression_stats = self.calculate_compression_stats(&compression_ratios, results);
490
491 let performance_times: Vec<f64> = results.values().map(|r| r.processing_time_ms).collect();
493 let performance_stats = self.calculate_performance_stats(&performance_times, results);
494
495 let character_analysis = self.analyze_characters(text);
497
498 let pattern_analysis = self.analyze_patterns(results);
500
501 DebugAnalysis {
502 total_tokenizers,
503 consensus_tokens,
504 disagreement_tokens,
505 compression_stats,
506 performance_stats,
507 character_analysis,
508 pattern_analysis,
509 }
510 }
511
512 fn calculate_compression_stats(
513 &self,
514 ratios: &[f64],
515 results: &HashMap<String, TokenizationResult>,
516 ) -> CompressionStats {
517 if ratios.is_empty() {
518 return CompressionStats {
519 min_ratio: 0.0,
520 max_ratio: 0.0,
521 avg_ratio: 0.0,
522 std_deviation: 0.0,
523 best_tokenizer: "None".to_string(),
524 worst_tokenizer: "None".to_string(),
525 };
526 }
527
528 let min_ratio = ratios.iter().cloned().fold(f64::INFINITY, f64::min);
529 let max_ratio = ratios.iter().cloned().fold(f64::NEG_INFINITY, f64::max);
530 let avg_ratio = ratios.iter().sum::<f64>() / ratios.len() as f64;
531
532 let variance =
533 ratios.iter().map(|r| (r - avg_ratio).powi(2)).sum::<f64>() / ratios.len() as f64;
534 let std_deviation = variance.sqrt();
535
536 let best_tokenizer = results
537 .iter()
538 .min_by(|a, b| {
539 a.1.compression_ratio
540 .partial_cmp(&b.1.compression_ratio)
541 .unwrap_or(std::cmp::Ordering::Equal)
542 })
543 .map(|(name, _)| name.clone())
544 .unwrap_or_else(|| "Unknown".to_string());
545
546 let worst_tokenizer = results
547 .iter()
548 .max_by(|a, b| {
549 a.1.compression_ratio
550 .partial_cmp(&b.1.compression_ratio)
551 .unwrap_or(std::cmp::Ordering::Equal)
552 })
553 .map(|(name, _)| name.clone())
554 .unwrap_or_else(|| "Unknown".to_string());
555
556 CompressionStats {
557 min_ratio,
558 max_ratio,
559 avg_ratio,
560 std_deviation,
561 best_tokenizer,
562 worst_tokenizer,
563 }
564 }
565
566 fn calculate_performance_stats(
567 &self,
568 times: &[f64],
569 results: &HashMap<String, TokenizationResult>,
570 ) -> PerformanceStats {
571 if times.is_empty() {
572 return PerformanceStats {
573 min_time_ms: 0.0,
574 max_time_ms: 0.0,
575 avg_time_ms: 0.0,
576 fastest_tokenizer: "None".to_string(),
577 slowest_tokenizer: "None".to_string(),
578 };
579 }
580
581 let min_time_ms = times.iter().cloned().fold(f64::INFINITY, f64::min);
582 let max_time_ms = times.iter().cloned().fold(f64::NEG_INFINITY, f64::max);
583 let avg_time_ms = times.iter().sum::<f64>() / times.len() as f64;
584
585 let fastest_tokenizer = results
586 .iter()
587 .min_by(|a, b| {
588 a.1.processing_time_ms
589 .partial_cmp(&b.1.processing_time_ms)
590 .unwrap_or(std::cmp::Ordering::Equal)
591 })
592 .map(|(name, _)| name.clone())
593 .unwrap_or_else(|| "Unknown".to_string());
594
595 let slowest_tokenizer = results
596 .iter()
597 .max_by(|a, b| {
598 a.1.processing_time_ms
599 .partial_cmp(&b.1.processing_time_ms)
600 .unwrap_or(std::cmp::Ordering::Equal)
601 })
602 .map(|(name, _)| name.clone())
603 .unwrap_or_else(|| "Unknown".to_string());
604
605 PerformanceStats {
606 min_time_ms,
607 max_time_ms,
608 avg_time_ms,
609 fastest_tokenizer,
610 slowest_tokenizer,
611 }
612 }
613
614 fn analyze_characters(&self, text: &str) -> CharacterAnalysis {
615 let mut character_frequency = HashMap::new();
616 let mut unicode_categories = HashMap::new();
617 let mut problematic_characters = Vec::new();
618
619 for ch in text.chars() {
620 *character_frequency.entry(ch).or_insert(0) += 1;
621
622 let category = match ch {
624 c if c.is_ascii_alphabetic() => "ASCII Letter",
625 c if c.is_ascii_digit() => "ASCII Digit",
626 c if c.is_ascii_punctuation() => "ASCII Punctuation",
627 c if c.is_ascii_whitespace() => "ASCII Whitespace",
628 c if c.is_alphabetic() => "Unicode Letter",
629 c if c.is_numeric() => "Unicode Number",
630 c if c.is_whitespace() => "Unicode Whitespace",
631 _ => "Other",
632 };
633 *unicode_categories.entry(category.to_string()).or_insert(0) += 1;
634
635 if (ch.is_control() || (ch as u32) > 0x10000) && !problematic_characters.contains(&ch) {
637 problematic_characters.push(ch);
638 }
639 }
640
641 CharacterAnalysis {
642 total_characters: text.len(),
643 unique_characters: character_frequency.len(),
644 character_frequency,
645 problematic_characters,
646 unicode_categories,
647 }
648 }
649
650 fn analyze_patterns(&self, results: &HashMap<String, TokenizationResult>) -> PatternAnalysis {
651 let mut all_tokens = Vec::new();
652 for result in results.values() {
653 all_tokens.extend(result.tokens.iter().cloned());
654 }
655
656 let mut prefix_counts = HashMap::new();
658 let mut suffix_counts = HashMap::new();
659 let mut length_distribution = HashMap::new();
660
661 for token in &all_tokens {
662 *length_distribution.entry(token.len()).or_insert(0) += 1;
663
664 if token.len() >= 2 {
665 let prefix = &token[..2];
666 let suffix = &token[token.len() - 2..];
667 *prefix_counts.entry(prefix.to_string()).or_insert(0) += 1;
668 *suffix_counts.entry(suffix.to_string()).or_insert(0) += 1;
669 }
670 }
671
672 let mut common_prefixes: Vec<_> = prefix_counts.into_iter().collect();
673 common_prefixes.sort_by_key(|item| std::cmp::Reverse(item.1));
674 common_prefixes.truncate(10);
675
676 let mut common_suffixes: Vec<_> = suffix_counts.into_iter().collect();
677 common_suffixes.sort_by_key(|item| std::cmp::Reverse(item.1));
678 common_suffixes.truncate(10);
679
680 let mut subword_patterns = HashMap::new();
682 for token in &all_tokens {
683 if token.starts_with("##") || token.starts_with("▁") || token.ends_with("@@") {
684 *subword_patterns.entry(token.clone()).or_insert(0) += 1;
685 }
686 }
687
688 let mut subword_patterns: Vec<_> = subword_patterns.into_iter().collect();
689 subword_patterns.sort_by_key(|item| std::cmp::Reverse(item.1));
690 subword_patterns.truncate(20);
691
692 PatternAnalysis {
693 common_prefixes,
694 common_suffixes,
695 token_length_distribution: length_distribution,
696 subword_patterns,
697 }
698 }
699
700 fn detect_issues(
701 &self,
702 results: &HashMap<String, TokenizationResult>,
703 analysis: &DebugAnalysis,
704 ) -> Vec<DetectedIssue> {
705 let mut issues = Vec::new();
706
707 let token_counts: Vec<usize> = results.values().map(|r| r.token_count).collect();
709 if let (Some(&min_tokens), Some(&max_tokens)) =
710 (token_counts.iter().min(), token_counts.iter().max())
711 {
712 let variance_ratio = max_tokens as f64 / min_tokens.max(1) as f64;
713 if variance_ratio > 2.0 {
714 issues.push(DetectedIssue {
715 issue_type: IssueType::HighTokenCountVariance,
716 severity: IssueSeverity::Medium,
717 description: format!(
718 "High variance in token count: {} to {} tokens",
719 min_tokens, max_tokens
720 ),
721 affected_tokenizers: results.keys().cloned().collect(),
722 suggestions: vec![
723 "Consider using tokenizers with similar vocabularies".to_string(),
724 "Check if different tokenizers are appropriate for the same use case"
725 .to_string(),
726 ],
727 examples: vec![],
728 });
729 }
730 }
731
732 if analysis.compression_stats.avg_ratio > 0.8 {
734 issues.push(DetectedIssue {
735 issue_type: IssueType::PoorCompression,
736 severity: IssueSeverity::Medium,
737 description: format!(
738 "Poor compression ratio: {:.3} (higher is worse)",
739 analysis.compression_stats.avg_ratio
740 ),
741 affected_tokenizers: results.keys().cloned().collect(),
742 suggestions: vec![
743 "Consider using subword tokenizers (BPE, WordPiece, Unigram)".to_string(),
744 "Increase vocabulary size if using limited vocabularies".to_string(),
745 "Check if the text domain matches the tokenizer training data".to_string(),
746 ],
747 examples: vec![],
748 });
749 }
750
751 if analysis.performance_stats.avg_time_ms > 100.0 {
753 issues.push(DetectedIssue {
754 issue_type: IssueType::SlowPerformance,
755 severity: IssueSeverity::Low,
756 description: format!(
757 "Slow tokenization: {:.2}ms average",
758 analysis.performance_stats.avg_time_ms
759 ),
760 affected_tokenizers: vec![analysis.performance_stats.slowest_tokenizer.clone()],
761 suggestions: vec![
762 "Consider using faster tokenizers for real-time applications".to_string(),
763 "Check if vocabulary loading can be optimized".to_string(),
764 "Consider caching tokenization results".to_string(),
765 ],
766 examples: vec![],
767 });
768 }
769
770 if !analysis.character_analysis.problematic_characters.is_empty() {
772 issues.push(DetectedIssue {
773 issue_type: IssueType::UnicodeIssues,
774 severity: IssueSeverity::High,
775 description: format!(
776 "Found {} potentially problematic Unicode characters",
777 analysis.character_analysis.problematic_characters.len()
778 ),
779 affected_tokenizers: results.keys().cloned().collect(),
780 suggestions: vec![
781 "Ensure tokenizers properly handle Unicode normalization".to_string(),
782 "Consider preprocessing to handle control characters".to_string(),
783 "Verify tokenizer training data included diverse Unicode content".to_string(),
784 ],
785 examples: analysis
786 .character_analysis
787 .problematic_characters
788 .iter()
789 .take(5)
790 .map(|c| format!("'{}'", c))
791 .collect(),
792 });
793 }
794
795 issues
796 }
797
798 fn find_oov_tokens(&self, tokens: &[String], _tokenizer: &TokenizerWrapper) -> Vec<String> {
799 tokens
802 .iter()
803 .filter(|token| {
804 token.contains("[UNK]") || token.contains("<unk>") || token.contains("�")
805 })
806 .cloned()
807 .collect()
808 }
809
810 fn find_special_tokens(&self, tokens: &[String], _tokenizer: &TokenizerWrapper) -> Vec<String> {
811 tokens
813 .iter()
814 .filter(|token| {
815 token.starts_with('[') && token.ends_with(']')
816 || token.starts_with('<') && token.ends_with('>')
817 || token.starts_with("▁")
818 || token.starts_with("##")
819 })
820 .cloned()
821 .collect()
822 }
823}
824
825fn html_escape(text: &str) -> String {
827 text.replace('&', "&")
828 .replace('<', "<")
829 .replace('>', ">")
830 .replace('"', """)
831 .replace('\'', "'")
832}
833
834#[cfg(test)]
835mod tests {
836 use super::*;
837
838 #[test]
839 fn test_debugger_creation() {
840 let debugger = TokenizationDebugger::new();
841 assert_eq!(debugger.list_tokenizers().len(), 0);
842 assert_eq!(debugger.get_history().len(), 0);
843 }
844
845 #[test]
846 fn test_config_default() {
847 let config = DebuggerConfig::default();
848 assert_eq!(config.max_history_size, 100);
849 assert!(config.auto_analyze_issues);
850 assert!(config.show_character_details);
851 }
852
853 #[test]
854 fn test_html_escape() {
855 assert_eq!(html_escape("<test>"), "<test>");
856 assert_eq!(html_escape("&"), "&amp;");
857 assert_eq!(html_escape("\"quote\""), ""quote"");
858 }
859}