1use crate::{TokenizedInput, Tokenizer};
7use anyhow::{anyhow, Result};
8use serde::{Deserialize, Serialize};
9use std::collections::HashMap;
10use std::time::{Duration, Instant};
11
12#[derive(Debug, Clone, Serialize, Deserialize)]
14pub struct CoverageConfig {
15 pub min_token_frequency: usize,
17 pub max_examples: usize,
19 pub include_token_analysis: bool,
21 pub include_vocab_stats: bool,
23 pub include_performance_metrics: bool,
25 pub include_quality_metrics: bool,
27 pub output_format: ReportFormat,
29 pub coverage_thresholds: CoverageThresholds,
31}
32
33impl Default for CoverageConfig {
34 fn default() -> Self {
35 Self {
36 min_token_frequency: 1,
37 max_examples: 100,
38 include_token_analysis: true,
39 include_vocab_stats: true,
40 include_performance_metrics: true,
41 include_quality_metrics: true,
42 output_format: ReportFormat::Json,
43 coverage_thresholds: CoverageThresholds::default(),
44 }
45 }
46}
47
48#[derive(Debug, Clone, Serialize, Deserialize)]
50pub struct CoverageThresholds {
51 pub min_vocab_coverage: f64,
53 pub min_char_coverage: f64,
55 pub max_oov_rate: f64,
57 pub min_avg_token_length: f64,
59 pub max_avg_token_length: f64,
61 pub min_tokens_per_second: f64,
63}
64
65impl Default for CoverageThresholds {
66 fn default() -> Self {
67 Self {
68 min_vocab_coverage: 0.95,
69 min_char_coverage: 0.99,
70 max_oov_rate: 0.05,
71 min_avg_token_length: 2.0,
72 max_avg_token_length: 10.0,
73 min_tokens_per_second: 1000.0,
74 }
75 }
76}
77
78#[derive(Debug, Clone, Serialize, Deserialize)]
80pub enum ReportFormat {
81 Json,
82 Html,
83 Markdown,
84 Csv,
85 Yaml,
86}
87
88#[derive(Debug, Clone, Serialize, Deserialize)]
90pub struct CoverageReport {
91 pub metadata: ReportMetadata,
93 pub vocabulary_coverage: VocabularyCoverage,
95 pub character_coverage: CharacterCoverage,
97 pub token_distribution: TokenDistribution,
99 pub performance_metrics: PerformanceMetrics,
101 pub quality_metrics: QualityMetrics,
103 pub warnings: Vec<CoverageWarning>,
105 pub examples: Vec<CoverageExample>,
107}
108
109#[derive(Debug, Clone, Serialize, Deserialize)]
111pub struct ReportMetadata {
112 pub timestamp: String,
114 pub tokenizer_name: String,
116 pub sample_count: usize,
118 pub processing_time: Duration,
120 pub config: CoverageConfig,
122}
123
124#[derive(Debug, Clone, Serialize, Deserialize)]
126pub struct VocabularyCoverage {
127 pub total_vocab_size: usize,
129 pub used_tokens: usize,
131 pub coverage_percentage: f64,
133 pub most_frequent_tokens: Vec<(String, usize)>,
135 pub least_frequent_tokens: Vec<(String, usize)>,
137 pub unused_tokens: Vec<String>,
139 pub frequency_distribution: HashMap<usize, usize>,
141}
142
143#[derive(Debug, Clone, Serialize, Deserialize)]
145pub struct CharacterCoverage {
146 pub total_characters: usize,
148 pub covered_characters: usize,
150 pub coverage_percentage: f64,
152 pub uncovered_characters: Vec<char>,
154 pub character_frequencies: HashMap<char, usize>,
156 pub unicode_categories: HashMap<String, usize>,
158}
159
160#[derive(Debug, Clone, Serialize, Deserialize)]
162pub struct TokenDistribution {
163 pub avg_tokens_per_input: f64,
165 pub token_length_distribution: HashMap<usize, usize>,
167 pub avg_token_length: f64,
169 pub compression_ratio: f64,
171 pub oov_rate: f64,
173 pub common_patterns: Vec<(String, usize)>,
175}
176
177#[derive(Debug, Clone, Serialize, Deserialize)]
179pub struct PerformanceMetrics {
180 pub total_time: Duration,
182 pub avg_time_per_input: Duration,
184 pub tokens_per_second: f64,
186 pub characters_per_second: f64,
188 pub memory_usage: MemoryUsageStats,
190 pub throughput_percentiles: HashMap<String, f64>,
192}
193
194#[derive(Debug, Clone, Serialize, Deserialize)]
196pub struct MemoryUsageStats {
197 pub peak_memory: usize,
199 pub avg_memory: usize,
201 pub memory_per_token: f64,
203 pub vocab_memory: usize,
205}
206
207#[derive(Debug, Clone, Serialize, Deserialize)]
209pub struct QualityMetrics {
210 pub vocab_efficiency: f64,
212 pub consistency_score: f64,
214 pub information_density: f64,
216 pub subword_quality: f64,
218 pub language_coverage: f64,
220 pub overall_score: f64,
222}
223
224#[derive(Debug, Clone, Serialize, Deserialize)]
226pub struct CoverageWarning {
227 pub warning_type: WarningType,
229 pub severity: WarningSeverity,
231 pub message: String,
233 pub recommendation: String,
235 pub examples: Vec<String>,
237}
238
239#[derive(Debug, Clone, Serialize, Deserialize)]
241pub enum WarningType {
242 LowVocabCoverage,
243 LowCharCoverage,
244 HighOOVRate,
245 PoorPerformance,
246 MemoryIssue,
247 InconsistentTokenization,
248 VocabularyWaste,
249 QualityIssue,
250}
251
252#[derive(Debug, Clone, Serialize, Deserialize)]
254pub enum WarningSeverity {
255 Info,
256 Warning,
257 Error,
258 Critical,
259}
260
261#[derive(Debug, Clone, Serialize, Deserialize)]
263pub struct CoverageExample {
264 pub input: String,
266 pub tokens: Vec<String>,
268 pub token_ids: Vec<u32>,
270 pub alignment: Vec<(usize, usize)>,
272 pub issues: Vec<String>,
274 pub quality_score: f64,
276}
277
278pub struct CoverageAnalyzer<T: Tokenizer> {
280 tokenizer: T,
281 config: CoverageConfig,
282 token_frequencies: HashMap<String, usize>,
283 character_frequencies: HashMap<char, usize>,
284 processing_times: Vec<Duration>,
285 examples: Vec<CoverageExample>,
286}
287
288impl<T: Tokenizer> CoverageAnalyzer<T> {
289 pub fn new(tokenizer: T, config: CoverageConfig) -> Self {
291 Self {
292 tokenizer,
293 config,
294 token_frequencies: HashMap::new(),
295 character_frequencies: HashMap::new(),
296 processing_times: Vec::new(),
297 examples: Vec::new(),
298 }
299 }
300
301 pub fn from_tokenizer(tokenizer: T) -> Self {
303 Self::new(tokenizer, CoverageConfig::default())
304 }
305
306 pub fn analyze_input(&mut self, text: &str) -> Result<()> {
308 let start_time = Instant::now();
309
310 let tokenized = self.tokenizer.encode(text)?;
312 let processing_time = start_time.elapsed();
313 self.processing_times.push(processing_time);
314
315 let _decoded = self.tokenizer.decode(&tokenized.input_ids)?;
317 let token_strings = self.extract_token_strings(&tokenized, text)?;
318
319 for token in &token_strings {
321 *self.token_frequencies.entry(token.clone()).or_insert(0) += 1;
322 }
323
324 for ch in text.chars() {
326 *self.character_frequencies.entry(ch).or_insert(0) += 1;
327 }
328
329 if self.examples.len() < self.config.max_examples {
331 let example = self.create_coverage_example(text, &tokenized, &token_strings)?;
332 self.examples.push(example);
333 }
334
335 Ok(())
336 }
337
338 pub fn analyze_batch(&mut self, texts: &[String]) -> Result<()> {
340 for text in texts {
341 self.analyze_input(text)?;
342 }
343 Ok(())
344 }
345
346 pub fn generate_report(&self) -> Result<CoverageReport> {
348 let _start_time = Instant::now();
349
350 let metadata = self.create_report_metadata();
351 let vocabulary_coverage = self.analyze_vocabulary_coverage()?;
352 let character_coverage = self.analyze_character_coverage();
353 let token_distribution = self.analyze_token_distribution();
354 let performance_metrics = self.analyze_performance_metrics();
355 let quality_metrics = self.calculate_quality_metrics(
356 &vocabulary_coverage,
357 &character_coverage,
358 &token_distribution,
359 );
360 let warnings = self.generate_warnings(
361 &vocabulary_coverage,
362 &character_coverage,
363 &token_distribution,
364 &performance_metrics,
365 );
366
367 let report = CoverageReport {
368 metadata,
369 vocabulary_coverage,
370 character_coverage,
371 token_distribution,
372 performance_metrics,
373 quality_metrics,
374 warnings,
375 examples: self.examples.clone(),
376 };
377
378 Ok(report)
379 }
380
381 fn extract_token_strings(
383 &self,
384 tokenized: &TokenizedInput,
385 original_text: &str,
386 ) -> Result<Vec<String>> {
387 let mut token_strings = Vec::new();
388
389 for &token_id in &tokenized.input_ids {
391 if let Ok(token_str) = self.tokenizer.decode(&[token_id]) {
392 token_strings.push(token_str);
393 } else {
394 token_strings.push(format!("<{}>", token_id));
396 }
397 }
398
399 if token_strings.len() != tokenized.input_ids.len() {
401 let chars_per_token = original_text.len() as f64 / tokenized.input_ids.len() as f64;
403 token_strings.clear();
404
405 for i in 0..tokenized.input_ids.len() {
406 let start = (i as f64 * chars_per_token) as usize;
407 let end =
408 ((i + 1) as f64 * chars_per_token).min(original_text.len() as f64) as usize;
409
410 if start < original_text.len() {
411 let token = &original_text[start..end];
412 token_strings.push(token.to_string());
413 }
414 }
415 }
416
417 Ok(token_strings)
418 }
419
420 fn create_coverage_example(
422 &self,
423 text: &str,
424 tokenized: &TokenizedInput,
425 tokens: &[String],
426 ) -> Result<CoverageExample> {
427 let mut alignment = Vec::new();
429 let mut char_pos = 0;
430
431 for token in tokens.iter() {
432 let start_char = char_pos;
433 char_pos += token.chars().count();
434 alignment.push((start_char, char_pos.min(text.chars().count())));
435 }
436
437 let mut issues = Vec::new();
439
440 for token in tokens {
442 if token.len() == 1 && token.chars().all(|c| c.is_alphabetic()) {
443 issues.push(format!("Single character token: '{}'", token));
444 } else if token.len() > 20 {
445 issues.push(format!("Very long token: '{}'", token));
446 }
447 }
448
449 if tokens.iter().any(|t| t.starts_with('<') && t.ends_with('>')) {
451 issues.push("Contains unknown tokens".to_string());
452 }
453
454 let quality_score = self.calculate_example_quality_score(text, tokens);
456
457 Ok(CoverageExample {
458 input: text.to_string(),
459 tokens: tokens.to_vec(),
460 token_ids: tokenized.input_ids.clone(),
461 alignment,
462 issues,
463 quality_score,
464 })
465 }
466
467 fn calculate_example_quality_score(&self, text: &str, tokens: &[String]) -> f64 {
469 let mut score = 1.0;
470
471 let char_count = text.chars().count();
473 let token_count = tokens.len();
474 let tokens_per_char = token_count as f64 / char_count as f64;
475
476 if tokens_per_char > 0.8 {
477 score *= 0.7; } else if tokens_per_char < 0.1 {
479 score *= 0.8; }
481
482 let avg_token_length =
484 tokens.iter().map(|t| t.len()).sum::<usize>() as f64 / tokens.len() as f64;
485 if !(2.0..=15.0).contains(&avg_token_length) {
486 score *= 0.9;
487 }
488
489 let unknown_tokens =
491 tokens.iter().filter(|t| t.starts_with('<') && t.ends_with('>')).count();
492 if unknown_tokens > 0 {
493 score *= 0.5_f64.powi(unknown_tokens as i32);
494 }
495
496 score
497 }
498
499 fn create_report_metadata(&self) -> ReportMetadata {
501 ReportMetadata {
502 timestamp: chrono::Utc::now().to_rfc3339(),
503 tokenizer_name: "TrustformeRS Tokenizer".to_string(),
504 sample_count: self.processing_times.len(),
505 processing_time: self.processing_times.iter().sum(),
506 config: self.config.clone(),
507 }
508 }
509
510 fn analyze_vocabulary_coverage(&self) -> Result<VocabularyCoverage> {
512 let used_tokens = self.token_frequencies.len();
515 let total_vocab_size = used_tokens * 2; let coverage_percentage = if total_vocab_size > 0 {
517 used_tokens as f64 / total_vocab_size as f64 * 100.0
518 } else {
519 0.0
520 };
521
522 let mut sorted_tokens: Vec<_> = self
524 .token_frequencies
525 .iter()
526 .map(|(token, &freq)| (token.clone(), freq))
527 .collect();
528 sorted_tokens.sort_by_key(|item| std::cmp::Reverse(item.1));
529
530 let most_frequent_tokens = sorted_tokens.iter().take(20).cloned().collect();
531
532 let least_frequent_tokens = sorted_tokens
533 .iter()
534 .filter(|(_, freq)| *freq >= self.config.min_token_frequency)
535 .rev()
536 .take(20)
537 .cloned()
538 .collect();
539
540 let mut frequency_distribution = HashMap::new();
542 for &freq in self.token_frequencies.values() {
543 *frequency_distribution.entry(freq).or_insert(0) += 1;
544 }
545
546 Ok(VocabularyCoverage {
547 total_vocab_size,
548 used_tokens,
549 coverage_percentage,
550 most_frequent_tokens,
551 least_frequent_tokens,
552 unused_tokens: Vec::new(), frequency_distribution,
554 })
555 }
556
557 fn analyze_character_coverage(&self) -> CharacterCoverage {
559 let total_characters = self.character_frequencies.len();
560 let covered_characters = self.character_frequencies.len(); let coverage_percentage = 100.0; let mut unicode_categories = HashMap::new();
565 for &ch in self.character_frequencies.keys() {
566 let category = if ch.is_alphabetic() {
567 "Letter".to_string()
568 } else if ch.is_numeric() {
569 "Number".to_string()
570 } else if ch.is_whitespace() {
571 "Separator".to_string()
572 } else {
573 "Other".to_string()
574 };
575 *unicode_categories.entry(category).or_insert(0) += 1;
576 }
577
578 CharacterCoverage {
579 total_characters,
580 covered_characters,
581 coverage_percentage,
582 uncovered_characters: Vec::new(),
583 character_frequencies: self.character_frequencies.clone(),
584 unicode_categories,
585 }
586 }
587
588 fn analyze_token_distribution(&self) -> TokenDistribution {
590 let total_inputs = self.processing_times.len();
591 let total_tokens: usize = self.token_frequencies.values().sum();
592
593 let avg_tokens_per_input =
594 if total_inputs > 0 { total_tokens as f64 / total_inputs as f64 } else { 0.0 };
595
596 let mut token_length_distribution = HashMap::new();
598 let mut total_length = 0;
599
600 for (token, &freq) in &self.token_frequencies {
601 let length = token.chars().count();
602 *token_length_distribution.entry(length).or_insert(0) += freq;
603 total_length += length * freq;
604 }
605
606 let avg_token_length =
607 if total_tokens > 0 { total_length as f64 / total_tokens as f64 } else { 0.0 };
608
609 let total_chars: usize = self.character_frequencies.values().sum();
611 let compression_ratio =
612 if total_tokens > 0 { total_chars as f64 / total_tokens as f64 } else { 0.0 };
613
614 let oov_tokens = self
616 .token_frequencies
617 .iter()
618 .filter(|(token, _)| token.starts_with('<') && token.ends_with('>'))
619 .map(|(_, &freq)| freq)
620 .sum::<usize>();
621
622 let oov_rate = if total_tokens > 0 { oov_tokens as f64 / total_tokens as f64 } else { 0.0 };
623
624 let mut pattern_counts = HashMap::new();
626 for token in self.token_frequencies.keys() {
627 if token.len() >= 3 {
629 let prefix = &token[..2];
630 let suffix = &token[token.len() - 2..];
631 *pattern_counts.entry(format!("prefix:{}", prefix)).or_insert(0) += 1;
632 *pattern_counts.entry(format!("suffix:{}", suffix)).or_insert(0) += 1;
633 }
634 }
635
636 let mut common_patterns: Vec<_> = pattern_counts.into_iter().collect();
637 common_patterns.sort_by_key(|item| std::cmp::Reverse(item.1));
638 common_patterns.truncate(20);
639
640 TokenDistribution {
641 avg_tokens_per_input,
642 token_length_distribution,
643 avg_token_length,
644 compression_ratio,
645 oov_rate,
646 common_patterns,
647 }
648 }
649
650 fn analyze_performance_metrics(&self) -> PerformanceMetrics {
652 let total_time: Duration = self.processing_times.iter().sum();
653 let avg_time_per_input = if !self.processing_times.is_empty() {
654 total_time / self.processing_times.len() as u32
655 } else {
656 Duration::from_secs(0)
657 };
658
659 let total_tokens: usize = self.token_frequencies.values().sum();
660 let total_chars: usize = self.character_frequencies.values().sum();
661
662 let tokens_per_second = if total_time.as_secs_f64() > 0.0 {
663 total_tokens as f64 / total_time.as_secs_f64()
664 } else {
665 0.0
666 };
667
668 let characters_per_second = if total_time.as_secs_f64() > 0.0 {
669 total_chars as f64 / total_time.as_secs_f64()
670 } else {
671 0.0
672 };
673
674 let mut sorted_times = self.processing_times.clone();
676 sorted_times.sort();
677
678 let mut throughput_percentiles = HashMap::new();
679 if !sorted_times.is_empty() {
680 let p50_idx = sorted_times.len() / 2;
681 let p90_idx = (sorted_times.len() as f64 * 0.9) as usize;
682 let p99_idx = (sorted_times.len() as f64 * 0.99) as usize;
683
684 throughput_percentiles.insert("p50".to_string(), sorted_times[p50_idx].as_secs_f64());
685 throughput_percentiles.insert("p90".to_string(), sorted_times[p90_idx].as_secs_f64());
686 throughput_percentiles.insert("p99".to_string(), sorted_times[p99_idx].as_secs_f64());
687 }
688
689 let vocab_memory = self.token_frequencies.len() * 64; let memory_usage = MemoryUsageStats {
692 peak_memory: vocab_memory * 2,
693 avg_memory: vocab_memory,
694 memory_per_token: if total_tokens > 0 {
695 vocab_memory as f64 / total_tokens as f64
696 } else {
697 0.0
698 },
699 vocab_memory,
700 };
701
702 PerformanceMetrics {
703 total_time,
704 avg_time_per_input,
705 tokens_per_second,
706 characters_per_second,
707 memory_usage,
708 throughput_percentiles,
709 }
710 }
711
712 fn calculate_quality_metrics(
714 &self,
715 vocab_coverage: &VocabularyCoverage,
716 char_coverage: &CharacterCoverage,
717 token_dist: &TokenDistribution,
718 ) -> QualityMetrics {
719 let vocab_efficiency = vocab_coverage.coverage_percentage / 100.0;
721
722 let consistency_score = if token_dist.avg_token_length > 0.0 {
724 1.0 / (1.0 + (token_dist.avg_token_length - 4.0).abs() / 4.0)
725 } else {
726 0.0
727 };
728
729 let information_density = (token_dist.compression_ratio / 5.0).min(1.0);
731
732 let subword_quality = (1.0 - token_dist.oov_rate).max(0.0);
734
735 let language_coverage = char_coverage.coverage_percentage / 100.0;
737
738 let overall_score = vocab_efficiency * 0.2
740 + consistency_score * 0.2
741 + information_density * 0.2
742 + subword_quality * 0.2
743 + language_coverage * 0.2;
744
745 QualityMetrics {
746 vocab_efficiency,
747 consistency_score,
748 information_density,
749 subword_quality,
750 language_coverage,
751 overall_score,
752 }
753 }
754
755 fn generate_warnings(
757 &self,
758 vocab_coverage: &VocabularyCoverage,
759 char_coverage: &CharacterCoverage,
760 token_dist: &TokenDistribution,
761 performance: &PerformanceMetrics,
762 ) -> Vec<CoverageWarning> {
763 let mut warnings = Vec::new();
764 let thresholds = &self.config.coverage_thresholds;
765
766 if vocab_coverage.coverage_percentage < thresholds.min_vocab_coverage * 100.0 {
768 warnings.push(CoverageWarning {
769 warning_type: WarningType::LowVocabCoverage,
770 severity: WarningSeverity::Warning,
771 message: format!(
772 "Low vocabulary coverage: {:.1}%",
773 vocab_coverage.coverage_percentage
774 ),
775 recommendation: "Consider using a larger or more diverse training corpus"
776 .to_string(),
777 examples: vec![],
778 });
779 }
780
781 if char_coverage.coverage_percentage < thresholds.min_char_coverage * 100.0 {
783 warnings.push(CoverageWarning {
784 warning_type: WarningType::LowCharCoverage,
785 severity: WarningSeverity::Error,
786 message: format!(
787 "Low character coverage: {:.1}%",
788 char_coverage.coverage_percentage
789 ),
790 recommendation: "Review tokenizer configuration and vocabulary".to_string(),
791 examples: char_coverage
792 .uncovered_characters
793 .iter()
794 .take(10)
795 .map(|c| c.to_string())
796 .collect(),
797 });
798 }
799
800 if token_dist.oov_rate > thresholds.max_oov_rate {
802 warnings.push(CoverageWarning {
803 warning_type: WarningType::HighOOVRate,
804 severity: WarningSeverity::Error,
805 message: format!("High OOV rate: {:.1}%", token_dist.oov_rate * 100.0),
806 recommendation: "Expand vocabulary or improve tokenization algorithm".to_string(),
807 examples: vec![],
808 });
809 }
810
811 if token_dist.avg_token_length < thresholds.min_avg_token_length
813 || token_dist.avg_token_length > thresholds.max_avg_token_length
814 {
815 warnings.push(CoverageWarning {
816 warning_type: WarningType::QualityIssue,
817 severity: WarningSeverity::Warning,
818 message: format!(
819 "Suboptimal average token length: {:.1}",
820 token_dist.avg_token_length
821 ),
822 recommendation: "Adjust tokenization parameters for better token granularity"
823 .to_string(),
824 examples: vec![],
825 });
826 }
827
828 if performance.tokens_per_second < thresholds.min_tokens_per_second {
830 warnings.push(CoverageWarning {
831 warning_type: WarningType::PoorPerformance,
832 severity: WarningSeverity::Warning,
833 message: format!(
834 "Low throughput: {:.0} tokens/sec",
835 performance.tokens_per_second
836 ),
837 recommendation: "Consider performance optimizations or hardware upgrades"
838 .to_string(),
839 examples: vec![],
840 });
841 }
842
843 warnings
844 }
845}
846
847pub struct CoverageReportExporter;
849
850impl CoverageReportExporter {
851 pub fn export_to_string(report: &CoverageReport, format: &ReportFormat) -> Result<String> {
853 match format {
854 ReportFormat::Json => serde_json::to_string_pretty(report)
855 .map_err(|e| anyhow!("Failed to serialize JSON: {}", e)),
856 ReportFormat::Yaml => serde_yaml::to_string(report)
857 .map_err(|e| anyhow!("Failed to serialize YAML: {}", e)),
858 ReportFormat::Html => Self::export_to_html(report),
859 ReportFormat::Markdown => Self::export_to_markdown(report),
860 ReportFormat::Csv => Self::export_to_csv(report),
861 }
862 }
863
864 fn export_to_html(report: &CoverageReport) -> Result<String> {
866 let mut html = String::new();
867 html.push_str("<!DOCTYPE html>\n<html>\n<head>\n");
868 html.push_str("<title>Tokenizer Coverage Report</title>\n");
869 html.push_str("<style>body{font-family:Arial,sans-serif;margin:40px;}table{border-collapse:collapse;width:100%;}th,td{border:1px solid #ddd;padding:8px;text-align:left;}</style>\n");
870 html.push_str("</head>\n<body>\n");
871
872 html.push_str("<h1>Tokenizer Coverage Report</h1>\n");
873 html.push_str(&format!(
874 "<p>Generated: {}</p>\n",
875 report.metadata.timestamp
876 ));
877 html.push_str(&format!(
878 "<p>Samples: {}</p>\n",
879 report.metadata.sample_count
880 ));
881
882 html.push_str("<h2>Vocabulary Coverage</h2>\n");
884 html.push_str(&format!(
885 "<p>Coverage: {:.1}%</p>\n",
886 report.vocabulary_coverage.coverage_percentage
887 ));
888 html.push_str(&format!(
889 "<p>Used Tokens: {}/{}</p>\n",
890 report.vocabulary_coverage.used_tokens, report.vocabulary_coverage.total_vocab_size
891 ));
892
893 html.push_str("<h2>Performance Metrics</h2>\n");
895 html.push_str(&format!(
896 "<p>Tokens/sec: {:.0}</p>\n",
897 report.performance_metrics.tokens_per_second
898 ));
899 html.push_str(&format!(
900 "<p>Characters/sec: {:.0}</p>\n",
901 report.performance_metrics.characters_per_second
902 ));
903
904 if !report.warnings.is_empty() {
906 html.push_str("<h2>Warnings</h2>\n<ul>\n");
907 for warning in &report.warnings {
908 html.push_str(&format!(
909 "<li><strong>{:?}</strong>: {}</li>\n",
910 warning.severity, warning.message
911 ));
912 }
913 html.push_str("</ul>\n");
914 }
915
916 html.push_str("</body>\n</html>");
917 Ok(html)
918 }
919
920 fn export_to_markdown(report: &CoverageReport) -> Result<String> {
922 let mut md = String::new();
923
924 md.push_str("# Tokenizer Coverage Report\n\n");
925 md.push_str(&format!("**Generated:** {}\n", report.metadata.timestamp));
926 md.push_str(&format!(
927 "**Samples:** {}\n\n",
928 report.metadata.sample_count
929 ));
930
931 md.push_str("## Vocabulary Coverage\n\n");
932 md.push_str(&format!(
933 "- **Coverage:** {:.1}%\n",
934 report.vocabulary_coverage.coverage_percentage
935 ));
936 md.push_str(&format!(
937 "- **Used Tokens:** {}/{}\n\n",
938 report.vocabulary_coverage.used_tokens, report.vocabulary_coverage.total_vocab_size
939 ));
940
941 md.push_str("## Performance Metrics\n\n");
942 md.push_str(&format!(
943 "- **Tokens/sec:** {:.0}\n",
944 report.performance_metrics.tokens_per_second
945 ));
946 md.push_str(&format!(
947 "- **Characters/sec:** {:.0}\n\n",
948 report.performance_metrics.characters_per_second
949 ));
950
951 if !report.warnings.is_empty() {
952 md.push_str("## Warnings\n\n");
953 for warning in &report.warnings {
954 md.push_str(&format!(
955 "- **{:?}:** {}\n",
956 warning.severity, warning.message
957 ));
958 }
959 }
960
961 Ok(md)
962 }
963
964 fn export_to_csv(report: &CoverageReport) -> Result<String> {
966 let mut csv = String::new();
967
968 csv.push_str("Metric,Value\n");
969 csv.push_str(&format!(
970 "Vocabulary Coverage,{:.1}%\n",
971 report.vocabulary_coverage.coverage_percentage
972 ));
973 csv.push_str(&format!(
974 "Character Coverage,{:.1}%\n",
975 report.character_coverage.coverage_percentage
976 ));
977 csv.push_str(&format!(
978 "Average Token Length,{:.2}\n",
979 report.token_distribution.avg_token_length
980 ));
981 csv.push_str(&format!(
982 "OOV Rate,{:.2}%\n",
983 report.token_distribution.oov_rate * 100.0
984 ));
985 csv.push_str(&format!(
986 "Tokens per Second,{:.0}\n",
987 report.performance_metrics.tokens_per_second
988 ));
989 csv.push_str(&format!(
990 "Overall Quality Score,{:.2}\n",
991 report.quality_metrics.overall_score
992 ));
993
994 Ok(csv)
995 }
996
997 pub fn save_to_file(report: &CoverageReport, path: &str, format: &ReportFormat) -> Result<()> {
999 let content = Self::export_to_string(report, format)?;
1000 std::fs::write(path, content).map_err(|e| anyhow!("Failed to write report to file: {}", e))
1001 }
1002}
1003
1004#[cfg(test)]
1005mod tests {
1006 use super::*;
1007 use crate::char::CharTokenizer;
1008 use std::collections::HashMap;
1009
1010 fn create_test_char_tokenizer() -> CharTokenizer {
1011 let mut vocab = HashMap::new();
1012 vocab.insert("[PAD]".to_string(), 0);
1013 vocab.insert("[UNK]".to_string(), 1);
1014 vocab.insert("[CLS]".to_string(), 2);
1015 vocab.insert("[SEP]".to_string(), 3);
1016 vocab.insert("h".to_string(), 4);
1017 vocab.insert("e".to_string(), 5);
1018 vocab.insert("l".to_string(), 6);
1019 vocab.insert("o".to_string(), 7);
1020 vocab.insert("w".to_string(), 8);
1021 vocab.insert("r".to_string(), 9);
1022 vocab.insert("d".to_string(), 10);
1023 vocab.insert(" ".to_string(), 11);
1024 vocab.insert("t".to_string(), 12);
1025 vocab.insert("s".to_string(), 13);
1026 CharTokenizer::new(vocab)
1027 }
1028
1029 #[test]
1030 fn test_coverage_config() {
1031 let config = CoverageConfig::default();
1032 assert_eq!(config.min_token_frequency, 1);
1033 assert_eq!(config.max_examples, 100);
1034 assert!(config.include_token_analysis);
1035 }
1036
1037 #[test]
1038 fn test_coverage_analyzer_creation() {
1039 let tokenizer = create_test_char_tokenizer();
1040 let analyzer = CoverageAnalyzer::from_tokenizer(tokenizer);
1041 assert_eq!(analyzer.token_frequencies.len(), 0);
1042 assert_eq!(analyzer.character_frequencies.len(), 0);
1043 }
1044
1045 #[test]
1046 fn test_analyze_input() {
1047 let tokenizer = create_test_char_tokenizer();
1048 let mut analyzer = CoverageAnalyzer::from_tokenizer(tokenizer);
1049
1050 let result = analyzer.analyze_input("hello world");
1051 assert!(result.is_ok());
1052 assert!(!analyzer.token_frequencies.is_empty());
1053 assert!(!analyzer.character_frequencies.is_empty());
1054 }
1055
1056 #[test]
1057 fn test_generate_report() {
1058 let tokenizer = create_test_char_tokenizer();
1059 let mut analyzer = CoverageAnalyzer::from_tokenizer(tokenizer);
1060
1061 analyzer.analyze_input("hello").expect("Operation failed in test");
1062 analyzer.analyze_input("world").expect("Operation failed in test");
1063
1064 let report = analyzer.generate_report();
1065 assert!(report.is_ok());
1066 let report = report.expect("Operation failed in test");
1067 assert_eq!(report.metadata.sample_count, 2);
1068 assert!(report.vocabulary_coverage.used_tokens > 0);
1069 }
1070
1071 #[test]
1072 fn test_report_export_json() {
1073 let tokenizer = create_test_char_tokenizer();
1074 let mut analyzer = CoverageAnalyzer::from_tokenizer(tokenizer);
1075
1076 analyzer.analyze_input("test").expect("Operation failed in test");
1077 let report = analyzer.generate_report().expect("Operation failed in test");
1078
1079 let json_result = CoverageReportExporter::export_to_string(&report, &ReportFormat::Json);
1080 assert!(json_result.is_ok());
1081 let json = json_result.expect("Operation failed in test");
1082 assert!(json.contains("vocabulary_coverage"));
1083 }
1084
1085 #[test]
1086 fn test_report_export_markdown() {
1087 let tokenizer = create_test_char_tokenizer();
1088 let mut analyzer = CoverageAnalyzer::from_tokenizer(tokenizer);
1089
1090 analyzer.analyze_input("test").expect("Operation failed in test");
1091 let report = analyzer.generate_report().expect("Operation failed in test");
1092
1093 let md_result = CoverageReportExporter::export_to_string(&report, &ReportFormat::Markdown);
1094 assert!(md_result.is_ok());
1095 let md = md_result.expect("Operation failed in test");
1096 assert!(md.contains("# Tokenizer Coverage Report"));
1097 }
1098
1099 #[test]
1100 fn test_coverage_thresholds() {
1101 let thresholds = CoverageThresholds::default();
1102 assert_eq!(thresholds.min_vocab_coverage, 0.95);
1103 assert_eq!(thresholds.max_oov_rate, 0.05);
1104 }
1105
1106 #[test]
1107 fn test_quality_score_calculation() {
1108 let tokenizer = create_test_char_tokenizer();
1109 let analyzer = CoverageAnalyzer::from_tokenizer(tokenizer);
1110
1111 let tokens = vec!["hello".to_string(), "world".to_string()];
1112 let score = analyzer.calculate_example_quality_score("hello world", &tokens);
1113 assert!(score > 0.0 && score <= 1.0);
1114 }
1115
1116 #[test]
1117 fn test_warning_generation() {
1118 let tokenizer = create_test_char_tokenizer();
1119 let mut analyzer = CoverageAnalyzer::from_tokenizer(tokenizer);
1120
1121 analyzer.analyze_input("a b c d e f g").expect("Operation failed in test"); let report = analyzer.generate_report().expect("Operation failed in test");
1124
1125 assert!(!report.warnings.is_empty());
1127 }
1128
1129 #[test]
1130 fn test_batch_analysis() {
1131 let tokenizer = create_test_char_tokenizer();
1132 let mut analyzer = CoverageAnalyzer::from_tokenizer(tokenizer);
1133
1134 let texts = vec![
1135 "hello world".to_string(),
1136 "goodbye world".to_string(),
1137 "test text".to_string(),
1138 ];
1139
1140 let result = analyzer.analyze_batch(&texts);
1141 assert!(result.is_ok());
1142 assert_eq!(analyzer.processing_times.len(), 3);
1143 }
1144}