1use serde::{Deserialize, Serialize};
10use std::collections::HashMap;
11
12#[derive(Debug, Clone, Serialize, Deserialize)]
14pub struct CoverageAnalysis {
15 pub char_coverage_rate: f64,
17 pub word_coverage_rate: f64,
19 pub compression_ratio: f64,
21 pub total_chars: usize,
23 pub total_words: usize,
25 pub total_tokens: usize,
27 pub covered_chars: usize,
29 pub covered_words: usize,
31 pub length_distribution: HashMap<usize, u32>,
33 pub oov_tokens: Vec<String>,
35 pub vocab_size: usize,
37}
38
39impl CoverageAnalysis {
40 pub fn summary(&self) -> String {
42 format!(
43 "Coverage Analysis Summary:\n\
44 - Character Coverage: {:.2}% ({}/{})\n\
45 - Word Coverage: {:.2}% ({}/{})\n\
46 - Compression Ratio: {:.3}\n\
47 - Vocabulary Size: {}\n\
48 - OOV Tokens: {}\n\
49 - Average Token Length: {:.2}",
50 self.char_coverage_rate * 100.0,
51 self.covered_chars,
52 self.total_chars,
53 self.word_coverage_rate * 100.0,
54 self.covered_words,
55 self.total_words,
56 self.compression_ratio,
57 self.vocab_size,
58 self.oov_tokens.len(),
59 self.average_token_length()
60 )
61 }
62
63 pub fn average_token_length(&self) -> f64 {
65 let total_tokens: u32 = self.length_distribution.values().sum();
66 if total_tokens == 0 {
67 return 0.0;
68 }
69
70 let weighted_sum: u32 = self
71 .length_distribution
72 .iter()
73 .map(|(&length, &count)| length as u32 * count)
74 .sum();
75
76 weighted_sum as f64 / total_tokens as f64
77 }
78
79 pub fn top_token_lengths(&self, n: usize) -> Vec<(usize, u32)> {
81 let mut lengths: Vec<_> = self.length_distribution.iter().collect();
82 lengths.sort_by(|a, b| b.1.cmp(a.1));
83 lengths.into_iter().take(n).map(|(&len, &count)| (len, count)).collect()
84 }
85
86 pub fn efficiency_score(&self) -> f64 {
88 let coverage_score = 0.6 * self.char_coverage_rate + 0.4 * self.word_coverage_rate;
90 let compression_score = 1.0 / (1.0 + self.compression_ratio);
91 0.7 * coverage_score + 0.3 * compression_score
92 }
93}
94
95#[derive(Debug, Clone, Serialize, Deserialize)]
97pub struct LanguageDetector {
98 language_profiles: HashMap<String, HashMap<char, f64>>,
100 ngram_profiles: HashMap<String, HashMap<String, f64>>,
102 supported_languages: Vec<String>,
104}
105
106impl LanguageDetector {
107 pub fn new() -> Self {
109 let mut detector = Self {
110 language_profiles: HashMap::new(),
111 ngram_profiles: HashMap::new(),
112 supported_languages: Vec::new(),
113 };
114
115 detector.initialize_built_in_profiles();
116 detector
117 }
118
119 fn initialize_built_in_profiles(&mut self) {
121 let mut english_chars = HashMap::new();
123 english_chars.insert('e', 12.7);
124 english_chars.insert('t', 9.1);
125 english_chars.insert('a', 8.2);
126 english_chars.insert('o', 7.5);
127 english_chars.insert('i', 7.0);
128 english_chars.insert('n', 6.7);
129 english_chars.insert('s', 6.3);
130 english_chars.insert('h', 6.1);
131 english_chars.insert('r', 6.0);
132
133 let mut english_ngrams = HashMap::new();
134 english_ngrams.insert("th".to_string(), 2.7);
135 english_ngrams.insert("he".to_string(), 2.3);
136 english_ngrams.insert("in".to_string(), 2.0);
137 english_ngrams.insert("er".to_string(), 1.8);
138 english_ngrams.insert("an".to_string(), 1.6);
139
140 self.language_profiles.insert("en".to_string(), english_chars);
141 self.ngram_profiles.insert("en".to_string(), english_ngrams);
142
143 let mut spanish_chars = HashMap::new();
145 spanish_chars.insert('e', 13.7);
146 spanish_chars.insert('a', 11.5);
147 spanish_chars.insert('o', 8.7);
148 spanish_chars.insert('s', 8.0);
149 spanish_chars.insert('r', 6.9);
150 spanish_chars.insert('n', 6.7);
151 spanish_chars.insert('i', 6.2);
152 spanish_chars.insert('d', 5.9);
153 spanish_chars.insert('l', 5.0);
154
155 let mut spanish_ngrams = HashMap::new();
156 spanish_ngrams.insert("de".to_string(), 2.8);
157 spanish_ngrams.insert("la".to_string(), 2.5);
158 spanish_ngrams.insert("es".to_string(), 2.1);
159 spanish_ngrams.insert("en".to_string(), 1.9);
160 spanish_ngrams.insert("el".to_string(), 1.7);
161
162 self.language_profiles.insert("es".to_string(), spanish_chars);
163 self.ngram_profiles.insert("es".to_string(), spanish_ngrams);
164
165 let mut french_chars = HashMap::new();
167 french_chars.insert('e', 14.7);
168 french_chars.insert('s', 7.9);
169 french_chars.insert('a', 7.6);
170 french_chars.insert('i', 7.5);
171 french_chars.insert('t', 7.2);
172 french_chars.insert('n', 7.1);
173 french_chars.insert('r', 6.6);
174 french_chars.insert('u', 6.3);
175 french_chars.insert('l', 5.5);
176
177 let mut french_ngrams = HashMap::new();
178 french_ngrams.insert("de".to_string(), 3.0);
179 french_ngrams.insert("le".to_string(), 2.4);
180 french_ngrams.insert("es".to_string(), 2.1);
181 french_ngrams.insert("re".to_string(), 1.8);
182 french_ngrams.insert("nt".to_string(), 1.6);
183
184 self.language_profiles.insert("fr".to_string(), french_chars);
185 self.ngram_profiles.insert("fr".to_string(), french_ngrams);
186
187 let mut german_chars = HashMap::new();
189 german_chars.insert('e', 17.4);
190 german_chars.insert('n', 9.8);
191 german_chars.insert('i', 7.6);
192 german_chars.insert('s', 7.3);
193 german_chars.insert('r', 7.0);
194 german_chars.insert('a', 6.5);
195 german_chars.insert('t', 6.2);
196 german_chars.insert('d', 5.1);
197 german_chars.insert('h', 4.8);
198
199 let mut german_ngrams = HashMap::new();
200 german_ngrams.insert("er".to_string(), 3.9);
201 german_ngrams.insert("en".to_string(), 3.6);
202 german_ngrams.insert("ch".to_string(), 2.4);
203 german_ngrams.insert("de".to_string(), 2.1);
204 german_ngrams.insert("ei".to_string(), 1.8);
205
206 self.language_profiles.insert("de".to_string(), german_chars);
207 self.ngram_profiles.insert("de".to_string(), german_ngrams);
208
209 self.supported_languages = vec![
210 "en".to_string(),
211 "es".to_string(),
212 "fr".to_string(),
213 "de".to_string(),
214 ];
215 }
216
217 pub fn detect_language(&self, text: &str) -> LanguageDetectionResult {
222 if text.trim().is_empty() {
223 return LanguageDetectionResult {
224 detected_language: "unknown".to_string(),
225 confidence: 0.0,
226 scores: HashMap::new(),
227 };
228 }
229
230 let text_lower = text.to_lowercase();
231 let char_freq = self.calculate_char_frequency(&text_lower);
232 let ngram_freq = self.calculate_ngram_frequency(&text_lower, 2);
233
234 let mut scores = HashMap::new();
235
236 for lang in &self.supported_languages {
237 let char_score = self.calculate_char_similarity(&char_freq, lang);
238 let ngram_score = self.calculate_ngram_similarity(&ngram_freq, lang);
239
240 let combined_score = 0.6 * char_score + 0.4 * ngram_score;
242 scores.insert(lang.clone(), combined_score);
243 }
244
245 let (detected_language, confidence) = scores
246 .iter()
247 .max_by(|a, b| a.1.partial_cmp(b.1).unwrap_or(std::cmp::Ordering::Equal))
248 .map(|(lang, score)| (lang.clone(), *score))
249 .unwrap_or(("unknown".to_string(), 0.0));
250
251 LanguageDetectionResult {
252 detected_language,
253 confidence,
254 scores,
255 }
256 }
257
258 fn calculate_char_frequency(&self, text: &str) -> HashMap<char, f64> {
260 let mut freq = HashMap::new();
261 let total_chars = text.chars().filter(|c| c.is_alphabetic()).count() as f64;
262
263 if total_chars == 0.0 {
264 return freq;
265 }
266
267 for ch in text.chars() {
268 if ch.is_alphabetic() {
269 *freq.entry(ch).or_insert(0.0) += 1.0;
270 }
271 }
272
273 for value in freq.values_mut() {
275 *value = (*value / total_chars) * 100.0;
276 }
277
278 freq
279 }
280
281 fn calculate_ngram_frequency(&self, text: &str, n: usize) -> HashMap<String, f64> {
283 let mut freq = HashMap::new();
284 let chars: Vec<char> = text.chars().filter(|c| c.is_alphabetic()).collect();
285 let total_ngrams = chars.len().saturating_sub(n - 1) as f64;
286
287 if total_ngrams == 0.0 {
288 return freq;
289 }
290
291 for window in chars.windows(n) {
292 let ngram: String = window.iter().collect();
293 *freq.entry(ngram).or_insert(0.0) += 1.0;
294 }
295
296 for value in freq.values_mut() {
298 *value = (*value / total_ngrams) * 100.0;
299 }
300
301 freq
302 }
303
304 fn calculate_char_similarity(&self, text_freq: &HashMap<char, f64>, language: &str) -> f64 {
306 let profile = match self.language_profiles.get(language) {
307 Some(p) => p,
308 None => return 0.0,
309 };
310
311 let mut similarity = 0.0;
312 let mut total_chars = 0;
313
314 for (ch, expected_freq) in profile {
315 let actual_freq = text_freq.get(ch).unwrap_or(&0.0);
316 similarity += 1.0 / (1.0 + (expected_freq - actual_freq).abs());
317 total_chars += 1;
318 }
319
320 if total_chars > 0 {
321 similarity / total_chars as f64
322 } else {
323 0.0
324 }
325 }
326
327 fn calculate_ngram_similarity(&self, text_freq: &HashMap<String, f64>, language: &str) -> f64 {
329 let profile = match self.ngram_profiles.get(language) {
330 Some(p) => p,
331 None => return 0.0,
332 };
333
334 let mut similarity = 0.0;
335 let mut total_ngrams = 0;
336
337 for (ngram, expected_freq) in profile {
338 let actual_freq = text_freq.get(ngram).unwrap_or(&0.0);
339 similarity += 1.0 / (1.0 + (expected_freq - actual_freq).abs());
340 total_ngrams += 1;
341 }
342
343 if total_ngrams > 0 {
344 similarity / total_ngrams as f64
345 } else {
346 0.0
347 }
348 }
349
350 pub fn supported_languages(&self) -> &[String] {
352 &self.supported_languages
353 }
354
355 pub fn add_language_profile(
357 &mut self,
358 language: String,
359 char_profile: HashMap<char, f64>,
360 ngram_profile: HashMap<String, f64>,
361 ) {
362 self.language_profiles.insert(language.clone(), char_profile);
363 self.ngram_profiles.insert(language.clone(), ngram_profile);
364 if !self.supported_languages.contains(&language) {
365 self.supported_languages.push(language);
366 }
367 }
368}
369
370impl Default for LanguageDetector {
371 fn default() -> Self {
372 Self::new()
373 }
374}
375
376#[derive(Debug, Clone, Serialize, Deserialize)]
378pub struct LanguageDetectionResult {
379 pub detected_language: String,
381 pub confidence: f64,
383 pub scores: HashMap<String, f64>,
385}
386
387impl LanguageDetectionResult {
388 pub fn top_candidates(&self, n: usize) -> Vec<(String, f64)> {
390 let mut sorted_scores: Vec<_> = self.scores.iter().collect();
391 sorted_scores.sort_by(|a, b| b.1.partial_cmp(a.1).unwrap_or(std::cmp::Ordering::Equal));
392 sorted_scores
393 .into_iter()
394 .take(n)
395 .map(|(lang, score)| (lang.clone(), *score))
396 .collect()
397 }
398
399 pub fn is_confident(&self, threshold: f64) -> bool {
401 self.confidence >= threshold
402 }
403}
404
405#[derive(Debug, Clone, Serialize, Deserialize)]
407pub struct TokenDistributionAnalyzer {
408 length_bins: Vec<usize>,
410 frequency_bins: Vec<usize>,
412}
413
414impl TokenDistributionAnalyzer {
415 pub fn new() -> Self {
417 Self {
418 length_bins: vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 30, 50],
419 frequency_bins: vec![1, 2, 5, 10, 20, 50, 100, 200, 500, 1000, 2000, 5000],
420 }
421 }
422
423 pub fn with_bins(length_bins: Vec<usize>, frequency_bins: Vec<usize>) -> Self {
425 Self {
426 length_bins,
427 frequency_bins,
428 }
429 }
430
431 pub fn analyze_distribution(&self, vocab: &HashMap<String, u32>) -> TokenDistributionResult {
433 let mut length_distribution = HashMap::new();
434 let mut frequency_distribution = HashMap::new();
435 let mut character_usage = HashMap::new();
436 let mut prefix_analysis = HashMap::new();
437 let mut suffix_analysis = HashMap::new();
438
439 for &bin in &self.length_bins {
441 length_distribution.insert(bin, 0);
442 }
443 for &bin in &self.frequency_bins {
444 frequency_distribution.insert(bin, 0);
445 }
446
447 let mut total_tokens = 0;
448 let mut total_length = 0;
449 let mut max_length = 0;
450 let mut min_length = usize::MAX;
451
452 for (token, &freq) in vocab {
453 let length = token.chars().count();
454 total_tokens += 1;
455 total_length += length;
456 max_length = max_length.max(length);
457 min_length = min_length.min(length);
458
459 for &bin in &self.length_bins {
461 if length <= bin {
462 *length_distribution.entry(bin).or_insert(0) += 1;
463 break;
464 }
465 }
466
467 let token_freq = freq as usize;
469 for &bin in &self.frequency_bins {
470 if token_freq <= bin {
471 *frequency_distribution.entry(bin).or_insert(0) += 1;
472 break;
473 }
474 }
475
476 for ch in token.chars() {
478 *character_usage.entry(ch).or_insert(0) += 1;
479 }
480
481 if length >= 2 {
483 let prefix2: String = token.chars().take(2).collect();
484 *prefix_analysis.entry(prefix2).or_insert(0) += 1;
485
486 let suffix2: String =
487 token.chars().rev().take(2).collect::<String>().chars().rev().collect();
488 *suffix_analysis.entry(suffix2).or_insert(0) += 1;
489 }
490 if length >= 3 {
491 let prefix3: String = token.chars().take(3).collect();
492 *prefix_analysis.entry(prefix3).or_insert(0) += 1;
493
494 let suffix3: String =
495 token.chars().rev().take(3).collect::<String>().chars().rev().collect();
496 *suffix_analysis.entry(suffix3).or_insert(0) += 1;
497 }
498 }
499
500 let average_length =
501 if total_tokens > 0 { total_length as f64 / total_tokens as f64 } else { 0.0 };
502
503 let mut char_frequency: Vec<_> = character_usage.into_iter().collect();
505 char_frequency.sort_by_key(|item| std::cmp::Reverse(item.1));
506
507 let mut prefix_frequency: Vec<_> = prefix_analysis.into_iter().collect();
509 prefix_frequency.sort_by_key(|item| std::cmp::Reverse(item.1));
510 prefix_frequency.truncate(20); let mut suffix_frequency: Vec<_> = suffix_analysis.into_iter().collect();
513 suffix_frequency.sort_by_key(|item| std::cmp::Reverse(item.1));
514 suffix_frequency.truncate(20); TokenDistributionResult {
517 total_tokens,
518 average_length,
519 max_length,
520 min_length: if min_length == usize::MAX { 0 } else { min_length },
521 length_distribution,
522 frequency_distribution,
523 character_frequency: char_frequency.into_iter().collect(),
524 prefix_frequency: prefix_frequency.into_iter().collect(),
525 suffix_frequency: suffix_frequency.into_iter().collect(),
526 }
527 }
528}
529
530impl Default for TokenDistributionAnalyzer {
531 fn default() -> Self {
532 Self::new()
533 }
534}
535
536#[derive(Debug, Clone, Serialize, Deserialize)]
538pub struct TokenDistributionResult {
539 pub total_tokens: usize,
541 pub average_length: f64,
543 pub max_length: usize,
545 pub min_length: usize,
547 pub length_distribution: HashMap<usize, usize>,
549 pub frequency_distribution: HashMap<usize, usize>,
551 pub character_frequency: HashMap<char, usize>,
553 pub prefix_frequency: HashMap<String, usize>,
555 pub suffix_frequency: HashMap<String, usize>,
557}
558
559impl TokenDistributionResult {
560 pub fn generate_report(&self) -> String {
562 format!(
563 "Token Distribution Analysis Report\n\
564 ===================================\n\
565 Total Tokens: {}\n\
566 Average Token Length: {:.2}\n\
567 Min/Max Length: {}/{}\n\
568 \n\
569 Length Distribution:\n\
570 {}\n\
571 \n\
572 Top 10 Characters by Frequency:\n\
573 {}\n\
574 \n\
575 Top 10 Prefixes:\n\
576 {}\n\
577 \n\
578 Top 10 Suffixes:\n\
579 {}",
580 self.total_tokens,
581 self.average_length,
582 self.min_length,
583 self.max_length,
584 self.format_length_distribution(),
585 self.format_character_frequency(),
586 self.format_prefix_frequency(),
587 self.format_suffix_frequency()
588 )
589 }
590
591 fn format_length_distribution(&self) -> String {
593 let mut sorted: Vec<_> = self.length_distribution.iter().collect();
594 sorted.sort_by_key(|(len, _)| *len);
595
596 sorted
597 .iter()
598 .map(|(len, count)| format!(" Length ≤{}: {} tokens", len, count))
599 .collect::<Vec<_>>()
600 .join("\n")
601 }
602
603 fn format_character_frequency(&self) -> String {
605 self.character_frequency
606 .iter()
607 .take(10)
608 .map(|(ch, count)| format!(" '{}': {} occurrences", ch, count))
609 .collect::<Vec<_>>()
610 .join("\n")
611 }
612
613 fn format_prefix_frequency(&self) -> String {
615 self.prefix_frequency
616 .iter()
617 .take(10)
618 .map(|(prefix, count)| format!(" '{}': {} tokens", prefix, count))
619 .collect::<Vec<_>>()
620 .join("\n")
621 }
622
623 fn format_suffix_frequency(&self) -> String {
625 self.suffix_frequency
626 .iter()
627 .take(10)
628 .map(|(suffix, count)| format!(" '{}': {} tokens", suffix, count))
629 .collect::<Vec<_>>()
630 .join("\n")
631 }
632
633 pub fn diversity_score(&self) -> f64 {
635 if self.total_tokens == 0 {
636 return 0.0;
637 }
638
639 let mut entropy = 0.0;
641 let total_chars: usize = self.character_frequency.values().sum();
642
643 if total_chars > 0 {
644 for &freq in self.character_frequency.values() {
645 let prob = freq as f64 / total_chars as f64;
646 if prob > 0.0 {
647 entropy -= prob * prob.log2();
648 }
649 }
650 }
651
652 entropy
653 }
654
655 pub fn optimal_length_range(&self) -> (usize, usize) {
657 let mut cumulative = 0;
658 let target_coverage = (self.total_tokens as f64 * 0.8) as usize; let mut sorted: Vec<_> = self.length_distribution.iter().collect();
661 sorted.sort_by_key(|(len, _)| *len);
662
663 let min_optimal = self.min_length;
664 let mut max_optimal = self.max_length;
665
666 for (len, count) in sorted {
667 cumulative += count;
668 if cumulative >= target_coverage {
669 max_optimal = *len;
670 break;
671 }
672 }
673
674 (min_optimal, max_optimal)
675 }
676}
677
678#[cfg(test)]
679mod tests {
680 use super::*;
681
682 #[test]
683 fn test_coverage_analysis() {
684 let analysis = CoverageAnalysis {
685 char_coverage_rate: 0.95,
686 word_coverage_rate: 0.88,
687 compression_ratio: 0.6,
688 total_chars: 1000,
689 total_words: 200,
690 total_tokens: 600,
691 covered_chars: 950,
692 covered_words: 176,
693 length_distribution: {
694 let mut dist = HashMap::new();
695 dist.insert(1, 100);
696 dist.insert(2, 200);
697 dist.insert(3, 150);
698 dist.insert(4, 100);
699 dist.insert(5, 50);
700 dist
701 },
702 oov_tokens: vec!["[UNK]".to_string()],
703 vocab_size: 5000,
704 };
705
706 assert!((analysis.average_token_length() - 2.6666666666666665).abs() < 1e-10);
707 assert!(analysis.efficiency_score() > 0.0);
708
709 let summary = analysis.summary();
710 assert!(summary.contains("95.00%"));
711 assert!(summary.contains("88.00%"));
712
713 let top_lengths = analysis.top_token_lengths(3);
714 assert_eq!(top_lengths[0], (2, 200));
715 }
716
717 #[test]
718 fn test_language_detector() {
719 let detector = LanguageDetector::new();
720
721 assert_eq!(detector.supported_languages().len(), 4);
722 assert!(detector.supported_languages().contains(&"en".to_string()));
723
724 let result = detector.detect_language("Hello world, this is a test in English");
725 assert_eq!(result.detected_language, "en");
726 assert!(result.confidence > 0.0);
727
728 let top_candidates = result.top_candidates(2);
729 assert_eq!(top_candidates[0].0, "en");
730
731 assert!(result.is_confident(0.1));
732 }
733
734 #[test]
735 fn test_language_detector_empty_text() {
736 let detector = LanguageDetector::new();
737 let result = detector.detect_language("");
738 assert_eq!(result.detected_language, "unknown");
739 assert_eq!(result.confidence, 0.0);
740 }
741
742 #[test]
743 fn test_token_distribution_analyzer() {
744 let analyzer = TokenDistributionAnalyzer::new();
745
746 let mut vocab = HashMap::new();
747 vocab.insert("a".to_string(), 1);
748 vocab.insert("the".to_string(), 2);
749 vocab.insert("hello".to_string(), 3);
750 vocab.insert("world".to_string(), 4);
751 vocab.insert("test".to_string(), 5);
752
753 let result = analyzer.analyze_distribution(&vocab);
754
755 assert_eq!(result.total_tokens, 5);
756 assert!(result.average_length > 0.0);
757 assert_eq!(result.min_length, 1);
758 assert_eq!(result.max_length, 5);
759
760 let report = result.generate_report();
761 assert!(report.contains("Token Distribution Analysis Report"));
762 assert!(report.contains("Total Tokens: 5"));
763
764 assert!(result.diversity_score() > 0.0);
765
766 let (min_opt, max_opt) = result.optimal_length_range();
767 assert!(min_opt <= max_opt);
768 }
769
770 #[test]
771 fn test_custom_language_profile() {
772 let mut detector = LanguageDetector::new();
773
774 let mut custom_chars = HashMap::new();
775 custom_chars.insert('x', 50.0);
776 custom_chars.insert('y', 40.0);
777 custom_chars.insert('z', 30.0);
778
779 let mut custom_ngrams = HashMap::new();
780 custom_ngrams.insert("xy".to_string(), 20.0);
781 custom_ngrams.insert("yz".to_string(), 15.0);
782 custom_ngrams.insert("xz".to_string(), 10.0);
783
784 detector.add_language_profile("custom".to_string(), custom_chars, custom_ngrams);
785
786 assert!(detector.supported_languages().contains(&"custom".to_string()));
787
788 let result = detector.detect_language("xyxyxyxyxyzyzyzyzxzxzxzxz");
790 assert_eq!(result.detected_language, "custom");
791 }
792
793 #[test]
794 fn test_distribution_analyzer_custom_bins() {
795 let analyzer = TokenDistributionAnalyzer::with_bins(vec![1, 3, 5], vec![1, 10, 100]);
796
797 let mut vocab = HashMap::new();
798 vocab.insert("a".to_string(), 1);
799 vocab.insert("abc".to_string(), 15);
800 vocab.insert("abcde".to_string(), 150);
801
802 let result = analyzer.analyze_distribution(&vocab);
803 assert_eq!(result.total_tokens, 3);
804 }
805}