1use crate::compressor::{CompressionResult, OutputFormat};
12#[cfg(feature = "image")]
13use crate::image_renderer::{ImageRenderer, ImageRendererConfig};
14use regex::Regex;
15use std::collections::HashMap;
16use std::sync::OnceLock;
17
18#[derive(Debug, Clone, Copy, PartialEq, Eq)]
20enum SpanType {
21 CodeBlock, JsonBlock, Path, Identifier, HashOrNumber, Bracket, }
28
29#[derive(Debug, Clone)]
31struct ProtectedSpan {
32 start: usize,
33 end: usize,
34 _span_type: SpanType,
35}
36
37#[derive(Debug, Clone)]
39pub struct WordImportance {
40 pub position: usize,
42 pub text: String,
44 pub score: f64,
46}
47
48#[derive(Debug, Clone)]
50pub struct StatisticalFilterConfig {
51 pub compression_ratio: f32,
54
55 pub idf_weight: f32,
57
58 pub position_weight: f32,
60
61 pub pos_weight: f32,
63
64 pub entity_weight: f32,
66
67 pub entropy_weight: f32,
69
70 pub enable_protection_masks: bool,
73
74 pub enable_contextual_stopwords: bool,
76
77 pub preserve_negations: bool,
79
80 pub preserve_comparators: bool,
82
83 pub domain_terms: Vec<String>,
85
86 pub min_gap_between_critical: usize,
88}
89
90impl Default for StatisticalFilterConfig {
91 fn default() -> Self {
92 Self {
97 compression_ratio: 0.5, idf_weight: 0.3,
99 position_weight: 0.2,
100 pos_weight: 0.2,
101 entity_weight: 0.2,
102 entropy_weight: 0.1,
103 enable_protection_masks: true,
105 enable_contextual_stopwords: true,
106 preserve_negations: true,
107 preserve_comparators: true,
108 domain_terms: vec![
109 "Vectorizer".to_string(),
110 "Synap".to_string(),
111 "UMICP".to_string(),
112 "Graphs".to_string(),
113 ],
114 min_gap_between_critical: 3,
115 }
116 }
117}
118
119#[derive(Debug)]
121pub struct StatisticalFilter {
122 config: StatisticalFilterConfig,
123}
124
125impl StatisticalFilter {
126 pub fn new(config: StatisticalFilterConfig) -> Self {
128 Self { config }
129 }
130}
131
132impl Default for StatisticalFilter {
133 fn default() -> Self {
134 Self::new(StatisticalFilterConfig::default())
135 }
136}
137
138impl StatisticalFilter {
139 fn detect_json_spans(text: &str) -> Vec<(usize, usize)> {
141 let mut spans = Vec::new();
142 let chars: Vec<char> = text.chars().collect();
143 let mut i = 0;
144
145 while i < chars.len() {
146 if chars[i] == '{' || chars[i] == '[' {
148 let opening = chars[i];
149 let closing = if opening == '{' { '}' } else { ']' };
150 let start = i;
151 let mut depth = 1;
152 let mut in_string = false;
153 let mut escape_next = false;
154 let mut has_colon = false;
155 i += 1;
156
157 while i < chars.len() && depth > 0 {
159 if escape_next {
160 escape_next = false;
161 i += 1;
162 continue;
163 }
164
165 match chars[i] {
166 '\\' if in_string => escape_next = true,
167 '"' => in_string = !in_string,
168 ':' if !in_string => has_colon = true,
169 c if c == opening && !in_string => depth += 1,
170 c if c == closing && !in_string => {
171 depth -= 1;
172 if depth == 0 {
173 if opening == '[' || has_colon {
175 spans.push((start, i + 1));
176 }
177 }
178 }
179 _ => {}
180 }
181 i += 1;
182 }
183 } else {
184 i += 1;
185 }
186 }
187
188 spans
189 }
190
191 fn detect_protected_spans(&self, text: &str) -> Vec<ProtectedSpan> {
193 if !self.config.enable_protection_masks {
194 return Vec::new();
195 }
196
197 let mut spans = Vec::new();
198
199 static CODE_BLOCK_RE: OnceLock<Regex> = OnceLock::new();
201 let code_re = CODE_BLOCK_RE.get_or_init(|| Regex::new(r"```[\s\S]*?```").unwrap());
202 for mat in code_re.find_iter(text) {
203 spans.push(ProtectedSpan {
204 start: mat.start(),
205 end: mat.end(),
206 _span_type: SpanType::CodeBlock,
207 });
208 }
209
210 let json_spans = Self::detect_json_spans(text);
213 for (start, end) in json_spans {
214 spans.push(ProtectedSpan {
215 start,
216 end,
217 _span_type: SpanType::JsonBlock,
218 });
219 }
220
221 static PATH_RE: OnceLock<Regex> = OnceLock::new();
223 let path_re = PATH_RE.get_or_init(|| {
224 Regex::new(r"(?:[A-Za-z]+:)?//[^\s]+|[/\\][\w/\\.-]+\.[A-Za-z0-9]{1,5}\b").unwrap()
225 });
226 for mat in path_re.find_iter(text) {
227 spans.push(ProtectedSpan {
228 start: mat.start(),
229 end: mat.end(),
230 _span_type: SpanType::Path,
231 });
232 }
233
234 static CAMEL_RE: OnceLock<Regex> = OnceLock::new();
236 let camel_re =
237 CAMEL_RE.get_or_init(|| Regex::new(r"\b[A-Z][a-z0-9]+[A-Z][A-Za-z0-9]+\b").unwrap());
238 for mat in camel_re.find_iter(text) {
239 spans.push(ProtectedSpan {
240 start: mat.start(),
241 end: mat.end(),
242 _span_type: SpanType::Identifier,
243 });
244 }
245
246 static SNAKE_RE: OnceLock<Regex> = OnceLock::new();
248 let snake_re = SNAKE_RE.get_or_init(|| Regex::new(r"\b[a-z_][a-z0-9_]{2,}\b").unwrap());
249 for mat in snake_re.find_iter(text) {
250 if mat.as_str().contains('_') {
251 spans.push(ProtectedSpan {
252 start: mat.start(),
253 end: mat.end(),
254 _span_type: SpanType::Identifier,
255 });
256 }
257 }
258
259 static UPPER_SNAKE_RE: OnceLock<Regex> = OnceLock::new();
261 let upper_snake_re =
262 UPPER_SNAKE_RE.get_or_init(|| Regex::new(r"\b[A-Z][A-Z0-9_]+\b").unwrap());
263 for mat in upper_snake_re.find_iter(text) {
264 if mat.as_str().len() > 1 {
265 spans.push(ProtectedSpan {
266 start: mat.start(),
267 end: mat.end(),
268 _span_type: SpanType::Identifier,
269 });
270 }
271 }
272
273 static HASH_RE: OnceLock<Regex> = OnceLock::new();
275 let hash_re = HASH_RE.get_or_init(|| Regex::new(r"\b[0-9a-f]{7,}\b|\b\d{3,}\b").unwrap());
276 for mat in hash_re.find_iter(text) {
277 spans.push(ProtectedSpan {
278 start: mat.start(),
279 end: mat.end(),
280 _span_type: SpanType::HashOrNumber,
281 });
282 }
283
284 static BRACKET_RE: OnceLock<Regex> = OnceLock::new();
286 let bracket_re =
287 BRACKET_RE.get_or_init(|| Regex::new(r"[\{\[\(][^\}\]\)]*[\}\]\)]").unwrap());
288 for mat in bracket_re.find_iter(text) {
289 spans.push(ProtectedSpan {
290 start: mat.start(),
291 end: mat.end(),
292 _span_type: SpanType::Bracket,
293 });
294 }
295
296 spans
297 }
298
299 fn is_word_protected(
301 &self,
302 word_start: usize,
303 word_end: usize,
304 protected: &[ProtectedSpan],
305 ) -> bool {
306 protected.iter().any(|span| {
307 word_start < span.end && word_end > span.start
309 })
310 }
311
312 fn should_preserve_stopword(
314 &self,
315 word: &str,
316 context_before: &[&str],
317 context_after: &[&str],
318 ) -> bool {
319 if !self.config.enable_contextual_stopwords {
320 return false;
321 }
322
323 let word_lower = word.to_lowercase();
324
325 if word_lower == "to" {
327 if let Some(&prev) = context_before.last() {
328 let prev_lower = prev.to_lowercase();
329 if ["how", "steps", "need", "want", "try", "used", "able"]
330 .contains(&prev_lower.as_str())
331 {
332 return true;
333 }
334 }
335 }
336
337 if ["in", "on", "at"].contains(&word_lower.as_str()) {
339 if let Some(&next) = context_after.first() {
340 if next.contains('/') || next.contains('\\') || next.contains('.') {
342 return true;
343 }
344 if next.chars().next().is_some_and(|c| c.is_uppercase()) || next.contains('_') {
346 return true;
347 }
348 }
349 }
350
351 if ["is", "are", "was", "were", "be"].contains(&word_lower.as_str()) {
353 if let Some(&prev) = context_before.last() {
354 if prev.chars().next().is_some_and(|c| c.is_uppercase())
356 || prev.len() > 6
357 || prev.contains('_')
358 {
359 return true;
360 }
361 }
362 }
363
364 if ["and", "or"].contains(&word_lower.as_str()) {
366 let prev_important = context_before.last().is_some_and(|&prev| {
367 prev.chars().next().is_some_and(|c| c.is_uppercase()) || prev.len() > 6
368 });
369 let next_important = context_after.first().is_some_and(|&next| {
370 next.chars().next().is_some_and(|c| c.is_uppercase()) || next.len() > 6
371 });
372 if prev_important && next_important {
373 return true;
374 }
375 }
376
377 false
378 }
379
380 fn is_critical_term(&self, word: &str) -> Option<f64> {
382 let word_lower = word.to_lowercase();
383
384 for domain_term in &self.config.domain_terms {
386 if word.eq_ignore_ascii_case(domain_term) {
387 return Some(f64::INFINITY);
388 }
389 }
390
391 if self.config.preserve_negations {
393 const NEGATIONS: &[&str] = &[
394 "not",
395 "no",
396 "never",
397 "don't",
398 "won't",
399 "can't",
400 "couldn't",
401 "wouldn't",
402 "shouldn't",
403 "mustn't",
404 "haven't",
405 "hasn't",
406 "hadn't",
407 "isn't",
408 "aren't",
409 "wasn't",
410 "weren't",
411 "neither",
412 "nor",
413 "none",
414 ];
415 if NEGATIONS.contains(&word_lower.as_str()) {
416 return Some(10.0);
417 }
418 }
419
420 if self.config.preserve_comparators {
422 const COMPARATORS: &[&str] = &["!=", "!==", "<=", ">=", "<", ">", "==", "===", "!"];
423 if COMPARATORS.contains(&word) {
424 return Some(10.0);
425 }
426 }
427
428 const MODALS: &[&str] = &[
430 "only", "except", "must", "should", "may", "might", "at", "least", "most",
431 ];
432 if MODALS.contains(&word_lower.as_str()) {
433 return Some(5.0);
434 }
435
436 None
437 }
438
439 pub fn score_words(&self, text: &str) -> Vec<WordImportance> {
442 let words: Vec<&str> = text.split_whitespace().collect();
443
444 if words.is_empty() {
445 return Vec::new();
446 }
447
448 let protected_spans = self.detect_protected_spans(text);
450
451 let word_positions: Vec<(usize, usize)> = {
453 let mut positions = Vec::new();
454 let mut char_idx = 0;
455 let text_chars: Vec<char> = text.chars().collect();
456
457 for word in &words {
458 while char_idx < text_chars.len() && text_chars[char_idx].is_whitespace() {
460 char_idx += 1;
461 }
462
463 let start = char_idx;
464 let word_len = word.chars().count();
465 char_idx += word_len;
466 let end = char_idx;
467
468 positions.push((start, end));
469 }
470 positions
471 };
472
473 let idf_scores = self.calculate_idf(&words);
475 let position_scores = self.calculate_position_importance(&words);
476 let pos_scores = self.calculate_pos_importance(&words, &protected_spans, text);
477 let entity_scores = self.calculate_entity_importance(&words);
478 let entropy_scores = self.calculate_local_entropy(&words);
479
480 words
482 .iter()
483 .enumerate()
484 .map(|(idx, word)| {
485 let final_score = if let Some(critical_score) = self.is_critical_term(word) {
488 critical_score
489 } else {
490 let (start, end) = word_positions[idx];
492 let is_protected = self.is_word_protected(start, end, &protected_spans);
493
494 if is_protected {
495 f64::INFINITY } else {
497 let idf = idf_scores.get(*word).copied().unwrap_or(0.0);
499 let pos_score = position_scores[idx];
500 let pos_tag_score = pos_scores[idx];
501 let entity_score = entity_scores[idx];
502 let entropy = entropy_scores[idx];
503
504 idf * self.config.idf_weight as f64
505 + pos_score * self.config.position_weight as f64
506 + pos_tag_score * self.config.pos_weight as f64
507 + entity_score * self.config.entity_weight as f64
508 + entropy * self.config.entropy_weight as f64
509 }
510 };
511
512 WordImportance {
513 position: idx,
514 text: word.to_string(),
515 score: final_score,
516 }
517 })
518 .collect()
519 }
520
521 pub fn compress(&self, text: &str) -> String {
523 let importances = self.score_words(text);
524
525 if importances.is_empty() {
526 return text.to_string();
527 }
528
529 let protected_indices: Vec<usize> = importances
531 .iter()
532 .filter(|imp| imp.score.is_infinite())
533 .map(|imp| imp.position)
534 .collect();
535
536 let mut regular_words: Vec<_> = importances
537 .iter()
538 .filter(|imp| !imp.score.is_infinite())
539 .cloned()
540 .collect();
541
542 regular_words.sort_by(|a, b| {
544 b.score
545 .partial_cmp(&a.score)
546 .unwrap_or(std::cmp::Ordering::Equal)
547 });
548
549 let total_regular = regular_words.len();
551 let keep_regular_count = if total_regular > 0 {
552 let target_total = (importances.len() as f32 * self.config.compression_ratio) as usize;
553 target_total.saturating_sub(protected_indices.len()).max(1).min(total_regular)
555 } else {
556 0
557 };
558
559 let mut keep_indices: Vec<usize> = regular_words[..keep_regular_count]
561 .iter()
562 .map(|imp| imp.position)
563 .collect();
564
565 keep_indices.extend(&protected_indices);
567
568 let critical_threshold = 0.8;
570 let mut critical_positions: Vec<usize> = regular_words
571 .iter()
572 .filter(|imp| imp.score > critical_threshold && keep_indices.contains(&imp.position))
573 .map(|imp| imp.position)
574 .collect();
575
576 critical_positions.extend(&protected_indices);
578 critical_positions.sort_unstable();
579
580 for window in critical_positions.windows(2) {
582 if window[1] > window[0] {
584 let gap_size = window[1] - window[0];
585 if gap_size > self.config.min_gap_between_critical {
586 let gap_candidates: Vec<_> = regular_words
588 .iter()
589 .filter(|imp| {
590 imp.position > window[0]
591 && imp.position < window[1]
592 && !keep_indices.contains(&imp.position)
593 })
594 .collect();
595
596 if let Some(best_gap_token) = gap_candidates.iter().max_by(|a, b| {
597 a.score
598 .partial_cmp(&b.score)
599 .unwrap_or(std::cmp::Ordering::Equal)
600 }) {
601 keep_indices.push(best_gap_token.position);
602 }
603 }
604 }
605 }
606
607 keep_indices.sort_unstable();
609
610 let words: Vec<&str> = text.split_whitespace().collect();
612 keep_indices
613 .iter()
614 .map(|&idx| words[idx])
615 .collect::<Vec<_>>()
616 .join(" ")
617 }
618
619 pub fn compress_with_format(
646 &self,
647 text: &str,
648 format: OutputFormat,
649 ) -> Result<CompressionResult, Box<dyn std::error::Error>> {
650 let compressed = self.compress(text);
652
653 let original_tokens = text.split_whitespace().count();
655 let compressed_tokens = compressed.split_whitespace().count();
656 let compression_ratio = if original_tokens > 0 {
657 compressed_tokens as f32 / original_tokens as f32
658 } else {
659 1.0
660 };
661 let tokens_removed = original_tokens.saturating_sub(compressed_tokens);
662
663 let image_data = if format == OutputFormat::Image {
665 #[cfg(feature = "image")]
666 {
667 let renderer = ImageRenderer::new(ImageRendererConfig::default());
668 Some(renderer.render_to_png(&compressed)?)
669 }
670 #[cfg(not(feature = "image"))]
671 {
672 None }
674 } else {
675 None
676 };
677
678 Ok(CompressionResult {
679 compressed,
680 image_data,
681 format,
682 original_tokens,
683 compressed_tokens,
684 compression_ratio,
685 tokens_removed,
686 })
687 }
688
689 fn calculate_idf<'a>(&self, words: &[&'a str]) -> HashMap<&'a str, f64> {
691 let mut freq_map: HashMap<&str, usize> = HashMap::new();
692 for word in words {
693 *freq_map.entry(word).or_insert(0) += 1;
694 }
695
696 let total = words.len() as f64;
697 freq_map
698 .iter()
699 .map(|(word, count)| (*word, (total / *count as f64).ln()))
700 .collect()
701 }
702
703 fn calculate_position_importance(&self, words: &[&str]) -> Vec<f64> {
705 let len = words.len();
706 (0..len)
707 .map(|idx| {
708 let normalized = idx as f64 / len as f64;
709 if !(0.1..=0.9).contains(&normalized) {
710 1.0
711 } else if !(0.2..=0.8).contains(&normalized) {
712 0.7
713 } else {
714 0.3
715 }
716 })
717 .collect()
718 }
719
720 fn calculate_pos_importance(
725 &self,
726 words: &[&str],
727 _protected_spans: &[ProtectedSpan],
728 _text: &str,
729 ) -> Vec<f64> {
730 const STOP_WORDS: &[&str] = &[
731 "the",
733 "a",
734 "an",
735 "and",
736 "or",
737 "but",
738 "in",
739 "on",
740 "at",
741 "to",
742 "for",
743 "of",
744 "with",
745 "by",
746 "from",
747 "as",
748 "is",
749 "was",
750 "are",
751 "were",
752 "be",
753 "been",
754 "being",
755 "have",
756 "has",
757 "had",
758 "do",
759 "does",
760 "did",
761 "will",
762 "would",
763 "should",
764 "could",
765 "may",
766 "might",
767 "must",
768 "can",
769 "shall",
770 "this",
771 "that",
772 "these",
773 "those",
774 "i",
775 "you",
776 "he",
777 "she",
778 "it",
779 "we",
780 "they",
781 "what",
782 "which",
783 "who",
784 "when",
785 "where",
786 "why",
787 "how",
788 "el",
790 "la",
791 "los",
792 "las",
793 "un",
794 "una",
795 "unos",
796 "unas",
797 "y",
798 "o",
799 "pero",
800 "en",
801 "de",
802 "del",
803 "al",
804 "para",
805 "por",
806 "con",
807 "sin",
808 "sobre",
809 "entre",
810 "hasta",
811 "desde",
812 "es",
813 "son",
814 "está",
815 "están",
816 "ser",
817 "estar",
818 "haber",
819 "hacer",
820 "tener",
821 "decir",
822 "ir",
823 "ver",
824 "dar",
825 "saber",
826 "querer",
827 "poder",
828 "poner",
829 "este",
830 "ese",
831 "aquel",
832 "mi",
833 "tu",
834 "su",
835 "nuestro",
836 "vuestro",
837 "que",
838 "quien",
839 "cual",
840 "cuando",
841 "donde",
842 "como",
843 "o",
845 "a",
846 "os",
847 "as",
848 "um",
849 "uma",
850 "uns",
851 "umas",
852 "e",
853 "ou",
854 "mas",
855 "em",
856 "de",
857 "do",
858 "da",
859 "dos",
860 "das",
861 "no",
862 "na",
863 "nos",
864 "nas",
865 "ao",
866 "à",
867 "aos",
868 "às",
869 "para",
870 "por",
871 "com",
872 "sem",
873 "sobre",
874 "entre",
875 "até",
876 "desde",
877 "é",
878 "são",
879 "está",
880 "estão",
881 "ser",
882 "estar",
883 "haver",
884 "ter",
885 "fazer",
886 "dizer",
887 "ir",
888 "ver",
889 "dar",
890 "saber",
891 "querer",
892 "poder",
893 "pôr",
894 "este",
895 "esse",
896 "aquele",
897 "meu",
898 "teu",
899 "seu",
900 "nosso",
901 "vosso",
902 "que",
903 "quem",
904 "qual",
905 "quando",
906 "onde",
907 "como",
908 "le",
910 "la",
911 "les",
912 "un",
913 "une",
914 "des",
915 "et",
916 "ou",
917 "mais",
918 "dans",
919 "en",
920 "de",
921 "du",
922 "au",
923 "aux",
924 "pour",
925 "par",
926 "avec",
927 "sans",
928 "sur",
929 "sous",
930 "entre",
931 "vers",
932 "chez",
933 "est",
934 "sont",
935 "être",
936 "avoir",
937 "faire",
938 "dire",
939 "aller",
940 "voir",
941 "savoir",
942 "pouvoir",
943 "vouloir",
944 "venir",
945 "devoir",
946 "prendre",
947 "ce",
948 "cet",
949 "cette",
950 "ces",
951 "mon",
952 "ton",
953 "son",
954 "notre",
955 "votre",
956 "leur",
957 "que",
958 "qui",
959 "quoi",
960 "dont",
961 "où",
962 "quand",
963 "comment",
964 "der",
966 "die",
967 "das",
968 "den",
969 "dem",
970 "des",
971 "ein",
972 "eine",
973 "einer",
974 "eines",
975 "einem",
976 "einen",
977 "und",
978 "oder",
979 "aber",
980 "in",
981 "im",
982 "an",
983 "auf",
984 "für",
985 "von",
986 "zu",
987 "mit",
988 "bei",
989 "nach",
990 "über",
991 "unter",
992 "ist",
993 "sind",
994 "war",
995 "waren",
996 "sein",
997 "haben",
998 "werden",
999 "können",
1000 "müssen",
1001 "sollen",
1002 "wollen",
1003 "dieser",
1004 "jener",
1005 "mein",
1006 "dein",
1007 "sein",
1008 "unser",
1009 "euer",
1010 "ihr",
1011 "was",
1012 "wer",
1013 "wo",
1014 "wann",
1015 "wie",
1016 "warum",
1017 "il",
1019 "lo",
1020 "l",
1021 "i",
1022 "gli",
1023 "la",
1024 "le",
1025 "un",
1026 "uno",
1027 "una",
1028 "e",
1029 "o",
1030 "ma",
1031 "in",
1032 "di",
1033 "del",
1034 "dello",
1035 "della",
1036 "dei",
1037 "degli",
1038 "delle",
1039 "al",
1040 "allo",
1041 "alla",
1042 "ai",
1043 "agli",
1044 "alle",
1045 "per",
1046 "da",
1047 "dal",
1048 "dallo",
1049 "dalla",
1050 "dai",
1051 "dagli",
1052 "dalle",
1053 "con",
1054 "su",
1055 "sul",
1056 "sullo",
1057 "sulla",
1058 "sui",
1059 "sugli",
1060 "sulle",
1061 "è",
1062 "sono",
1063 "essere",
1064 "avere",
1065 "fare",
1066 "dire",
1067 "andare",
1068 "vedere",
1069 "sapere",
1070 "potere",
1071 "volere",
1072 "questo",
1073 "quello",
1074 "mio",
1075 "tuo",
1076 "suo",
1077 "nostro",
1078 "vostro",
1079 "loro",
1080 "che",
1081 "chi",
1082 "quale",
1083 "quando",
1084 "dove",
1085 "come",
1086 "perché",
1087 "i",
1089 "v",
1090 "ne",
1091 "na",
1092 "ya",
1093 "on",
1094 "s",
1095 "eto",
1096 "kak",
1097 "po",
1098 "no",
1099 "oni",
1100 "vse",
1101 "tak",
1102 "ego",
1103 "za",
1104 "byl",
1105 "bylo",
1106 "tem",
1107 "chto",
1108 "eto",
1109 "esli",
1110 "mogu",
1111 "mozhet",
1112 "by",
1113 "的",
1115 "了",
1116 "和",
1117 "是",
1118 "在",
1119 "我",
1120 "有",
1121 "他",
1122 "这",
1123 "中",
1124 "大",
1125 "来",
1126 "上",
1127 "国",
1128 "个",
1129 "到",
1130 "说",
1131 "们",
1132 "为",
1133 "子",
1134 "中",
1135 "你",
1136 "地",
1137 "出",
1138 "道",
1139 "也",
1140 "时",
1141 "年",
1142 "は",
1144 "が",
1145 "を",
1146 "に",
1147 "で",
1148 "と",
1149 "の",
1150 "も",
1151 "や",
1152 "から",
1153 "まで",
1154 "より",
1155 "か",
1156 "な",
1157 "ね",
1158 "よ",
1159 "わ",
1160 "さ",
1161 "だ",
1162 "です",
1163 "ます",
1164 "ある",
1165 "いる",
1166 "する",
1167 "なる",
1168 "これ",
1169 "それ",
1170 "あれ",
1171 "この",
1172 "その",
1173 "あの",
1174 "ここ",
1175 "そこ",
1176 "あそこ",
1177 "al",
1179 "wa",
1180 "fi",
1181 "min",
1182 "ila",
1183 "an",
1184 "ma",
1185 "la",
1186 "li",
1187 "bi",
1188 "qad",
1189 "lam",
1190 "kan",
1191 "fi",
1192 "ala",
1193 "hatha",
1194 "dhalika",
1195 "huwa",
1196 "hiya",
1197 "hum",
1198 "ka",
1200 "ki",
1201 "ke",
1202 "se",
1203 "ne",
1204 "ko",
1205 "me",
1206 "par",
1207 "hai",
1208 "tha",
1209 "the",
1210 "thi",
1211 "aur",
1212 "ya",
1213 "to",
1214 "is",
1215 "wo",
1216 "ye",
1217 "kya",
1218 "kaise",
1219 "kab",
1220 "kahan",
1221 "kyun",
1222 ];
1223
1224 words
1225 .iter()
1226 .enumerate()
1227 .map(|(idx, word)| {
1228 let lower = word.to_lowercase();
1229
1230 if STOP_WORDS.contains(&lower.as_str()) {
1232 let context_before: Vec<&str> = if idx > 0 {
1234 words[..idx].iter().rev().take(3).rev().copied().collect()
1235 } else {
1236 Vec::new()
1237 };
1238
1239 let context_after: Vec<&str> = if idx + 1 < words.len() {
1240 words[idx + 1..].iter().take(3).copied().collect()
1241 } else {
1242 Vec::new()
1243 };
1244
1245 if self.should_preserve_stopword(word, &context_before, &context_after) {
1246 0.7 } else {
1248 0.1 }
1250 } else if word.chars().next().is_some_and(|c| c.is_uppercase()) {
1251 1.0 } else if word.len() > 6 {
1253 0.7 } else {
1255 0.5 }
1257 })
1258 .collect()
1259 }
1260
1261 fn calculate_entity_importance(&self, words: &[&str]) -> Vec<f64> {
1263 words
1264 .iter()
1265 .enumerate()
1266 .map(|(idx, word)| {
1267 let mut score: f64 = 0.0;
1268 if word.chars().next().is_some_and(|c| c.is_uppercase()) {
1269 score += 0.3;
1270 }
1271 if idx > 0 {
1272 let prev = words[idx - 1].to_lowercase();
1273 if prev.starts_with("mr.") || prev.starts_with("dr.") {
1274 score += 0.5;
1275 }
1276 }
1277 if word.contains('@') || word.starts_with("http") {
1278 score += 0.6;
1279 }
1280 if word.len() > 1 && word.chars().all(|c| c.is_uppercase()) {
1281 score += 0.4;
1282 }
1283 score.min(1.0)
1284 })
1285 .collect()
1286 }
1287
1288 fn calculate_local_entropy(&self, words: &[&str]) -> Vec<f64> {
1290 const WINDOW: usize = 10;
1291 (0..words.len())
1292 .map(|idx| {
1293 let start = idx.saturating_sub(WINDOW / 2);
1294 let end = (idx + WINDOW / 2).min(words.len());
1295 let window = &words[start..end];
1296 let unique: std::collections::HashSet<_> = window.iter().collect();
1297 unique.len() as f64 / window.len() as f64
1298 })
1299 .collect()
1300 }
1301}
1302
1303#[cfg(test)]
1304mod tests {
1305 use super::*;
1306
1307 #[test]
1308 fn test_compression() {
1309 let config = StatisticalFilterConfig {
1310 compression_ratio: 0.5,
1311 ..Default::default()
1312 };
1313 let filter = StatisticalFilter::new(config);
1314
1315 let text = "The quick brown fox jumps over the lazy dog";
1316 let compressed = filter.compress(text);
1317
1318 let original_words = text.split_whitespace().count();
1319 let compressed_words = compressed.split_whitespace().count();
1320
1321 assert!(compressed_words <= original_words);
1322 assert!(!compressed.is_empty());
1323 }
1324
1325 #[test]
1326 fn test_code_block_protection() {
1327 let config = StatisticalFilterConfig {
1328 compression_ratio: 0.3,
1329 ..Default::default()
1330 };
1331 let filter = StatisticalFilter::new(config);
1332
1333 let text = "Here is some code ```rust fn main() { println!(\"Hello\"); }``` that should be preserved";
1334 let compressed = filter.compress(text);
1335
1336 assert!(
1338 compressed.contains("```rust") || compressed.contains("println!"),
1339 "Expected code block to be preserved, got: {}",
1340 compressed
1341 );
1342 }
1343
1344 #[test]
1345 fn test_json_protection() {
1346 let config = StatisticalFilterConfig {
1347 compression_ratio: 0.3,
1348 ..Default::default()
1349 };
1350 let filter = StatisticalFilter::new(config);
1351
1352 let text = "The config is {\"key\": \"value\"} and it should remain intact";
1353 let compressed = filter.compress(text);
1354
1355 assert!(
1357 compressed.contains("{\"key\":") || compressed.contains("\"key\""),
1358 "Expected JSON to be preserved, got: {}",
1359 compressed
1360 );
1361 }
1362
1363 #[test]
1364 fn test_nested_json_protection() {
1365 let config = StatisticalFilterConfig {
1366 compression_ratio: 0.2,
1367 ..Default::default()
1368 };
1369 let filter = StatisticalFilter::new(config);
1370
1371 let text = "Here is a nested JSON {\"user\": {\"name\": \"John\", \"age\": 30}, \"active\": true} that must be kept";
1372 let compressed = filter.compress(text);
1373
1374 assert!(
1376 compressed.contains("{\"user\":") || compressed.contains("\"name\"") || compressed.contains("John"),
1377 "Expected nested JSON to be preserved, got: {}",
1378 compressed
1379 );
1380 }
1381
1382 #[test]
1383 fn test_multiline_json_protection() {
1384 let config = StatisticalFilterConfig {
1385 compression_ratio: 0.2,
1386 ..Default::default()
1387 };
1388 let filter = StatisticalFilter::new(config);
1389
1390 let text = r#"Configuration:
1391{
1392 "host": "localhost",
1393 "port": 8080,
1394 "options": {
1395 "debug": true
1396 }
1397}
1398End of config"#;
1399 let compressed = filter.compress(text);
1400
1401 assert!(
1403 compressed.contains("\"host\"") || compressed.contains("localhost") || compressed.contains("8080"),
1404 "Expected multiline JSON to be preserved, got: {}",
1405 compressed
1406 );
1407 }
1408
1409 #[test]
1410 fn test_json_array_protection() {
1411 let config = StatisticalFilterConfig {
1412 compression_ratio: 0.2,
1413 ..Default::default()
1414 };
1415 let filter = StatisticalFilter::new(config);
1416
1417 let text = "The list is [1, 2, 3, 4, 5] and should be preserved";
1418 let compressed = filter.compress(text);
1419
1420 assert!(
1422 compressed.contains("[1,") || compressed.contains("2") || compressed.contains("5"),
1423 "Expected JSON array to be preserved, got: {}",
1424 compressed
1425 );
1426 }
1427
1428 #[test]
1429 fn test_path_preservation() {
1430 let config = StatisticalFilterConfig {
1431 compression_ratio: 0.4,
1432 ..Default::default()
1433 };
1434 let filter = StatisticalFilter::new(config);
1435
1436 let text = "Check the file in src/main.rs for the implementation details";
1437 let compressed = filter.compress(text);
1438
1439 assert!(
1441 compressed.contains("src/main.rs")
1442 || (compressed.contains("src") && compressed.contains("main.rs"))
1443 );
1444 }
1445
1446 #[test]
1447 fn test_contextual_stopword_to() {
1448 let config = StatisticalFilterConfig {
1449 compression_ratio: 0.5,
1450 ..Default::default()
1451 };
1452 let filter = StatisticalFilter::new(config);
1453
1454 let text1 = "how to reproduce the bug";
1456 let compressed1 = filter.compress(text1);
1457 assert!(compressed1.contains("to") || compressed1.contains("how"));
1458
1459 let text2 = "going to the store";
1461 let _compressed2 = filter.compress(text2);
1462 }
1464
1465 #[test]
1466 fn test_negation_preservation() {
1467 let config = StatisticalFilterConfig {
1468 compression_ratio: 0.3,
1469 ..Default::default()
1470 };
1471 let filter = StatisticalFilter::new(config);
1472
1473 let text = "do not remove this critical information";
1474 let compressed = filter.compress(text);
1475
1476 assert!(compressed.contains("not"));
1478 }
1479
1480 #[test]
1481 fn test_comparator_preservation() {
1482 let config = StatisticalFilterConfig {
1483 compression_ratio: 0.3,
1484 ..Default::default()
1485 };
1486 let filter = StatisticalFilter::new(config);
1487
1488 let text = "check if x >= 5 before proceeding";
1489 let compressed = filter.compress(text);
1490
1491 assert!(compressed.contains(">=") || compressed.contains("5") || compressed.contains("x"));
1493 }
1494
1495 #[test]
1496 fn test_domain_terms_preservation() {
1497 let config = StatisticalFilterConfig {
1498 compression_ratio: 0.3,
1499 ..Default::default()
1500 };
1501 let filter = StatisticalFilter::new(config);
1502
1503 let text = "use the Vectorizer tool to process data";
1504 let compressed = filter.compress(text);
1505
1506 assert!(compressed.contains("Vectorizer"));
1508 }
1509
1510 #[test]
1511 fn test_identifier_protection() {
1512 let config = StatisticalFilterConfig {
1513 compression_ratio: 0.3,
1514 ..Default::default()
1515 };
1516 let filter = StatisticalFilter::new(config);
1517
1518 let text = "call the getUserData function from user_service module";
1519 let compressed = filter.compress(text);
1520
1521 assert!(compressed.contains("getUserData") || compressed.contains("user_service"));
1523 }
1524
1525 #[test]
1526 fn test_gap_filling_between_critical_tokens() {
1527 let config = StatisticalFilterConfig {
1528 compression_ratio: 0.2,
1529 min_gap_between_critical: 2,
1530 ..Default::default()
1531 };
1532 let filter = StatisticalFilter::new(config);
1533
1534 let text = "Vectorizer is a critical component that handles data processing for Synap";
1535 let compressed = filter.compress(text);
1536
1537 assert!(
1539 compressed.contains("Vectorizer"),
1540 "Expected 'Vectorizer' in output: {}",
1541 compressed
1542 );
1543 assert!(
1544 compressed.contains("Synap"),
1545 "Expected 'Synap' in output: {}",
1546 compressed
1547 );
1548
1549 let words: Vec<&str> = compressed.split_whitespace().collect();
1550 assert!(
1551 words.len() >= 3,
1552 "Expected at least 3 words, got: {}",
1553 words.len()
1554 );
1555 }
1556
1557 #[test]
1558 fn test_protection_masks_can_be_disabled() {
1559 let config = StatisticalFilterConfig {
1560 compression_ratio: 0.3,
1561 enable_protection_masks: false,
1562 ..Default::default()
1563 };
1564 let filter = StatisticalFilter::new(config);
1565
1566 let text = "Check src/main.rs for details";
1567 let _compressed = filter.compress(text);
1568
1569 }
1572
1573 #[test]
1574 fn test_contextual_stopwords_can_be_disabled() {
1575 let config = StatisticalFilterConfig {
1576 compression_ratio: 0.5,
1577 enable_contextual_stopwords: false,
1578 ..Default::default()
1579 };
1580 let filter = StatisticalFilter::new(config);
1581
1582 let text = "how to reproduce the issue";
1583 let _compressed = filter.compress(text);
1584
1585 }
1588}