1use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind};
23
24use crate::ir::{DocNode, FidelityLevel};
25
26#[derive(Debug, Clone)]
32pub struct CompressionConfig {
33 pub budget: usize,
35 pub current_tokens: usize,
37 pub fidelity: FidelityLevel,
39}
40
41impl CompressionConfig {
42 pub fn usage_ratio(&self) -> f64 {
44 if self.budget == 0 {
45 return 1.0;
46 }
47 self.current_tokens as f64 / self.budget as f64
48 }
49
50 pub fn stage(&self) -> CompressionStage {
52 match self.usage_ratio() {
53 r if r < 0.60 => CompressionStage::StopwordOnly,
54 r if r < 0.80 => CompressionStage::PruneLowImportance,
55 r if r < 0.95 => CompressionStage::DeduplicateAndLinearize,
56 _ => CompressionStage::MaxCompression,
57 }
58 }
59
60 pub fn min_stage(&self) -> CompressionStage {
66 match self.fidelity {
67 FidelityLevel::Compressed => CompressionStage::PruneLowImportance,
68 _ => CompressionStage::StopwordOnly,
69 }
70 }
71}
72
73#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
75pub enum CompressionStage {
76 StopwordOnly,
78 PruneLowImportance,
80 DeduplicateAndLinearize,
82 MaxCompression,
84}
85
86pub struct AdaptiveCompressor {
92 ascii_ac: Option<AhoCorasick>,
95 nonascii_stopwords: Vec<String>,
98}
99
100impl Default for AdaptiveCompressor {
101 fn default() -> Self {
102 Self::new()
103 }
104}
105
106impl AdaptiveCompressor {
107 pub fn new() -> Self {
113 Self::with_stopwords(default_stopwords())
114 }
115
116 pub fn with_stopwords(stopwords: Vec<String>) -> Self {
122 let mut ascii_stopwords: Vec<String> = Vec::new();
123 let mut nonascii_stopwords = Vec::new();
124
125 for sw in &stopwords {
126 if sw.is_ascii() {
127 ascii_stopwords.push(sw.to_ascii_lowercase());
128 } else {
129 nonascii_stopwords.push(sw.clone());
132 }
133 }
134
135 let ascii_ac = if ascii_stopwords.is_empty() {
136 None
137 } else {
138 AhoCorasickBuilder::new()
139 .ascii_case_insensitive(true)
140 .match_kind(MatchKind::LeftmostFirst)
141 .build(&ascii_stopwords)
142 .ok()
143 };
144
145 Self {
146 ascii_ac,
147 nonascii_stopwords,
148 }
149 }
150
151 pub fn has_stopwords(&self) -> bool {
153 self.ascii_ac.is_some() || !self.nonascii_stopwords.is_empty()
154 }
155
156 pub fn compress(&self, mut nodes: Vec<DocNode>, cfg: &CompressionConfig) -> Vec<DocNode> {
160 if cfg.fidelity == FidelityLevel::Lossless {
161 return nodes; }
163
164 let stage = cfg.stage().max(cfg.min_stage());
165
166 nodes = self.remove_stopwords(nodes);
168
169 if stage >= CompressionStage::PruneLowImportance {
171 nodes = prune_low_importance(nodes, 0.20);
172 }
173
174 if stage >= CompressionStage::DeduplicateAndLinearize {
176 nodes = deduplicate_paras(nodes);
177 }
178
179 if stage >= CompressionStage::MaxCompression {
182 nodes = truncate_to_first_sentence(nodes);
183 }
184
185 nodes
186 }
187
188 fn remove_stopwords(&self, nodes: Vec<DocNode>) -> Vec<DocNode> {
191 if !self.has_stopwords() {
192 return nodes;
193 }
194 nodes
195 .into_iter()
196 .map(|node| match node {
197 DocNode::Para { text, importance } => DocNode::Para {
198 text: self.strip_stopwords(&text),
199 importance,
200 },
201 DocNode::Header { level, text } => DocNode::Header {
202 level,
203 text: self.strip_stopwords(&text),
204 },
205 other => other,
206 })
207 .collect()
208 }
209
210 fn strip_stopwords(&self, text: &str) -> String {
223 let result: String = if let Some(ac) = &self.ascii_ac {
225 let bytes = text.as_bytes();
226 let mut out = String::with_capacity(text.len());
227 let mut last = 0usize;
228
229 for mat in ac.find_iter(text) {
230 let start = mat.start();
231 let end = mat.end();
232
233 let before_ok = start == 0 || !is_word_byte(bytes[start - 1]);
235 let after_ok = end == bytes.len() || !is_word_byte(bytes[end]);
237
238 if before_ok && after_ok {
239 out.push_str(&text[last..start]);
241 let skip_end = skip_trailing_space(bytes, end);
243 last = skip_end;
244 }
245 }
248
249 out.push_str(&text[last..]);
250 out
251 } else {
252 text.to_string()
253 };
254
255 let mut out2 = String::with_capacity(result.len());
257 if !self.nonascii_stopwords.is_empty() {
258 for token in result.split_whitespace().filter(|token| {
259 !self
260 .nonascii_stopwords
261 .iter()
262 .any(|sw| sw.as_str() == *token)
263 }) {
264 if !out2.is_empty() {
265 out2.push(' ');
266 }
267 out2.push_str(token);
268 }
269 } else {
270 for token in result.split_whitespace() {
272 if !out2.is_empty() {
273 out2.push(' ');
274 }
275 out2.push_str(token);
276 }
277 }
278
279 out2
280 }
281}
282
283#[inline]
291fn is_word_byte(b: u8) -> bool {
292 b.is_ascii_alphanumeric() || b == b'_'
293}
294
295#[inline]
302fn skip_trailing_space(bytes: &[u8], mut pos: usize) -> usize {
303 while pos < bytes.len() && (bytes[pos] == b' ' || bytes[pos] == b'\t') {
304 pos += 1;
305 }
306 pos
307}
308
309fn prune_low_importance(nodes: Vec<DocNode>, threshold: f32) -> Vec<DocNode> {
315 let para_importances: Vec<f32> = nodes
317 .iter()
318 .filter_map(|n| {
319 if let DocNode::Para { importance, .. } = n {
320 Some(*importance)
321 } else {
322 None
323 }
324 })
325 .collect();
326
327 if para_importances.len() <= 1 {
328 return nodes;
329 }
330
331 let mut sorted = para_importances.clone();
333 sorted.sort_by(|a, b| a.partial_cmp(b).unwrap());
334 let cutoff_idx = ((sorted.len() as f32 * threshold) as usize).min(sorted.len() - 1);
335 let cutoff = sorted[cutoff_idx];
336
337 let filtered: Vec<DocNode> = nodes
338 .iter()
339 .filter(|n| {
340 if let DocNode::Para { importance, .. } = n {
341 *importance > cutoff
342 } else {
343 true }
345 })
346 .cloned()
347 .collect();
348
349 let filtered_has_para = filtered.iter().any(|n| matches!(n, DocNode::Para { .. }));
352 let input_had_para = nodes.iter().any(|n| matches!(n, DocNode::Para { .. }));
353
354 if input_had_para && !filtered_has_para {
355 nodes
356 } else {
357 filtered
358 }
359}
360
361fn deduplicate_paras(nodes: Vec<DocNode>) -> Vec<DocNode> {
363 use std::collections::HashSet;
364 let mut seen: HashSet<String> = HashSet::new();
365 nodes
366 .into_iter()
367 .filter(|n| {
368 if let DocNode::Para { text, .. } = n {
369 let mut normalized = String::with_capacity(text.len());
370 for token in text.split_whitespace() {
371 if !normalized.is_empty() { normalized.push(' '); }
372 normalized.push_str(token);
373 }
374 seen.insert(normalized)
375 } else {
376 true
377 }
378 })
379 .collect()
380}
381
382fn truncate_to_first_sentence(nodes: Vec<DocNode>) -> Vec<DocNode> {
384 nodes
385 .into_iter()
386 .map(|node| match node {
387 DocNode::Para { text, importance } => {
388 let first = first_sentence(&text);
389 DocNode::Para {
390 text: first,
391 importance,
392 }
393 }
394 other => other,
395 })
396 .collect()
397}
398
399fn first_sentence(text: &str) -> String {
401 for (i, c) in text.char_indices() {
402 if matches!(
403 c,
404 '.' | '!' | '?' | '。' | '!' | '?' | '।' | '॥' | '۔' | '።' | '᙮' | '꓿' | '︒' | '﹒' | '.' ) {
415 return text[..i + c.len_utf8()].trim().to_string();
416 }
417 }
418 text.trim().to_string() }
420
421fn default_stopwords() -> Vec<String> {
438 let articles = ["a", "an", "the"];
441 let conjunctions = ["and", "or", "but", "nor", "yet", "so", "for"];
443 let prepositions = [
445 "in", "on", "at", "to", "of", "by", "as", "up", "via", "into", "from", "with", "than",
446 "about", "over", "after", "before", "between", "through", "during", "within", "without",
447 ];
448 let auxiliaries = [
450 "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "do", "does",
451 "did", "will", "would", "shall", "should", "may", "might", "must", "can", "could",
452 ];
453 let pronouns = [
455 "it", "its", "this", "that", "these", "those", "not", "no", "also", "too", "very", "just",
456 "such",
457 ];
458
459 let korean_connectives = [
463 "그리고",
464 "하지만",
465 "그러나",
466 "따라서",
467 "또한",
468 "즉",
469 "및",
470 "또는",
471 "그래서",
472 "그런데",
473 "게다가",
474 "다만",
475 "단지",
476 "특히",
477 "주로",
478 "왜냐하면",
479 "그러므로",
480 "한편",
481 "반면",
482 "다만",
483 "이처럼",
484 "이렇게",
485 "이에",
486 "이후",
487 "이전",
488 ];
489
490 articles
491 .iter()
492 .chain(conjunctions.iter())
493 .chain(prepositions.iter())
494 .chain(auxiliaries.iter())
495 .chain(pronouns.iter())
496 .map(|s| s.to_string())
497 .chain(korean_connectives.iter().map(|s| s.to_string()))
498 .collect()
499}
500
501#[cfg(test)]
506mod tests {
507 use super::*;
508
509 fn make_para(text: &str, importance: f32) -> DocNode {
510 DocNode::Para {
511 text: text.into(),
512 importance,
513 }
514 }
515
516 #[test]
517 fn lossless_skips_all_compression() {
518 let nodes = vec![make_para("the quick brown fox", 0.1)];
519 let cfg = CompressionConfig {
520 budget: 100,
521 current_tokens: 99,
522 fidelity: FidelityLevel::Lossless,
523 };
524 let compressor = AdaptiveCompressor::new();
525 let result = compressor.compress(nodes.clone(), &cfg);
526 if let (DocNode::Para { text: t1, .. }, DocNode::Para { text: t2, .. }) =
528 (&nodes[0], &result[0])
529 {
530 assert_eq!(t1, t2);
531 }
532 }
533
534 #[test]
535 fn new_compressor_has_stopwords() {
536 let compressor = AdaptiveCompressor::new();
537 assert!(
539 compressor.has_stopwords(),
540 "default compressor must have a non-empty stopword list"
541 );
542 }
543
544 #[test]
545 fn empty_compressor_has_no_stopwords() {
546 let compressor = AdaptiveCompressor::with_stopwords(vec![]);
547 assert!(
548 !compressor.has_stopwords(),
549 "compressor built with empty list must report no stopwords"
550 );
551 }
552
553 #[test]
554 fn stopword_removal_ascii_works() {
555 let compressor = AdaptiveCompressor::new();
557 let nodes = vec![make_para("the quick brown fox", 1.0)];
558 let cfg = CompressionConfig {
559 budget: 1000,
560 current_tokens: 100, fidelity: FidelityLevel::Semantic,
562 };
563 let result = compressor.compress(nodes, &cfg);
564 if let DocNode::Para { text, .. } = &result[0] {
565 assert!(
566 !text.to_lowercase().starts_with("the "),
567 "stopword 'the' must be removed: got '{}'",
568 text
569 );
570 }
571 }
572
573 #[test]
574 fn with_stopwords_removes_specified_ascii_words() {
575 let compressor = AdaptiveCompressor::with_stopwords(vec!["hello".into(), "world".into()]);
576 let nodes = vec![make_para("hello world foo", 1.0)];
577 let cfg = CompressionConfig {
578 budget: 1000,
579 current_tokens: 100,
580 fidelity: FidelityLevel::Semantic,
581 };
582 let result = compressor.compress(nodes, &cfg);
583 if let DocNode::Para { text, .. } = &result[0] {
584 assert!(
585 !text.to_lowercase().contains("hello"),
586 "'hello' must be removed: got '{}'",
587 text
588 );
589 assert!(
590 !text.to_lowercase().contains("world"),
591 "'world' must be removed: got '{}'",
592 text
593 );
594 assert!(text.contains("foo"), "'foo' must remain: got '{}'", text);
595 }
596 }
597
598 #[test]
599 fn nonascii_stopword_removal_works() {
600 let compressor = AdaptiveCompressor::new();
603 let nodes = vec![make_para("사과 그리고 바나나", 1.0)];
604 let cfg = CompressionConfig {
605 budget: 1000,
606 current_tokens: 100,
607 fidelity: FidelityLevel::Semantic,
608 };
609 let result = compressor.compress(nodes, &cfg);
610 if let DocNode::Para { text, .. } = &result[0] {
611 assert!(
612 !text.contains("그리고"),
613 "Korean connective '그리고' must be removed: got '{}'",
614 text
615 );
616 assert!(text.contains("사과"), "'사과' must remain: got '{}'", text);
617 assert!(
618 text.contains("바나나"),
619 "'바나나' must remain: got '{}'",
620 text
621 );
622 }
623 }
624
625 #[test]
626 fn nonascii_stopword_partial_match_not_removed() {
627 let compressor = AdaptiveCompressor::with_stopwords(vec!["그리고".into()]);
630 let nodes = vec![make_para("그리고나서 확인", 1.0)];
631 let cfg = CompressionConfig {
632 budget: 1000,
633 current_tokens: 100,
634 fidelity: FidelityLevel::Semantic,
635 };
636 let result = compressor.compress(nodes, &cfg);
637 if let DocNode::Para { text, .. } = &result[0] {
638 assert!(
639 text.contains("그리고나서"),
640 "'그리고나서' must NOT be removed (not an exact token): got '{}'",
641 text
642 );
643 }
644 }
645
646 #[test]
647 fn prune_low_importance_removes_bottom_20_pct() {
648 let nodes = vec![
649 make_para("중요 단락", 0.9),
650 make_para("보통 단락", 0.5),
651 make_para("낮은 단락", 0.1),
652 make_para("낮은 단락2", 0.05),
653 make_para("낮은 단락3", 0.02),
654 ];
655 let result = prune_low_importance(nodes, 0.20);
656 assert!(result.len() < 5, "some nodes must be removed");
658 }
659
660 #[test]
661 fn deduplicate_removes_duplicates() {
662 let nodes = vec![
663 make_para("동일한 내용입니다.", 1.0),
664 make_para("다른 내용입니다.", 1.0),
665 make_para("동일한 내용입니다.", 0.9),
666 ];
667 let result = deduplicate_paras(nodes);
668 assert_eq!(result.len(), 2, "one duplicate paragraph must be removed");
669 }
670
671 #[test]
672 fn first_sentence_extraction() {
673 assert_eq!(first_sentence("안녕하세요. 반갑습니다."), "안녕하세요.");
674 assert_eq!(
675 first_sentence("문장 부호 없는 텍스트"),
676 "문장 부호 없는 텍스트"
677 );
678 assert_eq!(first_sentence("Hello world! Bye."), "Hello world!");
679 }
680
681 #[test]
682 fn first_sentence_multilingual() {
683 assert_eq!(
685 first_sentence("यह पहला वाक्य है। यह दूसरा है।"),
686 "यह पहला वाक्य है।"
687 );
688 assert_eq!(
690 first_sentence("هذه الجملة الأولى۔ هذه الثانية۔"),
691 "هذه الجملة الأولى۔"
692 );
693 assert_eq!(
695 first_sentence("ይህ የመጀመሪያ ዓረፍተ ነገር ነው። ሁለተኛ።"),
696 "ይህ የመጀመሪያ ዓረፍተ ነገር ነው።"
697 );
698 assert_eq!(
700 first_sentence("これが最初の文です.これが二番目です."),
701 "これが最初の文です."
702 );
703 }
704
705 #[test]
706 fn prune_keeps_single_paragraph() {
707 let compressor = AdaptiveCompressor::with_stopwords(vec![]);
708 let nodes = vec![make_para("only paragraph", 0.1)]; let cfg = CompressionConfig {
710 budget: 100,
711 current_tokens: 65,
712 fidelity: FidelityLevel::Semantic,
713 };
714 let result = compressor.compress(nodes, &cfg);
715 assert_eq!(
716 result.len(),
717 1,
718 "the sole paragraph in a single-paragraph document must not be removed"
719 );
720 }
721
722 #[test]
723 fn prune_keeps_all_equal_importance_paragraphs() {
724 let compressor = AdaptiveCompressor::with_stopwords(vec![]);
725 let nodes = vec![
727 make_para("first", 0.5),
728 make_para("second", 0.5),
729 make_para("third", 0.5),
730 ];
731 let cfg = CompressionConfig {
732 budget: 100,
733 current_tokens: 65,
734 fidelity: FidelityLevel::Semantic,
735 };
736 let result = compressor.compress(nodes, &cfg);
737 assert_eq!(
738 result.len(),
739 3,
740 "paragraphs with equal importance must not all be removed"
741 );
742 }
743
744 #[test]
747 fn ascii_stopword_respects_word_boundaries() {
748 let compressor = AdaptiveCompressor::with_stopwords(vec!["the".into()]);
749 let cfg = CompressionConfig {
750 budget: 1000,
751 current_tokens: 100,
752 fidelity: FidelityLevel::Semantic,
753 };
754
755 let nodes = vec![make_para("the cat sat", 1.0)];
757 let result = compressor.compress(nodes, &cfg);
758 if let DocNode::Para { text, .. } = &result[0] {
759 assert!(
760 !text.to_lowercase().starts_with("the "),
761 "standalone 'the' at start must be removed: got '{}'",
762 text
763 );
764 assert!(
765 text.contains("cat") && text.contains("sat"),
766 "non-stopword tokens must remain: got '{}'",
767 text
768 );
769 }
770
771 let nodes2 = vec![make_para("theory is important", 1.0)];
773 let result2 = compressor.compress(nodes2, &cfg);
774 if let DocNode::Para { text, .. } = &result2[0] {
775 assert!(
776 text.contains("theory"),
777 "'theory' must not be modified by stopword 'the': got '{}'",
778 text
779 );
780 }
781
782 let nodes3 = vec![make_para("there are cats", 1.0)];
784 let result3 = compressor.compress(nodes3, &cfg);
785 if let DocNode::Para { text, .. } = &result3[0] {
786 assert!(
787 text.contains("there"),
788 "'there' must not be modified by stopword 'the': got '{}'",
789 text
790 );
791 }
792
793 let nodes4 = vec![make_para("we gather here", 1.0)];
795 let result4 = compressor.compress(nodes4, &cfg);
796 if let DocNode::Para { text, .. } = &result4[0] {
797 assert!(
798 text.contains("gather"),
799 "'gather' must not be modified by stopword 'the': got '{}'",
800 text
801 );
802 }
803 }
804
805 #[test]
806 fn stage_thresholds() {
807 let base = CompressionConfig {
808 budget: 100,
809 current_tokens: 0,
810 fidelity: FidelityLevel::Semantic,
811 };
812 let at = |tokens| CompressionConfig {
813 current_tokens: tokens,
814 ..base.clone()
815 };
816
817 assert_eq!(at(50).stage(), CompressionStage::StopwordOnly);
818 assert_eq!(at(70).stage(), CompressionStage::PruneLowImportance);
819 assert_eq!(at(85).stage(), CompressionStage::DeduplicateAndLinearize);
820 assert_eq!(at(96).stage(), CompressionStage::MaxCompression);
821 }
822}