llm_transpile/
compressor.rs

1//! compressor.rs — AdaptiveCompressor
2//!
3//! Automatically applies a four-stage compression strategy based on token budget usage.
4//!
5//! | Budget usage | Strategy applied                                          |
6//! |-------------|-----------------------------------------------------------|
7//! | 0–60%       | Stopword removal only                                     |
8//! | 60–80%      | Stopwords + prune bottom-20% importance paragraphs        |
9//! | 80–95%      | Above + deduplicate sentences + linearize numeric data    |
10//! | 95%+        | Above + truncate all paragraphs to first sentence (Semantic+) |
11//!
12//! ## Stopword matching strategy
13//!
14//! - **ASCII stopwords**: indexed into a single [`AhoCorasick`] automaton (case-insensitive).
15//!   Word-boundary semantics are enforced by checking the characters immediately before and
16//!   after each match — the same contract as the previous `\b word \b` regex approach, but
17//!   in a single O(N + M) pass instead of O(N × S) repeated regex sweeps.
18//! - **Non-ASCII stopwords** (Korean, Japanese, CJK, Arabic, etc.): matched as exact
19//!   whitespace-delimited tokens. This is necessary because `\b` does not recognise
20//!   Unicode word boundaries for scripts without ASCII-style spacing.
21
22use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind};
23
24use crate::ir::{DocNode, FidelityLevel};
25
26// ────────────────────────────────────────────────
27// 1. Compression configuration
28// ────────────────────────────────────────────────
29
30/// Context provided when running the compressor.
31#[derive(Debug, Clone)]
32pub struct CompressionConfig {
33    /// Maximum allowed token count.
34    pub budget: usize,
35    /// Tokens consumed so far (approximate).
36    pub current_tokens: usize,
37    /// Semantic preservation level.
38    pub fidelity: FidelityLevel,
39}
40
41impl CompressionConfig {
42    /// Current budget usage ratio (0.0–1.0).
43    pub fn usage_ratio(&self) -> f64 {
44        if self.budget == 0 {
45            return 1.0;
46        }
47        self.current_tokens as f64 / self.budget as f64
48    }
49
50    /// Returns the compression stage for the current usage ratio.
51    pub fn stage(&self) -> CompressionStage {
52        match self.usage_ratio() {
53            r if r < 0.60 => CompressionStage::StopwordOnly,
54            r if r < 0.80 => CompressionStage::PruneLowImportance,
55            r if r < 0.95 => CompressionStage::DeduplicateAndLinearize,
56            _ => CompressionStage::MaxCompression,
57        }
58    }
59
60    /// Returns the minimum compression stage enforced by the fidelity level,
61    /// regardless of budget usage ratio.
62    ///
63    /// - `Compressed`: always applies at least `PruneLowImportance`
64    /// - Others: no minimum (budget ratio decides)
65    pub fn min_stage(&self) -> CompressionStage {
66        match self.fidelity {
67            FidelityLevel::Compressed => CompressionStage::PruneLowImportance,
68            _ => CompressionStage::StopwordOnly,
69        }
70    }
71}
72
73/// Compression stage enumeration.
74#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
75pub enum CompressionStage {
76    /// Stopword removal only.
77    StopwordOnly,
78    /// Stopwords + prune bottom-20% importance paragraphs.
79    PruneLowImportance,
80    /// Above + deduplicate sentences.
81    DeduplicateAndLinearize,
82    /// Above + truncate paragraphs to their first sentence.
83    MaxCompression,
84}
85
86// ────────────────────────────────────────────────
87// 2. AdaptiveCompressor
88// ────────────────────────────────────────────────
89
90/// Budget-based adaptive document compressor.
91pub struct AdaptiveCompressor {
92    /// Single Aho-Corasick automaton built from all ASCII stopwords (case-insensitive).
93    /// Replaces the previous per-stopword regex list — one O(N+M) pass instead of O(N×S).
94    ascii_ac: Option<AhoCorasick>,
95    /// Non-ASCII stopword list for exact whitespace-token matching.
96    /// Applied with a whitespace-split-filter pass to handle CJK / Korean / Arabic etc.
97    nonascii_stopwords: Vec<String>,
98}
99
100impl Default for AdaptiveCompressor {
101    fn default() -> Self {
102        Self::new()
103    }
104}
105
106impl AdaptiveCompressor {
107    /// Creates a compressor with the default stopword list.
108    ///
109    /// The default list includes common English function words (ASCII) and
110    /// standalone Korean connective words (non-ASCII). For domain-specific
111    /// stopwords use [`Self::with_stopwords`].
112    pub fn new() -> Self {
113        Self::with_stopwords(default_stopwords())
114    }
115
116    /// Creates a compressor with a fully custom stopword list.
117    ///
118    /// Stopwords are partitioned at construction time:
119    /// - ASCII words → indexed into a single Aho-Corasick automaton (case-insensitive).
120    /// - Non-ASCII words → stored as plain strings for token-level matching.
121    pub fn with_stopwords(stopwords: Vec<String>) -> Self {
122        let mut ascii_stopwords: Vec<String> = Vec::new();
123        let mut nonascii_stopwords = Vec::new();
124
125        for sw in &stopwords {
126            if sw.is_ascii() {
127                ascii_stopwords.push(sw.to_ascii_lowercase());
128            } else {
129                // Non-ASCII (Korean, CJK, Arabic, Devanagari, …):
130                // stored as plain strings for whitespace-token matching.
131                nonascii_stopwords.push(sw.clone());
132            }
133        }
134
135        let ascii_ac = if ascii_stopwords.is_empty() {
136            None
137        } else {
138            AhoCorasickBuilder::new()
139                .ascii_case_insensitive(true)
140                .match_kind(MatchKind::LeftmostFirst)
141                .build(&ascii_stopwords)
142                .ok()
143        };
144
145        Self {
146            ascii_ac,
147            nonascii_stopwords,
148        }
149    }
150
151    /// Returns true when no stopwords are configured (both lists empty).
152    pub fn has_stopwords(&self) -> bool {
153        self.ascii_ac.is_some() || !self.nonascii_stopwords.is_empty()
154    }
155
156    /// Applies compression to the node list and returns the result.
157    ///
158    /// Stopword removal is also skipped at `FidelityLevel::Lossless`.
159    pub fn compress(&self, mut nodes: Vec<DocNode>, cfg: &CompressionConfig) -> Vec<DocNode> {
160        if cfg.fidelity == FidelityLevel::Lossless {
161            return nodes; // Lossless: compression entirely forbidden
162        }
163
164        let stage = cfg.stage().max(cfg.min_stage());
165
166        // ① Stopword removal (all stages)
167        nodes = self.remove_stopwords(nodes);
168
169        // ② Prune bottom-20% importance paragraphs
170        if stage >= CompressionStage::PruneLowImportance {
171            nodes = prune_low_importance(nodes, 0.20);
172        }
173
174        // ③ Deduplicate sentences
175        if stage >= CompressionStage::DeduplicateAndLinearize {
176            nodes = deduplicate_paras(nodes);
177        }
178
179        // ④ Truncate paragraphs to their first sentence
180        // Lossless early-returns at the top, so fidelity != Lossless is guaranteed here.
181        if stage >= CompressionStage::MaxCompression {
182            nodes = truncate_to_first_sentence(nodes);
183        }
184
185        nodes
186    }
187
188    // ── Internal helpers ─────────────────────────
189
190    fn remove_stopwords(&self, nodes: Vec<DocNode>) -> Vec<DocNode> {
191        if !self.has_stopwords() {
192            return nodes;
193        }
194        nodes
195            .into_iter()
196            .map(|node| match node {
197                DocNode::Para { text, importance } => DocNode::Para {
198                    text: self.strip_stopwords(&text),
199                    importance,
200                },
201                DocNode::Header { level, text } => DocNode::Header {
202                    level,
203                    text: self.strip_stopwords(&text),
204                },
205                other => other,
206            })
207            .collect()
208    }
209
210    /// Removes stopwords from a single text string.
211    ///
212    /// Two passes:
213    /// 1. ASCII Aho-Corasick pass — single O(N+M) scan with word-boundary validation.
214    ///    Each match is accepted only when the character immediately before the match
215    ///    start and the character immediately after the match end are both non-word
216    ///    characters (i.e. not `[A-Za-z0-9_]`). Trailing whitespace after an accepted
217    ///    match is also consumed to avoid double-spaces.
218    /// 2. Non-ASCII whitespace-token pass — splits on whitespace, filters exact matches,
219    ///    then rejoins. O(N) per token.
220    ///
221    /// A final `split_whitespace` + rejoin collapses any residual consecutive spaces.
222    fn strip_stopwords(&self, text: &str) -> String {
223        // ── Pass 1: ASCII Aho-Corasick with word-boundary check ──────────────
224        let result: String = if let Some(ac) = &self.ascii_ac {
225            let bytes = text.as_bytes();
226            let mut out = String::with_capacity(text.len());
227            let mut last = 0usize;
228
229            for mat in ac.find_iter(text) {
230                let start = mat.start();
231                let end = mat.end();
232
233                // Word-boundary check: char before must be a non-word char (or start of string).
234                let before_ok = start == 0 || !is_word_byte(bytes[start - 1]);
235                // Word-boundary check: char after must be a non-word char (or end of string).
236                let after_ok = end == bytes.len() || !is_word_byte(bytes[end]);
237
238                if before_ok && after_ok {
239                    // Emit the text before this match.
240                    out.push_str(&text[last..start]);
241                    // Consume any trailing whitespace that immediately follows the stopword.
242                    let skip_end = skip_trailing_space(bytes, end);
243                    last = skip_end;
244                }
245                // If boundary check fails, we do nothing — the match is skipped and
246                // `last` stays where it was so the text is emitted unchanged.
247            }
248
249            out.push_str(&text[last..]);
250            out
251        } else {
252            text.to_string()
253        };
254
255        // ── Pass 2: Non-ASCII token stopwords (whitespace-delimited exact match) ──
256        let mut out2 = String::with_capacity(result.len());
257        if !self.nonascii_stopwords.is_empty() {
258            for token in result.split_whitespace().filter(|token| {
259                !self
260                    .nonascii_stopwords
261                    .iter()
262                    .any(|sw| sw.as_str() == *token)
263            }) {
264                if !out2.is_empty() {
265                    out2.push(' ');
266                }
267                out2.push_str(token);
268            }
269        } else {
270            // Collapse consecutive whitespace even when no non-ASCII stopwords exist.
271            for token in result.split_whitespace() {
272                if !out2.is_empty() {
273                    out2.push(' ');
274                }
275                out2.push_str(token);
276            }
277        }
278
279        out2
280    }
281}
282
283// ── Word-boundary helpers ────────────────────────────────────────────────────
284
285/// Returns `true` when `b` is an ASCII word character (`[A-Za-z0-9_]`).
286///
287/// The AC automaton operates on the UTF-8 byte slice.  Because all stopwords
288/// are ASCII, every match start/end lands on an ASCII byte boundary, so a
289/// simple byte-level check is safe and avoids a `char`-decode round-trip.
290#[inline]
291fn is_word_byte(b: u8) -> bool {
292    b.is_ascii_alphanumeric() || b == b'_'
293}
294
295/// Returns the index just past any ASCII horizontal whitespace (` `, `\t`)
296/// immediately following position `pos` in `bytes`.
297///
298/// Only a single run of whitespace tokens immediately after the stopword is
299/// consumed; sentence-level whitespace collapse is handled by the
300/// `split_whitespace` pass that follows.
301#[inline]
302fn skip_trailing_space(bytes: &[u8], mut pos: usize) -> usize {
303    while pos < bytes.len() && (bytes[pos] == b' ' || bytes[pos] == b'\t') {
304        pos += 1;
305    }
306    pos
307}
308
309// ────────────────────────────────────────────────
310// 3. Internal compression functions
311// ────────────────────────────────────────────────
312
313/// Removes `Para` nodes in the bottom `threshold` fraction by importance.
314fn prune_low_importance(nodes: Vec<DocNode>, threshold: f32) -> Vec<DocNode> {
315    // Only paragraphs are subject to filtering
316    let para_importances: Vec<f32> = nodes
317        .iter()
318        .filter_map(|n| {
319            if let DocNode::Para { importance, .. } = n {
320                Some(*importance)
321            } else {
322                None
323            }
324        })
325        .collect();
326
327    if para_importances.len() <= 1 {
328        return nodes;
329    }
330
331    // Calculate the cutoff value for the bottom threshold fraction
332    let mut sorted = para_importances.clone();
333    sorted.sort_by(|a, b| a.partial_cmp(b).unwrap());
334    let cutoff_idx = ((sorted.len() as f32 * threshold) as usize).min(sorted.len() - 1);
335    let cutoff = sorted[cutoff_idx];
336
337    let filtered: Vec<DocNode> = nodes
338        .iter()
339        .filter(|n| {
340            if let DocNode::Para { importance, .. } = n {
341                *importance > cutoff
342            } else {
343                true // non-paragraph nodes are always preserved
344            }
345        })
346        .cloned()
347        .collect();
348
349    // Safety net: if the input had Para nodes but none remain after filtering, return the original.
350    // (When all paragraphs share the same importance, cutoff == all importances → prevents total elimination)
351    let filtered_has_para = filtered.iter().any(|n| matches!(n, DocNode::Para { .. }));
352    let input_had_para = nodes.iter().any(|n| matches!(n, DocNode::Para { .. }));
353
354    if input_had_para && !filtered_has_para {
355        nodes
356    } else {
357        filtered
358    }
359}
360
361/// Removes `Para` nodes with identical content, keeping only the first occurrence.
362fn deduplicate_paras(nodes: Vec<DocNode>) -> Vec<DocNode> {
363    use std::collections::HashSet;
364    let mut seen: HashSet<String> = HashSet::new();
365    nodes
366        .into_iter()
367        .filter(|n| {
368            if let DocNode::Para { text, .. } = n {
369                let mut normalized = String::with_capacity(text.len());
370                for token in text.split_whitespace() {
371                    if !normalized.is_empty() { normalized.push(' '); }
372                    normalized.push_str(token);
373                }
374                seen.insert(normalized)
375            } else {
376                true
377            }
378        })
379        .collect()
380}
381
382/// Truncates each `Para` to its first sentence.
383fn truncate_to_first_sentence(nodes: Vec<DocNode>) -> Vec<DocNode> {
384    nodes
385        .into_iter()
386        .map(|node| match node {
387            DocNode::Para { text, importance } => {
388                let first = first_sentence(&text);
389                DocNode::Para {
390                    text: first,
391                    importance,
392                }
393            }
394            other => other,
395        })
396        .collect()
397}
398
399/// Extracts the first sentence from text (delimited by `.`, `!`, or `?`).
400fn first_sentence(text: &str) -> String {
401    for (i, c) in text.char_indices() {
402        if matches!(
403            c,
404            '.' | '!' | '?'           // ASCII
405            | '。' | '！' | '？'      // CJK fullwidth (U+3002, U+FF01, U+FF1F)
406            | '।' | '॥'              // Devanagari Danda / Double Danda (U+0964, U+0965)
407            | '۔'                    // Arabic Full Stop (U+06D4)
408            | '።'                    // Ethiopic Full Stop (U+1362)
409            | '᙮'                    // Canadian Syllabics Full Stop (U+166E)
410            | '꓿'                    // Lisu Punctuation Full Stop (U+A4FF)
411            | '︒'                    // Presentation Form Vertical Ideographic Full Stop (U+FE12)
412            | '﹒'                    // Small Full Stop (U+FE52)
413            | '．' // Fullwidth Full Stop (U+FF0E)
414        ) {
415            return text[..i + c.len_utf8()].trim().to_string();
416        }
417    }
418    text.trim().to_string() // No sentence terminator found — return the full text
419}
420
421// ────────────────────────────────────────────────
422// 4. Default stopword list
423// ────────────────────────────────────────────────
424
425/// Default stopword list — English function words + Korean standalone connectives.
426///
427/// **English (ASCII)**: common articles, prepositions, auxiliaries, and pronouns
428/// that carry little semantic weight in most technical / business documents.
429///
430/// **Korean (non-ASCII)**: standalone connective words that appear as discrete
431/// whitespace-delimited tokens (그리고, 하지만, …). Grammatical particles
432/// (은/는/이/가/을/를/…) are *not* included because they are fused to the preceding
433/// noun in Korean text and cannot be stripped by whitespace-token matching without
434/// morphological analysis.
435///
436/// For domain-specific stopwords use [`AdaptiveCompressor::with_stopwords`].
437fn default_stopwords() -> Vec<String> {
438    // ── English function words ────────────────────────────────────────────
439    // Articles
440    let articles = ["a", "an", "the"];
441    // Coordinating conjunctions
442    let conjunctions = ["and", "or", "but", "nor", "yet", "so", "for"];
443    // Common prepositions
444    let prepositions = [
445        "in", "on", "at", "to", "of", "by", "as", "up", "via", "into", "from", "with", "than",
446        "about", "over", "after", "before", "between", "through", "during", "within", "without",
447    ];
448    // Auxiliary / modal verbs
449    let auxiliaries = [
450        "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "do", "does",
451        "did", "will", "would", "shall", "should", "may", "might", "must", "can", "could",
452    ];
453    // Common pronouns / determiners
454    let pronouns = [
455        "it", "its", "this", "that", "these", "those", "not", "no", "also", "too", "very", "just",
456        "such",
457    ];
458
459    // ── Korean standalone connectives (non-ASCII) ─────────────────────────
460    // These are whole whitespace-delimited words in Korean prose.
461    // Particles (은/는/이/가/…) are excluded — they require morphological analysis.
462    let korean_connectives = [
463        "그리고",
464        "하지만",
465        "그러나",
466        "따라서",
467        "또한",
468        "즉",
469        "및",
470        "또는",
471        "그래서",
472        "그런데",
473        "게다가",
474        "다만",
475        "단지",
476        "특히",
477        "주로",
478        "왜냐하면",
479        "그러므로",
480        "한편",
481        "반면",
482        "다만",
483        "이처럼",
484        "이렇게",
485        "이에",
486        "이후",
487        "이전",
488    ];
489
490    articles
491        .iter()
492        .chain(conjunctions.iter())
493        .chain(prepositions.iter())
494        .chain(auxiliaries.iter())
495        .chain(pronouns.iter())
496        .map(|s| s.to_string())
497        .chain(korean_connectives.iter().map(|s| s.to_string()))
498        .collect()
499}
500
501// ────────────────────────────────────────────────
502// 5. Unit tests
503// ────────────────────────────────────────────────
504
505#[cfg(test)]
506mod tests {
507    use super::*;
508
509    fn make_para(text: &str, importance: f32) -> DocNode {
510        DocNode::Para {
511            text: text.into(),
512            importance,
513        }
514    }
515
516    #[test]
517    fn lossless_skips_all_compression() {
518        let nodes = vec![make_para("the quick brown fox", 0.1)];
519        let cfg = CompressionConfig {
520            budget: 100,
521            current_tokens: 99,
522            fidelity: FidelityLevel::Lossless,
523        };
524        let compressor = AdaptiveCompressor::new();
525        let result = compressor.compress(nodes.clone(), &cfg);
526        // Lossless: original must be returned unchanged
527        if let (DocNode::Para { text: t1, .. }, DocNode::Para { text: t2, .. }) =
528            (&nodes[0], &result[0])
529        {
530            assert_eq!(t1, t2);
531        }
532    }
533
534    #[test]
535    fn new_compressor_has_stopwords() {
536        let compressor = AdaptiveCompressor::new();
537        // Default constructor must load the built-in stopword list.
538        assert!(
539            compressor.has_stopwords(),
540            "default compressor must have a non-empty stopword list"
541        );
542    }
543
544    #[test]
545    fn empty_compressor_has_no_stopwords() {
546        let compressor = AdaptiveCompressor::with_stopwords(vec![]);
547        assert!(
548            !compressor.has_stopwords(),
549            "compressor built with empty list must report no stopwords"
550        );
551    }
552
553    #[test]
554    fn stopword_removal_ascii_works() {
555        // "the" is in the default list → should be removed
556        let compressor = AdaptiveCompressor::new();
557        let nodes = vec![make_para("the quick brown fox", 1.0)];
558        let cfg = CompressionConfig {
559            budget: 1000,
560            current_tokens: 100, // ~10% — StopwordOnly stage
561            fidelity: FidelityLevel::Semantic,
562        };
563        let result = compressor.compress(nodes, &cfg);
564        if let DocNode::Para { text, .. } = &result[0] {
565            assert!(
566                !text.to_lowercase().starts_with("the "),
567                "stopword 'the' must be removed: got '{}'",
568                text
569            );
570        }
571    }
572
573    #[test]
574    fn with_stopwords_removes_specified_ascii_words() {
575        let compressor = AdaptiveCompressor::with_stopwords(vec!["hello".into(), "world".into()]);
576        let nodes = vec![make_para("hello world foo", 1.0)];
577        let cfg = CompressionConfig {
578            budget: 1000,
579            current_tokens: 100,
580            fidelity: FidelityLevel::Semantic,
581        };
582        let result = compressor.compress(nodes, &cfg);
583        if let DocNode::Para { text, .. } = &result[0] {
584            assert!(
585                !text.to_lowercase().contains("hello"),
586                "'hello' must be removed: got '{}'",
587                text
588            );
589            assert!(
590                !text.to_lowercase().contains("world"),
591                "'world' must be removed: got '{}'",
592                text
593            );
594            assert!(text.contains("foo"), "'foo' must remain: got '{}'", text);
595        }
596    }
597
598    #[test]
599    fn nonascii_stopword_removal_works() {
600        // Korean connective "그리고" is in the default list and should be removed
601        // when it appears as a standalone whitespace-delimited token.
602        let compressor = AdaptiveCompressor::new();
603        let nodes = vec![make_para("사과 그리고 바나나", 1.0)];
604        let cfg = CompressionConfig {
605            budget: 1000,
606            current_tokens: 100,
607            fidelity: FidelityLevel::Semantic,
608        };
609        let result = compressor.compress(nodes, &cfg);
610        if let DocNode::Para { text, .. } = &result[0] {
611            assert!(
612                !text.contains("그리고"),
613                "Korean connective '그리고' must be removed: got '{}'",
614                text
615            );
616            assert!(text.contains("사과"), "'사과' must remain: got '{}'", text);
617            assert!(
618                text.contains("바나나"),
619                "'바나나' must remain: got '{}'",
620                text
621            );
622        }
623    }
624
625    #[test]
626    fn nonascii_stopword_partial_match_not_removed() {
627        // "그리고" should NOT be removed when it is a substring of another word,
628        // e.g. "그리고나서" is a different word and must be preserved.
629        let compressor = AdaptiveCompressor::with_stopwords(vec!["그리고".into()]);
630        let nodes = vec![make_para("그리고나서 확인", 1.0)];
631        let cfg = CompressionConfig {
632            budget: 1000,
633            current_tokens: 100,
634            fidelity: FidelityLevel::Semantic,
635        };
636        let result = compressor.compress(nodes, &cfg);
637        if let DocNode::Para { text, .. } = &result[0] {
638            assert!(
639                text.contains("그리고나서"),
640                "'그리고나서' must NOT be removed (not an exact token): got '{}'",
641                text
642            );
643        }
644    }
645
646    #[test]
647    fn prune_low_importance_removes_bottom_20_pct() {
648        let nodes = vec![
649            make_para("중요 단락", 0.9),
650            make_para("보통 단락", 0.5),
651            make_para("낮은 단락", 0.1),
652            make_para("낮은 단락2", 0.05),
653            make_para("낮은 단락3", 0.02),
654        ];
655        let result = prune_low_importance(nodes, 0.20);
656        // Bottom 20% importance (1 out of 5, cutoff=0.02) should be removed
657        assert!(result.len() < 5, "some nodes must be removed");
658    }
659
660    #[test]
661    fn deduplicate_removes_duplicates() {
662        let nodes = vec![
663            make_para("동일한 내용입니다.", 1.0),
664            make_para("다른 내용입니다.", 1.0),
665            make_para("동일한 내용입니다.", 0.9),
666        ];
667        let result = deduplicate_paras(nodes);
668        assert_eq!(result.len(), 2, "one duplicate paragraph must be removed");
669    }
670
671    #[test]
672    fn first_sentence_extraction() {
673        assert_eq!(first_sentence("안녕하세요. 반갑습니다."), "안녕하세요.");
674        assert_eq!(
675            first_sentence("문장 부호 없는 텍스트"),
676            "문장 부호 없는 텍스트"
677        );
678        assert_eq!(first_sentence("Hello world! Bye."), "Hello world!");
679    }
680
681    #[test]
682    fn first_sentence_multilingual() {
683        // Hindi Devanagari Danda (U+0964)
684        assert_eq!(
685            first_sentence("यह पहला वाक्य है। यह दूसरा है।"),
686            "यह पहला वाक्य है।"
687        );
688        // Arabic Full Stop (U+06D4)
689        assert_eq!(
690            first_sentence("هذه الجملة الأولى۔ هذه الثانية۔"),
691            "هذه الجملة الأولى۔"
692        );
693        // Amharic Ethiopic Full Stop (U+1362)
694        assert_eq!(
695            first_sentence("ይህ የመጀመሪያ ዓረፍተ ነገር ነው። ሁለተኛ።"),
696            "ይህ የመጀመሪያ ዓረፍተ ነገር ነው።"
697        );
698        // Fullwidth Small Full Stop (U+FE52)
699        assert_eq!(
700            first_sentence("これが最初の文です．これが二番目です．"),
701            "これが最初の文です．"
702        );
703    }
704
705    #[test]
706    fn prune_keeps_single_paragraph() {
707        let compressor = AdaptiveCompressor::with_stopwords(vec![]);
708        let nodes = vec![make_para("only paragraph", 0.1)]; // low importance
709        let cfg = CompressionConfig {
710            budget: 100,
711            current_tokens: 65,
712            fidelity: FidelityLevel::Semantic,
713        };
714        let result = compressor.compress(nodes, &cfg);
715        assert_eq!(
716            result.len(),
717            1,
718            "the sole paragraph in a single-paragraph document must not be removed"
719        );
720    }
721
722    #[test]
723    fn prune_keeps_all_equal_importance_paragraphs() {
724        let compressor = AdaptiveCompressor::with_stopwords(vec![]);
725        // 3 paragraphs, all same importance — none should be removed
726        let nodes = vec![
727            make_para("first", 0.5),
728            make_para("second", 0.5),
729            make_para("third", 0.5),
730        ];
731        let cfg = CompressionConfig {
732            budget: 100,
733            current_tokens: 65,
734            fidelity: FidelityLevel::Semantic,
735        };
736        let result = compressor.compress(nodes, &cfg);
737        assert_eq!(
738            result.len(),
739            3,
740            "paragraphs with equal importance must not all be removed"
741        );
742    }
743
744    /// Word-boundary regression: stopword "the" must be removed as a whole word but
745    /// must NOT be stripped from inside "theory", "there", or "gather".
746    #[test]
747    fn ascii_stopword_respects_word_boundaries() {
748        let compressor = AdaptiveCompressor::with_stopwords(vec!["the".into()]);
749        let cfg = CompressionConfig {
750            budget: 1000,
751            current_tokens: 100,
752            fidelity: FidelityLevel::Semantic,
753        };
754
755        // "the" at start-of-string followed by space → must be removed
756        let nodes = vec![make_para("the cat sat", 1.0)];
757        let result = compressor.compress(nodes, &cfg);
758        if let DocNode::Para { text, .. } = &result[0] {
759            assert!(
760                !text.to_lowercase().starts_with("the "),
761                "standalone 'the' at start must be removed: got '{}'",
762                text
763            );
764            assert!(
765                text.contains("cat") && text.contains("sat"),
766                "non-stopword tokens must remain: got '{}'",
767                text
768            );
769        }
770
771        // "theory" contains "the" as a prefix → must NOT be altered
772        let nodes2 = vec![make_para("theory is important", 1.0)];
773        let result2 = compressor.compress(nodes2, &cfg);
774        if let DocNode::Para { text, .. } = &result2[0] {
775            assert!(
776                text.contains("theory"),
777                "'theory' must not be modified by stopword 'the': got '{}'",
778                text
779            );
780        }
781
782        // "there" starts with "the" → must NOT be altered
783        let nodes3 = vec![make_para("there are cats", 1.0)];
784        let result3 = compressor.compress(nodes3, &cfg);
785        if let DocNode::Para { text, .. } = &result3[0] {
786            assert!(
787                text.contains("there"),
788                "'there' must not be modified by stopword 'the': got '{}'",
789                text
790            );
791        }
792
793        // "gather" contains "the" inside → must NOT be altered
794        let nodes4 = vec![make_para("we gather here", 1.0)];
795        let result4 = compressor.compress(nodes4, &cfg);
796        if let DocNode::Para { text, .. } = &result4[0] {
797            assert!(
798                text.contains("gather"),
799                "'gather' must not be modified by stopword 'the': got '{}'",
800                text
801            );
802        }
803    }
804
805    #[test]
806    fn stage_thresholds() {
807        let base = CompressionConfig {
808            budget: 100,
809            current_tokens: 0,
810            fidelity: FidelityLevel::Semantic,
811        };
812        let at = |tokens| CompressionConfig {
813            current_tokens: tokens,
814            ..base.clone()
815        };
816
817        assert_eq!(at(50).stage(), CompressionStage::StopwordOnly);
818        assert_eq!(at(70).stage(), CompressionStage::PruneLowImportance);
819        assert_eq!(at(85).stage(), CompressionStage::DeduplicateAndLinearize);
820        assert_eq!(at(96).stage(), CompressionStage::MaxCompression);
821    }
822}
llm_transpile/compressor.rs

llm_transpile/
compressor.rs