llm-transpile 0.1.5

High-performance LLM context bridge — token-optimized document transpiler
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
//! compressor.rs — AdaptiveCompressor
//!
//! Automatically applies a four-stage compression strategy based on token budget usage.
//!
//! | Budget usage | Strategy applied                                          |
//! |-------------|-----------------------------------------------------------|
//! | 0–60%       | Stopword removal only                                     |
//! | 60–80%      | Stopwords + prune bottom-20% importance paragraphs        |
//! | 80–95%      | Above + deduplicate sentences + linearize numeric data    |
//! | 95%+        | Above + truncate all paragraphs to first sentence (Semantic+) |
//!
//! ## Stopword matching strategy
//!
//! - **ASCII stopwords**: indexed into a single [`AhoCorasick`] automaton (case-insensitive).
//!   Word-boundary semantics are enforced by checking the characters immediately before and
//!   after each match — the same contract as the previous `\b word \b` regex approach, but
//!   in a single O(N + M) pass instead of O(N × S) repeated regex sweeps.
//! - **Non-ASCII stopwords** (Korean, Japanese, CJK, Arabic, etc.): matched as exact
//!   whitespace-delimited tokens. This is necessary because `\b` does not recognise
//!   Unicode word boundaries for scripts without ASCII-style spacing.

use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind};

use crate::ir::{DocNode, FidelityLevel};

// ────────────────────────────────────────────────
// 1. Compression configuration
// ────────────────────────────────────────────────

/// Context provided when running the compressor.
#[derive(Debug, Clone)]
pub struct CompressionConfig {
    /// Maximum allowed token count.
    pub budget: usize,
    /// Tokens consumed so far (approximate).
    pub current_tokens: usize,
    /// Semantic preservation level.
    pub fidelity: FidelityLevel,
}

impl CompressionConfig {
    /// Current budget usage ratio (0.0–1.0).
    pub fn usage_ratio(&self) -> f64 {
        if self.budget == 0 {
            return 1.0;
        }
        self.current_tokens as f64 / self.budget as f64
    }

    /// Returns the compression stage for the current usage ratio.
    pub fn stage(&self) -> CompressionStage {
        match self.usage_ratio() {
            r if r < 0.60 => CompressionStage::StopwordOnly,
            r if r < 0.80 => CompressionStage::PruneLowImportance,
            r if r < 0.95 => CompressionStage::DeduplicateAndLinearize,
            _ => CompressionStage::MaxCompression,
        }
    }

    /// Returns the minimum compression stage enforced by the fidelity level,
    /// regardless of budget usage ratio.
    ///
    /// - `Compressed`: always applies at least `PruneLowImportance`
    /// - Others: no minimum (budget ratio decides)
    pub fn min_stage(&self) -> CompressionStage {
        match self.fidelity {
            FidelityLevel::Compressed => CompressionStage::PruneLowImportance,
            _ => CompressionStage::StopwordOnly,
        }
    }
}

/// Compression stage enumeration.
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
pub enum CompressionStage {
    /// Stopword removal only.
    StopwordOnly,
    /// Stopwords + prune bottom-20% importance paragraphs.
    PruneLowImportance,
    /// Above + deduplicate sentences.
    DeduplicateAndLinearize,
    /// Above + truncate paragraphs to their first sentence.
    MaxCompression,
}

// ────────────────────────────────────────────────
// 2. AdaptiveCompressor
// ────────────────────────────────────────────────

/// Budget-based adaptive document compressor.
pub struct AdaptiveCompressor {
    /// Single Aho-Corasick automaton built from all ASCII stopwords (case-insensitive).
    /// Replaces the previous per-stopword regex list — one O(N+M) pass instead of O(N×S).
    ascii_ac: Option<AhoCorasick>,
    /// Non-ASCII stopword list for exact whitespace-token matching.
    /// Applied with a whitespace-split-filter pass to handle CJK / Korean / Arabic etc.
    nonascii_stopwords: Vec<String>,
}

impl Default for AdaptiveCompressor {
    fn default() -> Self {
        Self::new()
    }
}

impl AdaptiveCompressor {
    /// Creates a compressor with the default stopword list.
    ///
    /// The default list includes common English function words (ASCII) and
    /// standalone Korean connective words (non-ASCII). For domain-specific
    /// stopwords use [`Self::with_stopwords`].
    pub fn new() -> Self {
        Self::with_stopwords(default_stopwords())
    }

    /// Creates a compressor with a fully custom stopword list.
    ///
    /// Stopwords are partitioned at construction time:
    /// - ASCII words → indexed into a single Aho-Corasick automaton (case-insensitive).
    /// - Non-ASCII words → stored as plain strings for token-level matching.
    pub fn with_stopwords(stopwords: Vec<String>) -> Self {
        let mut ascii_stopwords: Vec<String> = Vec::new();
        let mut nonascii_stopwords = Vec::new();

        for sw in &stopwords {
            if sw.is_ascii() {
                ascii_stopwords.push(sw.to_ascii_lowercase());
            } else {
                // Non-ASCII (Korean, CJK, Arabic, Devanagari, …):
                // stored as plain strings for whitespace-token matching.
                nonascii_stopwords.push(sw.clone());
            }
        }

        let ascii_ac = if ascii_stopwords.is_empty() {
            None
        } else {
            AhoCorasickBuilder::new()
                .ascii_case_insensitive(true)
                .match_kind(MatchKind::LeftmostFirst)
                .build(&ascii_stopwords)
                .ok()
        };

        Self {
            ascii_ac,
            nonascii_stopwords,
        }
    }

    /// Returns true when no stopwords are configured (both lists empty).
    pub fn has_stopwords(&self) -> bool {
        self.ascii_ac.is_some() || !self.nonascii_stopwords.is_empty()
    }

    /// Applies compression to the node list and returns the result.
    ///
    /// Stopword removal is also skipped at `FidelityLevel::Lossless`.
    pub fn compress(&self, mut nodes: Vec<DocNode>, cfg: &CompressionConfig) -> Vec<DocNode> {
        if cfg.fidelity == FidelityLevel::Lossless {
            return nodes; // Lossless: compression entirely forbidden
        }

        let stage = cfg.stage().max(cfg.min_stage());

        // ① Stopword removal (all stages)
        nodes = self.remove_stopwords(nodes);

        // ② Prune bottom-20% importance paragraphs
        if stage >= CompressionStage::PruneLowImportance {
            nodes = prune_low_importance(nodes, 0.20);
        }

        // ③ Deduplicate sentences
        if stage >= CompressionStage::DeduplicateAndLinearize {
            nodes = deduplicate_paras(nodes);
        }

        // ④ Truncate paragraphs to their first sentence
        // Lossless early-returns at the top, so fidelity != Lossless is guaranteed here.
        if stage >= CompressionStage::MaxCompression {
            nodes = truncate_to_first_sentence(nodes);
        }

        nodes
    }

    // ── Internal helpers ─────────────────────────

    fn remove_stopwords(&self, nodes: Vec<DocNode>) -> Vec<DocNode> {
        if !self.has_stopwords() {
            return nodes;
        }
        nodes
            .into_iter()
            .map(|node| match node {
                DocNode::Para { text, importance } => DocNode::Para {
                    text: self.strip_stopwords(&text),
                    importance,
                },
                DocNode::Header { level, text } => DocNode::Header {
                    level,
                    text: self.strip_stopwords(&text),
                },
                other => other,
            })
            .collect()
    }

    /// Removes stopwords from a single text string.
    ///
    /// Two passes:
    /// 1. ASCII Aho-Corasick pass — single O(N+M) scan with word-boundary validation.
    ///    Each match is accepted only when the character immediately before the match
    ///    start and the character immediately after the match end are both non-word
    ///    characters (i.e. not `[A-Za-z0-9_]`). Trailing whitespace after an accepted
    ///    match is also consumed to avoid double-spaces.
    /// 2. Non-ASCII whitespace-token pass — splits on whitespace, filters exact matches,
    ///    then rejoins. O(N) per token.
    ///
    /// A final `split_whitespace` + rejoin collapses any residual consecutive spaces.
    fn strip_stopwords(&self, text: &str) -> String {
        // ── Pass 1: ASCII Aho-Corasick with word-boundary check ──────────────
        let result: String = if let Some(ac) = &self.ascii_ac {
            let bytes = text.as_bytes();
            let mut out = String::with_capacity(text.len());
            let mut last = 0usize;

            for mat in ac.find_iter(text) {
                let start = mat.start();
                let end = mat.end();

                // Word-boundary check: char before must be a non-word char (or start of string).
                let before_ok = start == 0 || !is_word_byte(bytes[start - 1]);
                // Word-boundary check: char after must be a non-word char (or end of string).
                let after_ok = end == bytes.len() || !is_word_byte(bytes[end]);

                if before_ok && after_ok {
                    // Emit the text before this match.
                    out.push_str(&text[last..start]);
                    // Consume any trailing whitespace that immediately follows the stopword.
                    let skip_end = skip_trailing_space(bytes, end);
                    last = skip_end;
                }
                // If boundary check fails, we do nothing — the match is skipped and
                // `last` stays where it was so the text is emitted unchanged.
            }

            out.push_str(&text[last..]);
            out
        } else {
            text.to_string()
        };

        // ── Pass 2: Non-ASCII token stopwords (whitespace-delimited exact match) ──
        let mut out2 = String::with_capacity(result.len());
        if !self.nonascii_stopwords.is_empty() {
            for token in result.split_whitespace().filter(|token| {
                !self
                    .nonascii_stopwords
                    .iter()
                    .any(|sw| sw.as_str() == *token)
            }) {
                if !out2.is_empty() {
                    out2.push(' ');
                }
                out2.push_str(token);
            }
        } else {
            // Collapse consecutive whitespace even when no non-ASCII stopwords exist.
            for token in result.split_whitespace() {
                if !out2.is_empty() {
                    out2.push(' ');
                }
                out2.push_str(token);
            }
        }

        out2
    }
}

// ── Word-boundary helpers ────────────────────────────────────────────────────

/// Returns `true` when `b` is an ASCII word character (`[A-Za-z0-9_]`).
///
/// The AC automaton operates on the UTF-8 byte slice.  Because all stopwords
/// are ASCII, every match start/end lands on an ASCII byte boundary, so a
/// simple byte-level check is safe and avoids a `char`-decode round-trip.
#[inline]
fn is_word_byte(b: u8) -> bool {
    b.is_ascii_alphanumeric() || b == b'_'
}

/// Returns the index just past any ASCII horizontal whitespace (` `, `\t`)
/// immediately following position `pos` in `bytes`.
///
/// Only a single run of whitespace tokens immediately after the stopword is
/// consumed; sentence-level whitespace collapse is handled by the
/// `split_whitespace` pass that follows.
#[inline]
fn skip_trailing_space(bytes: &[u8], mut pos: usize) -> usize {
    while pos < bytes.len() && (bytes[pos] == b' ' || bytes[pos] == b'\t') {
        pos += 1;
    }
    pos
}

// ────────────────────────────────────────────────
// 3. Internal compression functions
// ────────────────────────────────────────────────

/// Removes `Para` nodes in the bottom `threshold` fraction by importance.
fn prune_low_importance(nodes: Vec<DocNode>, threshold: f32) -> Vec<DocNode> {
    // Only paragraphs are subject to filtering
    let para_importances: Vec<f32> = nodes
        .iter()
        .filter_map(|n| {
            if let DocNode::Para { importance, .. } = n {
                Some(*importance)
            } else {
                None
            }
        })
        .collect();

    if para_importances.len() <= 1 {
        return nodes;
    }

    // Calculate the cutoff value for the bottom threshold fraction
    let mut sorted = para_importances.clone();
    sorted.sort_by(|a, b| a.partial_cmp(b).unwrap());
    let cutoff_idx = ((sorted.len() as f32 * threshold) as usize).min(sorted.len() - 1);
    let cutoff = sorted[cutoff_idx];

    let filtered: Vec<DocNode> = nodes
        .iter()
        .filter(|n| {
            if let DocNode::Para { importance, .. } = n {
                *importance > cutoff
            } else {
                true // non-paragraph nodes are always preserved
            }
        })
        .cloned()
        .collect();

    // Safety net: if the input had Para nodes but none remain after filtering, return the original.
    // (When all paragraphs share the same importance, cutoff == all importances → prevents total elimination)
    let filtered_has_para = filtered.iter().any(|n| matches!(n, DocNode::Para { .. }));
    let input_had_para = nodes.iter().any(|n| matches!(n, DocNode::Para { .. }));

    if input_had_para && !filtered_has_para {
        nodes
    } else {
        filtered
    }
}

/// Removes `Para` nodes with identical content, keeping only the first occurrence.
fn deduplicate_paras(nodes: Vec<DocNode>) -> Vec<DocNode> {
    use std::collections::HashSet;
    let mut seen: HashSet<String> = HashSet::new();
    nodes
        .into_iter()
        .filter(|n| {
            if let DocNode::Para { text, .. } = n {
                let mut normalized = String::with_capacity(text.len());
                for token in text.split_whitespace() {
                    if !normalized.is_empty() { normalized.push(' '); }
                    normalized.push_str(token);
                }
                seen.insert(normalized)
            } else {
                true
            }
        })
        .collect()
}

/// Truncates each `Para` to its first sentence.
fn truncate_to_first_sentence(nodes: Vec<DocNode>) -> Vec<DocNode> {
    nodes
        .into_iter()
        .map(|node| match node {
            DocNode::Para { text, importance } => {
                let first = first_sentence(&text);
                DocNode::Para {
                    text: first,
                    importance,
                }
            }
            other => other,
        })
        .collect()
}

/// Extracts the first sentence from text (delimited by `.`, `!`, or `?`).
fn first_sentence(text: &str) -> String {
    for (i, c) in text.char_indices() {
        if matches!(
            c,
            '.' | '!' | '?'           // ASCII
            | '' | '' | ''      // CJK fullwidth (U+3002, U+FF01, U+FF1F)
            | '' | ''              // Devanagari Danda / Double Danda (U+0964, U+0965)
            | '۔'                    // Arabic Full Stop (U+06D4)
            | ''                    // Ethiopic Full Stop (U+1362)
            | ''                    // Canadian Syllabics Full Stop (U+166E)
            | ''                    // Lisu Punctuation Full Stop (U+A4FF)
            | ''                    // Presentation Form Vertical Ideographic Full Stop (U+FE12)
            | ''                    // Small Full Stop (U+FE52)
            | '' // Fullwidth Full Stop (U+FF0E)
        ) {
            return text[..i + c.len_utf8()].trim().to_string();
        }
    }
    text.trim().to_string() // No sentence terminator found — return the full text
}

// ────────────────────────────────────────────────
// 4. Default stopword list
// ────────────────────────────────────────────────

/// Default stopword list — English function words + Korean standalone connectives.
///
/// **English (ASCII)**: common articles, prepositions, auxiliaries, and pronouns
/// that carry little semantic weight in most technical / business documents.
///
/// **Korean (non-ASCII)**: standalone connective words that appear as discrete
/// whitespace-delimited tokens (그리고, 하지만, …). Grammatical particles
/// (은/는/이/가/을/를/…) are *not* included because they are fused to the preceding
/// noun in Korean text and cannot be stripped by whitespace-token matching without
/// morphological analysis.
///
/// For domain-specific stopwords use [`AdaptiveCompressor::with_stopwords`].
fn default_stopwords() -> Vec<String> {
    // ── English function words ────────────────────────────────────────────
    // Articles
    let articles = ["a", "an", "the"];
    // Coordinating conjunctions
    let conjunctions = ["and", "or", "but", "nor", "yet", "so", "for"];
    // Common prepositions
    let prepositions = [
        "in", "on", "at", "to", "of", "by", "as", "up", "via", "into", "from", "with", "than",
        "about", "over", "after", "before", "between", "through", "during", "within", "without",
    ];
    // Auxiliary / modal verbs
    let auxiliaries = [
        "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "do", "does",
        "did", "will", "would", "shall", "should", "may", "might", "must", "can", "could",
    ];
    // Common pronouns / determiners
    let pronouns = [
        "it", "its", "this", "that", "these", "those", "not", "no", "also", "too", "very", "just",
        "such",
    ];

    // ── Korean standalone connectives (non-ASCII) ─────────────────────────
    // These are whole whitespace-delimited words in Korean prose.
    // Particles (은/는/이/가/…) are excluded — they require morphological analysis.
    let korean_connectives = [
        "그리고",
        "하지만",
        "그러나",
        "따라서",
        "또한",
        "",
        "",
        "또는",
        "그래서",
        "그런데",
        "게다가",
        "다만",
        "단지",
        "특히",
        "주로",
        "왜냐하면",
        "그러므로",
        "한편",
        "반면",
        "다만",
        "이처럼",
        "이렇게",
        "이에",
        "이후",
        "이전",
    ];

    articles
        .iter()
        .chain(conjunctions.iter())
        .chain(prepositions.iter())
        .chain(auxiliaries.iter())
        .chain(pronouns.iter())
        .map(|s| s.to_string())
        .chain(korean_connectives.iter().map(|s| s.to_string()))
        .collect()
}

// ────────────────────────────────────────────────
// 5. Unit tests
// ────────────────────────────────────────────────

#[cfg(test)]
mod tests {
    use super::*;

    fn make_para(text: &str, importance: f32) -> DocNode {
        DocNode::Para {
            text: text.into(),
            importance,
        }
    }

    #[test]
    fn lossless_skips_all_compression() {
        let nodes = vec![make_para("the quick brown fox", 0.1)];
        let cfg = CompressionConfig {
            budget: 100,
            current_tokens: 99,
            fidelity: FidelityLevel::Lossless,
        };
        let compressor = AdaptiveCompressor::new();
        let result = compressor.compress(nodes.clone(), &cfg);
        // Lossless: original must be returned unchanged
        if let (DocNode::Para { text: t1, .. }, DocNode::Para { text: t2, .. }) =
            (&nodes[0], &result[0])
        {
            assert_eq!(t1, t2);
        }
    }

    #[test]
    fn new_compressor_has_stopwords() {
        let compressor = AdaptiveCompressor::new();
        // Default constructor must load the built-in stopword list.
        assert!(
            compressor.has_stopwords(),
            "default compressor must have a non-empty stopword list"
        );
    }

    #[test]
    fn empty_compressor_has_no_stopwords() {
        let compressor = AdaptiveCompressor::with_stopwords(vec![]);
        assert!(
            !compressor.has_stopwords(),
            "compressor built with empty list must report no stopwords"
        );
    }

    #[test]
    fn stopword_removal_ascii_works() {
        // "the" is in the default list → should be removed
        let compressor = AdaptiveCompressor::new();
        let nodes = vec![make_para("the quick brown fox", 1.0)];
        let cfg = CompressionConfig {
            budget: 1000,
            current_tokens: 100, // ~10% — StopwordOnly stage
            fidelity: FidelityLevel::Semantic,
        };
        let result = compressor.compress(nodes, &cfg);
        if let DocNode::Para { text, .. } = &result[0] {
            assert!(
                !text.to_lowercase().starts_with("the "),
                "stopword 'the' must be removed: got '{}'",
                text
            );
        }
    }

    #[test]
    fn with_stopwords_removes_specified_ascii_words() {
        let compressor = AdaptiveCompressor::with_stopwords(vec!["hello".into(), "world".into()]);
        let nodes = vec![make_para("hello world foo", 1.0)];
        let cfg = CompressionConfig {
            budget: 1000,
            current_tokens: 100,
            fidelity: FidelityLevel::Semantic,
        };
        let result = compressor.compress(nodes, &cfg);
        if let DocNode::Para { text, .. } = &result[0] {
            assert!(
                !text.to_lowercase().contains("hello"),
                "'hello' must be removed: got '{}'",
                text
            );
            assert!(
                !text.to_lowercase().contains("world"),
                "'world' must be removed: got '{}'",
                text
            );
            assert!(text.contains("foo"), "'foo' must remain: got '{}'", text);
        }
    }

    #[test]
    fn nonascii_stopword_removal_works() {
        // Korean connective "그리고" is in the default list and should be removed
        // when it appears as a standalone whitespace-delimited token.
        let compressor = AdaptiveCompressor::new();
        let nodes = vec![make_para("사과 그리고 바나나", 1.0)];
        let cfg = CompressionConfig {
            budget: 1000,
            current_tokens: 100,
            fidelity: FidelityLevel::Semantic,
        };
        let result = compressor.compress(nodes, &cfg);
        if let DocNode::Para { text, .. } = &result[0] {
            assert!(
                !text.contains("그리고"),
                "Korean connective '그리고' must be removed: got '{}'",
                text
            );
            assert!(text.contains("사과"), "'사과' must remain: got '{}'", text);
            assert!(
                text.contains("바나나"),
                "'바나나' must remain: got '{}'",
                text
            );
        }
    }

    #[test]
    fn nonascii_stopword_partial_match_not_removed() {
        // "그리고" should NOT be removed when it is a substring of another word,
        // e.g. "그리고나서" is a different word and must be preserved.
        let compressor = AdaptiveCompressor::with_stopwords(vec!["그리고".into()]);
        let nodes = vec![make_para("그리고나서 확인", 1.0)];
        let cfg = CompressionConfig {
            budget: 1000,
            current_tokens: 100,
            fidelity: FidelityLevel::Semantic,
        };
        let result = compressor.compress(nodes, &cfg);
        if let DocNode::Para { text, .. } = &result[0] {
            assert!(
                text.contains("그리고나서"),
                "'그리고나서' must NOT be removed (not an exact token): got '{}'",
                text
            );
        }
    }

    #[test]
    fn prune_low_importance_removes_bottom_20_pct() {
        let nodes = vec![
            make_para("중요 단락", 0.9),
            make_para("보통 단락", 0.5),
            make_para("낮은 단락", 0.1),
            make_para("낮은 단락2", 0.05),
            make_para("낮은 단락3", 0.02),
        ];
        let result = prune_low_importance(nodes, 0.20);
        // Bottom 20% importance (1 out of 5, cutoff=0.02) should be removed
        assert!(result.len() < 5, "some nodes must be removed");
    }

    #[test]
    fn deduplicate_removes_duplicates() {
        let nodes = vec![
            make_para("동일한 내용입니다.", 1.0),
            make_para("다른 내용입니다.", 1.0),
            make_para("동일한 내용입니다.", 0.9),
        ];
        let result = deduplicate_paras(nodes);
        assert_eq!(result.len(), 2, "one duplicate paragraph must be removed");
    }

    #[test]
    fn first_sentence_extraction() {
        assert_eq!(first_sentence("안녕하세요. 반갑습니다."), "안녕하세요.");
        assert_eq!(
            first_sentence("문장 부호 없는 텍스트"),
            "문장 부호 없는 텍스트"
        );
        assert_eq!(first_sentence("Hello world! Bye."), "Hello world!");
    }

    #[test]
    fn first_sentence_multilingual() {
        // Hindi Devanagari Danda (U+0964)
        assert_eq!(
            first_sentence("यह पहला वाक्य है। यह दूसरा है।"),
            "यह पहला वाक्य है।"
        );
        // Arabic Full Stop (U+06D4)
        assert_eq!(
            first_sentence("هذه الجملة الأولى۔ هذه الثانية۔"),
            "هذه الجملة الأولى۔"
        );
        // Amharic Ethiopic Full Stop (U+1362)
        assert_eq!(
            first_sentence("ይህ የመጀመሪያ ዓረፍተ ነገር ነው። ሁለተኛ።"),
            "ይህ የመጀመሪያ ዓረፍተ ነገር ነው።"
        );
        // Fullwidth Small Full Stop (U+FE52)
        assert_eq!(
            first_sentence("これが最初の文です.これが二番目です."),
            "これが最初の文です."
        );
    }

    #[test]
    fn prune_keeps_single_paragraph() {
        let compressor = AdaptiveCompressor::with_stopwords(vec![]);
        let nodes = vec![make_para("only paragraph", 0.1)]; // low importance
        let cfg = CompressionConfig {
            budget: 100,
            current_tokens: 65,
            fidelity: FidelityLevel::Semantic,
        };
        let result = compressor.compress(nodes, &cfg);
        assert_eq!(
            result.len(),
            1,
            "the sole paragraph in a single-paragraph document must not be removed"
        );
    }

    #[test]
    fn prune_keeps_all_equal_importance_paragraphs() {
        let compressor = AdaptiveCompressor::with_stopwords(vec![]);
        // 3 paragraphs, all same importance — none should be removed
        let nodes = vec![
            make_para("first", 0.5),
            make_para("second", 0.5),
            make_para("third", 0.5),
        ];
        let cfg = CompressionConfig {
            budget: 100,
            current_tokens: 65,
            fidelity: FidelityLevel::Semantic,
        };
        let result = compressor.compress(nodes, &cfg);
        assert_eq!(
            result.len(),
            3,
            "paragraphs with equal importance must not all be removed"
        );
    }

    /// Word-boundary regression: stopword "the" must be removed as a whole word but
    /// must NOT be stripped from inside "theory", "there", or "gather".
    #[test]
    fn ascii_stopword_respects_word_boundaries() {
        let compressor = AdaptiveCompressor::with_stopwords(vec!["the".into()]);
        let cfg = CompressionConfig {
            budget: 1000,
            current_tokens: 100,
            fidelity: FidelityLevel::Semantic,
        };

        // "the" at start-of-string followed by space → must be removed
        let nodes = vec![make_para("the cat sat", 1.0)];
        let result = compressor.compress(nodes, &cfg);
        if let DocNode::Para { text, .. } = &result[0] {
            assert!(
                !text.to_lowercase().starts_with("the "),
                "standalone 'the' at start must be removed: got '{}'",
                text
            );
            assert!(
                text.contains("cat") && text.contains("sat"),
                "non-stopword tokens must remain: got '{}'",
                text
            );
        }

        // "theory" contains "the" as a prefix → must NOT be altered
        let nodes2 = vec![make_para("theory is important", 1.0)];
        let result2 = compressor.compress(nodes2, &cfg);
        if let DocNode::Para { text, .. } = &result2[0] {
            assert!(
                text.contains("theory"),
                "'theory' must not be modified by stopword 'the': got '{}'",
                text
            );
        }

        // "there" starts with "the" → must NOT be altered
        let nodes3 = vec![make_para("there are cats", 1.0)];
        let result3 = compressor.compress(nodes3, &cfg);
        if let DocNode::Para { text, .. } = &result3[0] {
            assert!(
                text.contains("there"),
                "'there' must not be modified by stopword 'the': got '{}'",
                text
            );
        }

        // "gather" contains "the" inside → must NOT be altered
        let nodes4 = vec![make_para("we gather here", 1.0)];
        let result4 = compressor.compress(nodes4, &cfg);
        if let DocNode::Para { text, .. } = &result4[0] {
            assert!(
                text.contains("gather"),
                "'gather' must not be modified by stopword 'the': got '{}'",
                text
            );
        }
    }

    #[test]
    fn stage_thresholds() {
        let base = CompressionConfig {
            budget: 100,
            current_tokens: 0,
            fidelity: FidelityLevel::Semantic,
        };
        let at = |tokens| CompressionConfig {
            current_tokens: tokens,
            ..base.clone()
        };

        assert_eq!(at(50).stage(), CompressionStage::StopwordOnly);
        assert_eq!(at(70).stage(), CompressionStage::PruneLowImportance);
        assert_eq!(at(85).stage(), CompressionStage::DeduplicateAndLinearize);
        assert_eq!(at(96).stage(), CompressionStage::MaxCompression);
    }
}