use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind};
use crate::ir::{DocNode, FidelityLevel};
#[derive(Debug, Clone)]
pub struct CompressionConfig {
pub budget: usize,
pub current_tokens: usize,
pub fidelity: FidelityLevel,
}
impl CompressionConfig {
pub fn usage_ratio(&self) -> f64 {
if self.budget == 0 {
return 1.0;
}
self.current_tokens as f64 / self.budget as f64
}
pub fn stage(&self) -> CompressionStage {
match self.usage_ratio() {
r if r < 0.60 => CompressionStage::StopwordOnly,
r if r < 0.80 => CompressionStage::PruneLowImportance,
r if r < 0.95 => CompressionStage::DeduplicateAndLinearize,
_ => CompressionStage::MaxCompression,
}
}
pub fn min_stage(&self) -> CompressionStage {
match self.fidelity {
FidelityLevel::Compressed => CompressionStage::PruneLowImportance,
_ => CompressionStage::StopwordOnly,
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
pub enum CompressionStage {
StopwordOnly,
PruneLowImportance,
DeduplicateAndLinearize,
MaxCompression,
}
pub struct AdaptiveCompressor {
ascii_ac: Option<AhoCorasick>,
nonascii_stopwords: Vec<String>,
}
impl Default for AdaptiveCompressor {
fn default() -> Self {
Self::new()
}
}
impl AdaptiveCompressor {
pub fn new() -> Self {
Self::with_stopwords(default_stopwords())
}
pub fn with_stopwords(stopwords: Vec<String>) -> Self {
let mut ascii_stopwords: Vec<String> = Vec::new();
let mut nonascii_stopwords = Vec::new();
for sw in &stopwords {
if sw.is_ascii() {
ascii_stopwords.push(sw.to_ascii_lowercase());
} else {
nonascii_stopwords.push(sw.clone());
}
}
let ascii_ac = if ascii_stopwords.is_empty() {
None
} else {
AhoCorasickBuilder::new()
.ascii_case_insensitive(true)
.match_kind(MatchKind::LeftmostFirst)
.build(&ascii_stopwords)
.ok()
};
Self {
ascii_ac,
nonascii_stopwords,
}
}
pub fn has_stopwords(&self) -> bool {
self.ascii_ac.is_some() || !self.nonascii_stopwords.is_empty()
}
pub fn compress(&self, mut nodes: Vec<DocNode>, cfg: &CompressionConfig) -> Vec<DocNode> {
if cfg.fidelity == FidelityLevel::Lossless {
return nodes; }
let stage = cfg.stage().max(cfg.min_stage());
nodes = self.remove_stopwords(nodes);
if stage >= CompressionStage::PruneLowImportance {
nodes = prune_low_importance(nodes, 0.20);
}
if stage >= CompressionStage::DeduplicateAndLinearize {
nodes = deduplicate_paras(nodes);
}
if stage >= CompressionStage::MaxCompression {
nodes = truncate_to_first_sentence(nodes);
}
nodes
}
fn remove_stopwords(&self, nodes: Vec<DocNode>) -> Vec<DocNode> {
if !self.has_stopwords() {
return nodes;
}
nodes
.into_iter()
.map(|node| match node {
DocNode::Para { text, importance } => DocNode::Para {
text: self.strip_stopwords(&text),
importance,
},
DocNode::Header { level, text } => DocNode::Header {
level,
text: self.strip_stopwords(&text),
},
other => other,
})
.collect()
}
fn strip_stopwords(&self, text: &str) -> String {
let result: String = if let Some(ac) = &self.ascii_ac {
let bytes = text.as_bytes();
let mut out = String::with_capacity(text.len());
let mut last = 0usize;
for mat in ac.find_iter(text) {
let start = mat.start();
let end = mat.end();
let before_ok = start == 0 || !is_word_byte(bytes[start - 1]);
let after_ok = end == bytes.len() || !is_word_byte(bytes[end]);
if before_ok && after_ok {
out.push_str(&text[last..start]);
let skip_end = skip_trailing_space(bytes, end);
last = skip_end;
}
}
out.push_str(&text[last..]);
out
} else {
text.to_string()
};
let mut out2 = String::with_capacity(result.len());
if !self.nonascii_stopwords.is_empty() {
for token in result.split_whitespace().filter(|token| {
!self
.nonascii_stopwords
.iter()
.any(|sw| sw.as_str() == *token)
}) {
if !out2.is_empty() {
out2.push(' ');
}
out2.push_str(token);
}
} else {
for token in result.split_whitespace() {
if !out2.is_empty() {
out2.push(' ');
}
out2.push_str(token);
}
}
out2
}
}
#[inline]
fn is_word_byte(b: u8) -> bool {
b.is_ascii_alphanumeric() || b == b'_'
}
#[inline]
fn skip_trailing_space(bytes: &[u8], mut pos: usize) -> usize {
while pos < bytes.len() && (bytes[pos] == b' ' || bytes[pos] == b'\t') {
pos += 1;
}
pos
}
fn prune_low_importance(nodes: Vec<DocNode>, threshold: f32) -> Vec<DocNode> {
let para_importances: Vec<f32> = nodes
.iter()
.filter_map(|n| {
if let DocNode::Para { importance, .. } = n {
Some(*importance)
} else {
None
}
})
.collect();
if para_importances.len() <= 1 {
return nodes;
}
let mut sorted = para_importances.clone();
sorted.sort_by(|a, b| a.partial_cmp(b).unwrap());
let cutoff_idx = ((sorted.len() as f32 * threshold) as usize).min(sorted.len() - 1);
let cutoff = sorted[cutoff_idx];
let filtered: Vec<DocNode> = nodes
.iter()
.filter(|n| {
if let DocNode::Para { importance, .. } = n {
*importance > cutoff
} else {
true }
})
.cloned()
.collect();
let filtered_has_para = filtered.iter().any(|n| matches!(n, DocNode::Para { .. }));
let input_had_para = nodes.iter().any(|n| matches!(n, DocNode::Para { .. }));
if input_had_para && !filtered_has_para {
nodes
} else {
filtered
}
}
fn deduplicate_paras(nodes: Vec<DocNode>) -> Vec<DocNode> {
use std::collections::HashSet;
let mut seen: HashSet<String> = HashSet::new();
nodes
.into_iter()
.filter(|n| {
if let DocNode::Para { text, .. } = n {
let mut normalized = String::with_capacity(text.len());
for token in text.split_whitespace() {
if !normalized.is_empty() { normalized.push(' '); }
normalized.push_str(token);
}
seen.insert(normalized)
} else {
true
}
})
.collect()
}
fn truncate_to_first_sentence(nodes: Vec<DocNode>) -> Vec<DocNode> {
nodes
.into_iter()
.map(|node| match node {
DocNode::Para { text, importance } => {
let first = first_sentence(&text);
DocNode::Para {
text: first,
importance,
}
}
other => other,
})
.collect()
}
fn first_sentence(text: &str) -> String {
for (i, c) in text.char_indices() {
if matches!(
c,
'.' | '!' | '?' | '。' | '!' | '?' | '।' | '॥' | '۔' | '።' | '᙮' | '꓿' | '︒' | '﹒' | '.' ) {
return text[..i + c.len_utf8()].trim().to_string();
}
}
text.trim().to_string() }
fn default_stopwords() -> Vec<String> {
let articles = ["a", "an", "the"];
let conjunctions = ["and", "or", "but", "nor", "yet", "so", "for"];
let prepositions = [
"in", "on", "at", "to", "of", "by", "as", "up", "via", "into", "from", "with", "than",
"about", "over", "after", "before", "between", "through", "during", "within", "without",
];
let auxiliaries = [
"is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "do", "does",
"did", "will", "would", "shall", "should", "may", "might", "must", "can", "could",
];
let pronouns = [
"it", "its", "this", "that", "these", "those", "not", "no", "also", "too", "very", "just",
"such",
];
let korean_connectives = [
"그리고",
"하지만",
"그러나",
"따라서",
"또한",
"즉",
"및",
"또는",
"그래서",
"그런데",
"게다가",
"다만",
"단지",
"특히",
"주로",
"왜냐하면",
"그러므로",
"한편",
"반면",
"다만",
"이처럼",
"이렇게",
"이에",
"이후",
"이전",
];
articles
.iter()
.chain(conjunctions.iter())
.chain(prepositions.iter())
.chain(auxiliaries.iter())
.chain(pronouns.iter())
.map(|s| s.to_string())
.chain(korean_connectives.iter().map(|s| s.to_string()))
.collect()
}
#[cfg(test)]
mod tests {
use super::*;
fn make_para(text: &str, importance: f32) -> DocNode {
DocNode::Para {
text: text.into(),
importance,
}
}
#[test]
fn lossless_skips_all_compression() {
let nodes = vec![make_para("the quick brown fox", 0.1)];
let cfg = CompressionConfig {
budget: 100,
current_tokens: 99,
fidelity: FidelityLevel::Lossless,
};
let compressor = AdaptiveCompressor::new();
let result = compressor.compress(nodes.clone(), &cfg);
if let (DocNode::Para { text: t1, .. }, DocNode::Para { text: t2, .. }) =
(&nodes[0], &result[0])
{
assert_eq!(t1, t2);
}
}
#[test]
fn new_compressor_has_stopwords() {
let compressor = AdaptiveCompressor::new();
assert!(
compressor.has_stopwords(),
"default compressor must have a non-empty stopword list"
);
}
#[test]
fn empty_compressor_has_no_stopwords() {
let compressor = AdaptiveCompressor::with_stopwords(vec![]);
assert!(
!compressor.has_stopwords(),
"compressor built with empty list must report no stopwords"
);
}
#[test]
fn stopword_removal_ascii_works() {
let compressor = AdaptiveCompressor::new();
let nodes = vec![make_para("the quick brown fox", 1.0)];
let cfg = CompressionConfig {
budget: 1000,
current_tokens: 100, fidelity: FidelityLevel::Semantic,
};
let result = compressor.compress(nodes, &cfg);
if let DocNode::Para { text, .. } = &result[0] {
assert!(
!text.to_lowercase().starts_with("the "),
"stopword 'the' must be removed: got '{}'",
text
);
}
}
#[test]
fn with_stopwords_removes_specified_ascii_words() {
let compressor = AdaptiveCompressor::with_stopwords(vec!["hello".into(), "world".into()]);
let nodes = vec![make_para("hello world foo", 1.0)];
let cfg = CompressionConfig {
budget: 1000,
current_tokens: 100,
fidelity: FidelityLevel::Semantic,
};
let result = compressor.compress(nodes, &cfg);
if let DocNode::Para { text, .. } = &result[0] {
assert!(
!text.to_lowercase().contains("hello"),
"'hello' must be removed: got '{}'",
text
);
assert!(
!text.to_lowercase().contains("world"),
"'world' must be removed: got '{}'",
text
);
assert!(text.contains("foo"), "'foo' must remain: got '{}'", text);
}
}
#[test]
fn nonascii_stopword_removal_works() {
let compressor = AdaptiveCompressor::new();
let nodes = vec![make_para("사과 그리고 바나나", 1.0)];
let cfg = CompressionConfig {
budget: 1000,
current_tokens: 100,
fidelity: FidelityLevel::Semantic,
};
let result = compressor.compress(nodes, &cfg);
if let DocNode::Para { text, .. } = &result[0] {
assert!(
!text.contains("그리고"),
"Korean connective '그리고' must be removed: got '{}'",
text
);
assert!(text.contains("사과"), "'사과' must remain: got '{}'", text);
assert!(
text.contains("바나나"),
"'바나나' must remain: got '{}'",
text
);
}
}
#[test]
fn nonascii_stopword_partial_match_not_removed() {
let compressor = AdaptiveCompressor::with_stopwords(vec!["그리고".into()]);
let nodes = vec![make_para("그리고나서 확인", 1.0)];
let cfg = CompressionConfig {
budget: 1000,
current_tokens: 100,
fidelity: FidelityLevel::Semantic,
};
let result = compressor.compress(nodes, &cfg);
if let DocNode::Para { text, .. } = &result[0] {
assert!(
text.contains("그리고나서"),
"'그리고나서' must NOT be removed (not an exact token): got '{}'",
text
);
}
}
#[test]
fn prune_low_importance_removes_bottom_20_pct() {
let nodes = vec![
make_para("중요 단락", 0.9),
make_para("보통 단락", 0.5),
make_para("낮은 단락", 0.1),
make_para("낮은 단락2", 0.05),
make_para("낮은 단락3", 0.02),
];
let result = prune_low_importance(nodes, 0.20);
assert!(result.len() < 5, "some nodes must be removed");
}
#[test]
fn deduplicate_removes_duplicates() {
let nodes = vec![
make_para("동일한 내용입니다.", 1.0),
make_para("다른 내용입니다.", 1.0),
make_para("동일한 내용입니다.", 0.9),
];
let result = deduplicate_paras(nodes);
assert_eq!(result.len(), 2, "one duplicate paragraph must be removed");
}
#[test]
fn first_sentence_extraction() {
assert_eq!(first_sentence("안녕하세요. 반갑습니다."), "안녕하세요.");
assert_eq!(
first_sentence("문장 부호 없는 텍스트"),
"문장 부호 없는 텍스트"
);
assert_eq!(first_sentence("Hello world! Bye."), "Hello world!");
}
#[test]
fn first_sentence_multilingual() {
assert_eq!(
first_sentence("यह पहला वाक्य है। यह दूसरा है।"),
"यह पहला वाक्य है।"
);
assert_eq!(
first_sentence("هذه الجملة الأولى۔ هذه الثانية۔"),
"هذه الجملة الأولى۔"
);
assert_eq!(
first_sentence("ይህ የመጀመሪያ ዓረፍተ ነገር ነው። ሁለተኛ።"),
"ይህ የመጀመሪያ ዓረፍተ ነገር ነው።"
);
assert_eq!(
first_sentence("これが最初の文です.これが二番目です."),
"これが最初の文です."
);
}
#[test]
fn prune_keeps_single_paragraph() {
let compressor = AdaptiveCompressor::with_stopwords(vec![]);
let nodes = vec![make_para("only paragraph", 0.1)]; let cfg = CompressionConfig {
budget: 100,
current_tokens: 65,
fidelity: FidelityLevel::Semantic,
};
let result = compressor.compress(nodes, &cfg);
assert_eq!(
result.len(),
1,
"the sole paragraph in a single-paragraph document must not be removed"
);
}
#[test]
fn prune_keeps_all_equal_importance_paragraphs() {
let compressor = AdaptiveCompressor::with_stopwords(vec![]);
let nodes = vec![
make_para("first", 0.5),
make_para("second", 0.5),
make_para("third", 0.5),
];
let cfg = CompressionConfig {
budget: 100,
current_tokens: 65,
fidelity: FidelityLevel::Semantic,
};
let result = compressor.compress(nodes, &cfg);
assert_eq!(
result.len(),
3,
"paragraphs with equal importance must not all be removed"
);
}
#[test]
fn ascii_stopword_respects_word_boundaries() {
let compressor = AdaptiveCompressor::with_stopwords(vec!["the".into()]);
let cfg = CompressionConfig {
budget: 1000,
current_tokens: 100,
fidelity: FidelityLevel::Semantic,
};
let nodes = vec![make_para("the cat sat", 1.0)];
let result = compressor.compress(nodes, &cfg);
if let DocNode::Para { text, .. } = &result[0] {
assert!(
!text.to_lowercase().starts_with("the "),
"standalone 'the' at start must be removed: got '{}'",
text
);
assert!(
text.contains("cat") && text.contains("sat"),
"non-stopword tokens must remain: got '{}'",
text
);
}
let nodes2 = vec![make_para("theory is important", 1.0)];
let result2 = compressor.compress(nodes2, &cfg);
if let DocNode::Para { text, .. } = &result2[0] {
assert!(
text.contains("theory"),
"'theory' must not be modified by stopword 'the': got '{}'",
text
);
}
let nodes3 = vec![make_para("there are cats", 1.0)];
let result3 = compressor.compress(nodes3, &cfg);
if let DocNode::Para { text, .. } = &result3[0] {
assert!(
text.contains("there"),
"'there' must not be modified by stopword 'the': got '{}'",
text
);
}
let nodes4 = vec![make_para("we gather here", 1.0)];
let result4 = compressor.compress(nodes4, &cfg);
if let DocNode::Para { text, .. } = &result4[0] {
assert!(
text.contains("gather"),
"'gather' must not be modified by stopword 'the': got '{}'",
text
);
}
}
#[test]
fn stage_thresholds() {
let base = CompressionConfig {
budget: 100,
current_tokens: 0,
fidelity: FidelityLevel::Semantic,
};
let at = |tokens| CompressionConfig {
current_tokens: tokens,
..base.clone()
};
assert_eq!(at(50).stage(), CompressionStage::StopwordOnly);
assert_eq!(at(70).stage(), CompressionStage::PruneLowImportance);
assert_eq!(at(85).stage(), CompressionStage::DeduplicateAndLinearize);
assert_eq!(at(96).stage(), CompressionStage::MaxCompression);
}
}