use once_cell::sync::Lazy;
use regex::Regex;
static WHITESPACE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"\s+").unwrap());
static DECORATIVE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"[▶→←▼▲•·◆◇◊✓✗✔✘‣⁃◦]").unwrap());
pub fn compress_text(text: &str) -> String {
let clean = DECORATIVE_RE.replace_all(text, "");
let collapsed = WHITESPACE_RE.replace_all(&clean, " ");
collapsed.trim().to_string()
}
pub fn compress_block(text: &str) -> String {
let mut lines: Vec<String> = Vec::new();
let mut prev_blank = false;
for raw in text.lines() {
let line = compress_text(raw);
let blank = line.is_empty();
if blank && prev_blank {
continue;
}
lines.push(line);
prev_blank = blank;
}
lines.join("\n").trim().to_string()
}
pub fn estimate_tokens(text: &str) -> usize {
let base = text.len() / 4;
let url_punct = text
.bytes()
.filter(|b| {
matches!(
b,
b'/' | b':' | b'.' | b'?' | b'#' | b'&' | b'=' | b'%' | b'~'
)
})
.count();
base + url_punct / 2
}
const MIN_BODY_TOKENS: usize = 64;
pub fn truncate_to_tokens(text: &str, max_tokens: usize) -> String {
let max_chars = max_tokens.saturating_mul(4);
if text.len() <= max_chars {
return text.to_string();
}
let mut end = max_chars;
while end > 0 && !text.is_char_boundary(end) {
end -= 1;
}
format!("{}\n…[truncated]", &text[..end])
}
pub fn truncate_preserving_refs(content: &str, refs_block: &str, max_tokens: usize) -> String {
if refs_block.is_empty() {
return truncate_to_tokens(content, max_tokens);
}
let body = content
.strip_suffix(refs_block)
.map(|b| b.trim_end_matches('\n'))
.unwrap_or(content);
let refs_tokens = estimate_tokens(refs_block);
let body_budget = max_tokens.saturating_sub(refs_tokens).max(MIN_BODY_TOKENS);
let body = truncate_to_tokens(body, body_budget);
format!("{body}\n\n{refs_block}")
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn estimate_unchanged_for_plain_prose() {
assert_eq!(estimate_tokens(&"a".repeat(100)), 25);
}
#[test]
fn url_heavy_text_estimates_higher_than_prose_of_same_length() {
let urls = "[1] https://example.com/a/b?c=d#e\n\
[2] https://example.org/x/y/z?q=1\n\
[3] https://sub.example.net/path/to/thing\n\
[4] https://example.io/foo/bar/baz?k=v";
let prose = "x".repeat(urls.len());
assert_eq!(urls.len(), prose.len());
assert!(
estimate_tokens(urls) > estimate_tokens(&prose),
"urls={} prose={}",
estimate_tokens(urls),
estimate_tokens(&prose)
);
}
#[test]
fn preserving_refs_keeps_block_intact_when_body_truncated() {
let refs_block = "References:\n[1] https://example.com/a\n[2] https://example.com/b";
let body = "word ".repeat(500); let content = format!("{}\n\n{}", body.trim_end(), refs_block);
let out = truncate_preserving_refs(&content, refs_block, 80);
assert!(out.contains("…[truncated]"), "out: {out}");
assert!(out.ends_with(refs_block), "out tail: {:?}", out);
}
#[test]
fn preserving_refs_is_noop_when_within_budget() {
let refs_block = "References:\n[1] https://example.com/a";
let content = format!("short body\n\n{refs_block}");
let out = truncate_preserving_refs(&content, refs_block, 10_000);
assert_eq!(out, content);
}
#[test]
fn preserving_refs_without_block_falls_back_to_plain_truncate() {
let content = "z".repeat(1000);
let out = truncate_preserving_refs(&content, "", 10);
assert_eq!(out, truncate_to_tokens(&content, 10));
}
}