Skip to main content

webfetch_core/
compress.rs

1use once_cell::sync::Lazy;
2use regex::Regex;
3
4static WHITESPACE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"\s+").unwrap());
5static DECORATIVE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"[▶→←▼▲•·◆◇◊✓✗✔✘‣⁃◦]").unwrap());
6
7/// Semantic text reduction: strip decorative glyphs, then collapse runs of
8/// whitespace, then trim.
9///
10/// Order matters — decorative characters are removed *before* collapsing
11/// whitespace so that a glyph surrounded by spaces (e.g. `"Click ▶ to play"`)
12/// does not leave a double space behind.
13pub fn compress_text(text: &str) -> String {
14    let clean = DECORATIVE_RE.replace_all(text, "");
15    let collapsed = WHITESPACE_RE.replace_all(&clean, " ");
16    collapsed.trim().to_string()
17}
18
19/// Collapse repeated blank lines while preserving paragraph breaks, and
20/// compress whitespace within each line.
21pub fn compress_block(text: &str) -> String {
22    let mut lines: Vec<String> = Vec::new();
23    let mut prev_blank = false;
24    for raw in text.lines() {
25        let line = compress_text(raw);
26        let blank = line.is_empty();
27        if blank && prev_blank {
28            continue;
29        }
30        lines.push(line);
31        prev_blank = blank;
32    }
33    lines.join("\n").trim().to_string()
34}
35
36/// Fast token approximation: ~4 characters per token, matching common
37/// BPE tokenizers closely enough for budgeting.
38pub fn estimate_tokens(text: &str) -> usize {
39    text.len() / 4
40}
41
42/// Truncate text to roughly `max_tokens`, on a character boundary, appending
43/// an elision marker when content is dropped.
44pub fn truncate_to_tokens(text: &str, max_tokens: usize) -> String {
45    let max_chars = max_tokens.saturating_mul(4);
46    if text.len() <= max_chars {
47        return text.to_string();
48    }
49    let mut end = max_chars;
50    while end > 0 && !text.is_char_boundary(end) {
51        end -= 1;
52    }
53    format!("{}\n…[truncated]", &text[..end])
54}