Skip to main content

webfetch_core/
compress.rs

1use once_cell::sync::Lazy;
2use regex::Regex;
3
4static WHITESPACE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"\s+").unwrap());
5static DECORATIVE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"[▶→←▼▲•·◆◇◊✓✗✔✘‣⁃◦]").unwrap());
6
7/// Semantic text reduction: strip decorative glyphs, then collapse runs of
8/// whitespace, then trim.
9///
10/// Order matters — decorative characters are removed *before* collapsing
11/// whitespace so that a glyph surrounded by spaces (e.g. `"Click ▶ to play"`)
12/// does not leave a double space behind.
13pub fn compress_text(text: &str) -> String {
14    let clean = DECORATIVE_RE.replace_all(text, "");
15    let collapsed = WHITESPACE_RE.replace_all(&clean, " ");
16    collapsed.trim().to_string()
17}
18
19/// Collapse repeated blank lines while preserving paragraph breaks, and
20/// compress whitespace within each line.
21pub fn compress_block(text: &str) -> String {
22    let mut lines: Vec<String> = Vec::new();
23    let mut prev_blank = false;
24    for raw in text.lines() {
25        let line = compress_text(raw);
26        let blank = line.is_empty();
27        if blank && prev_blank {
28            continue;
29        }
30        lines.push(line);
31        prev_blank = blank;
32    }
33    lines.join("\n").trim().to_string()
34}
35
36/// Fast token approximation.
37///
38/// Prose is ~4 characters per token, which matches common BPE tokenizers
39/// closely enough for budgeting. URLs and reference blocks, however, are
40/// punctuation-dense — BPE breaks on `/ : . ? # & = % ~`, so a URL yields far
41/// more tokens per character than prose and a naive `len/4` badly
42/// *under*-budgets them. We therefore add a surcharge of half a token per such
43/// punctuation byte, which pushes URL-heavy text (the trailing reference block
44/// especially) toward its true token count while leaving prose essentially
45/// unchanged. The heuristic is deterministic and a single linear scan.
46pub fn estimate_tokens(text: &str) -> usize {
47    let base = text.len() / 4;
48    let url_punct = text
49        .bytes()
50        .filter(|b| {
51            matches!(
52                b,
53                b'/' | b':' | b'.' | b'?' | b'#' | b'&' | b'=' | b'%' | b'~'
54            )
55        })
56        .count();
57    base + url_punct / 2
58}
59
60/// The smallest body budget we will ever leave after reserving room for a
61/// reference block, so that a page dominated by links still shows *some* body.
62const MIN_BODY_TOKENS: usize = 64;
63
64/// Truncate text to roughly `max_tokens`, on a character boundary, appending
65/// an elision marker when content is dropped.
66pub fn truncate_to_tokens(text: &str, max_tokens: usize) -> String {
67    let max_chars = max_tokens.saturating_mul(4);
68    if text.len() <= max_chars {
69        return text.to_string();
70    }
71    let mut end = max_chars;
72    while end > 0 && !text.is_char_boundary(end) {
73        end -= 1;
74    }
75    format!("{}\n…[truncated]", &text[..end])
76}
77
78/// Truncate `content` to `max_tokens` while keeping a trailing `refs_block`
79/// (a rendered `References:` list) intact.
80///
81/// Reference-style output appends the block to the end of the content; a plain
82/// `truncate_to_tokens` over the whole string would cut the references off,
83/// leaving inline `[N]` markers that resolve to nothing. Instead we strip the
84/// block, truncate only the body to the budget left after reserving room for
85/// the block (floored at [`MIN_BODY_TOKENS`] so a link-heavy page keeps some
86/// body), then re-append the block whole.
87pub fn truncate_preserving_refs(content: &str, refs_block: &str, max_tokens: usize) -> String {
88    if refs_block.is_empty() {
89        return truncate_to_tokens(content, max_tokens);
90    }
91    // The block sits at the very end, joined to the body by a blank line.
92    let body = content
93        .strip_suffix(refs_block)
94        .map(|b| b.trim_end_matches('\n'))
95        .unwrap_or(content);
96
97    let refs_tokens = estimate_tokens(refs_block);
98    let body_budget = max_tokens.saturating_sub(refs_tokens).max(MIN_BODY_TOKENS);
99    let body = truncate_to_tokens(body, body_budget);
100    format!("{body}\n\n{refs_block}")
101}
102
103#[cfg(test)]
104mod tests {
105    use super::*;
106
107    #[test]
108    fn estimate_unchanged_for_plain_prose() {
109        assert_eq!(estimate_tokens(&"a".repeat(100)), 25);
110    }
111
112    #[test]
113    fn url_heavy_text_estimates_higher_than_prose_of_same_length() {
114        // Four reference lines: punctuation-dense URLs.
115        let urls = "[1] https://example.com/a/b?c=d#e\n\
116                    [2] https://example.org/x/y/z?q=1\n\
117                    [3] https://sub.example.net/path/to/thing\n\
118                    [4] https://example.io/foo/bar/baz?k=v";
119        // Same byte length, but plain prose (no URL punctuation).
120        let prose = "x".repeat(urls.len());
121        assert_eq!(urls.len(), prose.len());
122        assert!(
123            estimate_tokens(urls) > estimate_tokens(&prose),
124            "urls={} prose={}",
125            estimate_tokens(urls),
126            estimate_tokens(&prose)
127        );
128    }
129
130    #[test]
131    fn preserving_refs_keeps_block_intact_when_body_truncated() {
132        let refs_block = "References:\n[1] https://example.com/a\n[2] https://example.com/b";
133        let body = "word ".repeat(500); // far over budget
134        let content = format!("{}\n\n{}", body.trim_end(), refs_block);
135        let out = truncate_preserving_refs(&content, refs_block, 80);
136        assert!(out.contains("…[truncated]"), "out: {out}");
137        // The full reference block survives at the very end.
138        assert!(out.ends_with(refs_block), "out tail: {:?}", out);
139    }
140
141    #[test]
142    fn preserving_refs_is_noop_when_within_budget() {
143        let refs_block = "References:\n[1] https://example.com/a";
144        let content = format!("short body\n\n{refs_block}");
145        let out = truncate_preserving_refs(&content, refs_block, 10_000);
146        assert_eq!(out, content);
147    }
148
149    #[test]
150    fn preserving_refs_without_block_falls_back_to_plain_truncate() {
151        let content = "z".repeat(1000);
152        let out = truncate_preserving_refs(&content, "", 10);
153        assert_eq!(out, truncate_to_tokens(&content, 10));
154    }
155}