webfetch_core/
compress.rs1use once_cell::sync::Lazy;
2use regex::Regex;
3
4static WHITESPACE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"\s+").unwrap());
5static DECORATIVE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"[▶→←▼▲•·◆◇◊✓✗✔✘‣⁃◦]").unwrap());
6
7pub fn compress_text(text: &str) -> String {
14 let clean = DECORATIVE_RE.replace_all(text, "");
15 let collapsed = WHITESPACE_RE.replace_all(&clean, " ");
16 collapsed.trim().to_string()
17}
18
19pub fn compress_block(text: &str) -> String {
22 let mut lines: Vec<String> = Vec::new();
23 let mut prev_blank = false;
24 for raw in text.lines() {
25 let line = compress_text(raw);
26 let blank = line.is_empty();
27 if blank && prev_blank {
28 continue;
29 }
30 lines.push(line);
31 prev_blank = blank;
32 }
33 lines.join("\n").trim().to_string()
34}
35
36pub fn estimate_tokens(text: &str) -> usize {
47 let base = text.len() / 4;
48 let url_punct = text
49 .bytes()
50 .filter(|b| {
51 matches!(
52 b,
53 b'/' | b':' | b'.' | b'?' | b'#' | b'&' | b'=' | b'%' | b'~'
54 )
55 })
56 .count();
57 base + url_punct / 2
58}
59
60const MIN_BODY_TOKENS: usize = 64;
63
64pub fn truncate_to_tokens(text: &str, max_tokens: usize) -> String {
67 let max_chars = max_tokens.saturating_mul(4);
68 if text.len() <= max_chars {
69 return text.to_string();
70 }
71 let mut end = max_chars;
72 while end > 0 && !text.is_char_boundary(end) {
73 end -= 1;
74 }
75 format!("{}\n…[truncated]", &text[..end])
76}
77
78pub fn truncate_preserving_refs(content: &str, refs_block: &str, max_tokens: usize) -> String {
88 if refs_block.is_empty() {
89 return truncate_to_tokens(content, max_tokens);
90 }
91 let body = content
93 .strip_suffix(refs_block)
94 .map(|b| b.trim_end_matches('\n'))
95 .unwrap_or(content);
96
97 let refs_tokens = estimate_tokens(refs_block);
98 let body_budget = max_tokens.saturating_sub(refs_tokens).max(MIN_BODY_TOKENS);
99 let body = truncate_to_tokens(body, body_budget);
100 format!("{body}\n\n{refs_block}")
101}
102
103#[cfg(test)]
104mod tests {
105 use super::*;
106
107 #[test]
108 fn estimate_unchanged_for_plain_prose() {
109 assert_eq!(estimate_tokens(&"a".repeat(100)), 25);
110 }
111
112 #[test]
113 fn url_heavy_text_estimates_higher_than_prose_of_same_length() {
114 let urls = "[1] https://example.com/a/b?c=d#e\n\
116 [2] https://example.org/x/y/z?q=1\n\
117 [3] https://sub.example.net/path/to/thing\n\
118 [4] https://example.io/foo/bar/baz?k=v";
119 let prose = "x".repeat(urls.len());
121 assert_eq!(urls.len(), prose.len());
122 assert!(
123 estimate_tokens(urls) > estimate_tokens(&prose),
124 "urls={} prose={}",
125 estimate_tokens(urls),
126 estimate_tokens(&prose)
127 );
128 }
129
130 #[test]
131 fn preserving_refs_keeps_block_intact_when_body_truncated() {
132 let refs_block = "References:\n[1] https://example.com/a\n[2] https://example.com/b";
133 let body = "word ".repeat(500); let content = format!("{}\n\n{}", body.trim_end(), refs_block);
135 let out = truncate_preserving_refs(&content, refs_block, 80);
136 assert!(out.contains("…[truncated]"), "out: {out}");
137 assert!(out.ends_with(refs_block), "out tail: {:?}", out);
139 }
140
141 #[test]
142 fn preserving_refs_is_noop_when_within_budget() {
143 let refs_block = "References:\n[1] https://example.com/a";
144 let content = format!("short body\n\n{refs_block}");
145 let out = truncate_preserving_refs(&content, refs_block, 10_000);
146 assert_eq!(out, content);
147 }
148
149 #[test]
150 fn preserving_refs_without_block_falls_back_to_plain_truncate() {
151 let content = "z".repeat(1000);
152 let out = truncate_preserving_refs(&content, "", 10);
153 assert_eq!(out, truncate_to_tokens(&content, 10));
154 }
155}