Skip to main content

zerobox_utils_string/
truncate.rs

1//! Utilities for truncating large chunks of output while preserving a prefix
2//! and suffix on UTF-8 boundaries.
3
4const APPROX_BYTES_PER_TOKEN: usize = 4;
5
6/// Truncate a string to `max_bytes` using a character-count marker.
7pub fn truncate_middle_chars(s: &str, max_bytes: usize) -> String {
8    truncate_with_byte_estimate(s, max_bytes, /*use_tokens*/ false)
9}
10
11/// Truncate the middle of a UTF-8 string to at most `max_tokens` approximate
12/// tokens, preserving the beginning and the end. Returns the possibly
13/// truncated string and `Some(original_token_count)` if truncation occurred;
14/// otherwise returns the original string and `None`.
15pub fn truncate_middle_with_token_budget(s: &str, max_tokens: usize) -> (String, Option<u64>) {
16    if s.is_empty() {
17        return (String::new(), None);
18    }
19
20    if max_tokens > 0 && s.len() <= approx_bytes_for_tokens(max_tokens) {
21        return (s.to_string(), None);
22    }
23
24    let truncated = truncate_with_byte_estimate(
25        s,
26        approx_bytes_for_tokens(max_tokens),
27        /*use_tokens*/ true,
28    );
29    let total_tokens = u64::try_from(approx_token_count(s)).unwrap_or(u64::MAX);
30
31    if truncated == s {
32        (truncated, None)
33    } else {
34        (truncated, Some(total_tokens))
35    }
36}
37
38fn truncate_with_byte_estimate(s: &str, max_bytes: usize, use_tokens: bool) -> String {
39    if s.is_empty() {
40        return String::new();
41    }
42
43    let total_chars = s.chars().count();
44
45    if max_bytes == 0 {
46        return format_truncation_marker(
47            use_tokens,
48            removed_units(use_tokens, s.len(), total_chars),
49        );
50    }
51
52    if s.len() <= max_bytes {
53        return s.to_string();
54    }
55
56    let total_bytes = s.len();
57    let (left_budget, right_budget) = split_budget(max_bytes);
58    let (removed_chars, left, right) = split_string(s, left_budget, right_budget);
59    let marker = format_truncation_marker(
60        use_tokens,
61        removed_units(
62            use_tokens,
63            total_bytes.saturating_sub(max_bytes),
64            removed_chars,
65        ),
66    );
67
68    assemble_truncated_output(left, right, &marker)
69}
70
71pub fn approx_token_count(text: &str) -> usize {
72    let len = text.len();
73    len.saturating_add(APPROX_BYTES_PER_TOKEN.saturating_sub(1)) / APPROX_BYTES_PER_TOKEN
74}
75
76pub fn approx_bytes_for_tokens(tokens: usize) -> usize {
77    tokens.saturating_mul(APPROX_BYTES_PER_TOKEN)
78}
79
80pub fn approx_tokens_from_byte_count(bytes: usize) -> u64 {
81    let bytes_u64 = bytes as u64;
82    bytes_u64.saturating_add((APPROX_BYTES_PER_TOKEN as u64).saturating_sub(1))
83        / (APPROX_BYTES_PER_TOKEN as u64)
84}
85
86fn split_string(s: &str, beginning_bytes: usize, end_bytes: usize) -> (usize, &str, &str) {
87    if s.is_empty() {
88        return (0, "", "");
89    }
90
91    let len = s.len();
92    let tail_start_target = len.saturating_sub(end_bytes);
93    let mut prefix_end = 0usize;
94    let mut suffix_start = len;
95    let mut removed_chars = 0usize;
96    let mut suffix_started = false;
97
98    for (idx, ch) in s.char_indices() {
99        let char_end = idx + ch.len_utf8();
100        if char_end <= beginning_bytes {
101            prefix_end = char_end;
102            continue;
103        }
104
105        if idx >= tail_start_target {
106            if !suffix_started {
107                suffix_start = idx;
108                suffix_started = true;
109            }
110            continue;
111        }
112
113        removed_chars = removed_chars.saturating_add(1);
114    }
115
116    if suffix_start < prefix_end {
117        suffix_start = prefix_end;
118    }
119
120    let before = &s[..prefix_end];
121    let after = &s[suffix_start..];
122
123    (removed_chars, before, after)
124}
125
126fn split_budget(budget: usize) -> (usize, usize) {
127    let left = budget / 2;
128    (left, budget - left)
129}
130
131fn format_truncation_marker(use_tokens: bool, removed_count: u64) -> String {
132    if use_tokens {
133        format!("…{removed_count} tokens truncated…")
134    } else {
135        format!("…{removed_count} chars truncated…")
136    }
137}
138
139fn removed_units(use_tokens: bool, removed_bytes: usize, removed_chars: usize) -> u64 {
140    if use_tokens {
141        approx_tokens_from_byte_count(removed_bytes)
142    } else {
143        u64::try_from(removed_chars).unwrap_or(u64::MAX)
144    }
145}
146
147fn assemble_truncated_output(prefix: &str, suffix: &str, marker: &str) -> String {
148    let mut out = String::with_capacity(prefix.len() + marker.len() + suffix.len() + 1);
149    out.push_str(prefix);
150    out.push_str(marker);
151    out.push_str(suffix);
152    out
153}
154
155#[cfg(test)]
156mod tests;