Skip to main content

zeph_common/
text.rs

1// SPDX-FileCopyrightText: 2026 Andrei G <bug-ops>
2// SPDX-License-Identifier: MIT OR Apache-2.0
3
4//! String utility functions for Unicode-safe text manipulation.
5
6/// Format a token count as a short human-readable string.
7///
8/// - `>= 1_000_000` → `"{:.1}M"`
9/// - `>= 1_000`     → `"{:.1}k"`
10/// - otherwise      → decimal digits
11///
12/// # Examples
13///
14/// ```
15/// use zeph_common::text::format_tokens;
16///
17/// assert_eq!(format_tokens(0), "0");
18/// assert_eq!(format_tokens(999), "999");
19/// assert_eq!(format_tokens(1_500), "1.5k");
20/// assert_eq!(format_tokens(2_000_000), "2.0M");
21/// ```
22#[must_use]
23#[allow(clippy::cast_precision_loss)]
24pub fn format_tokens(n: u64) -> String {
25    if n >= 1_000_000 {
26        format!("{:.1}M", n as f64 / 1_000_000.0)
27    } else if n >= 1_000 {
28        format!("{:.1}k", n as f64 / 1_000.0)
29    } else {
30        n.to_string()
31    }
32}
33
34/// Truncate `s` to at most `max_bytes` bytes, preserving UTF-8 char boundaries.
35///
36/// Returns an owned `String`. If `s` fits within `max_bytes`, returns a copy
37/// unchanged. Otherwise, walks char boundaries and truncates at the largest
38/// boundary that fits.
39#[must_use]
40pub fn truncate_to_bytes(s: &str, max_bytes: usize) -> String {
41    if s.len() <= max_bytes {
42        return s.to_owned();
43    }
44    let mut byte_count = 0usize;
45    let mut end = 0usize;
46    for ch in s.chars() {
47        let ch_len = ch.len_utf8();
48        if byte_count + ch_len > max_bytes {
49            break;
50        }
51        byte_count += ch_len;
52        end += ch_len;
53    }
54    s[..end].to_owned()
55}
56
57/// Borrow a prefix of `s` that fits within `max_bytes` bytes.
58///
59/// Returns a subslice of `s`. Walks backwards from `max_bytes` to find a valid
60/// UTF-8 char boundary.
61#[must_use]
62pub fn truncate_to_bytes_ref(s: &str, max_bytes: usize) -> &str {
63    if s.len() <= max_bytes {
64        return s;
65    }
66    let mut end = max_bytes;
67    while end > 0 && !s.is_char_boundary(end) {
68        end -= 1;
69    }
70    &s[..end]
71}
72
73/// Rough token count estimate: 1 token ≈ 4 Unicode scalar values.
74///
75/// Uses `chars().count()` rather than byte length to avoid overestimating for
76/// non-ASCII content. This is the canonical fallback used when a BPE tokenizer
77/// is unavailable or the input exceeds the tokenizer's size limit.
78#[must_use]
79pub fn estimate_tokens(text: &str) -> usize {
80    text.chars().count() / 4
81}
82
83/// Borrow a prefix of `s` that is at most `max_chars` Unicode scalar values long.
84///
85/// Returns a subslice of `s`. No ellipsis is appended.
86#[must_use]
87pub fn truncate_chars(s: &str, max_chars: usize) -> &str {
88    if max_chars == 0 {
89        return "";
90    }
91    match s.char_indices().nth(max_chars) {
92        Some((byte_idx, _)) => &s[..byte_idx],
93        None => s,
94    }
95}
96
97/// Truncate a string to at most `max_chars` Unicode scalar values.
98///
99/// If the string is longer than `max_chars` chars, the first `max_chars` chars are
100/// kept and the Unicode ellipsis character `…` (U+2026) is appended. If `max_chars`
101/// is zero, returns an empty string.
102#[must_use]
103pub fn truncate_to_chars(s: &str, max_chars: usize) -> String {
104    if max_chars == 0 {
105        return String::new();
106    }
107    let count = s.chars().count();
108    if count <= max_chars {
109        s.to_owned()
110    } else {
111        let truncated: String = s.chars().take(max_chars).collect();
112        format!("{truncated}\u{2026}")
113    }
114}
115
116/// Escape XML special characters in a string.
117///
118/// Replaces `&`, `<`, `>`, `"`, and `'` with their XML entity equivalents.
119/// Use this when embedding arbitrary text into XML attributes or text nodes to
120/// prevent tag injection.
121///
122/// # Examples
123///
124/// ```
125/// use zeph_common::text::xml_escape;
126///
127/// assert_eq!(xml_escape("a < b && b > c"), "a &lt; b &amp;&amp; b &gt; c");
128/// assert_eq!(xml_escape(r#"say "hi""#), "say &quot;hi&quot;");
129/// assert_eq!(xml_escape("it's"), "it&#39;s");
130/// ```
131#[must_use]
132pub fn xml_escape(s: &str) -> String {
133    let mut out = String::with_capacity(s.len());
134    for ch in s.chars() {
135        match ch {
136            '&' => out.push_str("&amp;"),
137            '<' => out.push_str("&lt;"),
138            '>' => out.push_str("&gt;"),
139            '"' => out.push_str("&quot;"),
140            '\'' => out.push_str("&#39;"),
141            other => out.push(other),
142        }
143    }
144    out
145}
146
147#[cfg(test)]
148mod tests {
149    use super::*;
150
151    // truncate_to_bytes tests
152    #[test]
153    fn bytes_short_unchanged() {
154        assert_eq!(truncate_to_bytes("hello", 10), "hello");
155    }
156
157    #[test]
158    fn bytes_exact_unchanged() {
159        assert_eq!(truncate_to_bytes("hello", 5), "hello");
160    }
161
162    #[test]
163    fn bytes_truncates_at_boundary() {
164        let s = "hello world";
165        assert_eq!(truncate_to_bytes(s, 5), "hello");
166    }
167
168    #[test]
169    fn bytes_unicode_boundary() {
170        // "é" is 2 bytes in UTF-8
171        let s = "héllo";
172        assert_eq!(truncate_to_bytes(s, 3), "hé");
173    }
174
175    #[test]
176    fn bytes_zero_returns_empty() {
177        assert_eq!(truncate_to_bytes("hello", 0), "");
178    }
179
180    // truncate_to_bytes_ref tests
181    #[test]
182    fn bytes_ref_short_unchanged() {
183        assert_eq!(truncate_to_bytes_ref("hello", 10), "hello");
184    }
185
186    #[test]
187    fn bytes_ref_truncates_at_boundary() {
188        assert_eq!(truncate_to_bytes_ref("hello world", 5), "hello");
189    }
190
191    #[test]
192    fn bytes_ref_unicode_boundary() {
193        let s = "héllo";
194        assert_eq!(truncate_to_bytes_ref(s, 2), "h");
195    }
196
197    // truncate_chars tests
198    #[test]
199    fn chars_short_unchanged() {
200        assert_eq!(truncate_chars("hello", 10), "hello");
201    }
202
203    #[test]
204    fn chars_exact_unchanged() {
205        assert_eq!(truncate_chars("hello", 5), "hello");
206    }
207
208    #[test]
209    fn chars_truncates_by_char() {
210        assert_eq!(truncate_chars("hello world", 5), "hello");
211    }
212
213    #[test]
214    fn chars_zero_returns_empty() {
215        assert_eq!(truncate_chars("hello", 0), "");
216    }
217
218    #[test]
219    fn chars_unicode_by_char() {
220        let s = "😀😁😂😃😄extra";
221        assert_eq!(truncate_chars(s, 5), "😀😁😂😃😄");
222    }
223
224    // truncate_to_chars tests
225    #[test]
226    fn to_chars_short_unchanged() {
227        assert_eq!(truncate_to_chars("hello", 10), "hello");
228    }
229
230    #[test]
231    fn to_chars_exact_unchanged() {
232        assert_eq!(truncate_to_chars("hello", 5), "hello");
233    }
234
235    #[test]
236    fn to_chars_appends_ellipsis() {
237        assert_eq!(truncate_to_chars("hello world", 5), "hello\u{2026}");
238    }
239
240    #[test]
241    fn to_chars_zero_returns_empty() {
242        assert_eq!(truncate_to_chars("hello", 0), "");
243    }
244
245    #[test]
246    fn to_chars_unicode() {
247        let s = "😀😁😂😃😄extra";
248        assert_eq!(truncate_to_chars(s, 5), "😀😁😂😃😄\u{2026}");
249    }
250}