zeph_common/
text.rs

1// SPDX-FileCopyrightText: 2026 Andrei G <bug-ops>
2// SPDX-License-Identifier: MIT OR Apache-2.0
3
4//! String utility functions for Unicode-safe text manipulation.
5
6/// Truncate `s` to at most `max_bytes` bytes, preserving UTF-8 char boundaries.
7///
8/// Returns an owned `String`. If `s` fits within `max_bytes`, returns a copy
9/// unchanged. Otherwise, walks char boundaries and truncates at the largest
10/// boundary that fits.
11#[must_use]
12pub fn truncate_to_bytes(s: &str, max_bytes: usize) -> String {
13    if s.len() <= max_bytes {
14        return s.to_owned();
15    }
16    let mut byte_count = 0usize;
17    let mut end = 0usize;
18    for ch in s.chars() {
19        let ch_len = ch.len_utf8();
20        if byte_count + ch_len > max_bytes {
21            break;
22        }
23        byte_count += ch_len;
24        end += ch_len;
25    }
26    s[..end].to_owned()
27}
28
29/// Borrow a prefix of `s` that fits within `max_bytes` bytes.
30///
31/// Returns a subslice of `s`. Walks backwards from `max_bytes` to find a valid
32/// UTF-8 char boundary.
33#[must_use]
34pub fn truncate_to_bytes_ref(s: &str, max_bytes: usize) -> &str {
35    if s.len() <= max_bytes {
36        return s;
37    }
38    let mut end = max_bytes;
39    while end > 0 && !s.is_char_boundary(end) {
40        end -= 1;
41    }
42    &s[..end]
43}
44
45/// Rough token count estimate: 1 token ≈ 4 Unicode scalar values.
46///
47/// Uses `chars().count()` rather than byte length to avoid overestimating for
48/// non-ASCII content. This is the canonical fallback used when a BPE tokenizer
49/// is unavailable or the input exceeds the tokenizer's size limit.
50#[must_use]
51pub fn estimate_tokens(text: &str) -> usize {
52    text.chars().count() / 4
53}
54
55/// Borrow a prefix of `s` that is at most `max_chars` Unicode scalar values long.
56///
57/// Returns a subslice of `s`. No ellipsis is appended.
58#[must_use]
59pub fn truncate_chars(s: &str, max_chars: usize) -> &str {
60    if max_chars == 0 {
61        return "";
62    }
63    match s.char_indices().nth(max_chars) {
64        Some((byte_idx, _)) => &s[..byte_idx],
65        None => s,
66    }
67}
68
69/// Truncate a string to at most `max_chars` Unicode scalar values.
70///
71/// If the string is longer than `max_chars` chars, the first `max_chars` chars are
72/// kept and the Unicode ellipsis character `…` (U+2026) is appended. If `max_chars`
73/// is zero, returns an empty string.
74#[must_use]
75pub fn truncate_to_chars(s: &str, max_chars: usize) -> String {
76    if max_chars == 0 {
77        return String::new();
78    }
79    let count = s.chars().count();
80    if count <= max_chars {
81        s.to_owned()
82    } else {
83        let truncated: String = s.chars().take(max_chars).collect();
84        format!("{truncated}\u{2026}")
85    }
86}
87
88#[cfg(test)]
89mod tests {
90    use super::*;
91
92    // truncate_to_bytes tests
93    #[test]
94    fn bytes_short_unchanged() {
95        assert_eq!(truncate_to_bytes("hello", 10), "hello");
96    }
97
98    #[test]
99    fn bytes_exact_unchanged() {
100        assert_eq!(truncate_to_bytes("hello", 5), "hello");
101    }
102
103    #[test]
104    fn bytes_truncates_at_boundary() {
105        let s = "hello world";
106        assert_eq!(truncate_to_bytes(s, 5), "hello");
107    }
108
109    #[test]
110    fn bytes_unicode_boundary() {
111        // "é" is 2 bytes in UTF-8
112        let s = "héllo";
113        assert_eq!(truncate_to_bytes(s, 3), "hé");
114    }
115
116    #[test]
117    fn bytes_zero_returns_empty() {
118        assert_eq!(truncate_to_bytes("hello", 0), "");
119    }
120
121    // truncate_to_bytes_ref tests
122    #[test]
123    fn bytes_ref_short_unchanged() {
124        assert_eq!(truncate_to_bytes_ref("hello", 10), "hello");
125    }
126
127    #[test]
128    fn bytes_ref_truncates_at_boundary() {
129        assert_eq!(truncate_to_bytes_ref("hello world", 5), "hello");
130    }
131
132    #[test]
133    fn bytes_ref_unicode_boundary() {
134        let s = "héllo";
135        assert_eq!(truncate_to_bytes_ref(s, 2), "h");
136    }
137
138    // truncate_chars tests
139    #[test]
140    fn chars_short_unchanged() {
141        assert_eq!(truncate_chars("hello", 10), "hello");
142    }
143
144    #[test]
145    fn chars_exact_unchanged() {
146        assert_eq!(truncate_chars("hello", 5), "hello");
147    }
148
149    #[test]
150    fn chars_truncates_by_char() {
151        assert_eq!(truncate_chars("hello world", 5), "hello");
152    }
153
154    #[test]
155    fn chars_zero_returns_empty() {
156        assert_eq!(truncate_chars("hello", 0), "");
157    }
158
159    #[test]
160    fn chars_unicode_by_char() {
161        let s = "😀😁😂😃😄extra";
162        assert_eq!(truncate_chars(s, 5), "😀😁😂😃😄");
163    }
164
165    // truncate_to_chars tests
166    #[test]
167    fn to_chars_short_unchanged() {
168        assert_eq!(truncate_to_chars("hello", 10), "hello");
169    }
170
171    #[test]
172    fn to_chars_exact_unchanged() {
173        assert_eq!(truncate_to_chars("hello", 5), "hello");
174    }
175
176    #[test]
177    fn to_chars_appends_ellipsis() {
178        assert_eq!(truncate_to_chars("hello world", 5), "hello\u{2026}");
179    }
180
181    #[test]
182    fn to_chars_zero_returns_empty() {
183        assert_eq!(truncate_to_chars("hello", 0), "");
184    }
185
186    #[test]
187    fn to_chars_unicode() {
188        let s = "😀😁😂😃😄extra";
189        assert_eq!(truncate_to_chars(s, 5), "😀😁😂😃😄\u{2026}");
190    }
191}
zeph_common/text.rs

zeph_common/
text.rs