lexe_std/
string.rs

1//! String utilities.
2
3/// Truncates a [`String`] to at most `max_bytes` bytes, ensuring the result
4/// is valid UTF-8 by finding the nearest character boundary.
5///
6/// If `s.len() <= max_bytes`, this is a no-op.
7///
8/// Assumption: data in input distribution is almost always shorter than
9/// `max_bytes`.
10#[inline(always)]
11pub fn truncate_bytes(s: &mut String, max_bytes: usize) {
12    if s.len() <= max_bytes {
13        return;
14    }
15    truncate_bytes_cold(s, max_bytes)
16}
17
18/// Truncates a [`String`] to at most `max_chars` characters.
19///
20/// If the string has fewer than `max_chars` characters, this is a no-op.
21///
22/// Assumption: data in input distribution is almost always shorter than
23/// `max_chars`.
24#[inline(always)]
25pub fn truncate_chars(s: &mut String, max_chars: usize) {
26    if s.len() <= max_chars {
27        return;
28    }
29    truncate_chars_cold(s, max_chars)
30}
31
32#[inline(never)]
33#[cold]
34fn truncate_bytes_cold(s: &mut String, max_bytes: usize) {
35    // UTF-8 code points are 1-4 bytes long, so we can limit our search to this
36    // range: [max_bytes - 3, max_bytes]
37    for idx in (max_bytes.saturating_sub(3)..=max_bytes).rev() {
38        if s.is_char_boundary(idx) {
39            s.truncate(idx);
40            break;
41        }
42    }
43}
44
45#[inline(never)]
46#[cold]
47fn truncate_chars_cold(s: &mut String, max_chars: usize) {
48    const HIGH_BITS: u64 = 0x8080_8080_8080_8080;
49
50    let bytes = s.as_bytes();
51    let len = bytes.len();
52    let (chunks, _) = bytes.as_chunks::<8>();
53
54    let mut idx = 0usize;
55    let mut chars_seen = 0usize;
56
57    for chunk in chunks {
58        let word = u64::from_ne_bytes(*chunk);
59
60        // Continuation bytes are `10xxxxxx`: bit7=1 and bit6=0.
61        let continuation_mask = (word & HIGH_BITS) & !((word << 1) & HIGH_BITS);
62        let continuation_count = continuation_mask.count_ones() as usize;
63        let chunk_chars = 8usize - continuation_count;
64
65        // Accept the whole 8-byte chunk only if it keeps us within MAX_CHARS;
66        // otherwise fall back to byte-wise refinement from the current `idx`.
67        chars_seen += chunk_chars;
68        if chars_seen > max_chars {
69            chars_seen -= chunk_chars;
70            break;
71        }
72
73        idx += 8;
74    }
75
76    while idx < len {
77        if (bytes[idx] & 0b1100_0000) != 0b1000_0000 {
78            chars_seen += 1;
79            if chars_seen > max_chars {
80                // `idx` is a non-continuation byte, so it is a UTF-8 scalar
81                // boundary and therefore a valid truncate index.
82                s.truncate(idx);
83                return;
84            }
85        }
86        idx += 1;
87    }
88}
89
90#[cfg(test)]
91mod tests {
92    use proptest::{prop_assert, prop_assert_eq, proptest};
93
94    use super::*;
95
96    // Helper: truncate a clone by bytes and return the result.
97    fn tb(s: &str, max_bytes: usize) -> String {
98        let mut s = s.to_owned();
99        truncate_bytes(&mut s, max_bytes);
100        s
101    }
102
103    // Helper: truncate a clone by chars and return the result.
104    fn tc(s: &str, max_chars: usize) -> String {
105        let mut s = s.to_owned();
106        truncate_chars(&mut s, max_chars);
107        s
108    }
109
110    #[test]
111    fn test_truncate_bytes() {
112        // No-ops: empty, under limit, at limit
113        assert_eq!(tb("", 10), "");
114        assert_eq!(tb("hello", 10), "hello");
115        assert_eq!(tb("hello", 5), "hello");
116
117        // ASCII truncation
118        assert_eq!(tb("hello world", 5), "hello");
119
120        // Multibyte: "a😀b" = 1 + 4 + 1 = 6 bytes
121        // Cutting at 3 lands inside the emoji; backs up to byte 1.
122        assert_eq!(tb("a\u{1F600}b", 3), "a");
123
124        // CJK: 3 bytes each. "日本語" = 9 bytes.
125        assert_eq!(tb("日本語", 7), "日本"); // mid-char backs up
126        assert_eq!(tb("日本語", 6), "日本"); // exact boundary
127    }
128
129    #[test]
130    fn test_truncate_chars() {
131        // No-ops: empty, under limit, at limit
132        assert_eq!(tc("", 10), "");
133        assert_eq!(tc("hello", 10), "hello");
134        assert_eq!(tc("hello", 5), "hello");
135
136        // ASCII truncation
137        assert_eq!(tc("hello world", 5), "hello");
138
139        // Multibyte: "a😀b😀c" = 5 chars
140        assert_eq!(tc("a\u{1F600}b\u{1F600}c", 3), "a\u{1F600}b");
141
142        // CJK: "日本語テスト" = 6 chars
143        assert_eq!(tc("日本語テスト", 3), "日本語");
144
145        // Zero chars
146        assert_eq!(tc("hello", 0), "");
147    }
148
149    // Both truncate_bytes and truncate_chars are idempotent.
150    // ∀ f ∈ {tb, tc}, s, n.  f(s, n) = f(f(s, n), n)
151    #[test]
152    fn test_truncate_idempotent() {
153        proptest!(|(s: String, n in 0usize..=512)| {
154            let bytes_once = tb(&s, n);
155            let bytes_twice = tb(&bytes_once, n);
156            prop_assert_eq!(bytes_once, bytes_twice);
157
158            let chars_once = tc(&s, n);
159            let chars_twice = tc(&chars_once, n);
160            prop_assert_eq!(chars_once, chars_twice);
161        });
162    }
163
164    // For the same length, truncating by chars will be longer than truncating
165    // by bytes.
166    // ∀ s, n.  s.len() >= tc(s, n).len() >= tb(s, n).len()
167    #[test]
168    fn test_truncate_length_ordering() {
169        proptest!(|(s: String, n in 0usize..=512)| {
170            let original_len = s.len();
171            let chars_len = tc(&s, n).len();
172            let bytes_len = tb(&s, n).len();
173
174            prop_assert!(original_len >= chars_len);
175            prop_assert!(chars_len >= bytes_len);
176        });
177    }
178
179    // Truncating to the original length / num chars after appending any ASCII
180    // byte / char gives the original string.
181    // ∀ s, b.  s == tb(s || b, s.len())
182    // ∀ s, c.  s == tc(s || c, s.chars().count())
183    #[test]
184    fn test_truncate_prefix_recovery() {
185        proptest!(|(s: String, ascii in 0u8..=0x7f, c: char)| {
186            let mut with_ascii = s.clone();
187            with_ascii.push(char::from(ascii));
188            let bytes_recovered = tb(&with_ascii, s.len());
189            prop_assert_eq!(bytes_recovered, s.clone());
190
191            let mut with_char = s.clone();
192            with_char.push(c);
193            let chars_recovered = tc(&with_char, s.chars().count());
194            prop_assert_eq!(chars_recovered, s);
195        });
196    }
197}
lexe_std/string.rs

lexe_std/
string.rs