Skip to main content

rlm_rs/io/
unicode.rs

1//! Unicode utilities for text processing.
2//!
3//! Provides helpers for proper Unicode handling including
4//! character boundary detection and validation.
5
6use unicode_segmentation::UnicodeSegmentation;
7
8/// Finds a valid UTF-8 character boundary at or before the given position.
9///
10/// # Arguments
11///
12/// * `s` - The string to search.
13/// * `pos` - Target position in bytes.
14///
15/// # Returns
16///
17/// A byte position that is a valid UTF-8 character boundary.
18///
19/// # Examples
20///
21/// ```
22/// use rlm_rs::io::find_char_boundary;
23///
24/// let s = "Hello 世界";
25/// assert_eq!(find_char_boundary(s, 6), 6); // Before '世'
26/// assert_eq!(find_char_boundary(s, 7), 6); // Middle of '世', backs up
27/// ```
28#[must_use]
29pub const fn find_char_boundary(s: &str, pos: usize) -> usize {
30    if pos >= s.len() {
31        return s.len();
32    }
33    let bytes = s.as_bytes();
34    let mut boundary = pos;
35    // UTF-8 continuation bytes start with 10xxxxxx (0x80-0xBF)
36    while boundary > 0 && (bytes[boundary] & 0xC0) == 0x80 {
37        boundary -= 1;
38    }
39    boundary
40}
41
42/// Finds a valid UTF-8 character boundary at or after the given position.
43///
44/// # Arguments
45///
46/// * `s` - The string to search.
47/// * `pos` - Target position in bytes.
48///
49/// # Returns
50///
51/// A byte position that is a valid UTF-8 character boundary.
52#[must_use]
53pub const fn find_char_boundary_forward(s: &str, pos: usize) -> usize {
54    if pos >= s.len() {
55        return s.len();
56    }
57    let bytes = s.as_bytes();
58    let mut boundary = pos;
59    // UTF-8 continuation bytes start with 10xxxxxx (0x80-0xBF)
60    while boundary < bytes.len() && (bytes[boundary] & 0xC0) == 0x80 {
61        boundary += 1;
62    }
63    boundary
64}
65
66/// Validates that a byte slice is valid UTF-8.
67///
68/// # Arguments
69///
70/// * `bytes` - The bytes to validate.
71///
72/// # Returns
73///
74/// `Ok(str)` if valid, `Err` with the byte offset of the first invalid byte.
75///
76/// # Errors
77///
78/// Returns the byte offset of the first invalid UTF-8 sequence.
79pub fn validate_utf8(bytes: &[u8]) -> std::result::Result<&str, usize> {
80    std::str::from_utf8(bytes).map_err(|e| e.valid_up_to())
81}
82
83/// Counts the number of grapheme clusters in a string.
84///
85/// Grapheme clusters are user-perceived characters, which may consist
86/// of multiple Unicode code points (e.g., emoji with skin tone modifiers).
87///
88/// # Arguments
89///
90/// * `s` - The string to count.
91///
92/// # Examples
93///
94/// ```
95/// use rlm_rs::io::unicode::grapheme_count;
96///
97/// assert_eq!(grapheme_count("Hello"), 5);
98/// assert_eq!(grapheme_count("世界"), 2);
99/// ```
100#[must_use]
101pub fn grapheme_count(s: &str) -> usize {
102    s.graphemes(true).count()
103}
104
105/// Truncates a string at a grapheme cluster boundary.
106///
107/// # Arguments
108///
109/// * `s` - The string to truncate.
110/// * `max_graphemes` - Maximum number of grapheme clusters.
111///
112/// # Returns
113///
114/// A string slice containing at most `max_graphemes` grapheme clusters.
115#[must_use]
116pub fn truncate_graphemes(s: &str, max_graphemes: usize) -> &str {
117    let mut end_byte = 0;
118
119    for (count, grapheme) in s.graphemes(true).enumerate() {
120        if count >= max_graphemes {
121            break;
122        }
123        end_byte += grapheme.len();
124    }
125
126    &s[..end_byte]
127}
128
129/// Finds the byte position of the nth grapheme cluster.
130///
131/// # Arguments
132///
133/// * `s` - The string to search.
134/// * `n` - The grapheme index (0-based).
135///
136/// # Returns
137///
138/// The byte position of the start of the nth grapheme, or `s.len()` if out of bounds.
139#[must_use]
140pub fn grapheme_byte_position(s: &str, n: usize) -> usize {
141    let mut pos = 0;
142    for (i, grapheme) in s.graphemes(true).enumerate() {
143        if i == n {
144            return pos;
145        }
146        pos += grapheme.len();
147    }
148    s.len()
149}
150
151/// Iterates over lines with their byte offsets.
152///
153/// # Arguments
154///
155/// * `s` - The string to iterate.
156///
157/// # Returns
158///
159/// Iterator of (`byte_offset`, `line_content`) tuples.
160pub fn lines_with_offsets(s: &str) -> impl Iterator<Item = (usize, &str)> {
161    let mut offset = 0;
162    s.lines().map(move |line| {
163        let current_offset = offset;
164        offset += line.len();
165        // Account for newline character
166        if offset < s.len() {
167            offset += 1; // \n
168            if offset < s.len() && s.as_bytes().get(offset - 1) == Some(&b'\r') {
169                // Handle \r\n (already consumed \n, this checks if prev was \r)
170            }
171        }
172        (current_offset, line)
173    })
174}
175
176/// Splits text into sentences (approximate).
177///
178/// Uses simple heuristics: splits on `.`, `!`, `?` followed by whitespace.
179///
180/// # Arguments
181///
182/// * `s` - The string to split.
183///
184/// # Returns
185///
186/// Vector of sentence strings.
187#[must_use]
188pub fn split_sentences(s: &str) -> Vec<&str> {
189    let mut sentences = Vec::new();
190    let mut start = 0;
191
192    let bytes = s.as_bytes();
193    let mut i = 0;
194
195    while i < bytes.len() {
196        let c = bytes[i];
197        if matches!(c, b'.' | b'!' | b'?') {
198            // Check if followed by whitespace or end
199            if i + 1 >= bytes.len() || bytes[i + 1].is_ascii_whitespace() {
200                let end = i + 1;
201                if end > start {
202                    sentences.push(&s[start..end]);
203                }
204                // Skip whitespace
205                i += 1;
206                while i < bytes.len() && bytes[i].is_ascii_whitespace() {
207                    i += 1;
208                }
209                start = i;
210                continue;
211            }
212        }
213        i += 1;
214    }
215
216    // Add remaining text
217    if start < s.len() {
218        sentences.push(&s[start..]);
219    }
220
221    sentences
222}
223
224/// Returns the current Unix timestamp in seconds.
225///
226/// Used for timestamping buffers, chunks, and other entities.
227///
228/// # Examples
229///
230/// ```
231/// use rlm_rs::io::current_timestamp;
232///
233/// let ts = current_timestamp();
234/// assert!(ts > 0);
235/// ```
236#[allow(clippy::cast_possible_wrap)]
237#[must_use]
238pub fn current_timestamp() -> i64 {
239    std::time::SystemTime::now()
240        .duration_since(std::time::UNIX_EPOCH)
241        .map(|d| d.as_secs() as i64)
242        .unwrap_or(0)
243}
244
245#[cfg(test)]
246mod tests {
247    use super::*;
248
249    #[test]
250    fn test_find_char_boundary() {
251        let s = "Hello 世界!";
252        assert_eq!(find_char_boundary(s, 0), 0);
253        assert_eq!(find_char_boundary(s, 5), 5);
254        assert_eq!(find_char_boundary(s, 6), 6); // Space before '世'
255        assert_eq!(find_char_boundary(s, 7), 6); // Middle of '世'
256        assert_eq!(find_char_boundary(s, 8), 6); // Still in '世'
257        assert_eq!(find_char_boundary(s, 9), 9); // After '世'
258        assert_eq!(find_char_boundary(s, 100), s.len());
259    }
260
261    #[test]
262    fn test_find_char_boundary_forward() {
263        let s = "Hello 世界!";
264        assert_eq!(find_char_boundary_forward(s, 7), 9); // Middle of '世', moves forward
265    }
266
267    #[test]
268    fn test_validate_utf8() {
269        assert!(validate_utf8(b"Hello").is_ok());
270        assert!(validate_utf8("世界".as_bytes()).is_ok());
271
272        // Invalid UTF-8
273        let invalid = [0xFF, 0xFE];
274        assert!(validate_utf8(&invalid).is_err());
275    }
276
277    #[test]
278    fn test_grapheme_count() {
279        assert_eq!(grapheme_count("Hello"), 5);
280        assert_eq!(grapheme_count("世界"), 2);
281        assert_eq!(grapheme_count(""), 0);
282    }
283
284    #[test]
285    fn test_truncate_graphemes() {
286        assert_eq!(truncate_graphemes("Hello", 3), "Hel");
287        assert_eq!(truncate_graphemes("世界!", 2), "世界");
288        assert_eq!(truncate_graphemes("Hello", 10), "Hello");
289    }
290
291    #[test]
292    fn test_grapheme_byte_position() {
293        let s = "Hello 世界";
294        assert_eq!(grapheme_byte_position(s, 0), 0);
295        assert_eq!(grapheme_byte_position(s, 6), 6); // Space
296        assert_eq!(grapheme_byte_position(s, 7), 9); // After '世'
297    }
298
299    #[test]
300    fn test_split_sentences() {
301        let text = "Hello world. How are you? I am fine!";
302        let sentences = split_sentences(text);
303        assert_eq!(sentences.len(), 3);
304        assert_eq!(sentences[0], "Hello world.");
305        assert_eq!(sentences[1], "How are you?");
306        assert_eq!(sentences[2], "I am fine!");
307    }
308
309    #[test]
310    fn test_split_sentences_no_final_punct() {
311        let text = "First sentence. Second part";
312        let sentences = split_sentences(text);
313        assert_eq!(sentences.len(), 2);
314        assert_eq!(sentences[1], "Second part");
315    }
316
317    #[test]
318    fn test_lines_with_offsets() {
319        let text = "Line 1\nLine 2\nLine 3";
320        let lines: Vec<_> = lines_with_offsets(text).collect();
321        assert_eq!(lines.len(), 3);
322        assert_eq!(lines[0], (0, "Line 1"));
323        // Note: offset calculation is approximate
324    }
325
326    #[test]
327    fn test_find_char_boundary_forward_at_end() {
328        // Test find_char_boundary_forward when pos >= s.len() (line 53)
329        let s = "hello";
330        assert_eq!(find_char_boundary_forward(s, 10), 5);
331        assert_eq!(find_char_boundary_forward(s, 5), 5);
332    }
333
334    #[test]
335    fn test_grapheme_byte_position_out_of_range() {
336        // Test grapheme_byte_position when n > grapheme count (line 144)
337        let s = "abc";
338        assert_eq!(grapheme_byte_position(s, 10), 3); // Returns s.len()
339    }
340
341    #[test]
342    fn test_grapheme_byte_position_edge_cases() {
343        // Test with unicode to ensure correct byte offset calculation
344        let s = "Hello 世界"; // "Hello " is 6 bytes, "世" is 3 bytes, "界" is 3 bytes
345        assert_eq!(grapheme_byte_position(s, 0), 0);
346        assert_eq!(grapheme_byte_position(s, 6), 6); // Before '世'
347        assert_eq!(grapheme_byte_position(s, 7), 9); // After '世'
348        assert_eq!(grapheme_byte_position(s, 8), 12); // After '界'
349        assert_eq!(grapheme_byte_position(s, 100), 12); // Out of range
350    }
351}