Skip to main content

rlm_rs/io/
unicode.rs

1//! Unicode utilities for text processing.
2//!
3//! Provides helpers for proper Unicode handling including
4//! character boundary detection and validation.
5
6use unicode_segmentation::UnicodeSegmentation;
7
8/// Finds a valid UTF-8 character boundary at or before the given position.
9///
10/// # Arguments
11///
12/// * `s` - The string to search.
13/// * `pos` - Target position in bytes.
14///
15/// # Returns
16///
17/// A byte position that is a valid UTF-8 character boundary.
18///
19/// # Examples
20///
21/// ```
22/// use rlm_rs::io::find_char_boundary;
23///
24/// let s = "Hello 世界";
25/// assert_eq!(find_char_boundary(s, 6), 6); // Before '世'
26/// assert_eq!(find_char_boundary(s, 7), 6); // Middle of '世', backs up
27/// ```
28#[must_use]
29pub const fn find_char_boundary(s: &str, pos: usize) -> usize {
30    if pos >= s.len() {
31        return s.len();
32    }
33    let bytes = s.as_bytes();
34    let mut boundary = pos;
35    // UTF-8 continuation bytes start with 10xxxxxx (0x80-0xBF)
36    while boundary > 0 && (bytes[boundary] & 0xC0) == 0x80 {
37        boundary -= 1;
38    }
39    boundary
40}
41
42/// Finds a valid UTF-8 character boundary at or after the given position.
43///
44/// # Arguments
45///
46/// * `s` - The string to search.
47/// * `pos` - Target position in bytes.
48///
49/// # Returns
50///
51/// A byte position that is a valid UTF-8 character boundary.
52#[must_use]
53pub const fn find_char_boundary_forward(s: &str, pos: usize) -> usize {
54    if pos >= s.len() {
55        return s.len();
56    }
57    let bytes = s.as_bytes();
58    let mut boundary = pos;
59    // UTF-8 continuation bytes start with 10xxxxxx (0x80-0xBF)
60    while boundary < bytes.len() && (bytes[boundary] & 0xC0) == 0x80 {
61        boundary += 1;
62    }
63    boundary
64}
65
66/// Validates that a byte slice is valid UTF-8.
67///
68/// # Arguments
69///
70/// * `bytes` - The bytes to validate.
71///
72/// # Returns
73///
74/// `Ok(str)` if valid, `Err` with the byte offset of the first invalid byte.
75///
76/// # Errors
77///
78/// Returns the byte offset of the first invalid UTF-8 sequence.
79pub fn validate_utf8(bytes: &[u8]) -> std::result::Result<&str, usize> {
80    std::str::from_utf8(bytes).map_err(|e| e.valid_up_to())
81}
82
83/// Counts the number of grapheme clusters in a string.
84///
85/// Grapheme clusters are user-perceived characters, which may consist
86/// of multiple Unicode code points (e.g., emoji with skin tone modifiers).
87///
88/// # Arguments
89///
90/// * `s` - The string to count.
91///
92/// # Examples
93///
94/// ```
95/// use rlm_rs::io::unicode::grapheme_count;
96///
97/// assert_eq!(grapheme_count("Hello"), 5);
98/// assert_eq!(grapheme_count("世界"), 2);
99/// ```
100#[must_use]
101pub fn grapheme_count(s: &str) -> usize {
102    s.graphemes(true).count()
103}
104
105/// Truncates a string at a grapheme cluster boundary.
106///
107/// # Arguments
108///
109/// * `s` - The string to truncate.
110/// * `max_graphemes` - Maximum number of grapheme clusters.
111///
112/// # Returns
113///
114/// A string slice containing at most `max_graphemes` grapheme clusters.
115#[must_use]
116pub fn truncate_graphemes(s: &str, max_graphemes: usize) -> &str {
117    let mut end_byte = 0;
118
119    for (count, grapheme) in s.graphemes(true).enumerate() {
120        if count >= max_graphemes {
121            break;
122        }
123        end_byte += grapheme.len();
124    }
125
126    &s[..end_byte]
127}
128
129/// Finds the byte position of the nth grapheme cluster.
130///
131/// # Arguments
132///
133/// * `s` - The string to search.
134/// * `n` - The grapheme index (0-based).
135///
136/// # Returns
137///
138/// The byte position of the start of the nth grapheme, or `s.len()` if out of bounds.
139#[must_use]
140pub fn grapheme_byte_position(s: &str, n: usize) -> usize {
141    let mut pos = 0;
142    for (i, grapheme) in s.graphemes(true).enumerate() {
143        if i == n {
144            return pos;
145        }
146        pos += grapheme.len();
147    }
148    s.len()
149}
150
151/// Iterates over lines with their byte offsets.
152///
153/// # Arguments
154///
155/// * `s` - The string to iterate.
156///
157/// # Returns
158///
159/// Iterator of (`byte_offset`, `line_content`) tuples.
160pub fn lines_with_offsets(s: &str) -> impl Iterator<Item = (usize, &str)> {
161    let mut offset = 0;
162    s.lines().map(move |line| {
163        let current_offset = offset;
164        offset += line.len();
165        // Account for newline character
166        if offset < s.len() {
167            offset += 1; // \n
168            if offset < s.len() && s.as_bytes().get(offset - 1) == Some(&b'\r') {
169                // Handle \r\n (already consumed \n, this checks if prev was \r)
170            }
171        }
172        (current_offset, line)
173    })
174}
175
176/// Splits text into sentences (approximate).
177///
178/// Uses simple heuristics: splits on `.`, `!`, `?` followed by whitespace.
179///
180/// # Arguments
181///
182/// * `s` - The string to split.
183///
184/// # Returns
185///
186/// Vector of sentence strings.
187#[must_use]
188pub fn split_sentences(s: &str) -> Vec<&str> {
189    let mut sentences = Vec::new();
190    let mut start = 0;
191
192    let bytes = s.as_bytes();
193    let mut i = 0;
194
195    while i < bytes.len() {
196        let c = bytes[i];
197        if matches!(c, b'.' | b'!' | b'?') {
198            // Check if followed by whitespace or end
199            if i + 1 >= bytes.len() || bytes[i + 1].is_ascii_whitespace() {
200                let end = i + 1;
201                if end > start {
202                    sentences.push(&s[start..end]);
203                }
204                // Skip whitespace
205                i += 1;
206                while i < bytes.len() && bytes[i].is_ascii_whitespace() {
207                    i += 1;
208                }
209                start = i;
210                continue;
211            }
212        }
213        i += 1;
214    }
215
216    // Add remaining text
217    if start < s.len() {
218        sentences.push(&s[start..]);
219    }
220
221    sentences
222}
223
224/// Returns the current Unix timestamp in seconds.
225///
226/// Used for timestamping buffers, chunks, and other entities.
227///
228/// # Examples
229///
230/// ```
231/// use rlm_rs::io::current_timestamp;
232///
233/// let ts = current_timestamp();
234/// assert!(ts > 0);
235/// ```
236#[allow(clippy::cast_possible_wrap)]
237#[must_use]
238pub fn current_timestamp() -> i64 {
239    std::time::SystemTime::now()
240        .duration_since(std::time::UNIX_EPOCH)
241        .map_or(0, |d| d.as_secs() as i64)
242}
243
244#[cfg(test)]
245mod tests {
246    use super::*;
247
248    #[test]
249    fn test_find_char_boundary() {
250        let s = "Hello 世界!";
251        assert_eq!(find_char_boundary(s, 0), 0);
252        assert_eq!(find_char_boundary(s, 5), 5);
253        assert_eq!(find_char_boundary(s, 6), 6); // Space before '世'
254        assert_eq!(find_char_boundary(s, 7), 6); // Middle of '世'
255        assert_eq!(find_char_boundary(s, 8), 6); // Still in '世'
256        assert_eq!(find_char_boundary(s, 9), 9); // After '世'
257        assert_eq!(find_char_boundary(s, 100), s.len());
258    }
259
260    #[test]
261    fn test_find_char_boundary_forward() {
262        let s = "Hello 世界!";
263        assert_eq!(find_char_boundary_forward(s, 7), 9); // Middle of '世', moves forward
264    }
265
266    #[test]
267    fn test_validate_utf8() {
268        assert!(validate_utf8(b"Hello").is_ok());
269        assert!(validate_utf8("世界".as_bytes()).is_ok());
270
271        // Invalid UTF-8
272        let invalid = [0xFF, 0xFE];
273        assert!(validate_utf8(&invalid).is_err());
274    }
275
276    #[test]
277    fn test_grapheme_count() {
278        assert_eq!(grapheme_count("Hello"), 5);
279        assert_eq!(grapheme_count("世界"), 2);
280        assert_eq!(grapheme_count(""), 0);
281    }
282
283    #[test]
284    fn test_truncate_graphemes() {
285        assert_eq!(truncate_graphemes("Hello", 3), "Hel");
286        assert_eq!(truncate_graphemes("世界!", 2), "世界");
287        assert_eq!(truncate_graphemes("Hello", 10), "Hello");
288    }
289
290    #[test]
291    fn test_grapheme_byte_position() {
292        let s = "Hello 世界";
293        assert_eq!(grapheme_byte_position(s, 0), 0);
294        assert_eq!(grapheme_byte_position(s, 6), 6); // Space
295        assert_eq!(grapheme_byte_position(s, 7), 9); // After '世'
296    }
297
298    #[test]
299    fn test_split_sentences() {
300        let text = "Hello world. How are you? I am fine!";
301        let sentences = split_sentences(text);
302        assert_eq!(sentences.len(), 3);
303        assert_eq!(sentences[0], "Hello world.");
304        assert_eq!(sentences[1], "How are you?");
305        assert_eq!(sentences[2], "I am fine!");
306    }
307
308    #[test]
309    fn test_split_sentences_no_final_punct() {
310        let text = "First sentence. Second part";
311        let sentences = split_sentences(text);
312        assert_eq!(sentences.len(), 2);
313        assert_eq!(sentences[1], "Second part");
314    }
315
316    #[test]
317    fn test_lines_with_offsets() {
318        let text = "Line 1\nLine 2\nLine 3";
319        let lines: Vec<_> = lines_with_offsets(text).collect();
320        assert_eq!(lines.len(), 3);
321        assert_eq!(lines[0], (0, "Line 1"));
322        // Note: offset calculation is approximate
323    }
324
325    #[test]
326    fn test_find_char_boundary_forward_at_end() {
327        // Test find_char_boundary_forward when pos >= s.len() (line 53)
328        let s = "hello";
329        assert_eq!(find_char_boundary_forward(s, 10), 5);
330        assert_eq!(find_char_boundary_forward(s, 5), 5);
331    }
332
333    #[test]
334    fn test_grapheme_byte_position_out_of_range() {
335        // Test grapheme_byte_position when n > grapheme count (line 144)
336        let s = "abc";
337        assert_eq!(grapheme_byte_position(s, 10), 3); // Returns s.len()
338    }
339
340    #[test]
341    fn test_grapheme_byte_position_edge_cases() {
342        // Test with unicode to ensure correct byte offset calculation
343        let s = "Hello 世界"; // "Hello " is 6 bytes, "世" is 3 bytes, "界" is 3 bytes
344        assert_eq!(grapheme_byte_position(s, 0), 0);
345        assert_eq!(grapheme_byte_position(s, 6), 6); // Before '世'
346        assert_eq!(grapheme_byte_position(s, 7), 9); // After '世'
347        assert_eq!(grapheme_byte_position(s, 8), 12); // After '界'
348        assert_eq!(grapheme_byte_position(s, 100), 12); // Out of range
349    }
350}