1use unicode_segmentation::UnicodeSegmentation;
7
8#[must_use]
29pub const fn find_char_boundary(s: &str, pos: usize) -> usize {
30 if pos >= s.len() {
31 return s.len();
32 }
33 let bytes = s.as_bytes();
34 let mut boundary = pos;
35 while boundary > 0 && (bytes[boundary] & 0xC0) == 0x80 {
37 boundary -= 1;
38 }
39 boundary
40}
41
42#[must_use]
53pub const fn find_char_boundary_forward(s: &str, pos: usize) -> usize {
54 if pos >= s.len() {
55 return s.len();
56 }
57 let bytes = s.as_bytes();
58 let mut boundary = pos;
59 while boundary < bytes.len() && (bytes[boundary] & 0xC0) == 0x80 {
61 boundary += 1;
62 }
63 boundary
64}
65
66pub fn validate_utf8(bytes: &[u8]) -> std::result::Result<&str, usize> {
80 std::str::from_utf8(bytes).map_err(|e| e.valid_up_to())
81}
82
83#[must_use]
101pub fn grapheme_count(s: &str) -> usize {
102 s.graphemes(true).count()
103}
104
105#[must_use]
116pub fn truncate_graphemes(s: &str, max_graphemes: usize) -> &str {
117 let mut end_byte = 0;
118
119 for (count, grapheme) in s.graphemes(true).enumerate() {
120 if count >= max_graphemes {
121 break;
122 }
123 end_byte += grapheme.len();
124 }
125
126 &s[..end_byte]
127}
128
129#[must_use]
140pub fn grapheme_byte_position(s: &str, n: usize) -> usize {
141 let mut pos = 0;
142 for (i, grapheme) in s.graphemes(true).enumerate() {
143 if i == n {
144 return pos;
145 }
146 pos += grapheme.len();
147 }
148 s.len()
149}
150
151pub fn lines_with_offsets(s: &str) -> impl Iterator<Item = (usize, &str)> {
161 let mut offset = 0;
162 s.lines().map(move |line| {
163 let current_offset = offset;
164 offset += line.len();
165 if offset < s.len() {
167 offset += 1; if offset < s.len() && s.as_bytes().get(offset - 1) == Some(&b'\r') {
169 }
171 }
172 (current_offset, line)
173 })
174}
175
176#[must_use]
188pub fn split_sentences(s: &str) -> Vec<&str> {
189 let mut sentences = Vec::new();
190 let mut start = 0;
191
192 let bytes = s.as_bytes();
193 let mut i = 0;
194
195 while i < bytes.len() {
196 let c = bytes[i];
197 if matches!(c, b'.' | b'!' | b'?') {
198 if i + 1 >= bytes.len() || bytes[i + 1].is_ascii_whitespace() {
200 let end = i + 1;
201 if end > start {
202 sentences.push(&s[start..end]);
203 }
204 i += 1;
206 while i < bytes.len() && bytes[i].is_ascii_whitespace() {
207 i += 1;
208 }
209 start = i;
210 continue;
211 }
212 }
213 i += 1;
214 }
215
216 if start < s.len() {
218 sentences.push(&s[start..]);
219 }
220
221 sentences
222}
223
224#[allow(clippy::cast_possible_wrap)]
237#[must_use]
238pub fn current_timestamp() -> i64 {
239 std::time::SystemTime::now()
240 .duration_since(std::time::UNIX_EPOCH)
241 .map(|d| d.as_secs() as i64)
242 .unwrap_or(0)
243}
244
245#[cfg(test)]
246mod tests {
247 use super::*;
248
249 #[test]
250 fn test_find_char_boundary() {
251 let s = "Hello 世界!";
252 assert_eq!(find_char_boundary(s, 0), 0);
253 assert_eq!(find_char_boundary(s, 5), 5);
254 assert_eq!(find_char_boundary(s, 6), 6); assert_eq!(find_char_boundary(s, 7), 6); assert_eq!(find_char_boundary(s, 8), 6); assert_eq!(find_char_boundary(s, 9), 9); assert_eq!(find_char_boundary(s, 100), s.len());
259 }
260
261 #[test]
262 fn test_find_char_boundary_forward() {
263 let s = "Hello 世界!";
264 assert_eq!(find_char_boundary_forward(s, 7), 9); }
266
267 #[test]
268 fn test_validate_utf8() {
269 assert!(validate_utf8(b"Hello").is_ok());
270 assert!(validate_utf8("世界".as_bytes()).is_ok());
271
272 let invalid = [0xFF, 0xFE];
274 assert!(validate_utf8(&invalid).is_err());
275 }
276
277 #[test]
278 fn test_grapheme_count() {
279 assert_eq!(grapheme_count("Hello"), 5);
280 assert_eq!(grapheme_count("世界"), 2);
281 assert_eq!(grapheme_count(""), 0);
282 }
283
284 #[test]
285 fn test_truncate_graphemes() {
286 assert_eq!(truncate_graphemes("Hello", 3), "Hel");
287 assert_eq!(truncate_graphemes("世界!", 2), "世界");
288 assert_eq!(truncate_graphemes("Hello", 10), "Hello");
289 }
290
291 #[test]
292 fn test_grapheme_byte_position() {
293 let s = "Hello 世界";
294 assert_eq!(grapheme_byte_position(s, 0), 0);
295 assert_eq!(grapheme_byte_position(s, 6), 6); assert_eq!(grapheme_byte_position(s, 7), 9); }
298
299 #[test]
300 fn test_split_sentences() {
301 let text = "Hello world. How are you? I am fine!";
302 let sentences = split_sentences(text);
303 assert_eq!(sentences.len(), 3);
304 assert_eq!(sentences[0], "Hello world.");
305 assert_eq!(sentences[1], "How are you?");
306 assert_eq!(sentences[2], "I am fine!");
307 }
308
309 #[test]
310 fn test_split_sentences_no_final_punct() {
311 let text = "First sentence. Second part";
312 let sentences = split_sentences(text);
313 assert_eq!(sentences.len(), 2);
314 assert_eq!(sentences[1], "Second part");
315 }
316
317 #[test]
318 fn test_lines_with_offsets() {
319 let text = "Line 1\nLine 2\nLine 3";
320 let lines: Vec<_> = lines_with_offsets(text).collect();
321 assert_eq!(lines.len(), 3);
322 assert_eq!(lines[0], (0, "Line 1"));
323 }
325
326 #[test]
327 fn test_find_char_boundary_forward_at_end() {
328 let s = "hello";
330 assert_eq!(find_char_boundary_forward(s, 10), 5);
331 assert_eq!(find_char_boundary_forward(s, 5), 5);
332 }
333
334 #[test]
335 fn test_grapheme_byte_position_out_of_range() {
336 let s = "abc";
338 assert_eq!(grapheme_byte_position(s, 10), 3); }
340
341 #[test]
342 fn test_grapheme_byte_position_edge_cases() {
343 let s = "Hello 世界"; assert_eq!(grapheme_byte_position(s, 0), 0);
346 assert_eq!(grapheme_byte_position(s, 6), 6); assert_eq!(grapheme_byte_position(s, 7), 9); assert_eq!(grapheme_byte_position(s, 8), 12); assert_eq!(grapheme_byte_position(s, 100), 12); }
351}