use unicode_segmentation::UnicodeSegmentation;
#[must_use]
pub const fn find_char_boundary(s: &str, pos: usize) -> usize {
if pos >= s.len() {
return s.len();
}
let bytes = s.as_bytes();
let mut boundary = pos;
while boundary > 0 && (bytes[boundary] & 0xC0) == 0x80 {
boundary -= 1;
}
boundary
}
#[must_use]
pub const fn find_char_boundary_forward(s: &str, pos: usize) -> usize {
if pos >= s.len() {
return s.len();
}
let bytes = s.as_bytes();
let mut boundary = pos;
while boundary < bytes.len() && (bytes[boundary] & 0xC0) == 0x80 {
boundary += 1;
}
boundary
}
pub fn validate_utf8(bytes: &[u8]) -> std::result::Result<&str, usize> {
std::str::from_utf8(bytes).map_err(|e| e.valid_up_to())
}
#[must_use]
pub fn grapheme_count(s: &str) -> usize {
s.graphemes(true).count()
}
#[must_use]
pub fn truncate_graphemes(s: &str, max_graphemes: usize) -> &str {
let mut end_byte = 0;
for (count, grapheme) in s.graphemes(true).enumerate() {
if count >= max_graphemes {
break;
}
end_byte += grapheme.len();
}
&s[..end_byte]
}
#[must_use]
pub fn grapheme_byte_position(s: &str, n: usize) -> usize {
let mut pos = 0;
for (i, grapheme) in s.graphemes(true).enumerate() {
if i == n {
return pos;
}
pos += grapheme.len();
}
s.len()
}
pub fn lines_with_offsets(s: &str) -> impl Iterator<Item = (usize, &str)> {
let mut offset = 0;
s.lines().map(move |line| {
let current_offset = offset;
offset += line.len();
if offset < s.len() {
offset += 1; if offset < s.len() && s.as_bytes().get(offset - 1) == Some(&b'\r') {
}
}
(current_offset, line)
})
}
#[must_use]
pub fn split_sentences(s: &str) -> Vec<&str> {
let mut sentences = Vec::new();
let mut start = 0;
let bytes = s.as_bytes();
let mut i = 0;
while i < bytes.len() {
let c = bytes[i];
if matches!(c, b'.' | b'!' | b'?') {
if i + 1 >= bytes.len() || bytes[i + 1].is_ascii_whitespace() {
let end = i + 1;
if end > start {
sentences.push(&s[start..end]);
}
i += 1;
while i < bytes.len() && bytes[i].is_ascii_whitespace() {
i += 1;
}
start = i;
continue;
}
}
i += 1;
}
if start < s.len() {
sentences.push(&s[start..]);
}
sentences
}
#[allow(clippy::cast_possible_wrap)]
#[must_use]
pub fn current_timestamp() -> i64 {
std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.map(|d| d.as_secs() as i64)
.unwrap_or(0)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_find_char_boundary() {
let s = "Hello 世界!";
assert_eq!(find_char_boundary(s, 0), 0);
assert_eq!(find_char_boundary(s, 5), 5);
assert_eq!(find_char_boundary(s, 6), 6); assert_eq!(find_char_boundary(s, 7), 6); assert_eq!(find_char_boundary(s, 8), 6); assert_eq!(find_char_boundary(s, 9), 9); assert_eq!(find_char_boundary(s, 100), s.len());
}
#[test]
fn test_find_char_boundary_forward() {
let s = "Hello 世界!";
assert_eq!(find_char_boundary_forward(s, 7), 9); }
#[test]
fn test_validate_utf8() {
assert!(validate_utf8(b"Hello").is_ok());
assert!(validate_utf8("世界".as_bytes()).is_ok());
let invalid = [0xFF, 0xFE];
assert!(validate_utf8(&invalid).is_err());
}
#[test]
fn test_grapheme_count() {
assert_eq!(grapheme_count("Hello"), 5);
assert_eq!(grapheme_count("世界"), 2);
assert_eq!(grapheme_count(""), 0);
}
#[test]
fn test_truncate_graphemes() {
assert_eq!(truncate_graphemes("Hello", 3), "Hel");
assert_eq!(truncate_graphemes("世界!", 2), "世界");
assert_eq!(truncate_graphemes("Hello", 10), "Hello");
}
#[test]
fn test_grapheme_byte_position() {
let s = "Hello 世界";
assert_eq!(grapheme_byte_position(s, 0), 0);
assert_eq!(grapheme_byte_position(s, 6), 6); assert_eq!(grapheme_byte_position(s, 7), 9); }
#[test]
fn test_split_sentences() {
let text = "Hello world. How are you? I am fine!";
let sentences = split_sentences(text);
assert_eq!(sentences.len(), 3);
assert_eq!(sentences[0], "Hello world.");
assert_eq!(sentences[1], "How are you?");
assert_eq!(sentences[2], "I am fine!");
}
#[test]
fn test_split_sentences_no_final_punct() {
let text = "First sentence. Second part";
let sentences = split_sentences(text);
assert_eq!(sentences.len(), 2);
assert_eq!(sentences[1], "Second part");
}
#[test]
fn test_lines_with_offsets() {
let text = "Line 1\nLine 2\nLine 3";
let lines: Vec<_> = lines_with_offsets(text).collect();
assert_eq!(lines.len(), 3);
assert_eq!(lines[0], (0, "Line 1"));
}
#[test]
fn test_find_char_boundary_forward_at_end() {
let s = "hello";
assert_eq!(find_char_boundary_forward(s, 10), 5);
assert_eq!(find_char_boundary_forward(s, 5), 5);
}
#[test]
fn test_grapheme_byte_position_out_of_range() {
let s = "abc";
assert_eq!(grapheme_byte_position(s, 10), 3); }
#[test]
fn test_grapheme_byte_position_edge_cases() {
let s = "Hello 世界"; assert_eq!(grapheme_byte_position(s, 0), 0);
assert_eq!(grapheme_byte_position(s, 6), 6); assert_eq!(grapheme_byte_position(s, 7), 9); assert_eq!(grapheme_byte_position(s, 8), 12); assert_eq!(grapheme_byte_position(s, 100), 12); }
}