use unicode_segmentation::UnicodeSegmentation;
pub(crate) fn clusters(text: &str) -> impl Iterator<Item = &str> {
text.graphemes(true)
}
pub(crate) fn grapheme_len(text: &str) -> usize {
clusters(text).count()
}
pub(crate) fn grapheme_split(text: &str) -> Vec<String> {
clusters(text).map(std::borrow::ToOwned::to_owned).collect()
}
pub(crate) fn truncate_to_graphemes(text: &str, max_graphemes: usize) -> String {
let mut result = String::with_capacity(text.len());
for (count, g) in clusters(text).enumerate() {
if count >= max_graphemes {
break;
}
result.push_str(g);
}
result
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_grapheme_len_ascii() {
assert_eq!(grapheme_len("hello"), 5);
}
#[test]
fn test_grapheme_len_cafe_nfc() {
assert_eq!(grapheme_len("caf\u{00E9}"), 4); }
#[test]
fn test_grapheme_len_cafe_nfd() {
assert_eq!(grapheme_len("cafe\u{0301}"), 4); }
#[test]
fn test_grapheme_len_family_emoji() {
assert_eq!(grapheme_len("๐ฉ\u{200D}๐ฉ\u{200D}๐ง\u{200D}๐ฆ"), 1);
}
#[test]
fn test_grapheme_len_flag() {
assert_eq!(grapheme_len("๐ฌ๐ง"), 1);
}
#[test]
fn test_grapheme_len_hangul() {
assert_eq!(grapheme_len("ํ"), 1);
}
#[test]
fn test_grapheme_split_basic() {
let parts = grapheme_split("cafรฉ");
assert_eq!(parts, vec!["c", "a", "f", "รฉ"]);
}
#[test]
fn test_grapheme_split_nfd() {
let parts = grapheme_split("cafe\u{0301}");
assert_eq!(parts.len(), 4); assert_eq!(parts[3], "e\u{0301}");
}
#[test]
fn test_grapheme_truncate_basic() {
assert_eq!(truncate_to_graphemes("hello world", 5), "hello");
}
#[test]
fn test_grapheme_truncate_emoji() {
let family = "๐ฉ\u{200D}๐ฉ\u{200D}๐ง\u{200D}๐ฆ family";
let truncated = truncate_to_graphemes(family, 1);
assert_eq!(truncated, "๐ฉ\u{200D}๐ฉ\u{200D}๐ง\u{200D}๐ฆ");
}
#[test]
fn test_grapheme_truncate_nfd() {
let text = "cafe\u{0301}s"; let truncated = truncate_to_graphemes(text, 4);
assert_eq!(truncated, "cafe\u{0301}"); }
#[test]
fn test_grapheme_truncate_within_limit() {
assert_eq!(truncate_to_graphemes("hi", 10), "hi");
}
#[test]
fn test_grapheme_truncate_rejects_negative() {
let err = crate::error::checked_max_graphemes(-1).unwrap_err();
assert!(err
.to_string()
.contains("max_graphemes must be non-negative, got -1"));
assert_eq!(crate::error::checked_max_graphemes(0).unwrap(), 0);
}
mod proptest_properties {
use super::*;
use proptest::prelude::*;
proptest! {
#![proptest_config(ProptestConfig::with_cases(1000))]
#[test]
fn split_len_consistent(s in "\\PC*") {
let parts = grapheme_split(&s);
let len = grapheme_len(&s);
prop_assert_eq!(parts.len(), len);
}
#[test]
fn split_roundtrip(s in "\\PC*") {
let parts = grapheme_split(&s);
let joined: String = parts.concat();
prop_assert_eq!(&joined, &s);
}
#[test]
fn truncate_respects_limit(s in "\\PC*", n in 0..200usize) {
let result = truncate_to_graphemes(&s, n);
prop_assert!(grapheme_len(&result) <= n);
}
#[test]
fn truncate_is_prefix(s in "\\PC*", n in 0..200usize) {
let result = truncate_to_graphemes(&s, n);
prop_assert!(s.starts_with(&result));
}
#[test]
fn truncate_at_full_length_is_identity(s in "\\PC*") {
let len = grapheme_len(&s);
let result = truncate_to_graphemes(&s, len);
prop_assert_eq!(&result, &s);
}
}
}
}