use crate::unicode::width::WidthMethod;
use crate::unicode::width::display_width_with_method;
use unicode_segmentation::UnicodeSegmentation;
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub struct GraphemeInfo {
pub byte_offset: u32,
pub byte_len: u8,
pub col_offset: u32,
pub width: u8,
}
pub struct GraphemeIterator<'a> {
inner: unicode_segmentation::Graphemes<'a>,
}
impl<'a> Iterator for GraphemeIterator<'a> {
type Item = &'a str;
fn next(&mut self) -> Option<Self::Item> {
self.inner.next()
}
}
#[must_use]
pub fn graphemes(s: &str) -> GraphemeIterator<'_> {
GraphemeIterator {
inner: s.graphemes(true),
}
}
pub fn grapheme_indices(s: &str) -> impl Iterator<Item = (usize, &str)> {
s.grapheme_indices(true)
}
#[must_use]
pub fn is_ascii_only(s: &str) -> bool {
s.is_ascii()
}
#[must_use]
pub fn grapheme_info(s: &str, tab_width: u32, method: WidthMethod) -> Vec<GraphemeInfo> {
let mut infos = Vec::new();
let mut col = 0u32;
let tab_width = tab_width.max(1);
for (byte_offset, grapheme) in s.grapheme_indices(true) {
let width = if grapheme == "\t" {
let spaces = tab_width - (col % tab_width);
spaces.min(u32::from(u8::MAX)) as u8
} else {
let w = display_width_with_method(grapheme, method);
w.min(usize::from(u8::MAX)) as u8
};
let info = GraphemeInfo {
byte_offset: byte_offset as u32,
byte_len: grapheme.len().min(usize::from(u8::MAX)) as u8,
col_offset: col,
width,
};
infos.push(info);
col += u32::from(width);
}
infos
}
#[must_use]
pub fn split_graphemes_with_widths(text: &str) -> Vec<(&str, usize)> {
use crate::unicode::display_width;
text.graphemes(true)
.map(|g| (g, display_width(g)))
.collect()
}
#[must_use]
pub fn find_grapheme_boundary(text: &str, pos: usize) -> usize {
if text.is_empty() || pos == 0 {
return 0;
}
if pos >= text.len() {
return text.len();
}
let mut last_boundary = 0;
for (byte_offset, _grapheme) in text.grapheme_indices(true) {
if byte_offset > pos {
return last_boundary;
}
if byte_offset == pos {
return pos;
}
last_boundary = byte_offset;
}
last_boundary
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_graphemes_ascii() {
let g: Vec<_> = graphemes("hello").collect();
assert_eq!(g, vec!["h", "e", "l", "l", "o"]);
}
#[test]
fn test_graphemes_emoji() {
assert_eq!(graphemes("👨👩👧").count(), 1);
}
#[test]
fn test_graphemes_combining() {
assert_eq!(graphemes("e\u{0301}").count(), 1);
}
#[test]
fn test_grapheme_info_basic() {
let infos = grapheme_info("ab\tc", 4, WidthMethod::WcWidth);
assert!(!infos.is_empty());
assert_eq!(infos[0].byte_offset, 0);
assert_eq!(infos[0].width, 1);
}
#[test]
fn test_grapheme_info_clamping() {
let mut huge_grapheme = String::from("a");
for _ in 0..300 {
huge_grapheme.push('\u{0301}'); }
let infos = grapheme_info(&huge_grapheme, 4, WidthMethod::WcWidth);
assert_eq!(infos.len(), 1); assert_eq!(infos[0].byte_len, 255);
let infos_tab = grapheme_info("\t", 300, WidthMethod::WcWidth);
assert_eq!(infos_tab.len(), 1);
assert_eq!(infos_tab[0].width, 255); }
#[test]
fn test_split_graphemes_with_widths_empty() {
let pairs = split_graphemes_with_widths("");
assert!(pairs.is_empty());
}
#[test]
fn test_split_graphemes_with_widths_ascii() {
let pairs = split_graphemes_with_widths("hello");
assert_eq!(pairs.len(), 5);
assert_eq!(pairs[0], ("h", 1));
assert_eq!(pairs[1], ("e", 1));
assert_eq!(pairs[2], ("l", 1));
assert_eq!(pairs[3], ("l", 1));
assert_eq!(pairs[4], ("o", 1));
}
#[test]
fn test_split_graphemes_with_widths_cjk() {
let pairs = split_graphemes_with_widths("世界");
assert_eq!(pairs.len(), 2);
assert_eq!(pairs[0], ("世", 2));
assert_eq!(pairs[1], ("界", 2));
}
#[test]
fn test_split_graphemes_with_widths_emoji() {
let pairs = split_graphemes_with_widths("👍");
assert_eq!(pairs.len(), 1);
assert_eq!(pairs[0].0, "👍");
assert_eq!(pairs[0].1, 2);
}
#[test]
fn test_split_graphemes_with_widths_zwj() {
let pairs = split_graphemes_with_widths("👨👩👧");
assert_eq!(pairs.len(), 1);
assert_eq!(pairs[0].0, "👨👩👧");
assert_eq!(pairs[0].1, 2);
}
#[test]
fn test_split_graphemes_with_widths_mixed() {
let pairs = split_graphemes_with_widths("A世👍");
assert_eq!(pairs.len(), 3);
assert_eq!(pairs[0], ("A", 1));
assert_eq!(pairs[1], ("世", 2));
assert_eq!(pairs[2].0, "👍");
assert_eq!(pairs[2].1, 2);
}
#[test]
fn test_split_graphemes_with_widths_combining() {
let pairs = split_graphemes_with_widths("e\u{0301}");
assert_eq!(pairs.len(), 1);
assert_eq!(pairs[0].0, "e\u{0301}");
assert_eq!(pairs[0].1, 1);
}
#[test]
fn test_find_grapheme_boundary_empty() {
assert_eq!(find_grapheme_boundary("", 0), 0);
assert_eq!(find_grapheme_boundary("", 5), 0);
}
#[test]
fn test_find_grapheme_boundary_ascii() {
let text = "hello";
assert_eq!(find_grapheme_boundary(text, 0), 0);
assert_eq!(find_grapheme_boundary(text, 1), 1);
assert_eq!(find_grapheme_boundary(text, 2), 2);
assert_eq!(find_grapheme_boundary(text, 5), 5); }
#[test]
fn test_find_grapheme_boundary_beyond_string() {
assert_eq!(find_grapheme_boundary("abc", 10), 3);
}
#[test]
fn test_find_grapheme_boundary_multibyte() {
let text = "世界";
assert_eq!(find_grapheme_boundary(text, 0), 0); assert_eq!(find_grapheme_boundary(text, 1), 0); assert_eq!(find_grapheme_boundary(text, 2), 0); assert_eq!(find_grapheme_boundary(text, 3), 3); assert_eq!(find_grapheme_boundary(text, 4), 3); assert_eq!(find_grapheme_boundary(text, 5), 3); assert_eq!(find_grapheme_boundary(text, 6), 6); }
#[test]
fn test_find_grapheme_boundary_combining() {
let text = "e\u{0301}";
assert_eq!(text.len(), 3); assert_eq!(find_grapheme_boundary(text, 0), 0);
assert_eq!(find_grapheme_boundary(text, 1), 0); assert_eq!(find_grapheme_boundary(text, 2), 0); assert_eq!(find_grapheme_boundary(text, 3), 3); }
#[test]
fn test_find_grapheme_boundary_emoji() {
let text = "👍";
assert_eq!(find_grapheme_boundary(text, 0), 0);
assert_eq!(find_grapheme_boundary(text, 1), 0);
assert_eq!(find_grapheme_boundary(text, 2), 0);
assert_eq!(find_grapheme_boundary(text, 3), 0);
assert_eq!(find_grapheme_boundary(text, 4), 4); }
#[test]
fn test_find_grapheme_boundary_zwj() {
let text = "👨👩👧";
let len = text.len();
assert!(len > 4);
assert_eq!(find_grapheme_boundary(text, 0), 0);
for pos in 1..len {
assert_eq!(
find_grapheme_boundary(text, pos),
0,
"Position {pos} should return 0"
);
}
assert_eq!(find_grapheme_boundary(text, len), len);
}
#[test]
fn test_find_grapheme_boundary_mixed() {
let text = "A世👍";
assert_eq!(find_grapheme_boundary(text, 0), 0); assert_eq!(find_grapheme_boundary(text, 1), 1); assert_eq!(find_grapheme_boundary(text, 2), 1); assert_eq!(find_grapheme_boundary(text, 3), 1); assert_eq!(find_grapheme_boundary(text, 4), 4); assert_eq!(find_grapheme_boundary(text, 5), 4); assert_eq!(find_grapheme_boundary(text, 7), 4); assert_eq!(find_grapheme_boundary(text, 8), 8); }
}