grapheme_utils/lib.rs
1//! Extended Grapheme Cluster Utils
2//!
3//! Handy Grapheme Helper Utils
4//!
5//! ```rust
6//! use grapheme_utils::*;
7//!
8//! fn main() {
9//! let st = "हिन्दीH🧑🌾e‘︀o‘︁réé".to_string();
10//!
11//! println!("num_graphemes {}", num_graphemes(&st)); // Prints 12, the string has 12 grapheme clusters total
12//!
13//! println!("string_width {}", string_width(&st)); // Prints 18, the string uses 18 columns
14//!
15//! println!("nth_grapheme_idx {}", nth_grapheme_idx(&st, 1)); // Prints 6 (index 6)
16//! println!("nth_grapheme_idx {}", nth_grapheme_idx(&st, 2)); // Prints 18 (index 18)
17//!
18//! println!("nth_grapheme {}", nth_grapheme(&st, 1)); // Prints न्दी, the 2nd byte in the string, base 0
19//! println!("nth_grapheme {}", nth_grapheme(&st, 2)); // Prints H, the 3rd bytes in the string, base 0
20//!
21//! println!("nth grapheme_width {}", nth_grapheme_width(&st, 1)); // Prints 3
22//! println!("nth grapheme_width {}", nth_grapheme_width(&st, 2)); // Prints 1
23//!
24//! // Grapheme Visual Column Width
25//! println!("grapheme_width_at_idx {}", grapheme_width_at_idx(&st, 18)); // Prints 1 (H is 1 column wide)
26//! let num = 7; // Anything between 6 and 17 inclusive
27//! println!(
28//! "grapheme_width_at_idx {}",
29//! grapheme_width_at_idx(&st, num)
30//! ); // Prints 3 (न्दी is 3 columns wide)
31//!
32//! // Grapheme utf8 byte count
33//! println!("grapheme_len {}", grapheme_len(&st, num)); // Prints 12 (न्दी uses 12 utf8 bytes)
34//! println!("grapheme_len {}", grapheme_len(&st, 18)); // Prints 1 (H uses 1 utf8 byte)
35//!
36//! // Matrix of grapheme functions:
37//! // [previous, current or next] grapheme given an index
38//! //
39//! // Outputing either the
40//! // [extended grapheme cluster string, or starting string index]
41//! //
42//! // Take special note of the 2 characters before and after the H.
43//! // The characer before न्दी starts at string index 6, H is at 18, and 🧑 is at index 19
44//! //
45//!
46//! // Output string index
47//! println!(
48//! "prev_grapheme_idx_from_idx {}",
49//! prev_grapheme_idx_from_idx(&st, 18)
50//! ); // Prints 6
51//!
52//! println!("grapheme_idx_at_idx {}", grapheme_idx_at_idx(&st, 18)); // Prints 18
53//!
54//! println!(
55//! "next_grapheme_idx_from_idx {}",
56//! next_grapheme_idx_from_idx(&st, 18)
57//! ); // Prints 19
58//!
59//! // Output extended grapheme cluster
60//! println!("prev_grapheme_from_idx {}", prev_grapheme_from_idx(&st, 18)); // Prints न्दी
61//!
62//! println!("grapheme_at_idx {}", grapheme_at_idx(&st, 18)); // Prints H
63//!
64//! println!("next_grapheme_from_idx {}", next_grapheme_from_idx(&st, 18)); // Prints 🧑
65//!
66//! // Note, all of the above matrix of functions work with num, the range of inputs
67//! // instead of requiring the exact start to each grapheme.
68//! // Examining the testing matrix may be instructive
69//! }
70//! ```
71
72
73// Note: Ease of use over Ideomatic Rust
74// This code will return a '' grapheme or an index past the end of the
75// string instead of None, etc.
76//
77// I find it much more convenient when working with text to concatenate ''
78// stings more convenient than checking an handling None for example. I
79// can easily test if a string is None (or len() == 0 when necessary.)
80//
81// Note: This code is a purposefully forgiving.
82// unicode_segmentation will panic if it ever index in the middle of a grapheme.
83// This code is wastefully hardened against those errors.
84//
85// Note: Utf-8 Can encode reverse text (right to left), probably downwards, etc.
86// This crate ignores ALL THAT.
87//
88use unicode_segmentation::{GraphemeCursor, UnicodeSegmentation};
89use unicode_width::UnicodeWidthStr;
90
91// Notes on Graphemes
92// It's complicated... and with the way the world works, it keeps getting more complicated.
93// One comic suggested that the unicode foundation has the job of trying to direct a flooding
94// river with traffic signs!
95//
96// A utf-8 character can be encoded in 1 to 4 bytes. See table below.
97//
98// Extended Graphemes Clusters can consist of multiple utf-8 characters, many of which modify
99// the initial character, making certain works 100 or more utf-8 bytes long.
100//
101// All together... it's complicated.
102//
103//
104// - UTF-8 encoding follows a pattern:
105// - 1-byte sequence: `0xxxxxxx` (ASCII range, covers basic Latin) Follow Bytes are:
106// - 2-byte sequence: `110xxxxx 10xxxxxx` CD 8?, 9?, A?, or B?
107// - 3-byte sequence: `1110xxxx 10xxxxxx 10xxxxxx` E?
108// - 4-byte sequence: `11110xxx 10xxxxxx 10xxxxxx 10xxxxxx` F?
109//
110// pub enum GraphemeCat - https://github.com/unicode-rs/unicode-segmentation/blob/master/src/grapheme.rs
111// Catch All GC_Any,
112// Carriage Return GC_CR,
113// Control < 20h GC_Control,
114// Extended GC_Extend,
115// Ext Pic GC_Extended_Pictographic,
116// Conanical Brk GC_InCB_Consonant,
117// Left Part GC_L,
118// Line Feed (CR) GC_LF,
119// Letter/Vowel GC_LV,
120// Let/Volel/Tone GC_LVT,
121// Prepend GC_Prepend,
122// Regional GC_Regional_Indicator,
123// Visable Join GC_SpacingMark,
124// Trailing GC_T,
125// Vowel GC_V,
126// Zero Width Join GC_ZWJ,
127//
128// Wide Chars Test
129// "I😂J你KツL한M😂N\u{1F468}\u{200D}\u{1F469}\u{200D}\u{1F467}\u{200D}\u{1F466}O👨👩👧👦P\u{FF21}Q\u{20000}R"
130//
131// Note:
132// Challenge Case - /t is reported as a single character, real width depends on column, and tabstops!!!
133
134
135/// Return the grapheme at the given string idx
136///
137/// ```rust
138/// use grapheme_utils::*;
139///
140/// fn main() {
141/// let st = "हिन्दीH🧑🌾e‘︀o‘︁réé".to_string();
142///
143/// println!("grapheme_at_idx {}", grapheme_at_idx(&st, 18)); // Prints H
144/// }
145/// ```
146pub fn grapheme_at_idx(st: &str, idx: usize) -> &str {
147 let pos = grapheme_idx_at_idx(&st, idx);
148 &st[pos..pos + st[pos..].graphemes(true).next().unwrap_or("").len()]
149}
150
151/// Grapheme length in Bytes
152///
153/// ```rust
154/// use grapheme_utils::*;
155///
156/// fn main() {
157/// let st = "हिन्दीH🧑🌾e‘︀o‘︁réé".to_string();
158///
159/// let num = 7; // Anything between 6 and 17 inclusive
160///
161/// // Grapheme utf8 byte count or length in bytes
162/// println!("grapheme_len {}", grapheme_len(&st, num)); // Prints 12 (न्दी uses 12 utf8 bytes)
163/// println!("grapheme_len {}", grapheme_len(&st, 18)); // Prints 1 (H uses 1 utf8 byte)
164/// }
165/// ```
166pub fn grapheme_len(st: &str, idx: usize) -> usize {
167 let pos = grapheme_idx_at_idx(&st, idx);
168 st[pos..].graphemes(true).next().unwrap_or("").len()
169}
170
171/// Starting idx of Grapheme
172///
173/// This returns the starting index for the grapheme given
174/// a string byte idx
175///
176/// Note: This is an example wasteful function, but it
177/// usually returns the same index you're providing.
178///
179///
180/// ```rust
181/// use grapheme_utils::*;
182///
183/// fn main() {
184/// let st = "हिन्दीH🧑🌾e‘︀o‘︁réé".to_string();
185///
186/// println!("grapheme_idx_at_idx {}", grapheme_idx_at_idx(&st, 18)); // Prints 18
187/// }
188/// ```
189pub fn grapheme_idx_at_idx(st: &str, idx: usize) -> usize {
190 if idx == 0 {
191 return 0;
192 }
193 let mut pos = idx;
194
195 if idx >= st.len() {
196 return st.len();
197 }
198 let mut cursor = GraphemeCursor::new(idx, st.len(), true);
199
200 loop {
201 while pos > 0 && (st.as_bytes()[pos] & 0xc0) == 0x80 {
202 pos -= 1;
203 }
204 cursor.set_cursor(pos);
205 if cursor.is_boundary(st, 0).unwrap_or(false) {
206 break;
207 }
208 pos -= 1;
209 }
210 pos
211}
212
213/// Return the grapheme starting at or after the given byte index in a string.
214///
215/// ```rust
216/// use grapheme_utils::*;
217///
218/// fn main() {
219/// let st = "हिन्दीH🧑🌾e‘︀o‘︁réé".to_string();
220///
221/// // Grapheme Visual Column Width
222/// println!("grapheme_width_at_idx {}", grapheme_width_at_idx(&st, 18)); // Prints 1 (H is 1 column wide)
223///
224/// let num = 7; // Anything between 6 and 17 inclusive
225/// println!(
226/// "grapheme_width_at_idx {}",
227/// grapheme_width_at_idx(&st, num)
228/// ); // Prints 3 (न्दी is 3 columns wide)
229/// }
230/// ```
231pub fn grapheme_width_at_idx(st: &str, idx: usize) -> usize {
232 let pos = grapheme_idx_at_idx(&st, idx);
233 st[pos..].graphemes(true).next().unwrap_or("").width()
234}
235
236/// Next Grapheme from Current Index
237///
238/// ```rust
239/// use grapheme_utils::*;
240///
241/// fn main() {
242/// let st = "हिन्दीH🧑🌾e‘︀o‘︁réé".to_string();
243///
244/// println!("next_grapheme_from_idx {}", next_grapheme_from_idx(&st, 18)); // Prints 🧑
245/// }
246/// ```
247pub fn next_grapheme_from_idx(st: &str, idx: usize) -> &str {
248 let st_len = st.len();
249 if idx >= st_len {
250 return "";
251 }
252 let pos = next_grapheme_idx_from_idx(&st, idx);
253 st[pos..].graphemes(true).next().unwrap_or("")
254}
255
256/// Byte Index of the Next Extended Grapheme from Current Index
257///
258/// NOTE: This can return the st.len(), meaning an illegal index
259/// if this is the last Grapheme in the string!
260///
261/// Note: In testing, The Rust library currently divides
262/// some characters that should be singular "🧑🌾"
263///
264/// Note: This code is inefficient... small, but inefficient.
265///
266/// Note: This function has been modified to be panic proof.
267/// The underlying library can panic:
268/// unicode-segmentation-1.12.0/src/grapheme.rs:787:29:
269///
270/// ```rust
271/// use grapheme_utils::*;
272///
273/// fn main() {
274/// let st = "हिन्दीH🧑🌾e‘︀o‘︁réé".to_string();
275///
276/// println!("next_grapheme_from_idx {}", next_grapheme_from_idx(&st, 18)); // Prints 🧑
277/// }
278/// ```
279// Note: next_boundary is written to only require
280// a short chunk of the string instead of the
281// whole thing:
282// next_boundary(st[beg..beg+ch.len_utf8+etc], beg)
283//
284// next_boundary can then send an error back saying if
285// it needs more (GraphemeIncomplete::NextChunk)
286//
287// It may or may not to land on exact utf8 char
288// boundaries, but it's really hard for prev_grapheme_idx
289// where you need to know the exact info we're wanting.
290//
291pub fn next_grapheme_idx_from_idx(st: &str, idx: usize) -> usize {
292 let st_len = st.len();
293 if idx >= st_len {
294 return st_len;
295 }
296 let mut pos = idx;
297 while pos > 0 && (st.as_bytes()[pos] & 0xc0) == 0x80 {
298 pos -= 1;
299 }
300 let mut cursor = GraphemeCursor::new(pos, st_len, true);
301 cursor
302 .next_boundary(st, 0)
303 .ok()
304 .flatten()
305 .unwrap_or_else(|| st_len)
306}
307
308/// nth Grapheme
309///
310/// Note, this will return the st.len() index if it would be
311/// past the end of the string even if the string
312/// is empty.
313///
314/// ```rust
315/// use grapheme_utils::*;
316///
317/// fn main() {
318/// let st = "हिन्दीH🧑🌾e‘︀o‘︁réé".to_string();
319///
320/// println!("nth_grapheme {}", nth_grapheme(&st, 1)); // Prints न्दी, the 2nd byte in the string, base 0
321/// println!("nth_grapheme {}", nth_grapheme(&st, 2)); // Prints H, the 3rd bytes in the string, base 0
322/// }
323/// ```
324// UUGH - Full Iter to nth!
325//
326pub fn nth_grapheme(st: &str, nth: usize) -> &str {
327 UnicodeSegmentation::grapheme_indices(st, true)
328 .nth(nth)
329 .map(|(_, g)| g)
330 .unwrap_or_else(|| "")
331}
332
333/// nth Grapheme Index from nth
334///
335/// Note, this will return the st.len() index if it would be
336/// past the end of the string even if the string
337/// is empty.
338///
339/// ```rust
340/// use grapheme_utils::*;
341///
342/// fn main() {
343/// let st = "हिन्दीH🧑🌾e‘︀o‘︁réé".to_string();
344///
345/// println!("nth_grapheme_idx {}", nth_grapheme_idx(&st, 1)); // Prints 6 (index 6)
346/// println!("nth_grapheme_idx {}", nth_grapheme_idx(&st, 2)); // Prints 18 (index 18)
347/// }
348/// ```
349// Uugh - Full Iter!
350//
351pub fn nth_grapheme_idx(st: &str, nth: usize) -> usize {
352 UnicodeSegmentation::grapheme_indices(st, true)
353 .nth(nth)
354 .map(|(idx, _)| idx)
355 .unwrap_or_else(|| st.len())
356}
357
358/// nth Grapheme Width
359///
360/// ```rust
361/// use grapheme_utils::*;
362///
363/// fn main() {
364/// let st = "हिन्दीH🧑🌾e‘︀o‘︁réé".to_string();
365///
366/// println!("nth grapheme_width {}", nth_grapheme_width(&st, 1)); // Prints 3
367/// println!("nth grapheme_width {}", nth_grapheme_width(&st, 2)); // Prints 1
368/// }
369/// ```
370pub fn nth_grapheme_width(st: &str, nth: usize) -> usize {
371 UnicodeSegmentation::grapheme_indices(st, true)
372 .nth(nth)
373 .map(|(_, g)| g)
374 .unwrap_or_else(|| "")
375 .width()
376}
377
378/// Num Graphemes In &str
379///
380/// Note, this will return the st.len() index if it would be
381/// past the end of the string even if the string
382/// is empty.
383/// ```rust
384/// use grapheme_utils::*;
385///
386/// fn main() {
387/// let st = "हिन्दीH🧑🌾e‘︀o‘︁réé".to_string();
388///
389/// println!("num_graphemes {}", num_graphemes(&st)); // Prints 12, the string has 12 grapheme clusters total
390/// }
391/// ```
392pub fn num_graphemes(st: &str) -> usize {
393 UnicodeSegmentation::grapheme_indices(st, true).count()
394}
395
396/// Previoius Grapheme from current idx
397///
398/// ```rust
399/// use grapheme_utils::*;
400///
401/// fn main() {
402/// let st = "हिन्दीH🧑🌾e‘︀o‘︁réé".to_string();
403///
404/// println!("prev_grapheme_from_idx {}", prev_grapheme_from_idx(&st, 18)); // Prints न्दी
405/// }
406/// ```
407pub fn prev_grapheme_from_idx(st: &str, idx: usize) -> &str {
408 if idx == 0 {
409 return "";
410 }
411 let pos = prev_grapheme_idx_from_idx(&st, idx);
412 grapheme_at_idx(&st, pos)
413}
414
415/// Byte Index of the Previous Extended Grapheme from Current Idx
416///
417/// NOTE: This will return 0, even when the string is empty.
418///
419/// Note: In testing, The Rust library currently divides
420/// some characters that should be singular "🧑🌾"
421///
422/// Note: This function has been modified to be panic proof.
423/// The underlying library can panic:
424/// unicode-segmentation-1.12.0/src/grapheme.rs:787:29:
425///
426/// ```rust
427/// use grapheme_utils::*;
428///
429/// fn main() {
430/// let st = "हिन्दीH🧑🌾e‘︀o‘︁réé".to_string();
431///
432/// println!(
433/// "prev_grapheme_idx_from_idx {}",
434/// prev_grapheme_idx_from_idx(&st, 18)
435/// ); // Prints 6
436/// }
437/// ```
438// Note: prev_boundary is written to only require
439// a short chunk of the string instead of the
440// whole thing:
441// let beg = idx - prev-ch.len_utf8 - etc
442// prev_boundary(st[beg..idx], beg)
443//
444// This Code could be IMPROVED by implementing PrevChunk
445//
446// next_boundary can then send an error back saying if
447// it needs more (GraphemeIncomplete::PrevChunk)
448//
449// beg MUST land on exact utf8 char
450// boundaries, but it's really hard for prev_grapheme_idx
451// where you need to know the exact info we're wanting.
452pub fn prev_grapheme_idx_from_idx(st: &str, idx: usize) -> usize {
453 let st_len = st.len();
454 if st_len == 0 {
455 return 0;
456 }
457
458 let max_len = st_len.saturating_sub(1);
459
460 let mut pos = idx;
461 while pos <= max_len && (st.as_bytes()[pos] & 0xc0) == 0x80 {
462 pos += 1;
463 }
464 if pos > st_len {
465 pos = st_len;
466 }
467
468 let mut cursor = GraphemeCursor::new(pos, st_len, true);
469 let pos = match cursor.prev_boundary(st, 0) {
470 Ok(Some(prev)) => prev,
471 _ => 0, // If we can't find a valid breakpoint or are at the start, return 0
472 };
473 pos
474}
475
476/// Return the string_width
477///
478/// ```rust
479/// use grapheme_utils::*;
480///
481/// fn main() {
482/// let st = "हिन्दीH🧑🌾e‘︀o‘︁réé".to_string();
483///
484/// println!("string_width {}", string_width(&st)); // Prints 18, the string uses 18 columns
485/// }
486/// ```
487pub fn string_width(st: &str) -> usize {
488 let mut total = 0;
489 for (_, grapheme) in st.grapheme_indices(true) {
490 total += grapheme.width();
491 }
492 total
493}
494
495
496#[cfg(test)]
497mod tests {
498 use super::*;
499 type TestData = (
500 usize, // testnum
501 usize, // pgifi: prev_grapheme_idx_from_idx
502 usize, // giati: grapheme_idx_at_idx
503 usize, // nxgifi: next_grapheme_idx_from_idx
504 &'static str, // pgfi: prev_grapheme_from_idx
505 &'static str, // gati: grapheme_at_idx
506 &'static str, // nxgfi: next_grapheme_from_idx
507 usize, // gwi: grapheme width_idx
508 usize, // glen: grapheme_len
509 &'static str, // nthg: nth_grapheme
510 usize, // ngw: nth_grapheme width
511 usize, // nthgi: nth_grapheme_idx
512 usize, // numg: num_graphemes
513 usize, // sw: string_width
514 );
515
516 fn run_grapheme_test(st: &str, expected: Vec<TestData>) {
517 // Note: Testing An Error:
518 // The Character 🧑🌾 is supposed to be 1 Character.
519 // (Look at it in a real editor)
520 // I expect this to be fixed eventually, but it's here for now.
521 //
522 // Note, the 2 éé are differnt.
523 // First one from a French AZERTY keyboard ( utf8 bytes c3a9, or codepoint e9)
524 // Second from the standard linux Ctrl-U character entry: (utf8 bytes 65 cc 81, or codepoint-modifier 65 301)
525 let string_len = st.len();
526 println!("Testing grapheme vector for \"{}\"", st);
527 assert_eq!(expected.len(), string_len + 2); // Ensure there's one more expected result than the length of the input
528
529 for i in 0..string_len + 2 {
530 print!("Testing String: \"{}\", at byte index: {} \n", st, i);
531 assert_eq!(expected[i].0, i); // Position index
532 print!("i:{} ok, \n", i);
533
534 let pgifi = prev_grapheme_idx_from_idx(st, i);
535 print!("prev_grapheme_idx_from_idx");
536 assert_eq!(expected[i].1, pgifi);
537 println!(" ok:{}", pgifi);
538
539 let giati = grapheme_idx_at_idx(st, i);
540 print!("grapheme_idx_at_idx");
541 assert_eq!(expected[i].2, giati);
542 println!(" ok:{}", giati);
543
544 let nxgifi = next_grapheme_idx_from_idx(st, i);
545 print!("next_grapheme_idx_from_idx");
546 assert_eq!(expected[i].3, nxgifi);
547 println!(" ok:{}\n", nxgifi);
548
549 let pgfi = prev_grapheme_from_idx(st, i);
550 print!("prev_grapheme_from_idx");
551 assert_eq!(expected[i].4, pgfi);
552 println!(" ok:{}", pgfi);
553
554 let gati = grapheme_at_idx(st, i);
555 print!("grapheme_at_idx");
556 assert_eq!(expected[i].5, gati);
557 println!(" ok:{}", gati);
558
559 let nxgfi = next_grapheme_from_idx(st, i);
560 print!("next_grapheme_from_idx");
561 assert_eq!(expected[i].6, nxgfi);
562 println!(" ok:{}\n", nxgfi);
563
564 let gwi = grapheme_width_at_idx(st, i);
565 print!("grapheme_width_from_idx");
566 assert_eq!(expected[i].7, gwi);
567 println!(" ok:{}", gwi);
568
569 let glen = grapheme_len(st, i);
570 print!("grapheme_len");
571 assert_eq!(expected[i].8, glen);
572 println!(" ok:{}", glen);
573
574 let nthg = nth_grapheme(st, i);
575 print!("nth_grapheme");
576 assert_eq!(expected[i].9, nthg);
577 println!(" ok:{}", nthg);
578
579 let ngw = nth_grapheme_width(st, i);
580 print!("nth grapheme_width");
581 assert_eq!(expected[i].10, ngw);
582 println!(" ok:{}", ngw);
583
584 let nthgi = nth_grapheme_idx(st, i);
585 print!("nth_grapheme_idx");
586 assert_eq!(expected[i].11, nthgi);
587 println!(" ok:{}", nthgi);
588
589 let numg = num_graphemes(st);
590 print!("num_graphemes");
591 assert_eq!(expected[i].12, numg);
592 println!(" ok:{}", numg);
593
594 let sw = string_width(st);
595 print!("string_width");
596 assert_eq!(expected[i].13, sw);
597 println!(" ok:{}", sw);
598 }
599 }
600
601 #[test]
602 fn test_grapheme_vectors() {
603 let test_cases: Vec<(String, Vec<TestData>)> = vec![
604 (
605 "".to_string(),
606 vec![
607 (0, 0, 0, 0, "", "", "", 0, 0, "", 0, 0, 0, 0),
608 (1, 0, 0, 0, "", "", "", 0, 0, "", 0, 0, 0, 0),
609 ],
610 ),
611 (
612 "é".to_string(),
613 vec![
614 (0, 0, 0, 2, "", "é", "", 1, 2, "é", 1, 0, 1, 1),
615 (1, 0, 0, 2, "é", "é", "", 1, 2, "", 0, 2, 1, 1),
616 (2, 0, 2, 2, "é", "", "", 0, 0, "", 0, 2, 1, 1),
617 (3, 0, 2, 2, "é", "", "", 0, 0, "", 0, 2, 1, 1),
618 ],
619 ),
620 (
621 "é".to_string(),
622 vec![
623 (0, 0, 0, 3, "", "é", "", 1, 3, "é", 1, 0, 1, 1),
624 (1, 0, 0, 3, "é", "é", "", 1, 3, "", 0, 3, 1, 1),
625 (2, 0, 0, 3, "é", "é", "", 1, 3, "", 0, 3, 1, 1),
626 (3, 0, 3, 3, "é", "", "", 0, 0, "", 0, 3, 1, 1),
627 (4, 0, 3, 3, "é", "", "", 0, 0, "", 0, 3, 1, 1),
628 ],
629 ),
630 (
631 "aé".to_string(),
632 vec![
633 (0, 0, 0, 1, "", "a", "é", 1, 1, "a", 1, 0, 2, 2),
634 (1, 0, 1, 4, "a", "é", "", 1, 3, "é", 1, 1, 2, 2),
635 (2, 1, 1, 4, "é", "é", "", 1, 3, "", 0, 4, 2, 2),
636 (3, 1, 1, 4, "é", "é", "", 1, 3, "", 0, 4, 2, 2),
637 (4, 1, 4, 4, "é", "", "", 0, 0, "", 0, 4, 2, 2),
638 (5, 1, 4, 4, "é", "", "", 0, 0, "", 0, 4, 2, 2),
639 ],
640 ),
641 (
642 "aé".to_string(),
643 vec![
644 (0, 0, 0, 1, "", "a", "é", 1, 1, "a", 1, 0, 2, 2),
645 (1, 0, 1, 3, "a", "é", "", 1, 2, "é", 1, 1, 2, 2),
646 (2, 1, 1, 3, "é", "é", "", 1, 2, "", 0, 3, 2, 2),
647 (3, 1, 3, 3, "é", "", "", 0, 0, "", 0, 3, 2, 2),
648 (4, 1, 3, 3, "é", "", "", 0, 0, "", 0, 3, 2, 2),
649 ],
650 ),
651 (
652 "aé".to_string(),
653 vec![
654 (0, 0, 0, 1, "", "a", "é", 1, 1, "a", 1, 0, 2, 2),
655 (1, 0, 1, 4, "a", "é", "", 1, 3, "é", 1, 1, 2, 2),
656 (2, 1, 1, 4, "é", "é", "", 1, 3, "", 0, 4, 2, 2),
657 (3, 1, 1, 4, "é", "é", "", 1, 3, "", 0, 4, 2, 2),
658 (4, 1, 4, 4, "é", "", "", 0, 0, "", 0, 4, 2, 2),
659 (5, 1, 4, 4, "é", "", "", 0, 0, "", 0, 4, 2, 2),
660 ],
661 ),
662 (
663 "éa".to_string(),
664 vec![
665 (0, 0, 0, 2, "", "é", "a", 1, 2, "é", 1, 0, 2, 2),
666 (1, 0, 0, 2, "é", "é", "a", 1, 2, "a", 1, 2, 2, 2),
667 (2, 0, 2, 3, "é", "a", "", 1, 1, "", 0, 3, 2, 2),
668 (3, 2, 3, 3, "a", "", "", 0, 0, "", 0, 3, 2, 2),
669 (4, 2, 3, 3, "a", "", "", 0, 0, "", 0, 3, 2, 2),
670 ],
671 ),
672 (
673 "éa".to_string(),
674 vec![
675 (0, 0, 0, 3, "", "é", "a", 1, 3, "é", 1, 0, 2, 2),
676 (1, 0, 0, 3, "é", "é", "a", 1, 3, "a", 1, 3, 2, 2),
677 (2, 0, 0, 3, "é", "é", "a", 1, 3, "", 0, 4, 2, 2),
678 (3, 0, 3, 4, "é", "a", "", 1, 1, "", 0, 4, 2, 2),
679 (4, 3, 4, 4, "a", "", "", 0, 0, "", 0, 4, 2, 2),
680 (5, 3, 4, 4, "a", "", "", 0, 0, "", 0, 4, 2, 2),
681 ],
682 ),
683 (
684 "abcd".to_string(),
685 vec![
686 (0, 0, 0, 1, "", "a", "b", 1, 1, "a", 1, 0, 4, 4),
687 (1, 0, 1, 2, "a", "b", "c", 1, 1, "b", 1, 1, 4, 4),
688 (2, 1, 2, 3, "b", "c", "d", 1, 1, "c", 1, 2, 4, 4),
689 (3, 2, 3, 4, "c", "d", "", 1, 1, "d", 1, 3, 4, 4),
690 (4, 3, 4, 4, "d", "", "", 0, 0, "", 0, 4, 4, 4),
691 (5, 3, 4, 4, "d", "", "", 0, 0, "", 0, 4, 4, 4),
692 ],
693 ),
694 (
695 "abcहि".to_string(),
696 vec![
697 (0, 0, 0, 1, "", "a", "b", 1, 1, "a", 1, 0, 4, 5),
698 (1, 0, 1, 2, "a", "b", "c", 1, 1, "b", 1, 1, 4, 5),
699 (2, 1, 2, 3, "b", "c", "हि", 1, 1, "c", 1, 2, 4, 5),
700 (3, 2, 3, 9, "c", "हि", "", 2, 6, "हि", 2, 3, 4, 5),
701 (4, 3, 3, 9, "हि", "हि", "", 2, 6, "", 0, 9, 4, 5),
702 (5, 3, 3, 9, "हि", "हि", "", 2, 6, "", 0, 9, 4, 5),
703 (6, 3, 3, 9, "हि", "हि", "", 2, 6, "", 0, 9, 4, 5),
704 (7, 3, 3, 9, "हि", "हि", "", 2, 6, "", 0, 9, 4, 5),
705 (8, 3, 3, 9, "हि", "हि", "", 2, 6, "", 0, 9, 4, 5),
706 (9, 3, 9, 9, "हि", "", "", 0, 0, "", 0, 9, 4, 5),
707 (10, 3, 9, 9, "हि", "", "", 0, 0, "", 0, 9, 4, 5),
708 ],
709 ),
710 (
711 "हिन्दीH🧑🌾e‘︀o‘︁réé".to_string(),
712 vec![
713 (0, 0, 0, 6, "", "हि", "न्दी", 2, 6, "हि", 2, 0, 12, 18),
714 (1, 0, 0, 6, "हि", "हि", "न्दी", 2, 6, "न्दी", 3, 6, 12, 18),
715 (2, 0, 0, 6, "हि", "हि", "न्दी", 2, 6, "H", 1, 18, 12, 18),
716 (3, 0, 0, 6, "हि", "हि", "न्दी", 2, 6, "🧑", 2, 19, 12, 18),
717 (4, 0, 0, 6, "हि", "हि", "न्दी", 2, 6, "🌾", 2, 23, 12, 18),
718 (5, 0, 0, 6, "हि", "हि", "न्दी", 2, 6, "e", 1, 27, 12, 18),
719 (6, 0, 6, 18, "हि", "न्दी", "H", 3, 12, "‘︀", 1, 28, 12, 18),
720 (7, 6, 6, 18, "न्दी", "न्दी", "H", 3, 12, "o", 1, 34, 12, 18),
721 (8, 6, 6, 18, "न्दी", "न्दी", "H", 3, 12, "‘︁", 2, 35, 12, 18),
722 (9, 6, 6, 18, "न्दी", "न्दी", "H", 3, 12, "r", 1, 41, 12, 18),
723 (10, 6, 6, 18, "न्दी", "न्दी", "H", 3, 12, "é", 1, 42, 12, 18),
724 (11, 6, 6, 18, "न्दी", "न्दी", "H", 3, 12, "é", 1, 44, 12, 18),
725 (12, 6, 6, 18, "न्दी", "न्दी", "H", 3, 12, "", 0, 47, 12, 18),
726 (13, 6, 6, 18, "न्दी", "न्दी", "H", 3, 12, "", 0, 47, 12, 18),
727 (14, 6, 6, 18, "न्दी", "न्दी", "H", 3, 12, "", 0, 47, 12, 18),
728 (15, 6, 6, 18, "न्दी", "न्दी", "H", 3, 12, "", 0, 47, 12, 18),
729 (16, 6, 6, 18, "न्दी", "न्दी", "H", 3, 12, "", 0, 47, 12, 18),
730 (17, 6, 6, 18, "न्दी", "न्दी", "H", 3, 12, "", 0, 47, 12, 18),
731 (18, 6, 18, 19, "न्दी", "H", "🧑", 1, 1, "", 0, 47, 12, 18),
732 (19, 18, 19, 23, "H", "🧑", "🌾", 2, 4, "", 0, 47, 12, 18),
733 (20, 19, 19, 23, "🧑", "🧑", "🌾", 2, 4, "", 0, 47, 12, 18),
734 (21, 19, 19, 23, "🧑", "🧑", "🌾", 2, 4, "", 0, 47, 12, 18),
735 (22, 19, 19, 23, "🧑", "🧑", "🌾", 2, 4, "", 0, 47, 12, 18),
736 (23, 19, 23, 27, "🧑", "🌾", "e", 2, 4, "", 0, 47, 12, 18),
737 (24, 23, 23, 27, "🌾", "🌾", "e", 2, 4, "", 0, 47, 12, 18),
738 (25, 23, 23, 27, "🌾", "🌾", "e", 2, 4, "", 0, 47, 12, 18),
739 (26, 23, 23, 27, "🌾", "🌾", "e", 2, 4, "", 0, 47, 12, 18),
740 (27, 23, 27, 28, "🌾", "e", "‘︀", 1, 1, "", 0, 47, 12, 18),
741 (28, 27, 28, 34, "e", "‘︀", "o", 1, 6, "", 0, 47, 12, 18),
742 (29, 28, 28, 34, "‘︀", "‘︀", "o", 1, 6, "", 0, 47, 12, 18),
743 (30, 28, 28, 34, "‘︀", "‘︀", "o", 1, 6, "", 0, 47, 12, 18),
744 (31, 28, 28, 34, "‘︀", "‘︀", "o", 1, 6, "", 0, 47, 12, 18),
745 (32, 28, 28, 34, "‘︀", "‘︀", "o", 1, 6, "", 0, 47, 12, 18),
746 (33, 28, 28, 34, "‘︀", "‘︀", "o", 1, 6, "", 0, 47, 12, 18),
747 (34, 28, 34, 35, "‘︀", "o", "‘︁", 1, 1, "", 0, 47, 12, 18),
748 (35, 34, 35, 41, "o", "‘︁", "r", 2, 6, "", 0, 47, 12, 18),
749 (36, 35, 35, 41, "‘︁", "‘︁", "r", 2, 6, "", 0, 47, 12, 18),
750 (37, 35, 35, 41, "‘︁", "‘︁", "r", 2, 6, "", 0, 47, 12, 18),
751 (38, 35, 35, 41, "‘︁", "‘︁", "r", 2, 6, "", 0, 47, 12, 18),
752 (39, 35, 35, 41, "‘︁", "‘︁", "r", 2, 6, "", 0, 47, 12, 18),
753 (40, 35, 35, 41, "‘︁", "‘︁", "r", 2, 6, "", 0, 47, 12, 18),
754 (41, 35, 41, 42, "‘︁", "r", "é", 1, 1, "", 0, 47, 12, 18),
755 (42, 41, 42, 44, "r", "é", "é", 1, 2, "", 0, 47, 12, 18),
756 (43, 42, 42, 44, "é", "é", "é", 1, 2, "", 0, 47, 12, 18),
757 (44, 42, 44, 47, "é", "é", "", 1, 3, "", 0, 47, 12, 18),
758 (45, 44, 44, 47, "é", "é", "", 1, 3, "", 0, 47, 12, 18),
759 (46, 44, 44, 47, "é", "é", "", 1, 3, "", 0, 47, 12, 18),
760 (47, 44, 47, 47, "é", "", "", 0, 0, "", 0, 47, 12, 18),
761 (48, 44, 47, 47, "é", "", "", 0, 0, "", 0, 47, 12, 18),
762 ],
763 ),
764 ];
765
766 for (st, expected) in test_cases {
767 run_grapheme_test(&st, expected);
768 }
769 }
770
771 #[test]
772 fn test_num_graphemes() {
773 assert_eq!(num_graphemes(""), 0);
774 assert_eq!(num_graphemes("hello"), 5);
775 assert_eq!(num_graphemes("😊"), 1);
776 assert_eq!(num_graphemes("😊b"), 2);
777 assert_eq!(num_graphemes("a😊"), 2);
778 assert_eq!(num_graphemes("😊😊"), 2);
779 assert_eq!(num_graphemes("hello 😊 world"), 13);
780 assert_eq!(num_graphemes("é"), 1);
781 let complex_str = "áb̌c̃d̄";
782 assert_eq!(num_graphemes(complex_str), 4);
783 let flag_str = "🇫🇷"; // French flag
784 assert_eq!(num_graphemes(flag_str), 1);
785 }
786}