grapheme_utils/
lib.rs

1//! Extended Grapheme Cluster Utils
2//!
3//! Handy Grapheme Helper Utils
4//!
5//! ```rust
6//! use grapheme_utils::*;
7//! 
8//! fn main() {
9//!     let st = "हिन्दीH🧑🌾e‘︀o‘︁réé".to_string();
10//! 
11//!     println!("num_graphemes {}", num_graphemes(&st)); // Prints 12, the string has 12 grapheme clusters total
12//! 
13//!     println!("string_width {}", string_width(&st)); // Prints 18, the string uses 18 columns
14//! 
15//!     println!("nth_grapheme_idx {}", nth_grapheme_idx(&st, 1)); // Prints 6 (index 6)
16//!     println!("nth_grapheme_idx {}", nth_grapheme_idx(&st, 2)); // Prints 18 (index 18)
17//! 
18//!     println!("nth_grapheme {}", nth_grapheme(&st, 1)); // Prints न्दी, the 2nd byte in the string, base 0
19//!     println!("nth_grapheme {}", nth_grapheme(&st, 2)); // Prints H, the 3rd bytes in the string, base 0
20//! 
21//!     println!("nth grapheme_width {}", nth_grapheme_width(&st, 1)); // Prints 3
22//!     println!("nth grapheme_width {}", nth_grapheme_width(&st, 2)); // Prints 1
23//! 
24//!     // Grapheme Visual Column Width
25//!     println!("grapheme_width_at_idx {}", grapheme_width_at_idx(&st, 18)); // Prints 1 (H is 1 column wide)
26//!     let num = 7; // Anything between 6 and 17 inclusive
27//!     println!(
28//!         "grapheme_width_at_idx {}",
29//!         grapheme_width_at_idx(&st, num)
30//!     ); // Prints 3 (न्दी is 3 columns wide)
31//! 
32//!     // Grapheme utf8 byte count
33//!     println!("grapheme_len {}", grapheme_len(&st, num)); // Prints 12 (न्दी uses 12 utf8 bytes)
34//!     println!("grapheme_len {}", grapheme_len(&st, 18)); // Prints 1 (H uses 1 utf8 byte)
35//! 
36//!     // Matrix of grapheme functions:
37//!     // [previous, current or next] grapheme given an index
38//!     //
39//!     // Outputing either the
40//!     //  [extended grapheme cluster string, or starting string index]
41//!     //
42//!     // Take special note of the 2 characters before and after the H.
43//!     // The characer before न्दी starts at string index 6, H is at 18, and 🧑 is at index 19
44//!     //
45//! 
46//!     // Output string index
47//!     println!(
48//!         "prev_grapheme_idx_from_idx {}",
49//!         prev_grapheme_idx_from_idx(&st, 18)
50//!     ); // Prints 6
51//! 
52//!     println!("grapheme_idx_at_idx {}", grapheme_idx_at_idx(&st, 18)); // Prints 18
53//! 
54//!     println!(
55//!         "next_grapheme_idx_from_idx {}",
56//!         next_grapheme_idx_from_idx(&st, 18)
57//!     ); // Prints 19
58//! 
59//!     // Output extended grapheme cluster
60//!     println!("prev_grapheme_from_idx {}", prev_grapheme_from_idx(&st, 18)); // Prints न्दी
61//! 
62//!     println!("grapheme_at_idx {}", grapheme_at_idx(&st, 18)); // Prints H
63//! 
64//!     println!("next_grapheme_from_idx {}", next_grapheme_from_idx(&st, 18)); // Prints 🧑
65//! 
66//!     // Note, all of the above matrix of functions work with num, the range of inputs
67//!     //       instead of requiring the exact start to each grapheme.
68//!     // Examining the testing matrix may be instructive
69//! }
70//! ```
71
72
73// Note: Ease of use over Ideomatic Rust
74//       This code will return a '' grapheme or an index past the end of the
75//       string instead of None, etc.
76//
77//       I find it much more convenient when working with text to concatenate ''
78//       stings more convenient than checking an handling None for example. I
79//       can easily test if a string is None (or len() == 0 when necessary.)
80//
81// Note: This code is a purposefully forgiving.
82//       unicode_segmentation will panic if it ever index in the middle of a grapheme.
83//       This code is wastefully hardened against those errors.
84//
85// Note: Utf-8 Can encode reverse text (right to left), probably downwards, etc.  
86//       This crate ignores ALL THAT.
87//
88use unicode_segmentation::{GraphemeCursor, UnicodeSegmentation};
89use unicode_width::UnicodeWidthStr;
90
91//  Notes on Graphemes
92//	It's complicated...  and with the way the world works, it keeps getting more complicated.
93//	One comic suggested that the unicode foundation has the job of trying to direct a flooding
94//	river with traffic signs!
95//
96//	A utf-8 character can be encoded in 1 to 4 bytes. See table below.
97//
98//	Extended Graphemes Clusters can consist of multiple utf-8 characters, many of which modify
99//	the initial character, making certain works 100 or more utf-8 bytes long.
100//
101//	All together...  it's complicated.
102//
103//
104//    - UTF-8 encoding follows a pattern:
105//     - 1-byte sequence: `0xxxxxxx` (ASCII range, covers basic Latin)  Follow Bytes are:
106//     - 2-byte sequence: `110xxxxx 10xxxxxx`                   CD      8?, 9?, A?, or B?
107//     - 3-byte sequence: `1110xxxx 10xxxxxx 10xxxxxx`          E?
108//     - 4-byte sequence: `11110xxx 10xxxxxx 10xxxxxx 10xxxxxx` F?
109//
110//  pub enum GraphemeCat - https://github.com/unicode-rs/unicode-segmentation/blob/master/src/grapheme.rs
111//	Catch All	GC_Any,
112//	Carriage Return	GC_CR,
113//	Control < 20h	GC_Control,
114//	Extended	GC_Extend,
115//	Ext Pic		GC_Extended_Pictographic,
116//	Conanical Brk	GC_InCB_Consonant,
117//	Left Part	GC_L,
118//	Line Feed (CR)	GC_LF,
119//	Letter/Vowel	GC_LV,
120//	Let/Volel/Tone	GC_LVT,
121//	Prepend		GC_Prepend,
122//	Regional	GC_Regional_Indicator,
123//	Visable Join	GC_SpacingMark,
124//	Trailing	GC_T,
125//	Vowel		GC_V,
126//	Zero Width Join	GC_ZWJ,
127//
128//  Wide Chars Test
129//  "I😂J你KツL한M😂N\u{1F468}\u{200D}\u{1F469}\u{200D}\u{1F467}\u{200D}\u{1F466}O👨👩👧👦P\u{FF21}Q\u{20000}R"
130//
131//  Note:
132//  Challenge Case - /t is reported as a single character, real width depends on column, and tabstops!!!
133
134
135/// Return the grapheme at the given string idx
136///
137/// ```rust
138/// use grapheme_utils::*;
139/// 
140/// fn main() {
141///     let st = "हिन्दीH🧑🌾e‘︀o‘︁réé".to_string();
142/// 
143///     println!("grapheme_at_idx {}", grapheme_at_idx(&st, 18)); // Prints H
144/// }
145/// ```
146pub fn grapheme_at_idx(st: &str, idx: usize) -> &str {
147    let pos = grapheme_idx_at_idx(&st, idx);
148    &st[pos..pos + st[pos..].graphemes(true).next().unwrap_or("").len()]
149}
150
151/// Grapheme length in Bytes
152///
153/// ```rust
154/// use grapheme_utils::*;
155/// 
156/// fn main() {
157///     let st = "हिन्दीH🧑🌾e‘︀o‘︁réé".to_string();
158/// 
159///     let num = 7; // Anything between 6 and 17 inclusive
160/// 
161///     // Grapheme utf8 byte count or length in bytes
162///     println!("grapheme_len {}", grapheme_len(&st, num)); // Prints 12 (न्दी uses 12 utf8 bytes)
163///     println!("grapheme_len {}", grapheme_len(&st, 18)); // Prints 1 (H uses 1 utf8 byte)
164/// }
165/// ```
166pub fn grapheme_len(st: &str, idx: usize) -> usize {
167    let pos = grapheme_idx_at_idx(&st, idx);
168    st[pos..].graphemes(true).next().unwrap_or("").len()
169}
170
171/// Starting idx of Grapheme
172///
173/// This returns the starting index for the grapheme given
174/// a string byte idx
175///
176/// Note: This is an example wasteful function, but it
177///       usually returns the same index you're providing.
178///       
179///   
180/// ```rust
181/// use grapheme_utils::*;
182/// 
183/// fn main() {
184///     let st = "हिन्दीH🧑🌾e‘︀o‘︁réé".to_string();
185/// 
186///     println!("grapheme_idx_at_idx {}", grapheme_idx_at_idx(&st, 18)); // Prints 18
187/// }
188/// ```
189pub fn grapheme_idx_at_idx(st: &str, idx: usize) -> usize {
190    if idx == 0 {
191        return 0;
192    }
193    let mut pos = idx;
194
195    if idx >= st.len() {
196        return st.len();
197    }
198    let mut cursor = GraphemeCursor::new(idx, st.len(), true);
199
200    loop {
201        while pos > 0 && (st.as_bytes()[pos] & 0xc0) == 0x80 {
202            pos -= 1;
203        }
204        cursor.set_cursor(pos);
205        if cursor.is_boundary(st, 0).unwrap_or(false) {
206            break;
207        }
208        pos -= 1;
209    }
210    pos
211}
212
213/// Return the grapheme starting at or after the given byte index in a string.
214///
215/// ```rust
216/// use grapheme_utils::*;
217/// 
218/// fn main() {
219///     let st = "हिन्दीH🧑🌾e‘︀o‘︁réé".to_string();
220/// 
221///     // Grapheme Visual Column Width
222///     println!("grapheme_width_at_idx {}", grapheme_width_at_idx(&st, 18)); // Prints 1 (H is 1 column wide)
223///
224///     let num = 7; // Anything between 6 and 17 inclusive
225///     println!(
226///         "grapheme_width_at_idx {}",
227///         grapheme_width_at_idx(&st, num)
228///     ); // Prints 3 (न्दी is 3 columns wide)
229/// }
230/// ```
231pub fn grapheme_width_at_idx(st: &str, idx: usize) -> usize {
232    let pos = grapheme_idx_at_idx(&st, idx);
233    st[pos..].graphemes(true).next().unwrap_or("").width()
234}
235
236/// Next Grapheme from Current Index
237///
238/// ```rust
239/// use grapheme_utils::*;
240/// 
241/// fn main() {
242///     let st = "हिन्दीH🧑🌾e‘︀o‘︁réé".to_string();
243/// 
244///     println!("next_grapheme_from_idx {}", next_grapheme_from_idx(&st, 18)); // Prints 🧑
245/// }
246/// ```
247pub fn next_grapheme_from_idx(st: &str, idx: usize) -> &str {
248    let st_len = st.len();
249    if idx >= st_len {
250        return "";
251    }
252    let pos = next_grapheme_idx_from_idx(&st, idx);
253    st[pos..].graphemes(true).next().unwrap_or("")
254}
255
256/// Byte Index of the Next Extended Grapheme from Current Index
257///
258/// NOTE: This can return the st.len(), meaning an illegal index
259///       if this is the last Grapheme in the string!
260///
261/// Note: In testing, The Rust library currently divides
262///       some characters that should be singular "🧑🌾"
263///
264/// Note: This code is inefficient...  small, but inefficient.
265///
266/// Note: This function has been modified to be panic proof.
267///       The underlying library can panic:
268///       unicode-segmentation-1.12.0/src/grapheme.rs:787:29:
269///
270/// ```rust
271/// use grapheme_utils::*;
272/// 
273/// fn main() {
274///     let st = "हिन्दीH🧑🌾e‘︀o‘︁réé".to_string();
275/// 
276///     println!("next_grapheme_from_idx {}", next_grapheme_from_idx(&st, 18)); // Prints 🧑
277/// }
278/// ```
279// Note: next_boundary is written to only require
280//       a short chunk of the string instead of the
281//       whole thing:
282//           next_boundary(st[beg..beg+ch.len_utf8+etc], beg)
283//
284//       next_boundary can then send an error back saying if
285//       it needs more (GraphemeIncomplete::NextChunk)
286//
287//       It may or may not to land on exact utf8 char
288//       boundaries, but it's really hard for prev_grapheme_idx
289//       where you need to know the exact info we're wanting.
290//
291pub fn next_grapheme_idx_from_idx(st: &str, idx: usize) -> usize {
292    let st_len = st.len();
293    if idx >= st_len {
294        return st_len;
295    }
296    let mut pos = idx;
297    while pos > 0 && (st.as_bytes()[pos] & 0xc0) == 0x80 {
298        pos -= 1;
299    }
300    let mut cursor = GraphemeCursor::new(pos, st_len, true);
301    cursor
302        .next_boundary(st, 0)
303        .ok()
304        .flatten()
305        .unwrap_or_else(|| st_len)
306}
307
308/// nth Grapheme
309///
310/// Note, this will return the st.len() index if it would be
311///       past the end of the string even if the string
312///       is empty.
313///
314/// ```rust
315/// use grapheme_utils::*;
316/// 
317/// fn main() {
318///     let st = "हिन्दीH🧑🌾e‘︀o‘︁réé".to_string();
319/// 
320///     println!("nth_grapheme {}", nth_grapheme(&st, 1)); // Prints न्दी, the 2nd byte in the string, base 0
321///     println!("nth_grapheme {}", nth_grapheme(&st, 2)); // Prints H, the 3rd bytes in the string, base 0
322/// }
323/// ```
324// UUGH - Full Iter to nth!
325//
326pub fn nth_grapheme(st: &str, nth: usize) -> &str {
327    UnicodeSegmentation::grapheme_indices(st, true)
328        .nth(nth)
329        .map(|(_, g)| g)
330        .unwrap_or_else(|| "")
331}
332
333/// nth Grapheme Index from nth
334///
335/// Note, this will return the st.len() index if it would be
336///       past the end of the string even if the string
337///       is empty.
338///
339/// ```rust
340/// use grapheme_utils::*;
341/// 
342/// fn main() {
343///     let st = "हिन्दीH🧑🌾e‘︀o‘︁réé".to_string();
344/// 
345///     println!("nth_grapheme_idx {}", nth_grapheme_idx(&st, 1)); // Prints 6 (index 6)
346///     println!("nth_grapheme_idx {}", nth_grapheme_idx(&st, 2)); // Prints 18 (index 18)
347/// }
348/// ```
349// Uugh - Full Iter!
350//
351pub fn nth_grapheme_idx(st: &str, nth: usize) -> usize {
352    UnicodeSegmentation::grapheme_indices(st, true)
353        .nth(nth)
354        .map(|(idx, _)| idx)
355        .unwrap_or_else(|| st.len())
356}
357
358/// nth Grapheme Width
359///
360/// ```rust
361/// use grapheme_utils::*;
362/// 
363/// fn main() {
364///     let st = "हिन्दीH🧑🌾e‘︀o‘︁réé".to_string();
365/// 
366///     println!("nth grapheme_width {}", nth_grapheme_width(&st, 1)); // Prints 3
367///     println!("nth grapheme_width {}", nth_grapheme_width(&st, 2)); // Prints 1
368/// }
369/// ```
370pub fn nth_grapheme_width(st: &str, nth: usize) -> usize {
371    UnicodeSegmentation::grapheme_indices(st, true)
372        .nth(nth)
373        .map(|(_, g)| g)
374        .unwrap_or_else(|| "")
375        .width()
376}
377
378/// Num Graphemes In &str
379///
380/// Note, this will return the st.len() index if it would be
381///       past the end of the string even if the string
382///       is empty.
383/// ```rust
384/// use grapheme_utils::*;
385/// 
386/// fn main() {
387///     let st = "हिन्दीH🧑🌾e‘︀o‘︁réé".to_string();
388/// 
389///     println!("num_graphemes {}", num_graphemes(&st)); // Prints 12, the string has 12 grapheme clusters total
390/// }
391/// ```
392pub fn num_graphemes(st: &str) -> usize {
393    UnicodeSegmentation::grapheme_indices(st, true).count()
394}
395
396/// Previoius Grapheme from current idx
397///
398/// ```rust
399/// use grapheme_utils::*;
400/// 
401/// fn main() {
402///     let st = "हिन्दीH🧑🌾e‘︀o‘︁réé".to_string();
403/// 
404///     println!("prev_grapheme_from_idx {}", prev_grapheme_from_idx(&st, 18)); // Prints न्दी
405/// }
406/// ```
407pub fn prev_grapheme_from_idx(st: &str, idx: usize) -> &str {
408    if idx == 0 {
409        return "";
410    }
411    let pos = prev_grapheme_idx_from_idx(&st, idx);
412    grapheme_at_idx(&st, pos)
413}
414
415/// Byte Index of the Previous Extended Grapheme from Current Idx
416///
417/// NOTE: This will return 0, even when the string is empty.
418///
419/// Note: In testing, The Rust library currently divides
420///       some characters that should be singular "🧑🌾"
421///
422/// Note: This function has been modified to be panic proof.
423///       The underlying library can panic:
424///       unicode-segmentation-1.12.0/src/grapheme.rs:787:29:
425///
426/// ```rust
427/// use grapheme_utils::*;
428/// 
429/// fn main() {
430///     let st = "हिन्दीH🧑🌾e‘︀o‘︁réé".to_string();
431/// 
432///     println!(
433///         "prev_grapheme_idx_from_idx {}",
434///         prev_grapheme_idx_from_idx(&st, 18)
435///     ); // Prints 6
436/// }
437/// ```
438// Note: prev_boundary is written to only require
439//       a short chunk of the string instead of the
440//       whole thing:
441//           let beg = idx - prev-ch.len_utf8 - etc
442//           prev_boundary(st[beg..idx], beg)
443//
444//       This Code could be IMPROVED by implementing PrevChunk
445//
446//       next_boundary can then send an error back saying if
447//       it needs more (GraphemeIncomplete::PrevChunk)
448//
449//       beg MUST land on exact utf8 char
450//       boundaries, but it's really hard for prev_grapheme_idx
451//       where you need to know the exact info we're wanting.
452pub fn prev_grapheme_idx_from_idx(st: &str, idx: usize) -> usize {
453    let st_len = st.len();
454    if st_len == 0 {
455        return 0;
456    }
457
458    let max_len = st_len.saturating_sub(1);
459
460    let mut pos = idx;
461    while pos <= max_len && (st.as_bytes()[pos] & 0xc0) == 0x80 {
462        pos += 1;
463    }
464    if pos > st_len {
465        pos = st_len;
466    }
467
468    let mut cursor = GraphemeCursor::new(pos, st_len, true);
469    let pos = match cursor.prev_boundary(st, 0) {
470        Ok(Some(prev)) => prev,
471        _ => 0, // If we can't find a valid breakpoint or are at the start, return 0
472    };
473    pos
474}
475
476/// Return the string_width
477///
478/// ```rust
479/// use grapheme_utils::*;
480/// 
481/// fn main() {
482///     let st = "हिन्दीH🧑🌾e‘︀o‘︁réé".to_string();
483/// 
484///     println!("string_width {}", string_width(&st)); // Prints 18, the string uses 18 columns
485/// }
486/// ```
487pub fn string_width(st: &str) -> usize {
488    let mut total = 0;
489    for (_, grapheme) in st.grapheme_indices(true) {
490        total += grapheme.width();
491    }
492    total
493}
494
495
496#[cfg(test)]
497mod tests {
498    use super::*;
499    type TestData = (
500        usize, // testnum
501        usize, // pgifi: prev_grapheme_idx_from_idx
502        usize, // giati: grapheme_idx_at_idx
503        usize, // nxgifi: next_grapheme_idx_from_idx
504        &'static str, // pgfi: prev_grapheme_from_idx
505        &'static str, // gati: grapheme_at_idx
506        &'static str, // nxgfi: next_grapheme_from_idx
507        usize, // gwi: grapheme width_idx
508        usize, // glen: grapheme_len
509        &'static str, // nthg: nth_grapheme
510        usize, // ngw: nth_grapheme width
511        usize, // nthgi: nth_grapheme_idx
512        usize, // numg: num_graphemes
513        usize, // sw: string_width
514    );
515
516    fn run_grapheme_test(st: &str, expected: Vec<TestData>) {
517        // Note:  Testing An Error:
518        // The Character 🧑🌾 is supposed to be 1 Character.
519        // (Look at it in a real editor)
520        // I expect this to be fixed eventually, but it's here for now.
521        //
522        // Note, the 2 éé are differnt.
523        // First one from a French AZERTY keyboard ( utf8 bytes c3a9, or codepoint e9)
524        // Second from the standard linux Ctrl-U character entry: (utf8 bytes 65 cc 81, or codepoint-modifier 65 301)
525        let string_len = st.len();
526        println!("Testing grapheme vector for \"{}\"", st);
527        assert_eq!(expected.len(), string_len + 2); // Ensure there's one more expected result than the length of the input
528
529        for i in 0..string_len + 2 {
530            print!("Testing String: \"{}\", at byte index: {} \n", st, i);
531            assert_eq!(expected[i].0, i); // Position index
532            print!("i:{} ok, \n", i);
533
534            let pgifi = prev_grapheme_idx_from_idx(st, i);
535            print!("prev_grapheme_idx_from_idx");
536            assert_eq!(expected[i].1, pgifi);
537            println!("  ok:{}", pgifi);
538
539            let giati = grapheme_idx_at_idx(st, i);
540            print!("grapheme_idx_at_idx");
541            assert_eq!(expected[i].2, giati);
542            println!("  ok:{}", giati);
543
544            let nxgifi = next_grapheme_idx_from_idx(st, i);
545            print!("next_grapheme_idx_from_idx");
546            assert_eq!(expected[i].3, nxgifi);
547            println!("  ok:{}\n", nxgifi);
548
549            let pgfi = prev_grapheme_from_idx(st, i);
550            print!("prev_grapheme_from_idx");
551            assert_eq!(expected[i].4, pgfi);
552            println!("  ok:{}", pgfi);
553
554            let gati = grapheme_at_idx(st, i);
555            print!("grapheme_at_idx");
556            assert_eq!(expected[i].5, gati);
557            println!("  ok:{}", gati);
558
559            let nxgfi = next_grapheme_from_idx(st, i);
560            print!("next_grapheme_from_idx");
561            assert_eq!(expected[i].6, nxgfi);
562            println!("  ok:{}\n", nxgfi);
563
564            let gwi = grapheme_width_at_idx(st, i);
565            print!("grapheme_width_from_idx");
566            assert_eq!(expected[i].7, gwi);
567            println!("  ok:{}", gwi);
568
569            let glen = grapheme_len(st, i);
570            print!("grapheme_len");
571            assert_eq!(expected[i].8, glen);
572            println!("  ok:{}", glen);
573
574            let nthg = nth_grapheme(st, i);
575            print!("nth_grapheme");
576            assert_eq!(expected[i].9, nthg);
577            println!("  ok:{}", nthg);
578
579            let ngw = nth_grapheme_width(st, i);
580            print!("nth grapheme_width");
581            assert_eq!(expected[i].10, ngw);
582            println!("  ok:{}", ngw);
583
584            let nthgi = nth_grapheme_idx(st, i);
585            print!("nth_grapheme_idx");
586            assert_eq!(expected[i].11, nthgi);
587            println!("  ok:{}", nthgi);
588
589            let numg = num_graphemes(st);
590            print!("num_graphemes");
591            assert_eq!(expected[i].12, numg);
592            println!("  ok:{}", numg);
593
594            let sw = string_width(st);
595            print!("string_width");
596            assert_eq!(expected[i].13, sw);
597            println!("  ok:{}", sw);
598        }
599    }
600
601    #[test]
602    fn test_grapheme_vectors() {
603        let test_cases: Vec<(String, Vec<TestData>)> = vec![
604            (
605                "".to_string(),
606                vec![
607                    (0, 0, 0, 0, "", "", "", 0, 0, "", 0, 0, 0, 0),
608                    (1, 0, 0, 0, "", "", "", 0, 0, "", 0, 0, 0, 0),
609                ],
610            ),
611            (
612                "é".to_string(),
613                vec![
614                    (0, 0, 0, 2, "", "é", "", 1, 2, "é", 1, 0, 1, 1),
615                    (1, 0, 0, 2, "é", "é", "", 1, 2, "", 0, 2, 1, 1),
616                    (2, 0, 2, 2, "é", "", "", 0, 0, "", 0, 2, 1, 1),
617                    (3, 0, 2, 2, "é", "", "", 0, 0, "", 0, 2, 1, 1),
618                ],
619            ),
620            (
621                "é".to_string(),
622                vec![
623                    (0, 0, 0, 3, "", "é", "", 1, 3, "é", 1, 0, 1, 1),
624                    (1, 0, 0, 3, "é", "é", "", 1, 3, "", 0, 3, 1, 1),
625                    (2, 0, 0, 3, "é", "é", "", 1, 3, "", 0, 3, 1, 1),
626                    (3, 0, 3, 3, "é", "", "", 0, 0, "", 0, 3, 1, 1),
627                    (4, 0, 3, 3, "é", "", "", 0, 0, "", 0, 3, 1, 1),
628                ],
629            ),
630            (
631                "aé".to_string(),
632                vec![
633                    (0, 0, 0, 1, "", "a", "é", 1, 1, "a", 1, 0, 2, 2),
634                    (1, 0, 1, 4, "a", "é", "", 1, 3, "é", 1, 1, 2, 2),
635                    (2, 1, 1, 4, "é", "é", "", 1, 3, "", 0, 4, 2, 2),
636                    (3, 1, 1, 4, "é", "é", "", 1, 3, "", 0, 4, 2, 2),
637                    (4, 1, 4, 4, "é", "", "", 0, 0, "", 0, 4, 2, 2),
638                    (5, 1, 4, 4, "é", "", "", 0, 0, "", 0, 4, 2, 2),
639                ],
640            ),
641            (
642                "aé".to_string(),
643                vec![
644                    (0, 0, 0, 1, "", "a", "é", 1, 1, "a", 1, 0, 2, 2),
645                    (1, 0, 1, 3, "a", "é", "", 1, 2, "é", 1, 1, 2, 2),
646                    (2, 1, 1, 3, "é", "é", "", 1, 2, "", 0, 3, 2, 2),
647                    (3, 1, 3, 3, "é", "", "", 0, 0, "", 0, 3, 2, 2),
648                    (4, 1, 3, 3, "é", "", "", 0, 0, "", 0, 3, 2, 2),
649                ],
650            ),
651            (
652                "aé".to_string(),
653                vec![
654                    (0, 0, 0, 1, "", "a", "é", 1, 1, "a", 1, 0, 2, 2),
655                    (1, 0, 1, 4, "a", "é", "", 1, 3, "é", 1, 1, 2, 2),
656                    (2, 1, 1, 4, "é", "é", "", 1, 3, "", 0, 4, 2, 2),
657                    (3, 1, 1, 4, "é", "é", "", 1, 3, "", 0, 4, 2, 2),
658                    (4, 1, 4, 4, "é", "", "", 0, 0, "", 0, 4, 2, 2),
659                    (5, 1, 4, 4, "é", "", "", 0, 0, "", 0, 4, 2, 2),
660                ],
661            ),
662            (
663                "éa".to_string(),
664                vec![
665                    (0, 0, 0, 2, "", "é", "a", 1, 2, "é", 1, 0, 2, 2),
666                    (1, 0, 0, 2, "é", "é", "a", 1, 2, "a", 1, 2, 2, 2),
667                    (2, 0, 2, 3, "é", "a", "", 1, 1, "", 0, 3, 2, 2),
668                    (3, 2, 3, 3, "a", "", "", 0, 0, "", 0, 3, 2, 2),
669                    (4, 2, 3, 3, "a", "", "", 0, 0, "", 0, 3, 2, 2),
670                ],
671            ),
672            (
673                "éa".to_string(),
674                vec![
675                    (0, 0, 0, 3, "", "é", "a", 1, 3, "é", 1, 0, 2, 2),
676                    (1, 0, 0, 3, "é", "é", "a", 1, 3, "a", 1, 3, 2, 2),
677                    (2, 0, 0, 3, "é", "é", "a", 1, 3, "", 0, 4, 2, 2),
678                    (3, 0, 3, 4, "é", "a", "", 1, 1, "", 0, 4, 2, 2),
679                    (4, 3, 4, 4, "a", "", "", 0, 0, "", 0, 4, 2, 2),
680                    (5, 3, 4, 4, "a", "", "", 0, 0, "", 0, 4, 2, 2),
681                ],
682            ),
683            (
684                "abcd".to_string(),
685                vec![
686                    (0, 0, 0, 1, "", "a", "b", 1, 1, "a", 1, 0, 4, 4),
687                    (1, 0, 1, 2, "a", "b", "c", 1, 1, "b", 1, 1, 4, 4),
688                    (2, 1, 2, 3, "b", "c", "d", 1, 1, "c", 1, 2, 4, 4),
689                    (3, 2, 3, 4, "c", "d", "", 1, 1, "d", 1, 3, 4, 4),
690                    (4, 3, 4, 4, "d", "", "", 0, 0, "", 0, 4, 4, 4),
691                    (5, 3, 4, 4, "d", "", "", 0, 0, "", 0, 4, 4, 4),
692                ],
693            ),
694            (
695                "abcहि".to_string(),
696                vec![
697                    (0, 0, 0, 1, "", "a", "b", 1, 1, "a", 1, 0, 4, 5),
698                    (1, 0, 1, 2, "a", "b", "c", 1, 1, "b", 1, 1, 4, 5),
699                    (2, 1, 2, 3, "b", "c", "हि", 1, 1, "c", 1, 2, 4, 5),
700                    (3, 2, 3, 9, "c", "हि", "", 2, 6, "हि", 2, 3, 4, 5),
701                    (4, 3, 3, 9, "हि", "हि", "", 2, 6, "", 0, 9, 4, 5),
702                    (5, 3, 3, 9, "हि", "हि", "", 2, 6, "", 0, 9, 4, 5),
703                    (6, 3, 3, 9, "हि", "हि", "", 2, 6, "", 0, 9, 4, 5),
704                    (7, 3, 3, 9, "हि", "हि", "", 2, 6, "", 0, 9, 4, 5),
705                    (8, 3, 3, 9, "हि", "हि", "", 2, 6, "", 0, 9, 4, 5),
706                    (9, 3, 9, 9, "हि", "", "", 0, 0, "", 0, 9, 4, 5),
707                    (10, 3, 9, 9, "हि", "", "", 0, 0, "", 0, 9, 4, 5),
708                ],
709            ),
710            (
711                "हिन्दीH🧑🌾e‘︀o‘︁réé".to_string(),
712                vec![
713                    (0, 0, 0, 6, "", "हि", "न्दी", 2, 6, "हि", 2, 0, 12, 18),
714                    (1, 0, 0, 6, "हि", "हि", "न्दी", 2, 6, "न्दी", 3, 6, 12, 18),
715                    (2, 0, 0, 6, "हि", "हि", "न्दी", 2, 6, "H", 1, 18, 12, 18),
716                    (3, 0, 0, 6, "हि", "हि", "न्दी", 2, 6, "🧑", 2, 19, 12, 18),
717                    (4, 0, 0, 6, "हि", "हि", "न्दी", 2, 6, "🌾", 2, 23, 12, 18),
718                    (5, 0, 0, 6, "हि", "हि", "न्दी", 2, 6, "e", 1, 27, 12, 18),
719                    (6, 0, 6, 18, "हि", "न्दी", "H", 3, 12, "‘︀", 1, 28, 12, 18),
720                    (7, 6, 6, 18, "न्दी", "न्दी", "H", 3, 12, "o", 1, 34, 12, 18),
721                    (8, 6, 6, 18, "न्दी", "न्दी", "H", 3, 12, "‘︁", 2, 35, 12, 18),
722                    (9, 6, 6, 18, "न्दी", "न्दी", "H", 3, 12, "r", 1, 41, 12, 18),
723                    (10, 6, 6, 18, "न्दी", "न्दी", "H", 3, 12, "é", 1, 42, 12, 18),
724                    (11, 6, 6, 18, "न्दी", "न्दी", "H", 3, 12, "é", 1, 44, 12, 18),
725                    (12, 6, 6, 18, "न्दी", "न्दी", "H", 3, 12, "", 0, 47, 12, 18),
726                    (13, 6, 6, 18, "न्दी", "न्दी", "H", 3, 12, "", 0, 47, 12, 18),
727                    (14, 6, 6, 18, "न्दी", "न्दी", "H", 3, 12, "", 0, 47, 12, 18),
728                    (15, 6, 6, 18, "न्दी", "न्दी", "H", 3, 12, "", 0, 47, 12, 18),
729                    (16, 6, 6, 18, "न्दी", "न्दी", "H", 3, 12, "", 0, 47, 12, 18),
730                    (17, 6, 6, 18, "न्दी", "न्दी", "H", 3, 12, "", 0, 47, 12, 18),
731                    (18, 6, 18, 19, "न्दी", "H", "🧑", 1, 1, "", 0, 47, 12, 18),
732                    (19, 18, 19, 23, "H", "🧑", "🌾", 2, 4, "", 0, 47, 12, 18),
733                    (20, 19, 19, 23, "🧑", "🧑", "🌾", 2, 4, "", 0, 47, 12, 18),
734                    (21, 19, 19, 23, "🧑", "🧑", "🌾", 2, 4, "", 0, 47, 12, 18),
735                    (22, 19, 19, 23, "🧑", "🧑", "🌾", 2, 4, "", 0, 47, 12, 18),
736                    (23, 19, 23, 27, "🧑", "🌾", "e", 2, 4, "", 0, 47, 12, 18),
737                    (24, 23, 23, 27, "🌾", "🌾", "e", 2, 4, "", 0, 47, 12, 18),
738                    (25, 23, 23, 27, "🌾", "🌾", "e", 2, 4, "", 0, 47, 12, 18),
739                    (26, 23, 23, 27, "🌾", "🌾", "e", 2, 4, "", 0, 47, 12, 18),
740                    (27, 23, 27, 28, "🌾", "e", "‘︀", 1, 1, "", 0, 47, 12, 18),
741                    (28, 27, 28, 34, "e", "‘︀", "o", 1, 6, "", 0, 47, 12, 18),
742                    (29, 28, 28, 34, "‘︀", "‘︀", "o", 1, 6, "", 0, 47, 12, 18),
743                    (30, 28, 28, 34, "‘︀", "‘︀", "o", 1, 6, "", 0, 47, 12, 18),
744                    (31, 28, 28, 34, "‘︀", "‘︀", "o", 1, 6, "", 0, 47, 12, 18),
745                    (32, 28, 28, 34, "‘︀", "‘︀", "o", 1, 6, "", 0, 47, 12, 18),
746                    (33, 28, 28, 34, "‘︀", "‘︀", "o", 1, 6, "", 0, 47, 12, 18),
747                    (34, 28, 34, 35, "‘︀", "o", "‘︁", 1, 1, "", 0, 47, 12, 18),
748                    (35, 34, 35, 41, "o", "‘︁", "r", 2, 6, "", 0, 47, 12, 18),
749                    (36, 35, 35, 41, "‘︁", "‘︁", "r", 2, 6, "", 0, 47, 12, 18),
750                    (37, 35, 35, 41, "‘︁", "‘︁", "r", 2, 6, "", 0, 47, 12, 18),
751                    (38, 35, 35, 41, "‘︁", "‘︁", "r", 2, 6, "", 0, 47, 12, 18),
752                    (39, 35, 35, 41, "‘︁", "‘︁", "r", 2, 6, "", 0, 47, 12, 18),
753                    (40, 35, 35, 41, "‘︁", "‘︁", "r", 2, 6, "", 0, 47, 12, 18),
754                    (41, 35, 41, 42, "‘︁", "r", "é", 1, 1, "", 0, 47, 12, 18),
755                    (42, 41, 42, 44, "r", "é", "é", 1, 2, "", 0, 47, 12, 18),
756                    (43, 42, 42, 44, "é", "é", "é", 1, 2, "", 0, 47, 12, 18),
757                    (44, 42, 44, 47, "é", "é", "", 1, 3, "", 0, 47, 12, 18),
758                    (45, 44, 44, 47, "é", "é", "", 1, 3, "", 0, 47, 12, 18),
759                    (46, 44, 44, 47, "é", "é", "", 1, 3, "", 0, 47, 12, 18),
760                    (47, 44, 47, 47, "é", "", "", 0, 0, "", 0, 47, 12, 18),
761                    (48, 44, 47, 47, "é", "", "", 0, 0, "", 0, 47, 12, 18),
762                ],
763            ),
764        ];
765
766        for (st, expected) in test_cases {
767            run_grapheme_test(&st, expected);
768        }
769    }
770
771    #[test]
772    fn test_num_graphemes() {
773        assert_eq!(num_graphemes(""), 0);
774        assert_eq!(num_graphemes("hello"), 5);
775        assert_eq!(num_graphemes("😊"), 1);
776        assert_eq!(num_graphemes("😊b"), 2);
777        assert_eq!(num_graphemes("a😊"), 2);
778        assert_eq!(num_graphemes("😊😊"), 2);
779        assert_eq!(num_graphemes("hello 😊 world"), 13);
780        assert_eq!(num_graphemes("é"), 1);
781        let complex_str = "áb̌c̃d̄";
782        assert_eq!(num_graphemes(complex_str), 4);
783        let flag_str = "🇫🇷"; // French flag
784        assert_eq!(num_graphemes(flag_str), 1);
785    }
786}
grapheme_utils/lib.rs

grapheme_utils/
lib.rs