seshat/
unicode.rs

1pub mod props;
2
3pub(crate) mod ucd;
4
5pub(crate) mod hangul;
6
7pub(crate) mod seg;
8
9pub(crate) mod normalization;
10
11use self::props::*;
12
13/// Current Unicode version.
14pub const UNICODE_VERSION: UnicodeVersion = UnicodeVersion {
15    major: 16,
16    minor: 0,
17    update: 0,
18};
19
20/// A Unicode code point.
21/// The valid code point ranges are 0x0 to 0x10FFFF.
22#[derive(Clone, Copy)]
23pub struct CodePoint {
24    code_point: u32,
25}
26
27impl CodePoint {
28    /// Create new `CodePoint` with value `cp`.
29    ///
30    /// If `cp` is greater than 0x10FFFF, returns Err.
31    pub fn new(cp: u32) -> Result<CodePoint, &'static str> {
32        if cp > 0x10FFFF {
33            return Err("IllegalCodePoint: Code point cannot be over U+10FFFF.");
34        }
35        Ok(CodePoint { code_point: cp })
36    }
37
38    /// Convert `CodePoint` to `u32`.
39    pub fn to_u32(&self) -> u32 {
40        self.code_point
41    }
42}
43
44/// A struct for contain Unicode version.
45#[derive(Debug, PartialEq, PartialOrd, Clone, Copy)]
46pub struct UnicodeVersion {
47    pub major: u32,
48    pub minor: u32,
49    pub update: u32,
50}
51
52impl ToString for UnicodeVersion {
53    fn to_string(&self) -> String {
54        format!("{}.{}.{}", self.major, self.minor, self.update)
55    }
56}
57
58/// Trait that convert type to `CodePoint`.
59///
60/// The source type should always in valid code point range.
61pub trait ToCodePoint {
62    /// Creates new `CodePoint` struct from the source.
63    fn to_code_point(&self) -> CodePoint;
64}
65
66impl ToCodePoint for char {
67    fn to_code_point(&self) -> CodePoint {
68        CodePoint::new(*self as u32).unwrap()
69    }
70}
71
72impl std::fmt::Display for CodePoint {
73    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
74        write!(f, "U+{:04X}", self.code_point)
75    }
76}
77
78/// Trait `Ucd` is a collection of Unicode UCD property methods.
79pub trait Ucd {
80    fn dm(&self) -> String;
81
82    /// Get the Unicode Name(na) property. If there is no Name property, then
83    /// returns empty string.
84    fn na(&self) -> String;
85    /// Get the Unicode Bidi_Class(bc) property.
86    fn bc(&self) -> Bc;
87    /// Get the Unicode Canonical_Combining_Class(ccc) property.
88    fn ccc(&self) -> Ccc;
89    /// Get the Unicode Decomposition_Type(dt) property.
90    fn dt(&self) -> Dt;
91    /// Get the Unicode General_Category(gc) property.
92    fn gc(&self) -> Gc;
93    /// Get the Unicode Grapheme_Cluster_Break(GCB) property.
94    fn gcb(&self) -> Gcb;
95    /// Get the Unicode Word_Break(WB) property.
96    fn wb(&self) -> Wb;
97    /// Get the Unicode Hangul_Syllable_Type(hst) property.
98    fn hst(&self) -> Hst;
99    /// Get the Unicode Indic_Syllabic_Category (InSC) property.
100    fn insc(&self) -> Insc;
101    /// Get the Unicode Indic_Conjunct_Break (InCB) property.
102    fn incb(&self) -> Incb;
103    /// Get the Unicode White_Space(WSpace) property.
104    fn wspace(&self) -> bool;
105    /// Get the Unicode Bidi_Control(Bidi_C) property.
106    fn bidi_c(&self) -> bool;
107    /// Get the Unicode Join_Control(Join_C) property.
108    fn join_c(&self) -> bool;
109    /// Get the Unicode Dash(Dash) property.
110    fn dash(&self) -> bool;
111    /// Get the Unicode Hyphen(Hyphen) property.
112    fn hyphen(&self) -> bool;
113    /// Get the Unicode Quotation_Mark(QMark) property.
114    fn qmark(&self) -> bool;
115    /// Get the Unicode Terminal_Punctuation(Term) property.
116    fn term(&self) -> bool;
117    /// Get the Unicode Other_Math(OMath) property.
118    fn omath(&self) -> bool;
119    /// Get the Unicode Hex_Digit(Hex) property.
120    fn hex(&self) -> bool;
121    /// Get the Unicode ASCII_Hex_Digit(AHex) property.
122    fn ahex(&self) -> bool;
123    /// Get the Unicode Other_Alphabetic(OAlpha) property.
124    fn oalpha(&self) -> bool;
125    /// Get the Unicode Ideographic(Ideo) property.
126    fn ideo(&self) -> bool;
127    /// Get the Unicode Diacritic(Dia) property.
128    fn dia(&self) -> bool;
129    /// Get the Unicode Extender(Ext) property.
130    fn ext(&self) -> bool;
131    /// Get the Unicode Other_Lowercase(OLower) property.
132    fn olower(&self) -> bool;
133    /// Get the Unicode Other_Uppercase(OUpper) property.
134    fn oupper(&self) -> bool;
135    /// Get the Unicode Noncharacter_Code_Point(NChar) property.
136    fn nchar(&self) -> bool;
137    /// Get the Unicode Other_Grapheme_Extend(OGr_Ext) property.
138    fn ogr_ext(&self) -> bool;
139    /// Get the Unicode IDS_Binary_Operator(IDSB) property.
140    fn idsb(&self) -> bool;
141    /// Get the Unicode IDS_Trinary_Operator(IDST) property.
142    fn idst(&self) -> bool;
143    /// Get the Unicode Radical(Radical) property.
144    fn radical(&self) -> bool;
145    /// Get the Unicode Unified_Ideograph(UIdeo) property.
146    fn uideo(&self) -> bool;
147    /// Get the Unicode Other_Default_Ignorable_Code_Point(ODI) property.
148    fn odi(&self) -> bool;
149    /// Get the Unicode Deprecated(Dep) property.
150    fn dep(&self) -> bool;
151    /// Get the Unicode Soft_Dotted(SD) property.
152    fn sd(&self) -> bool;
153    /// Get the Unicode Logical_Order_Exception(LOE) property.
154    fn loe(&self) -> bool;
155    /// Get the Unicode Other_ID_Start(OIDS) property.
156    fn oids(&self) -> bool;
157    /// Get the Unicode Other_ID_Continue(OIDC) property.
158    fn oidc(&self) -> bool;
159    /// Get the Unicode Sentence_Terminal(STerm) property.
160    fn sterm(&self) -> bool;
161    /// Get the Unicode Variation_Selector(VS) property.
162    fn vs(&self) -> bool;
163    /// Get the Unicode Pattern_White_Space(Pat_WS) property.
164    fn pat_ws(&self) -> bool;
165    /// Get the Unicode Pattern_Syntax(Pat_Syn) property.
166    fn pat_syn(&self) -> bool;
167    /// Get the Unicode Prepended_Concatenation_Mark(PCM) property.
168    fn pcm(&self) -> bool;
169    /// Get the Unicode Regional_Indicator(RI) property.
170    fn ri(&self) -> bool;
171    /// Get the Unicode Composition_Exclusion(CE) property.
172    fn ce(&self) -> bool;
173    /// Get the Unicode Case_Ignorable(CI) property.
174    fn ci(&self) -> bool;
175    /// Get the Unicode Full_Composition_Exclusion(Comp_Ex) property.
176    fn comp_ex(&self) -> bool;
177
178    /// Get the Unicode Block(blk) property.
179    fn blk(&self) -> Blk;
180    /// Get the Unicode Script(sc) property.
181    fn sc(&self) -> Sc;
182    /// Get the Unicode Age(age) property.
183    fn age(&self) -> Age;
184
185    /// Get the Unicode Math(Math) property.
186    fn math(&self) -> bool;
187    /// Get the Unicode Alphabetic(Alpha) property.
188    fn alpha(&self) -> bool;
189    /// Get the Unicode Lowercase(Lower) property.
190    fn lower(&self) -> bool;
191    /// Get the Unicode Uppercase(Upper) property.
192    fn upper(&self) -> bool;
193    /// Get the Unicode Cased(Cased) property.
194    fn cased(&self) -> bool;
195    /// Get the Unicode Default_Ignorable_Code_Point(DI) property.
196    fn di(&self) -> bool;
197    /// Get the Unicode Grapheme_Extend(Gr_Ext) property.
198    fn gr_ext(&self) -> bool;
199    /// Get the Unicode ID_Start (IDS) property.
200    fn ids(&self) -> bool;
201    /// Get the Unicode ID_Continue (IDC) property.
202    fn idc(&self) -> bool;
203    /// Get the Unicode XID_Start (XIDS) property.
204    fn xids(&self) -> bool;
205    /// Get the Unicode XID_Continue (XIDC) property.
206    fn xidc(&self) -> bool;
207
208    /// Get the Unicode Emoji(Emoji) property.
209    fn emoji(&self) -> bool;
210    /// Get the Unicode Emoji_Presentation(EPres) property.
211    fn epres(&self) -> bool;
212    /// Get the Unicode Emoji_Modifier(EMod) property.
213    fn emod(&self) -> bool;
214    /// Get the Unicode Emoji_Modifier_Base(EBase) property.
215    fn ebase(&self) -> bool;
216    /// Get the Unicode Emoji_Component(EComp) property.
217    fn ecomp(&self) -> bool;
218    /// Get the Unicode Extended_Pictographic(ExtPict) property.
219    fn ext_pict(&self) -> bool;
220}
221
222impl Ucd for CodePoint {
223    fn dm(&self) -> String {
224        ucd::dm::dm(self.code_point)
225    }
226
227    fn na(&self) -> String {
228        ucd::na::na(self.code_point)
229    }
230
231    fn bc(&self) -> Bc {
232        ucd::bc::bc(self.code_point)
233    }
234
235    fn ccc(&self) -> Ccc {
236        ucd::ccc::ccc(self.code_point)
237    }
238
239    fn dt(&self) -> Dt {
240        ucd::dt::dt(self.code_point)
241    }
242
243    fn gc(&self) -> Gc {
244        ucd::gc::gc(self.code_point)
245    }
246
247    fn gcb(&self) -> Gcb {
248        ucd::gcb::gcb(self.code_point)
249    }
250
251    fn wb(&self) -> Wb {
252        ucd::wb::wb(self.code_point)
253    }
254
255    fn hst(&self) -> Hst {
256        ucd::hst::hst(self.code_point)
257    }
258
259    fn insc(&self) -> Insc {
260        ucd::insc::insc(self.code_point)
261    }
262
263    fn incb(&self) -> Incb {
264        ucd::derived_props::incb(self.code_point)
265    }
266
267    fn wspace(&self) -> bool {
268        ucd::binary_props::wspace(self.code_point)
269    }
270
271    fn bidi_c(&self) -> bool {
272        ucd::binary_props::bidi_c(self.code_point)
273    }
274
275    fn join_c(&self) -> bool {
276        ucd::binary_props::join_c(self.code_point)
277    }
278
279    fn dash(&self) -> bool {
280        ucd::binary_props::dash(self.code_point)
281    }
282
283    fn hyphen(&self) -> bool {
284        ucd::binary_props::hyphen(self.code_point)
285    }
286
287    fn qmark(&self) -> bool {
288        ucd::binary_props::qmark(self.code_point)
289    }
290
291    fn term(&self) -> bool {
292        ucd::binary_props::term(self.code_point)
293    }
294
295    fn omath(&self) -> bool {
296        ucd::binary_props::omath(self.code_point)
297    }
298
299    fn hex(&self) -> bool {
300        ucd::binary_props::hex(self.code_point)
301    }
302
303    fn ahex(&self) -> bool {
304        ucd::binary_props::ahex(self.code_point)
305    }
306
307    fn oalpha(&self) -> bool {
308        ucd::binary_props::oalpha(self.code_point)
309    }
310
311    fn ideo(&self) -> bool {
312        ucd::binary_props::ideo(self.code_point)
313    }
314
315    fn dia(&self) -> bool {
316        ucd::binary_props::dia(self.code_point)
317    }
318
319    fn ext(&self) -> bool {
320        ucd::binary_props::ext(self.code_point)
321    }
322
323    fn olower(&self) -> bool {
324        ucd::binary_props::olower(self.code_point)
325    }
326
327    fn oupper(&self) -> bool {
328        ucd::binary_props::oupper(self.code_point)
329    }
330
331    fn nchar(&self) -> bool {
332        ucd::binary_props::nchar(self.code_point)
333    }
334
335    fn ogr_ext(&self) -> bool {
336        ucd::binary_props::ogr_ext(self.code_point)
337    }
338
339    fn idsb(&self) -> bool {
340        ucd::binary_props::idsb(self.code_point)
341    }
342
343    fn idst(&self) -> bool {
344        ucd::binary_props::idst(self.code_point)
345    }
346
347    fn radical(&self) -> bool {
348        ucd::binary_props::radical(self.code_point)
349    }
350
351    fn uideo(&self) -> bool {
352        ucd::binary_props::uideo(self.code_point)
353    }
354
355    fn odi(&self) -> bool {
356        ucd::binary_props::odi(self.code_point)
357    }
358
359    fn dep(&self) -> bool {
360        ucd::binary_props::dep(self.code_point)
361    }
362
363    fn sd(&self) -> bool {
364        ucd::binary_props::sd(self.code_point)
365    }
366
367    fn loe(&self) -> bool {
368        ucd::binary_props::loe(self.code_point)
369    }
370
371    fn oids(&self) -> bool {
372        ucd::binary_props::oids(self.code_point)
373    }
374
375    fn oidc(&self) -> bool {
376        ucd::binary_props::oidc(self.code_point)
377    }
378
379    fn sterm(&self) -> bool {
380        ucd::binary_props::sterm(self.code_point)
381    }
382
383    fn vs(&self) -> bool {
384        ucd::binary_props::vs(self.code_point)
385    }
386
387    fn pat_ws(&self) -> bool {
388        ucd::binary_props::pat_ws(self.code_point)
389    }
390
391    fn pat_syn(&self) -> bool {
392        ucd::binary_props::pat_syn(self.code_point)
393    }
394
395    fn pcm(&self) -> bool {
396        ucd::binary_props::pcm(self.code_point)
397    }
398
399    fn ri(&self) -> bool {
400        ucd::binary_props::ri(self.code_point)
401    }
402
403    fn ce(&self) -> bool {
404        ucd::ce::ce(self.code_point)
405    }
406
407    fn ci(&self) -> bool {
408        ucd::derived_props::ci(self.code_point)
409    }
410
411    fn comp_ex(&self) -> bool {
412        ucd::normalization_props::comp_ex(self.code_point)
413    }
414
415    fn blk(&self) -> Blk {
416        ucd::blk::blk(self.code_point)
417    }
418
419    fn sc(&self) -> Sc {
420        ucd::sc::sc(self.code_point)
421    }
422
423    fn age(&self) -> Age {
424        ucd::age::age(self.code_point)
425    }
426
427    fn math(&self) -> bool {
428        ucd::derived_props::math(self.code_point)
429    }
430
431    fn alpha(&self) -> bool {
432        ucd::derived_props::alpha(self.code_point)
433    }
434
435    fn lower(&self) -> bool {
436        ucd::derived_props::lower(self.code_point)
437    }
438
439    fn upper(&self) -> bool {
440        ucd::derived_props::upper(self.code_point)
441    }
442
443    fn cased(&self) -> bool {
444        ucd::derived_props::cased(self.code_point)
445    }
446
447    fn di(&self) -> bool {
448        ucd::derived_props::di(self.code_point)
449    }
450
451    fn gr_ext(&self) -> bool {
452        ucd::derived_props::gr_ext(self.code_point)
453    }
454
455    fn ids(&self) -> bool {
456        ucd::derived_props::ids(self.code_point)
457    }
458
459    fn idc(&self) -> bool {
460        ucd::derived_props::idc(self.code_point)
461    }
462
463    fn xids(&self) -> bool {
464        ucd::derived_props::xids(self.code_point)
465    }
466
467    fn xidc(&self) -> bool {
468        ucd::derived_props::xidc(self.code_point)
469    }
470
471    fn emoji(&self) -> bool {
472        ucd::emoji_props::emoji(self.code_point)
473    }
474
475    fn epres(&self) -> bool {
476        ucd::emoji_props::epres(self.code_point)
477    }
478
479    fn emod(&self) -> bool {
480        ucd::emoji_props::emod(self.code_point)
481    }
482
483    fn ebase(&self) -> bool {
484        ucd::emoji_props::ebase(self.code_point)
485    }
486
487    fn ecomp(&self) -> bool {
488        ucd::emoji_props::ecomp(self.code_point)
489    }
490
491    fn ext_pict(&self) -> bool {
492        ucd::emoji_props::ext_pict(self.code_point)
493    }
494}
495
496impl Ucd for char {
497    fn dm(&self) -> String {
498        ucd::dm::dm(*self as u32)
499    }
500
501    fn na(&self) -> String {
502        ucd::na::na(*self as u32)
503    }
504
505    fn bc(&self) -> Bc {
506        ucd::bc::bc(*self as u32)
507    }
508
509    fn ccc(&self) -> Ccc {
510        ucd::ccc::ccc(*self as u32)
511    }
512
513    fn dt(&self) -> Dt {
514        ucd::dt::dt(*self as u32)
515    }
516
517    fn gc(&self) -> Gc {
518        ucd::gc::gc(*self as u32)
519    }
520
521    fn gcb(&self) -> Gcb {
522        ucd::gcb::gcb(*self as u32)
523    }
524
525    fn wb(&self) -> Wb {
526        ucd::wb::wb(*self as u32)
527    }
528
529    fn hst(&self) -> Hst {
530        ucd::hst::hst(*self as u32)
531    }
532
533    fn insc(&self) -> Insc {
534        ucd::insc::insc(*self as u32)
535    }
536
537    fn incb(&self) -> Incb {
538        ucd::derived_props::incb(*self as u32)
539    }
540
541    fn wspace(&self) -> bool {
542        ucd::binary_props::wspace(*self as u32)
543    }
544
545    fn bidi_c(&self) -> bool {
546        ucd::binary_props::bidi_c(*self as u32)
547    }
548
549    fn join_c(&self) -> bool {
550        ucd::binary_props::join_c(*self as u32)
551    }
552
553    fn dash(&self) -> bool {
554        ucd::binary_props::dash(*self as u32)
555    }
556
557    fn hyphen(&self) -> bool {
558        ucd::binary_props::hyphen(*self as u32)
559    }
560
561    fn qmark(&self) -> bool {
562        ucd::binary_props::qmark(*self as u32)
563    }
564
565    fn term(&self) -> bool {
566        ucd::binary_props::term(*self as u32)
567    }
568
569    fn omath(&self) -> bool {
570        ucd::binary_props::omath(*self as u32)
571    }
572
573    fn hex(&self) -> bool {
574        ucd::binary_props::hex(*self as u32)
575    }
576
577    fn ahex(&self) -> bool {
578        ucd::binary_props::ahex(*self as u32)
579    }
580
581    fn oalpha(&self) -> bool {
582        ucd::binary_props::oalpha(*self as u32)
583    }
584
585    fn ideo(&self) -> bool {
586        ucd::binary_props::ideo(*self as u32)
587    }
588
589    fn dia(&self) -> bool {
590        ucd::binary_props::dia(*self as u32)
591    }
592
593    fn ext(&self) -> bool {
594        ucd::binary_props::ext(*self as u32)
595    }
596
597    fn olower(&self) -> bool {
598        ucd::binary_props::olower(*self as u32)
599    }
600
601    fn oupper(&self) -> bool {
602        ucd::binary_props::oupper(*self as u32)
603    }
604
605    fn nchar(&self) -> bool {
606        ucd::binary_props::nchar(*self as u32)
607    }
608
609    fn ogr_ext(&self) -> bool {
610        ucd::binary_props::ogr_ext(*self as u32)
611    }
612
613    fn idsb(&self) -> bool {
614        ucd::binary_props::idsb(*self as u32)
615    }
616
617    fn idst(&self) -> bool {
618        ucd::binary_props::idst(*self as u32)
619    }
620
621    fn radical(&self) -> bool {
622        ucd::binary_props::radical(*self as u32)
623    }
624
625    fn uideo(&self) -> bool {
626        ucd::binary_props::uideo(*self as u32)
627    }
628
629    fn odi(&self) -> bool {
630        ucd::binary_props::odi(*self as u32)
631    }
632
633    fn dep(&self) -> bool {
634        ucd::binary_props::dep(*self as u32)
635    }
636
637    fn sd(&self) -> bool {
638        ucd::binary_props::sd(*self as u32)
639    }
640
641    fn loe(&self) -> bool {
642        ucd::binary_props::loe(*self as u32)
643    }
644
645    fn oids(&self) -> bool {
646        ucd::binary_props::oids(*self as u32)
647    }
648
649    fn oidc(&self) -> bool {
650        ucd::binary_props::oidc(*self as u32)
651    }
652
653    fn sterm(&self) -> bool {
654        ucd::binary_props::sterm(*self as u32)
655    }
656
657    fn vs(&self) -> bool {
658        ucd::binary_props::vs(*self as u32)
659    }
660
661    fn pat_ws(&self) -> bool {
662        ucd::binary_props::pat_ws(*self as u32)
663    }
664
665    fn pat_syn(&self) -> bool {
666        ucd::binary_props::pat_syn(*self as u32)
667    }
668
669    fn pcm(&self) -> bool {
670        ucd::binary_props::pcm(*self as u32)
671    }
672
673    fn ri(&self) -> bool {
674        ucd::binary_props::ri(*self as u32)
675    }
676
677    fn ce(&self) -> bool {
678        ucd::ce::ce(*self as u32)
679    }
680
681    fn ci(&self) -> bool {
682        ucd::derived_props::ci(*self as u32)
683    }
684
685    fn comp_ex(&self) -> bool {
686        ucd::normalization_props::comp_ex(*self as u32)
687    }
688
689    fn blk(&self) -> Blk {
690        ucd::blk::blk(*self as u32)
691    }
692
693    fn sc(&self) -> Sc {
694        ucd::sc::sc(*self as u32)
695    }
696
697    fn age(&self) -> Age {
698        ucd::age::age(*self as u32)
699    }
700
701    fn math(&self) -> bool {
702        ucd::derived_props::math(*self as u32)
703    }
704
705    fn alpha(&self) -> bool {
706        ucd::derived_props::alpha(*self as u32)
707    }
708
709    fn lower(&self) -> bool {
710        ucd::derived_props::lower(*self as u32)
711    }
712
713    fn upper(&self) -> bool {
714        ucd::derived_props::upper(*self as u32)
715    }
716
717    fn cased(&self) -> bool {
718        ucd::derived_props::cased(*self as u32)
719    }
720
721    fn di(&self) -> bool {
722        ucd::derived_props::di(*self as u32)
723    }
724
725    fn gr_ext(&self) -> bool {
726        ucd::derived_props::gr_ext(*self as u32)
727    }
728
729    fn ids(&self) -> bool {
730        ucd::derived_props::ids(*self as u32)
731    }
732
733    fn idc(&self) -> bool {
734        ucd::derived_props::idc(*self as u32)
735    }
736
737    fn xids(&self) -> bool {
738        ucd::derived_props::xids(*self as u32)
739    }
740
741    fn xidc(&self) -> bool {
742        ucd::derived_props::xidc(*self as u32)
743    }
744
745    fn emoji(&self) -> bool {
746        ucd::emoji_props::emoji(*self as u32)
747    }
748
749    fn epres(&self) -> bool {
750        ucd::emoji_props::epres(*self as u32)
751    }
752
753    fn emod(&self) -> bool {
754        ucd::emoji_props::emod(*self as u32)
755    }
756
757    fn ebase(&self) -> bool {
758        ucd::emoji_props::ebase(*self as u32)
759    }
760
761    fn ecomp(&self) -> bool {
762        ucd::emoji_props::ecomp(*self as u32)
763    }
764
765    fn ext_pict(&self) -> bool {
766        ucd::emoji_props::ext_pict(*self as u32)
767    }
768}
769
770pub struct BreakGraphemes<'a> {
771    slice: &'a str,
772}
773
774impl<'a> Iterator for BreakGraphemes<'a> {
775    type Item = &'a str;
776
777    fn next(&mut self) -> Option<&'a str> {
778        // Not iterate if empty string.
779        if self.slice.len() == 0 {
780            return None;
781        }
782
783        let mut iter = self.slice.char_indices();
784        let mut curr = iter.next();
785        let mut next = iter.next();
786        let mut in_ext_pict = false;
787        let mut _ri_count = 0; // Ignore compiler not used warning.
788        let mut in_indic = false;
789        let mut indic_linker = false;
790        loop {
791            let curr_ch = curr.unwrap();
792            let next_ch = next.unwrap_or((self.slice.len(), '\u{0000}'));
793            // Prepare for GB11
794            if curr_ch.1.ext_pict() {
795                in_ext_pict = true;
796            }
797            // Prepare for GB12, GB13
798            if curr_ch.1.gcb() == Gcb::RI {
799                _ri_count += 1;
800            }
801            // Prepare for GB9c
802            if curr_ch.1.incb() == Incb::Consonant {
803                in_indic = true;
804            }
805            if in_indic == true && curr_ch.1.incb() == Incb::Linker {
806                indic_linker = true;
807            }
808            // Do not break between a CR and LF. Otherwise, break before and after controls.
809            // GB3:                  CR × LF
810            if seg::is_gb3(curr_ch.1, next_ch.1) {
811                curr = next;
812                next = iter.next();
813                continue;
814            }
815            // GB4: (Control | CR | LF) ÷
816            if seg::is_gb4(curr_ch.1) {
817                curr = next;
818                // next = iter.next();
819                break;
820            }
821            // GB5:                     ÷ (Control | CR | LF)
822            if seg::is_gb5(next_ch.1) {
823                curr = next;
824                // next = iter.next();
825                break;
826            }
827            // Do not break Hangul syllable sequences.
828            // GB6 	        L × (L | V | LV | LVT)
829            if seg::is_gb6(curr_ch.1, next_ch.1) {
830                curr = next;
831                next = iter.next();
832                continue;
833            }
834            // GB7:  (LV | V) × (V | T)
835            if seg::is_gb7(curr_ch.1, next_ch.1) {
836                curr = next;
837                next = iter.next();
838                continue;
839            }
840            // GB8: (LVT | T) × T
841            if seg::is_gb8(curr_ch.1, next_ch.1) {
842                curr = next;
843                next = iter.next();
844                continue;
845            }
846            // Do not break before extending characters or ZWJ.
847            // GB9:       × (Extend | ZWJ)
848            if seg::is_gb9(next_ch.1) {
849                curr = next;
850                next = iter.next();
851                continue;
852            }
853            // The GB9a and GB9b rules only apply to extended grapheme clusters:
854            // Do not break before SpacingMarks, or after Prepend characters.
855            // GB9a:         × SpacingMark
856            if seg::is_gb9a(next_ch.1) {
857                curr = next;
858                next = iter.next();
859                continue;
860            }
861            // GB9b: Prepend ×
862            if seg::is_gb9b(curr_ch.1) {
863                curr = next;
864                next = iter.next();
865                continue;
866            }
867            // GB9c: Do not break within certain combinations with
868            // Indic_Conjunct_Break (InCB)=Linker.
869            if in_indic {
870                if next_ch.1.incb() == Incb::None {
871                    curr = next;
872                    break;
873                }
874                if curr_ch.1.incb() == Incb::Consonant && next_ch.1.incb() == Incb::Consonant {
875                    curr = next;
876                    break;
877                }
878                if indic_linker {
879                    if curr_ch.1.incb() == Incb::Extend || curr_ch.1.incb() == Incb::Linker
880                        || curr_ch.1.incb() == Incb::Consonant {
881                        curr = next;
882                        next = iter.next();
883                        continue;
884                    }
885                } else {
886                    if curr_ch.1.incb() == Incb::Linker {
887                        indic_linker = true;
888                        curr = next;
889                        next = iter.next();
890                        continue;
891                    } else if curr_ch.1.incb() == Incb::Extend
892                        && next_ch.1.incb() == Incb::Consonant {
893                        curr = next;
894                        break;
895                    } else if curr_ch.1.incb() == Incb::Extend {
896                        curr = next;
897                        next = iter.next();
898                        continue;
899                    } else {
900                        curr = next;
901                        break;
902                    }
903                }
904            }
905            // Do not break within emoji modifier sequences or emoji zwj sequences.
906            // GB11: \p{ExtPict} Extend* ZWJ × \p{ExtPict}
907            if curr_ch.1.ext_pict()
908                && (next_ch.1.ext_pict())
909            {
910                curr = next;
911                // next = iter.next();
912                break;
913            }
914            if curr_ch.1.ext_pict()
915                && (!next_ch.1.ext_pict())
916            {
917                curr = next;
918                break;
919            }
920            if curr_ch.1.ext_pict() {
921                in_ext_pict = true;
922                curr = next;
923                next = iter.next();
924                continue;
925            }
926            if in_ext_pict
927                && (curr_ch.1.gcb() == Gcb::ZWJ
928                    && next_ch.1.ext_pict())
929            {
930                // ExtPict, ZWJ, ExtPict
931                curr = next;
932                next = iter.next();
933                in_ext_pict = false;
934                continue;
935            } else if in_ext_pict
936                && (curr_ch.1.gcb() == Gcb::EX
937                    && next_ch.1.gcb() == Gcb::EX)
938            {
939                // ExtPict, EX
940                curr = next;
941                next = iter.next();
942                continue;
943            } else if in_ext_pict
944                && (curr_ch.1.gcb() == Gcb::EX
945                    && next_ch.1.gcb() == Gcb::ZWJ)
946            {
947                // EX, ZWJ
948                curr = next;
949                next = iter.next();
950                if next.unwrap().1.ext_pict() {
951                    curr = next;
952                    continue;
953                }
954                break;
955            }
956            // Do not break within emoji flag sequences.
957            // That is, do not break between regional indicator (RI) symbols
958            // if there is an odd number of RI characters before the break point.
959            // GB12:   sot (RI RI)* RI × RI
960            // GB13: [^RI] (RI RI)* RI × RI
961            if (curr_ch.1.gcb() == Gcb::RI && next_ch.1.gcb() == Gcb::RI)
962                    && _ri_count % 2 != 0 {
963                curr = next;
964                next = iter.next();
965                continue;
966            } else {
967                _ri_count = 0;
968            }
969
970            // GB999: Any ÷ Any
971            curr = next;
972            break;
973        }
974
975        let tmp = self.slice;
976        // println!("curr: {:?}", curr);
977        // println!("next: {:?}", next);
978        // println!("slice: {}", self.slice);
979        if curr.is_none() {
980            self.slice = &tmp[self.slice.len()..];
981            return Some(&tmp[..]);
982        } else {
983            self.slice = &tmp[curr.unwrap().0..];
984        }
985
986        Some(&tmp[..curr.unwrap().0])
987    }
988}
989
990pub trait Segmentation {
991    fn break_graphemes(&self) -> BreakGraphemes;
992}
993
994impl Segmentation for str {
995    fn break_graphemes(&self) -> BreakGraphemes {
996        BreakGraphemes { slice: self }
997    }
998}
999
1000pub trait Normalization {
1001    fn to_nfd(&self) -> String;
1002    fn to_nfkd(&self) -> String;
1003    fn to_nfc(&self) -> String;
1004    fn to_nfkc(&self) -> String;
1005}
1006
1007impl Normalization for str {
1008    fn to_nfd(&self) -> String {
1009        let mut result = String::new();
1010        let v = normalization::nfd(self);
1011        for ch in v.iter() {
1012            result.push(*ch);
1013        }
1014
1015        result
1016    }
1017
1018    fn to_nfkd(&self) -> String {
1019        let mut result = String::new();
1020        let v = normalization::nfkd(self);
1021        for ch in v.iter() {
1022            result.push(*ch);
1023        }
1024
1025        result
1026    }
1027
1028    fn to_nfc(&self) -> String {
1029        let mut result = String::new();
1030        let v = normalization::nfc(self);
1031        for ch in v.iter() {
1032            result.push(*ch);
1033        }
1034
1035        result
1036    }
1037
1038    fn to_nfkc(&self) -> String {
1039        let mut result = String::new();
1040        let v = normalization::nfkc(self);
1041        for ch in v.iter() {
1042            result.push(*ch);
1043        }
1044
1045        result
1046    }
1047}