Skip to main content

unicode_segmentation/
grapheme.rs

1// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
2// file at the top-level directory of this distribution and at
3// http://rust-lang.org/COPYRIGHT.
4//
5// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8// option. This file may not be copied, modified, or distributed
9// except according to those terms.
10
11use crate::tables::grapheme::GraphemeCat;
12use core::cmp;
13
14/// External iterator for grapheme clusters and byte offsets.
15///
16/// This struct is created by the [`grapheme_indices`] method on the [`UnicodeSegmentation`]
17/// trait. See its documentation for more.
18///
19/// [`grapheme_indices`]: trait.UnicodeSegmentation.html#tymethod.grapheme_indices
20/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
21#[derive(Debug, Clone)]
22pub struct GraphemeIndices<'a> {
23    start_offset: usize,
24    iter: Graphemes<'a>,
25}
26
27impl<'a> GraphemeIndices<'a> {
28    #[inline]
29    /// View the underlying data (the part yet to be iterated) as a slice of the original string.
30    ///
31    /// ```rust
32    /// # use unicode_segmentation::UnicodeSegmentation;
33    /// let mut iter = "abc".grapheme_indices(true);
34    /// assert_eq!(iter.as_str(), "abc");
35    /// iter.next();
36    /// assert_eq!(iter.as_str(), "bc");
37    /// iter.next();
38    /// iter.next();
39    /// assert_eq!(iter.as_str(), "");
40    /// ```
41    pub fn as_str(&self) -> &'a str {
42        self.iter.as_str()
43    }
44}
45
46impl<'a> Iterator for GraphemeIndices<'a> {
47    type Item = (usize, &'a str);
48
49    #[inline]
50    fn next(&mut self) -> Option<(usize, &'a str)> {
51        self.iter
52            .next()
53            .map(|s| (s.as_ptr() as usize - self.start_offset, s))
54    }
55
56    #[inline]
57    fn size_hint(&self) -> (usize, Option<usize>) {
58        self.iter.size_hint()
59    }
60}
61
62impl<'a> DoubleEndedIterator for GraphemeIndices<'a> {
63    #[inline]
64    fn next_back(&mut self) -> Option<(usize, &'a str)> {
65        self.iter
66            .next_back()
67            .map(|s| (s.as_ptr() as usize - self.start_offset, s))
68    }
69}
70
71/// External iterator for a string's
72/// [grapheme clusters](http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries).
73///
74/// This struct is created by the [`graphemes`] method on the [`UnicodeSegmentation`] trait. See its
75/// documentation for more.
76///
77/// [`graphemes`]: trait.UnicodeSegmentation.html#tymethod.graphemes
78/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
79#[derive(Clone, Debug)]
80pub struct Graphemes<'a> {
81    string: &'a str,
82    cursor: GraphemeCursor,
83    cursor_back: GraphemeCursor,
84}
85
86impl<'a> Graphemes<'a> {
87    #[inline]
88    /// View the underlying data (the part yet to be iterated) as a slice of the original string.
89    ///
90    /// ```rust
91    /// # use unicode_segmentation::UnicodeSegmentation;
92    /// let mut iter = "abc".graphemes(true);
93    /// assert_eq!(iter.as_str(), "abc");
94    /// iter.next();
95    /// assert_eq!(iter.as_str(), "bc");
96    /// iter.next();
97    /// iter.next();
98    /// assert_eq!(iter.as_str(), "");
99    /// ```
100    pub fn as_str(&self) -> &'a str {
101        &self.string[self.cursor.cur_cursor()..self.cursor_back.cur_cursor()]
102    }
103}
104
105impl<'a> Iterator for Graphemes<'a> {
106    type Item = &'a str;
107
108    #[inline]
109    fn size_hint(&self) -> (usize, Option<usize>) {
110        let slen = self.cursor_back.cur_cursor() - self.cursor.cur_cursor();
111        (cmp::min(slen, 1), Some(slen))
112    }
113
114    #[inline]
115    fn next(&mut self) -> Option<&'a str> {
116        let start = self.cursor.cur_cursor();
117        if start == self.cursor_back.cur_cursor() {
118            return None;
119        }
120        let next = self.cursor.next_boundary(self.string, 0).unwrap().unwrap();
121        Some(&self.string[start..next])
122    }
123}
124
125impl<'a> DoubleEndedIterator for Graphemes<'a> {
126    #[inline]
127    fn next_back(&mut self) -> Option<&'a str> {
128        let end = self.cursor_back.cur_cursor();
129        if end == self.cursor.cur_cursor() {
130            return None;
131        }
132        let prev = self
133            .cursor_back
134            .prev_boundary(self.string, 0)
135            .unwrap()
136            .unwrap();
137        Some(&self.string[prev..end])
138    }
139}
140
141#[inline]
142pub fn new_graphemes(s: &str, is_extended: bool) -> Graphemes<'_> {
143    let len = s.len();
144    Graphemes {
145        string: s,
146        cursor: GraphemeCursor::new(0, len, is_extended),
147        cursor_back: GraphemeCursor::new(len, len, is_extended),
148    }
149}
150
151#[inline]
152pub fn new_grapheme_indices(s: &str, is_extended: bool) -> GraphemeIndices<'_> {
153    GraphemeIndices {
154        start_offset: s.as_ptr() as usize,
155        iter: new_graphemes(s, is_extended),
156    }
157}
158
159/// maybe unify with PairResult?
160/// An enum describing information about a potential boundary.
161#[derive(PartialEq, Eq, Clone, Debug)]
162enum GraphemeState {
163    /// No information is known.
164    Unknown,
165    /// It is known to not be a boundary.
166    NotBreak,
167    /// It is known to be a boundary.
168    Break,
169    /// The codepoint after it has Indic_Conjunct_Break=Consonant,
170    /// so there is a break before so a boundary if it is preceded by another
171    /// InCB=Consonant follwoed by a sequence consisting of one or more InCB=Linker
172    /// and zero or more InCB = Extend (in any order).
173    InCbConsonant,
174    /// The codepoint after is a Regional Indicator Symbol, so a boundary iff
175    /// it is preceded by an even number of RIS codepoints. (GB12, GB13)
176    Regional,
177    /// The codepoint after is Extended_Pictographic,
178    /// so whether it's a boundary depends on pre-context according to GB11.
179    Emoji {
180        /// Whether the ZWJ char has been seen already an only a "\p{Extended_Pictographic} Extend*"
181        /// part of GB11 has to be checked
182        seen_zwj: bool,
183    },
184}
185
186/// Cursor-based segmenter for grapheme clusters.
187///
188/// This allows working with ropes and other datastructures where the string is not contiguous or
189/// fully known at initialization time.
190#[derive(Clone, Debug)]
191pub struct GraphemeCursor {
192    /// Current cursor position.
193    offset: usize,
194    /// Total length of the string.
195    len: usize,
196    /// A config flag indicating whether this cursor computes legacy or extended
197    /// grapheme cluster boundaries (enables GB9a and GB9b if set).
198    is_extended: bool,
199    /// Information about the potential boundary at `offset`
200    state: GraphemeState,
201    /// Category of codepoint immediately preceding cursor, if known.
202    cat_before: Option<GraphemeCat>,
203    /// Category of codepoint immediately after cursor, if known.
204    cat_after: Option<GraphemeCat>,
205    /// If set, at least one more codepoint immediately preceding this offset
206    /// is needed to resolve whether there's a boundary at `offset`.
207    pre_context_offset: Option<usize>,
208    /// The number of `InCB=Linker` codepoints preceding `offset`
209    /// (potentially intermingled with `InCB=Extend`).
210    incb_linker_count: Option<usize>,
211    /// The number of RIS codepoints preceding `offset`. If `pre_context_offset`
212    /// is set, then counts the number of RIS between that and `offset`, otherwise
213    /// is an accurate count relative to the string.
214    ris_count: Option<usize>,
215    /// Set if a call to `prev_boundary` or `next_boundary` was suspended due
216    /// to needing more input.
217    resuming: bool,
218    /// Cached grapheme category and associated scalar value range.
219    grapheme_cat_cache: (u32, u32, GraphemeCat),
220}
221
222/// An error return indicating that not enough content was available in the
223/// provided chunk to satisfy the query, and that more content must be provided.
224#[derive(PartialEq, Eq, Debug)]
225pub enum GraphemeIncomplete {
226    /// More pre-context is needed. The caller should call `provide_context`
227    /// with a chunk ending at the offset given, then retry the query. This
228    /// will only be returned if the `chunk_start` parameter is nonzero.
229    PreContext(usize),
230
231    /// When requesting `prev_boundary`, the cursor is moving past the beginning
232    /// of the current chunk, so the chunk before that is requested. This will
233    /// only be returned if the `chunk_start` parameter is nonzero.
234    PrevChunk,
235
236    /// When requesting `next_boundary`, the cursor is moving past the end of the
237    /// current chunk, so the chunk after that is requested. This will only be
238    /// returned if the chunk ends before the `len` parameter provided on
239    /// creation of the cursor.
240    NextChunk, // requesting chunk following the one given
241
242    /// An error returned when the chunk given does not contain the cursor position.
243    InvalidOffset,
244}
245
246// An enum describing the result from lookup of a pair of categories.
247#[derive(PartialEq, Eq)]
248enum PairResult {
249    /// definitely not a break
250    NotBreak,
251    /// definitely a break
252    Break,
253    /// a break iff not in extended mode
254    Extended,
255    /// a break unless in extended mode and preceded by
256    /// a sequence of 0 or more InCB=Extend and one or more
257    /// InCB = Linker (in any order),
258    /// preceded by another InCB=Consonant
259    InCbConsonant,
260    /// a break if preceded by an even number of RIS
261    Regional,
262    /// a break if preceded by emoji base and (Extend)*
263    Emoji,
264}
265
266#[inline]
267fn check_pair(before: GraphemeCat, after: GraphemeCat) -> PairResult {
268    use self::PairResult::*;
269    use crate::tables::grapheme::GraphemeCat::*;
270    match (before, after) {
271        (GC_CR, GC_LF) => NotBreak,                                 // GB3
272        (GC_Control | GC_CR | GC_LF, _) => Break,                   // GB4
273        (_, GC_Control | GC_CR | GC_LF) => Break,                   // GB5
274        (GC_L, GC_L | GC_V | GC_LV | GC_LVT) => NotBreak,           // GB6
275        (GC_LV | GC_V, GC_V | GC_T) => NotBreak,                    // GB7
276        (GC_LVT | GC_T, GC_T) => NotBreak,                          // GB8
277        (_, GC_Extend | GC_ZWJ) => NotBreak,                        // GB9
278        (_, GC_SpacingMark) => Extended,                            // GB9a
279        (GC_Prepend, _) => Extended,                                // GB9b
280        (_, GC_InCB_Consonant) => InCbConsonant,                    // GB9c
281        (GC_ZWJ, GC_Extended_Pictographic) => Emoji,                // GB11
282        (GC_Regional_Indicator, GC_Regional_Indicator) => Regional, // GB12, GB13
283        (_, _) => Break,                                            // GB999
284    }
285}
286
287impl GraphemeCursor {
288    /// Create a new cursor. The string and initial offset are given at creation
289    /// time, but the contents of the string are not. The `is_extended` parameter
290    /// controls whether extended grapheme clusters are selected.
291    ///
292    /// The `offset` parameter must be on a codepoint boundary.
293    ///
294    /// ```rust
295    /// # use unicode_segmentation::GraphemeCursor;
296    /// let s = "हिन्दी";
297    /// let mut legacy = GraphemeCursor::new(0, s.len(), false);
298    /// assert_eq!(legacy.next_boundary(s, 0), Ok(Some("ह".len())));
299    /// let mut extended = GraphemeCursor::new(0, s.len(), true);
300    /// assert_eq!(extended.next_boundary(s, 0), Ok(Some("हि".len())));
301    /// ```
302    pub fn new(offset: usize, len: usize, is_extended: bool) -> GraphemeCursor {
303        let state = if offset == 0 || offset == len {
304            GraphemeState::Break
305        } else {
306            GraphemeState::Unknown
307        };
308        GraphemeCursor {
309            offset,
310            len,
311            state,
312            is_extended,
313            cat_before: None,
314            cat_after: None,
315            pre_context_offset: None,
316            incb_linker_count: None,
317            ris_count: None,
318            resuming: false,
319            grapheme_cat_cache: (0, 0, GraphemeCat::GC_Control),
320        }
321    }
322
323    fn grapheme_category(&mut self, ch: char) -> GraphemeCat {
324        use crate::tables::grapheme as gr;
325        use crate::tables::grapheme::GraphemeCat::*;
326
327        if ch <= '\u{7e}' {
328            // Special-case optimization for ascii, except U+007F.  This
329            // improves performance even for many primarily non-ascii texts,
330            // due to use of punctuation and white space characters from the
331            // ascii range.
332            if ch >= '\u{20}' {
333                GC_Any
334            } else if ch == '\n' {
335                GC_LF
336            } else if ch == '\r' {
337                GC_CR
338            } else {
339                GC_Control
340            }
341        } else {
342            // If this char isn't within the cached range, update the cache to the
343            // range that includes it.
344            if (ch as u32) < self.grapheme_cat_cache.0 || (ch as u32) > self.grapheme_cat_cache.1 {
345                self.grapheme_cat_cache = gr::grapheme_category(ch);
346            }
347            self.grapheme_cat_cache.2
348        }
349    }
350
351    // Not sure I'm gonna keep this, the advantage over new() seems thin.
352
353    /// Set the cursor to a new location in the same string.
354    ///
355    /// ```rust
356    /// # use unicode_segmentation::GraphemeCursor;
357    /// let s = "abcd";
358    /// let mut cursor = GraphemeCursor::new(0, s.len(), false);
359    /// assert_eq!(cursor.cur_cursor(), 0);
360    /// cursor.set_cursor(2);
361    /// assert_eq!(cursor.cur_cursor(), 2);
362    /// ```
363    pub fn set_cursor(&mut self, offset: usize) {
364        if offset != self.offset {
365            self.offset = offset;
366            self.state = if offset == 0 || offset == self.len {
367                GraphemeState::Break
368            } else {
369                GraphemeState::Unknown
370            };
371            // reset state derived from text around cursor
372            self.cat_before = None;
373            self.cat_after = None;
374            self.incb_linker_count = None;
375            self.ris_count = None;
376        }
377    }
378
379    #[inline]
380    /// The current offset of the cursor. Equal to the last value provided to
381    /// `new()` or `set_cursor()`, or returned from `next_boundary()` or
382    /// `prev_boundary()`.
383    ///
384    /// ```rust
385    /// # use unicode_segmentation::GraphemeCursor;
386    /// // Two flags (🇷🇸🇮🇴), each flag is two RIS codepoints, each RIS is 4 bytes.
387    /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
388    /// let mut cursor = GraphemeCursor::new(4, flags.len(), false);
389    /// assert_eq!(cursor.cur_cursor(), 4);
390    /// assert_eq!(cursor.next_boundary(flags, 0), Ok(Some(8)));
391    /// assert_eq!(cursor.cur_cursor(), 8);
392    /// ```
393    pub fn cur_cursor(&self) -> usize {
394        self.offset
395    }
396
397    /// Provide additional pre-context when it is needed to decide a boundary.
398    /// The end of the chunk must coincide with the value given in the
399    /// `GraphemeIncomplete::PreContext` request.
400    ///
401    /// ```rust
402    /// # use unicode_segmentation::{GraphemeCursor, GraphemeIncomplete};
403    /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
404    /// let mut cursor = GraphemeCursor::new(8, flags.len(), false);
405    /// // Not enough pre-context to decide if there's a boundary between the two flags.
406    /// assert_eq!(cursor.is_boundary(&flags[8..], 8), Err(GraphemeIncomplete::PreContext(8)));
407    /// // Provide one more Regional Indicator Symbol of pre-context
408    /// cursor.provide_context(&flags[4..8], 4);
409    /// // Still not enough context to decide.
410    /// assert_eq!(cursor.is_boundary(&flags[8..], 8), Err(GraphemeIncomplete::PreContext(4)));
411    /// // Provide additional requested context.
412    /// cursor.provide_context(&flags[0..4], 0);
413    /// // That's enough to decide (it always is when context goes to the start of the string)
414    /// assert_eq!(cursor.is_boundary(&flags[8..], 8), Ok(true));
415    /// ```
416    pub fn provide_context(&mut self, chunk: &str, chunk_start: usize) {
417        use crate::tables::grapheme as gr;
418        assert!(chunk_start.saturating_add(chunk.len()) == self.pre_context_offset.unwrap());
419        self.pre_context_offset = None;
420        if self.is_extended && chunk_start + chunk.len() == self.offset {
421            let ch = chunk.chars().next_back().unwrap();
422            if self.grapheme_category(ch) == gr::GC_Prepend {
423                self.decide(false); // GB9b
424                return;
425            }
426        }
427        match self.state {
428            GraphemeState::InCbConsonant => self.handle_incb_consonant(chunk, chunk_start),
429            GraphemeState::Regional => self.handle_regional(chunk, chunk_start),
430            GraphemeState::Emoji { seen_zwj } => self.handle_emoji(chunk, chunk_start, seen_zwj),
431            _ => {
432                if self.cat_before.is_none() && self.offset == chunk.len() + chunk_start {
433                    let ch = chunk.chars().next_back().unwrap();
434                    self.cat_before = Some(self.grapheme_category(ch));
435                }
436            }
437        }
438    }
439
440    #[inline]
441    fn decide(&mut self, is_break: bool) {
442        self.state = if is_break {
443            GraphemeState::Break
444        } else {
445            GraphemeState::NotBreak
446        };
447    }
448
449    #[inline]
450    fn decision(&mut self, is_break: bool) -> Result<bool, GraphemeIncomplete> {
451        self.decide(is_break);
452        Ok(is_break)
453    }
454
455    #[inline]
456    fn is_boundary_result(&self) -> Result<bool, GraphemeIncomplete> {
457        if self.state == GraphemeState::Break {
458            Ok(true)
459        } else if self.state == GraphemeState::NotBreak {
460            Ok(false)
461        } else if let Some(pre_context_offset) = self.pre_context_offset {
462            Err(GraphemeIncomplete::PreContext(pre_context_offset))
463        } else {
464            unreachable!("inconsistent state");
465        }
466    }
467
468    /// For handling rule GB9c:
469    ///
470    /// There's an `InCB=Consonant` after this, and we need to look back
471    /// to verify whether there should be a break.
472    ///
473    /// Seek backward to find an `InCB=Linker` preceded by an `InCB=Consonsnt`
474    /// (potentially separated by some number of `InCB=Linker` or `InCB=Extend`).
475    /// If we find the consonant in question, then there's no break; if we find a consonant
476    /// with no linker, or a non-linker non-extend non-consonant, or the start of text, there's a break;
477    /// otherwise we need more context
478    #[inline]
479    fn handle_incb_consonant(&mut self, chunk: &str, chunk_start: usize) {
480        use crate::tables::{self, grapheme as gr};
481
482        // GB9c only applies to extended grapheme clusters
483        if !self.is_extended {
484            self.decide(true);
485            return;
486        }
487
488        let mut incb_linker_count = self.incb_linker_count.unwrap_or(0);
489
490        for ch in chunk.chars().rev() {
491            if tables::is_incb_linker(ch) {
492                // We found an InCB linker
493                incb_linker_count += 1;
494                self.incb_linker_count = Some(incb_linker_count);
495            } else if tables::derived_property::InCB_Extend(ch) {
496                // We ignore InCB extends, continue
497            } else {
498                // Prev character is neither linker nor extend, break suppressed iff it's InCB=Consonant
499                let result = !(self.incb_linker_count.unwrap_or(0) > 0
500                    && self.grapheme_category(ch) == gr::GC_InCB_Consonant);
501                self.decide(result);
502                return;
503            }
504        }
505
506        if chunk_start == 0 {
507            // Start of text and we still haven't found a consonant, so break
508            self.decide(true);
509        } else {
510            // We need more context
511            self.pre_context_offset = Some(chunk_start);
512            self.state = GraphemeState::InCbConsonant;
513        }
514    }
515
516    #[inline]
517    fn handle_regional(&mut self, chunk: &str, chunk_start: usize) {
518        use crate::tables::grapheme as gr;
519        let mut ris_count = self.ris_count.unwrap_or(0);
520        for ch in chunk.chars().rev() {
521            if self.grapheme_category(ch) != gr::GC_Regional_Indicator {
522                self.ris_count = Some(ris_count);
523                self.decide(ris_count % 2 == 0);
524                return;
525            }
526            ris_count += 1;
527        }
528        self.ris_count = Some(ris_count);
529        if chunk_start == 0 {
530            self.decide(ris_count % 2 == 0);
531        } else {
532            self.pre_context_offset = Some(chunk_start);
533            self.state = GraphemeState::Regional;
534        }
535    }
536
537    #[inline]
538    fn handle_emoji(&mut self, chunk: &str, chunk_start: usize, mut seen_zwj: bool) {
539        // \p{Extended_Pictographic} Extend* ZWJ 	× 	\p{Extended_Pictographic}
540        use crate::tables::grapheme as gr;
541        let mut iter = chunk.chars().rev();
542        if !seen_zwj {
543            if let Some(ch) = iter.next() {
544                if self.grapheme_category(ch) != gr::GC_ZWJ {
545                    self.decide(true);
546                    return;
547                } else {
548                    seen_zwj = true;
549                }
550            }
551        }
552        for ch in iter {
553            match self.grapheme_category(ch) {
554                gr::GC_Extend => (),
555                gr::GC_Extended_Pictographic => {
556                    self.decide(false);
557                    return;
558                }
559                _ => {
560                    self.decide(true);
561                    return;
562                }
563            }
564        }
565        if chunk_start == 0 {
566            self.decide(true);
567        } else {
568            self.pre_context_offset = Some(chunk_start);
569            self.state = GraphemeState::Emoji { seen_zwj };
570        }
571    }
572
573    #[inline]
574    /// Determine whether the current cursor location is a grapheme cluster boundary.
575    /// Only a part of the string need be supplied. If `chunk_start` is nonzero or
576    /// the length of `chunk` is not equal to `len` on creation, then this method
577    /// may return `GraphemeIncomplete::PreContext`. The caller should then
578    /// call `provide_context` with the requested chunk, then retry calling this
579    /// method.
580    ///
581    /// For partial chunks, if the cursor is not at the beginning or end of the
582    /// string, the chunk should contain at least the codepoint following the cursor.
583    /// If the string is nonempty, the chunk must be nonempty.
584    ///
585    /// All calls should have consistent chunk contents (ie, if a chunk provides
586    /// content for a given slice, all further chunks covering that slice must have
587    /// the same content for it).
588    ///
589    /// ```rust
590    /// # use unicode_segmentation::GraphemeCursor;
591    /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
592    /// let mut cursor = GraphemeCursor::new(8, flags.len(), false);
593    /// assert_eq!(cursor.is_boundary(flags, 0), Ok(true));
594    /// cursor.set_cursor(12);
595    /// assert_eq!(cursor.is_boundary(flags, 0), Ok(false));
596    /// ```
597    pub fn is_boundary(
598        &mut self,
599        chunk: &str,
600        chunk_start: usize,
601    ) -> Result<bool, GraphemeIncomplete> {
602        use crate::tables::grapheme as gr;
603        if self.state == GraphemeState::Break {
604            return Ok(true);
605        }
606        if self.state == GraphemeState::NotBreak {
607            return Ok(false);
608        }
609        if (self.offset < chunk_start || self.offset >= chunk_start.saturating_add(chunk.len()))
610            && (self.offset > chunk_start.saturating_add(chunk.len()) || self.cat_after.is_none())
611        {
612            return Err(GraphemeIncomplete::InvalidOffset);
613        }
614        if let Some(pre_context_offset) = self.pre_context_offset {
615            return Err(GraphemeIncomplete::PreContext(pre_context_offset));
616        }
617        let offset_in_chunk = self.offset.saturating_sub(chunk_start);
618        if self.cat_after.is_none() {
619            let ch = chunk[offset_in_chunk..].chars().next().unwrap();
620            self.cat_after = Some(self.grapheme_category(ch));
621        }
622        if self.offset == chunk_start {
623            let mut need_pre_context = true;
624            match self.cat_after.unwrap() {
625                gr::GC_InCB_Consonant => self.state = GraphemeState::InCbConsonant,
626                gr::GC_Regional_Indicator => self.state = GraphemeState::Regional,
627                gr::GC_Extended_Pictographic => {
628                    self.state = GraphemeState::Emoji { seen_zwj: false }
629                }
630                _ => need_pre_context = self.cat_before.is_none(),
631            }
632            if need_pre_context {
633                self.pre_context_offset = Some(chunk_start);
634                return Err(GraphemeIncomplete::PreContext(chunk_start));
635            }
636        }
637        if self.cat_before.is_none() {
638            let ch = chunk[..offset_in_chunk].chars().next_back().unwrap();
639            self.cat_before = Some(self.grapheme_category(ch));
640        }
641        match check_pair(self.cat_before.unwrap(), self.cat_after.unwrap()) {
642            PairResult::NotBreak => self.decision(false),
643            PairResult::Break => self.decision(true),
644            PairResult::Extended => {
645                let is_extended = self.is_extended;
646                self.decision(!is_extended)
647            }
648            PairResult::InCbConsonant => {
649                self.handle_incb_consonant(&chunk[..offset_in_chunk], chunk_start);
650                self.is_boundary_result()
651            }
652            PairResult::Regional => {
653                if let Some(ris_count) = self.ris_count {
654                    return self.decision((ris_count % 2) == 0);
655                }
656                self.handle_regional(&chunk[..offset_in_chunk], chunk_start);
657                self.is_boundary_result()
658            }
659            PairResult::Emoji => {
660                self.handle_emoji(&chunk[..offset_in_chunk], chunk_start, false);
661                self.is_boundary_result()
662            }
663        }
664    }
665
666    #[inline]
667    /// Find the next boundary after the current cursor position. Only a part of
668    /// the string need be supplied. If the chunk is incomplete, then this
669    /// method might return `GraphemeIncomplete::PreContext` or
670    /// `GraphemeIncomplete::NextChunk`. In the former case, the caller should
671    /// call `provide_context` with the requested chunk, then retry. In the
672    /// latter case, the caller should provide the chunk following the one
673    /// given, then retry.
674    ///
675    /// See `is_boundary` for expectations on the provided chunk.
676    ///
677    /// ```rust
678    /// # use unicode_segmentation::GraphemeCursor;
679    /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
680    /// let mut cursor = GraphemeCursor::new(4, flags.len(), false);
681    /// assert_eq!(cursor.next_boundary(flags, 0), Ok(Some(8)));
682    /// assert_eq!(cursor.next_boundary(flags, 0), Ok(Some(16)));
683    /// assert_eq!(cursor.next_boundary(flags, 0), Ok(None));
684    /// ```
685    ///
686    /// And an example that uses partial strings:
687    ///
688    /// ```rust
689    /// # use unicode_segmentation::{GraphemeCursor, GraphemeIncomplete};
690    /// let s = "abcd";
691    /// let mut cursor = GraphemeCursor::new(0, s.len(), false);
692    /// assert_eq!(cursor.next_boundary(&s[..2], 0), Ok(Some(1)));
693    /// assert_eq!(cursor.next_boundary(&s[..2], 0), Err(GraphemeIncomplete::NextChunk));
694    /// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(Some(2)));
695    /// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(Some(3)));
696    /// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(Some(4)));
697    /// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(None));
698    /// ```
699    pub fn next_boundary(
700        &mut self,
701        chunk: &str,
702        chunk_start: usize,
703    ) -> Result<Option<usize>, GraphemeIncomplete> {
704        if self.offset == self.len {
705            return Ok(None);
706        }
707        let mut iter = chunk[self.offset.saturating_sub(chunk_start)..].chars();
708        let mut ch = match iter.next() {
709            Some(ch) => ch,
710            None => return Err(GraphemeIncomplete::NextChunk),
711        };
712        loop {
713            if self.resuming {
714                if self.cat_after.is_none() {
715                    self.cat_after = Some(self.grapheme_category(ch));
716                }
717            } else {
718                self.offset = self.offset.saturating_add(ch.len_utf8());
719                self.state = GraphemeState::Unknown;
720                self.cat_before = self.cat_after.take();
721                if self.cat_before.is_none() {
722                    self.cat_before = Some(self.grapheme_category(ch));
723                }
724                if crate::tables::is_incb_linker(ch) {
725                    self.incb_linker_count = Some(self.incb_linker_count.map_or(1, |c| c + 1));
726                } else if !crate::tables::derived_property::InCB_Extend(ch) {
727                    self.incb_linker_count = Some(0);
728                }
729                if self.cat_before.unwrap() == GraphemeCat::GC_Regional_Indicator {
730                    self.ris_count = self.ris_count.map(|c| c + 1);
731                } else {
732                    self.ris_count = Some(0);
733                }
734                if let Some(next_ch) = iter.next() {
735                    ch = next_ch;
736                    self.cat_after = Some(self.grapheme_category(ch));
737                } else if self.offset == self.len {
738                    self.decide(true);
739                } else {
740                    self.resuming = true;
741                    return Err(GraphemeIncomplete::NextChunk);
742                }
743            }
744            self.resuming = true;
745            if self.is_boundary(chunk, chunk_start)? {
746                self.resuming = false;
747                return Ok(Some(self.offset));
748            }
749            self.resuming = false;
750        }
751    }
752
753    /// Find the previous boundary after the current cursor position. Only a part
754    /// of the string need be supplied. If the chunk is incomplete, then this
755    /// method might return `GraphemeIncomplete::PreContext` or
756    /// `GraphemeIncomplete::PrevChunk`. In the former case, the caller should
757    /// call `provide_context` with the requested chunk, then retry. In the
758    /// latter case, the caller should provide the chunk preceding the one
759    /// given, then retry.
760    ///
761    /// See `is_boundary` for expectations on the provided chunk.
762    ///
763    /// ```rust
764    /// # use unicode_segmentation::GraphemeCursor;
765    /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
766    /// let mut cursor = GraphemeCursor::new(12, flags.len(), false);
767    /// assert_eq!(cursor.prev_boundary(flags, 0), Ok(Some(8)));
768    /// assert_eq!(cursor.prev_boundary(flags, 0), Ok(Some(0)));
769    /// assert_eq!(cursor.prev_boundary(flags, 0), Ok(None));
770    /// ```
771    ///
772    /// And an example that uses partial strings (note the exact return is not
773    /// guaranteed, and may be `PrevChunk` or `PreContext` arbitrarily):
774    ///
775    /// ```rust
776    /// # use unicode_segmentation::{GraphemeCursor, GraphemeIncomplete};
777    /// let s = "abcd";
778    /// let mut cursor = GraphemeCursor::new(4, s.len(), false);
779    /// assert_eq!(cursor.prev_boundary(&s[2..4], 2), Ok(Some(3)));
780    /// assert_eq!(cursor.prev_boundary(&s[2..4], 2), Err(GraphemeIncomplete::PrevChunk));
781    /// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(Some(2)));
782    /// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(Some(1)));
783    /// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(Some(0)));
784    /// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(None));
785    /// ```
786    pub fn prev_boundary(
787        &mut self,
788        chunk: &str,
789        chunk_start: usize,
790    ) -> Result<Option<usize>, GraphemeIncomplete> {
791        if self.offset == 0 {
792            return Ok(None);
793        }
794        if self.offset == chunk_start {
795            return Err(GraphemeIncomplete::PrevChunk);
796        }
797        let mut iter = chunk[..self.offset.saturating_sub(chunk_start)]
798            .chars()
799            .rev();
800        let mut ch = iter.next().unwrap();
801        loop {
802            if self.offset == chunk_start {
803                self.resuming = true;
804                return Err(GraphemeIncomplete::PrevChunk);
805            }
806            if self.resuming {
807                self.cat_before = Some(self.grapheme_category(ch));
808            } else {
809                self.offset -= ch.len_utf8();
810                self.cat_after = self.cat_before.take();
811                self.state = GraphemeState::Unknown;
812                if let Some(incb_linker_count) = self.incb_linker_count {
813                    self.incb_linker_count =
814                        if incb_linker_count > 0 && crate::tables::is_incb_linker(ch) {
815                            Some(incb_linker_count - 1)
816                        } else if crate::tables::derived_property::InCB_Extend(ch) {
817                            Some(incb_linker_count)
818                        } else {
819                            None
820                        };
821                }
822                if let Some(ris_count) = self.ris_count {
823                    self.ris_count = if ris_count > 0 {
824                        Some(ris_count - 1)
825                    } else {
826                        None
827                    };
828                }
829                if let Some(prev_ch) = iter.next() {
830                    ch = prev_ch;
831                    self.cat_before = Some(self.grapheme_category(ch));
832                } else if self.offset == 0 {
833                    self.decide(true);
834                } else {
835                    self.resuming = true;
836                    self.cat_after = Some(self.grapheme_category(ch));
837                    return Err(GraphemeIncomplete::PrevChunk);
838                }
839            }
840            self.resuming = true;
841            if self.is_boundary(chunk, chunk_start)? {
842                self.resuming = false;
843                return Ok(Some(self.offset));
844            }
845            self.resuming = false;
846        }
847    }
848}
849
850#[test]
851fn test_grapheme_cursor_ris_precontext() {
852    let s = "\u{1f1fa}\u{1f1f8}\u{1f1fa}\u{1f1f8}\u{1f1fa}\u{1f1f8}";
853    let mut c = GraphemeCursor::new(8, s.len(), true);
854    assert_eq!(
855        c.is_boundary(&s[4..], 4),
856        Err(GraphemeIncomplete::PreContext(4))
857    );
858    c.provide_context(&s[..4], 0);
859    assert_eq!(c.is_boundary(&s[4..], 4), Ok(true));
860}
861
862#[test]
863fn test_grapheme_cursor_chunk_start_require_precontext() {
864    let s = "\r\n";
865    let mut c = GraphemeCursor::new(1, s.len(), true);
866    assert_eq!(
867        c.is_boundary(&s[1..], 1),
868        Err(GraphemeIncomplete::PreContext(1))
869    );
870    c.provide_context(&s[..1], 0);
871    assert_eq!(c.is_boundary(&s[1..], 1), Ok(false));
872}
873
874#[test]
875fn test_grapheme_cursor_prev_boundary() {
876    let s = "abcd";
877    let mut c = GraphemeCursor::new(3, s.len(), true);
878    assert_eq!(
879        c.prev_boundary(&s[2..], 2),
880        Err(GraphemeIncomplete::PrevChunk)
881    );
882    assert_eq!(c.prev_boundary(&s[..2], 0), Ok(Some(2)));
883}
884
885#[test]
886fn test_grapheme_cursor_prev_boundary_chunk_start() {
887    let s = "abcd";
888    let mut c = GraphemeCursor::new(2, s.len(), true);
889    assert_eq!(
890        c.prev_boundary(&s[2..], 2),
891        Err(GraphemeIncomplete::PrevChunk)
892    );
893    assert_eq!(c.prev_boundary(&s[..2], 0), Ok(Some(1)));
894}
895
896#[test]
897fn test_grapheme_cursor_boundary_with_zwj_on_chunk_start() {
898    use GraphemeIncomplete::*;
899
900    let chunk0 = "👩"; // 4 bytes
901    let chunk1 = "\u{200d}🔬"; // 3 bytes + 4 bytes
902
903    let full_len = chunk0.len() + chunk1.len();
904
905    let mut cur = GraphemeCursor::new(0, full_len, true);
906    assert_eq!(cur.next_boundary(chunk0, 0), Err(NextChunk));
907    match cur.next_boundary(chunk1, chunk0.len()) {
908        Ok(res) => assert_eq!(res, Some(11)),
909        Err(PreContext(_)) => {
910            cur.provide_context(chunk0, 0);
911            assert_eq!(cur.next_boundary(chunk1, chunk0.len()), Ok(Some(11)));
912        }
913        _ => unreachable!(),
914    }
915}
916
917#[test]
918fn test_grapheme_cursor_emoji_no_zwj() {
919    use GraphemeIncomplete::*;
920    let chunk0 = "🍒"; // 4 bytes
921    let chunk1 = "🥑"; // 4 bytes
922    let full_len = chunk0.len() + chunk1.len();
923
924    let mut c = GraphemeCursor::new(0, full_len, true);
925    assert_eq!(c.next_boundary(chunk0, 0), Err(NextChunk));
926    assert_eq!(
927        c.next_boundary(chunk1, chunk0.len()),
928        Err(PreContext(chunk0.len()))
929    );
930    c.provide_context(chunk0, 0);
931    assert_eq!(c.next_boundary(chunk1, chunk0.len()), Ok(Some(4)));
932    assert_eq!(c.next_boundary(chunk1, chunk0.len()), Ok(Some(8)));
933    assert_eq!(c.next_boundary(chunk1, chunk0.len()), Ok(None));
934}
935
936#[test]
937fn test_grapheme_cursor_emoji_chunk_boundary_before_zwj() {
938    use GraphemeIncomplete::*;
939    let chunk0 = "🍒"; // 4 bytes
940    let chunk1 = "\u{200d}🥑"; // 3 + 4 bytes
941    let full_len = chunk0.len() + chunk1.len(); // 11
942
943    let mut c = GraphemeCursor::new(0, full_len, true);
944    assert_eq!(c.next_boundary(chunk0, 0), Err(NextChunk));
945    assert_eq!(
946        c.next_boundary(chunk1, chunk0.len()),
947        Err(PreContext(chunk0.len()))
948    );
949    c.provide_context(chunk0, 0);
950    assert_eq!(c.next_boundary(chunk1, chunk0.len()), Ok(Some(11)));
951    assert_eq!(c.next_boundary(chunk1, chunk0.len()), Ok(None));
952}
953
954#[test]
955fn test_grapheme_cursor_emoji_chunk_boundary_after_zwj() {
956    use GraphemeIncomplete::*;
957    let chunk0 = "🍒\u{200d}"; // 4 + 3 bytes
958    let chunk1 = "🥑"; // 4 bytes
959    let full_len = chunk0.len() + chunk1.len(); // 11
960
961    let mut c = GraphemeCursor::new(0, full_len, true);
962    assert_eq!(c.next_boundary(chunk0, 0), Err(NextChunk));
963    assert_eq!(
964        c.next_boundary(chunk1, chunk0.len()),
965        Err(PreContext(chunk0.len()))
966    );
967    c.provide_context(chunk0, 0);
968    assert_eq!(c.next_boundary(chunk1, chunk0.len()), Ok(Some(11)));
969    assert_eq!(c.next_boundary(chunk1, chunk0.len()), Ok(None));
970}
971
972#[test]
973fn test_grapheme_cursor_emoji_zwj_across_chunks() {
974    use GraphemeIncomplete::*;
975    let chunk0 = "🍒"; // 4 bytes
976    let chunk1 = "\u{200d}"; // 3 bytes
977    let chunk2 = "🥑"; // 4 bytes
978    let full_len = chunk0.len() + chunk1.len() + chunk2.len(); // 11
979    let chunk2_start = chunk0.len() + chunk1.len();
980
981    let mut c = GraphemeCursor::new(0, full_len, true);
982    assert_eq!(c.next_boundary(chunk0, 0), Err(NextChunk));
983    assert_eq!(c.next_boundary(chunk1, chunk0.len()), Err(NextChunk));
984    assert_eq!(
985        c.next_boundary(chunk2, chunk2_start),
986        Err(PreContext(chunk2_start))
987    );
988    c.provide_context(chunk1, chunk0.len());
989    assert_eq!(
990        c.next_boundary(chunk2, chunk2_start),
991        Err(PreContext(chunk0.len()))
992    );
993    c.provide_context(chunk0, 0);
994    assert_eq!(c.next_boundary(chunk2, chunk2_start), Ok(Some(11)));
995    assert_eq!(c.next_boundary(chunk2, chunk2_start), Ok(None));
996}