unicode_segmentation/grapheme.rs
1// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
2// file at the top-level directory of this distribution and at
3// http://rust-lang.org/COPYRIGHT.
4//
5// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8// option. This file may not be copied, modified, or distributed
9// except according to those terms.
10
11use crate::tables::grapheme::GraphemeCat;
12use core::cmp;
13
14/// External iterator for grapheme clusters and byte offsets.
15///
16/// This struct is created by the [`grapheme_indices`] method on the [`UnicodeSegmentation`]
17/// trait. See its documentation for more.
18///
19/// [`grapheme_indices`]: trait.UnicodeSegmentation.html#tymethod.grapheme_indices
20/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
21#[derive(Debug, Clone)]
22pub struct GraphemeIndices<'a> {
23 start_offset: usize,
24 iter: Graphemes<'a>,
25}
26
27impl<'a> GraphemeIndices<'a> {
28 #[inline]
29 /// View the underlying data (the part yet to be iterated) as a slice of the original string.
30 ///
31 /// ```rust
32 /// # use unicode_segmentation::UnicodeSegmentation;
33 /// let mut iter = "abc".grapheme_indices(true);
34 /// assert_eq!(iter.as_str(), "abc");
35 /// iter.next();
36 /// assert_eq!(iter.as_str(), "bc");
37 /// iter.next();
38 /// iter.next();
39 /// assert_eq!(iter.as_str(), "");
40 /// ```
41 pub fn as_str(&self) -> &'a str {
42 self.iter.as_str()
43 }
44}
45
46impl<'a> Iterator for GraphemeIndices<'a> {
47 type Item = (usize, &'a str);
48
49 #[inline]
50 fn next(&mut self) -> Option<(usize, &'a str)> {
51 self.iter
52 .next()
53 .map(|s| (s.as_ptr() as usize - self.start_offset, s))
54 }
55
56 #[inline]
57 fn size_hint(&self) -> (usize, Option<usize>) {
58 self.iter.size_hint()
59 }
60}
61
62impl<'a> DoubleEndedIterator for GraphemeIndices<'a> {
63 #[inline]
64 fn next_back(&mut self) -> Option<(usize, &'a str)> {
65 self.iter
66 .next_back()
67 .map(|s| (s.as_ptr() as usize - self.start_offset, s))
68 }
69}
70
71/// External iterator for a string's
72/// [grapheme clusters](http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries).
73///
74/// This struct is created by the [`graphemes`] method on the [`UnicodeSegmentation`] trait. See its
75/// documentation for more.
76///
77/// [`graphemes`]: trait.UnicodeSegmentation.html#tymethod.graphemes
78/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
79#[derive(Clone, Debug)]
80pub struct Graphemes<'a> {
81 string: &'a str,
82 cursor: GraphemeCursor,
83 cursor_back: GraphemeCursor,
84}
85
86impl<'a> Graphemes<'a> {
87 #[inline]
88 /// View the underlying data (the part yet to be iterated) as a slice of the original string.
89 ///
90 /// ```rust
91 /// # use unicode_segmentation::UnicodeSegmentation;
92 /// let mut iter = "abc".graphemes(true);
93 /// assert_eq!(iter.as_str(), "abc");
94 /// iter.next();
95 /// assert_eq!(iter.as_str(), "bc");
96 /// iter.next();
97 /// iter.next();
98 /// assert_eq!(iter.as_str(), "");
99 /// ```
100 pub fn as_str(&self) -> &'a str {
101 &self.string[self.cursor.cur_cursor()..self.cursor_back.cur_cursor()]
102 }
103}
104
105impl<'a> Iterator for Graphemes<'a> {
106 type Item = &'a str;
107
108 #[inline]
109 fn size_hint(&self) -> (usize, Option<usize>) {
110 let slen = self.cursor_back.cur_cursor() - self.cursor.cur_cursor();
111 (cmp::min(slen, 1), Some(slen))
112 }
113
114 #[inline]
115 fn next(&mut self) -> Option<&'a str> {
116 let start = self.cursor.cur_cursor();
117 if start == self.cursor_back.cur_cursor() {
118 return None;
119 }
120 let next = self.cursor.next_boundary(self.string, 0).unwrap().unwrap();
121 Some(&self.string[start..next])
122 }
123}
124
125impl<'a> DoubleEndedIterator for Graphemes<'a> {
126 #[inline]
127 fn next_back(&mut self) -> Option<&'a str> {
128 let end = self.cursor_back.cur_cursor();
129 if end == self.cursor.cur_cursor() {
130 return None;
131 }
132 let prev = self
133 .cursor_back
134 .prev_boundary(self.string, 0)
135 .unwrap()
136 .unwrap();
137 Some(&self.string[prev..end])
138 }
139}
140
141#[inline]
142pub fn new_graphemes(s: &str, is_extended: bool) -> Graphemes<'_> {
143 let len = s.len();
144 Graphemes {
145 string: s,
146 cursor: GraphemeCursor::new(0, len, is_extended),
147 cursor_back: GraphemeCursor::new(len, len, is_extended),
148 }
149}
150
151#[inline]
152pub fn new_grapheme_indices(s: &str, is_extended: bool) -> GraphemeIndices<'_> {
153 GraphemeIndices {
154 start_offset: s.as_ptr() as usize,
155 iter: new_graphemes(s, is_extended),
156 }
157}
158
159/// maybe unify with PairResult?
160/// An enum describing information about a potential boundary.
161#[derive(PartialEq, Eq, Clone, Debug)]
162enum GraphemeState {
163 /// No information is known.
164 Unknown,
165 /// It is known to not be a boundary.
166 NotBreak,
167 /// It is known to be a boundary.
168 Break,
169 /// The codepoint after it has Indic_Conjunct_Break=Consonant,
170 /// so there is a break before so a boundary if it is preceded by another
171 /// InCB=Consonant follwoed by a sequence consisting of one or more InCB=Linker
172 /// and zero or more InCB = Extend (in any order).
173 InCbConsonant,
174 /// The codepoint after is a Regional Indicator Symbol, so a boundary iff
175 /// it is preceded by an even number of RIS codepoints. (GB12, GB13)
176 Regional,
177 /// The codepoint after is Extended_Pictographic,
178 /// so whether it's a boundary depends on pre-context according to GB11.
179 Emoji {
180 /// Whether the ZWJ char has been seen already an only a "\p{Extended_Pictographic} Extend*"
181 /// part of GB11 has to be checked
182 seen_zwj: bool,
183 },
184}
185
186/// Cursor-based segmenter for grapheme clusters.
187///
188/// This allows working with ropes and other datastructures where the string is not contiguous or
189/// fully known at initialization time.
190#[derive(Clone, Debug)]
191pub struct GraphemeCursor {
192 /// Current cursor position.
193 offset: usize,
194 /// Total length of the string.
195 len: usize,
196 /// A config flag indicating whether this cursor computes legacy or extended
197 /// grapheme cluster boundaries (enables GB9a and GB9b if set).
198 is_extended: bool,
199 /// Information about the potential boundary at `offset`
200 state: GraphemeState,
201 /// Category of codepoint immediately preceding cursor, if known.
202 cat_before: Option<GraphemeCat>,
203 /// Category of codepoint immediately after cursor, if known.
204 cat_after: Option<GraphemeCat>,
205 /// If set, at least one more codepoint immediately preceding this offset
206 /// is needed to resolve whether there's a boundary at `offset`.
207 pre_context_offset: Option<usize>,
208 /// The number of `InCB=Linker` codepoints preceding `offset`
209 /// (potentially intermingled with `InCB=Extend`).
210 incb_linker_count: Option<usize>,
211 /// The number of RIS codepoints preceding `offset`. If `pre_context_offset`
212 /// is set, then counts the number of RIS between that and `offset`, otherwise
213 /// is an accurate count relative to the string.
214 ris_count: Option<usize>,
215 /// Set if a call to `prev_boundary` or `next_boundary` was suspended due
216 /// to needing more input.
217 resuming: bool,
218 /// Cached grapheme category and associated scalar value range.
219 grapheme_cat_cache: (u32, u32, GraphemeCat),
220}
221
222/// An error return indicating that not enough content was available in the
223/// provided chunk to satisfy the query, and that more content must be provided.
224#[derive(PartialEq, Eq, Debug)]
225pub enum GraphemeIncomplete {
226 /// More pre-context is needed. The caller should call `provide_context`
227 /// with a chunk ending at the offset given, then retry the query. This
228 /// will only be returned if the `chunk_start` parameter is nonzero.
229 PreContext(usize),
230
231 /// When requesting `prev_boundary`, the cursor is moving past the beginning
232 /// of the current chunk, so the chunk before that is requested. This will
233 /// only be returned if the `chunk_start` parameter is nonzero.
234 PrevChunk,
235
236 /// When requesting `next_boundary`, the cursor is moving past the end of the
237 /// current chunk, so the chunk after that is requested. This will only be
238 /// returned if the chunk ends before the `len` parameter provided on
239 /// creation of the cursor.
240 NextChunk, // requesting chunk following the one given
241
242 /// An error returned when the chunk given does not contain the cursor position.
243 InvalidOffset,
244}
245
246// An enum describing the result from lookup of a pair of categories.
247#[derive(PartialEq, Eq)]
248enum PairResult {
249 /// definitely not a break
250 NotBreak,
251 /// definitely a break
252 Break,
253 /// a break iff not in extended mode
254 Extended,
255 /// a break unless in extended mode and preceded by
256 /// a sequence of 0 or more InCB=Extend and one or more
257 /// InCB = Linker (in any order),
258 /// preceded by another InCB=Consonant
259 InCbConsonant,
260 /// a break if preceded by an even number of RIS
261 Regional,
262 /// a break if preceded by emoji base and (Extend)*
263 Emoji,
264}
265
266#[inline]
267fn check_pair(before: GraphemeCat, after: GraphemeCat) -> PairResult {
268 use self::PairResult::*;
269 use crate::tables::grapheme::GraphemeCat::*;
270 match (before, after) {
271 (GC_CR, GC_LF) => NotBreak, // GB3
272 (GC_Control | GC_CR | GC_LF, _) => Break, // GB4
273 (_, GC_Control | GC_CR | GC_LF) => Break, // GB5
274 (GC_L, GC_L | GC_V | GC_LV | GC_LVT) => NotBreak, // GB6
275 (GC_LV | GC_V, GC_V | GC_T) => NotBreak, // GB7
276 (GC_LVT | GC_T, GC_T) => NotBreak, // GB8
277 (_, GC_Extend | GC_ZWJ) => NotBreak, // GB9
278 (_, GC_SpacingMark) => Extended, // GB9a
279 (GC_Prepend, _) => Extended, // GB9b
280 (_, GC_InCB_Consonant) => InCbConsonant, // GB9c
281 (GC_ZWJ, GC_Extended_Pictographic) => Emoji, // GB11
282 (GC_Regional_Indicator, GC_Regional_Indicator) => Regional, // GB12, GB13
283 (_, _) => Break, // GB999
284 }
285}
286
287impl GraphemeCursor {
288 /// Create a new cursor. The string and initial offset are given at creation
289 /// time, but the contents of the string are not. The `is_extended` parameter
290 /// controls whether extended grapheme clusters are selected.
291 ///
292 /// The `offset` parameter must be on a codepoint boundary.
293 ///
294 /// ```rust
295 /// # use unicode_segmentation::GraphemeCursor;
296 /// let s = "हिन्दी";
297 /// let mut legacy = GraphemeCursor::new(0, s.len(), false);
298 /// assert_eq!(legacy.next_boundary(s, 0), Ok(Some("ह".len())));
299 /// let mut extended = GraphemeCursor::new(0, s.len(), true);
300 /// assert_eq!(extended.next_boundary(s, 0), Ok(Some("हि".len())));
301 /// ```
302 pub fn new(offset: usize, len: usize, is_extended: bool) -> GraphemeCursor {
303 let state = if offset == 0 || offset == len {
304 GraphemeState::Break
305 } else {
306 GraphemeState::Unknown
307 };
308 GraphemeCursor {
309 offset,
310 len,
311 state,
312 is_extended,
313 cat_before: None,
314 cat_after: None,
315 pre_context_offset: None,
316 incb_linker_count: None,
317 ris_count: None,
318 resuming: false,
319 grapheme_cat_cache: (0, 0, GraphemeCat::GC_Control),
320 }
321 }
322
323 fn grapheme_category(&mut self, ch: char) -> GraphemeCat {
324 use crate::tables::grapheme as gr;
325 use crate::tables::grapheme::GraphemeCat::*;
326
327 if ch <= '\u{7e}' {
328 // Special-case optimization for ascii, except U+007F. This
329 // improves performance even for many primarily non-ascii texts,
330 // due to use of punctuation and white space characters from the
331 // ascii range.
332 if ch >= '\u{20}' {
333 GC_Any
334 } else if ch == '\n' {
335 GC_LF
336 } else if ch == '\r' {
337 GC_CR
338 } else {
339 GC_Control
340 }
341 } else {
342 // If this char isn't within the cached range, update the cache to the
343 // range that includes it.
344 if (ch as u32) < self.grapheme_cat_cache.0 || (ch as u32) > self.grapheme_cat_cache.1 {
345 self.grapheme_cat_cache = gr::grapheme_category(ch);
346 }
347 self.grapheme_cat_cache.2
348 }
349 }
350
351 // Not sure I'm gonna keep this, the advantage over new() seems thin.
352
353 /// Set the cursor to a new location in the same string.
354 ///
355 /// ```rust
356 /// # use unicode_segmentation::GraphemeCursor;
357 /// let s = "abcd";
358 /// let mut cursor = GraphemeCursor::new(0, s.len(), false);
359 /// assert_eq!(cursor.cur_cursor(), 0);
360 /// cursor.set_cursor(2);
361 /// assert_eq!(cursor.cur_cursor(), 2);
362 /// ```
363 pub fn set_cursor(&mut self, offset: usize) {
364 if offset != self.offset {
365 self.offset = offset;
366 self.state = if offset == 0 || offset == self.len {
367 GraphemeState::Break
368 } else {
369 GraphemeState::Unknown
370 };
371 // reset state derived from text around cursor
372 self.cat_before = None;
373 self.cat_after = None;
374 self.incb_linker_count = None;
375 self.ris_count = None;
376 }
377 }
378
379 #[inline]
380 /// The current offset of the cursor. Equal to the last value provided to
381 /// `new()` or `set_cursor()`, or returned from `next_boundary()` or
382 /// `prev_boundary()`.
383 ///
384 /// ```rust
385 /// # use unicode_segmentation::GraphemeCursor;
386 /// // Two flags (🇷🇸🇮🇴), each flag is two RIS codepoints, each RIS is 4 bytes.
387 /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
388 /// let mut cursor = GraphemeCursor::new(4, flags.len(), false);
389 /// assert_eq!(cursor.cur_cursor(), 4);
390 /// assert_eq!(cursor.next_boundary(flags, 0), Ok(Some(8)));
391 /// assert_eq!(cursor.cur_cursor(), 8);
392 /// ```
393 pub fn cur_cursor(&self) -> usize {
394 self.offset
395 }
396
397 /// Provide additional pre-context when it is needed to decide a boundary.
398 /// The end of the chunk must coincide with the value given in the
399 /// `GraphemeIncomplete::PreContext` request.
400 ///
401 /// ```rust
402 /// # use unicode_segmentation::{GraphemeCursor, GraphemeIncomplete};
403 /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
404 /// let mut cursor = GraphemeCursor::new(8, flags.len(), false);
405 /// // Not enough pre-context to decide if there's a boundary between the two flags.
406 /// assert_eq!(cursor.is_boundary(&flags[8..], 8), Err(GraphemeIncomplete::PreContext(8)));
407 /// // Provide one more Regional Indicator Symbol of pre-context
408 /// cursor.provide_context(&flags[4..8], 4);
409 /// // Still not enough context to decide.
410 /// assert_eq!(cursor.is_boundary(&flags[8..], 8), Err(GraphemeIncomplete::PreContext(4)));
411 /// // Provide additional requested context.
412 /// cursor.provide_context(&flags[0..4], 0);
413 /// // That's enough to decide (it always is when context goes to the start of the string)
414 /// assert_eq!(cursor.is_boundary(&flags[8..], 8), Ok(true));
415 /// ```
416 pub fn provide_context(&mut self, chunk: &str, chunk_start: usize) {
417 use crate::tables::grapheme as gr;
418 assert!(chunk_start.saturating_add(chunk.len()) == self.pre_context_offset.unwrap());
419 self.pre_context_offset = None;
420 if self.is_extended && chunk_start + chunk.len() == self.offset {
421 let ch = chunk.chars().next_back().unwrap();
422 if self.grapheme_category(ch) == gr::GC_Prepend {
423 self.decide(false); // GB9b
424 return;
425 }
426 }
427 match self.state {
428 GraphemeState::InCbConsonant => self.handle_incb_consonant(chunk, chunk_start),
429 GraphemeState::Regional => self.handle_regional(chunk, chunk_start),
430 GraphemeState::Emoji { seen_zwj } => self.handle_emoji(chunk, chunk_start, seen_zwj),
431 _ => {
432 if self.cat_before.is_none() && self.offset == chunk.len() + chunk_start {
433 let ch = chunk.chars().next_back().unwrap();
434 self.cat_before = Some(self.grapheme_category(ch));
435 }
436 }
437 }
438 }
439
440 #[inline]
441 fn decide(&mut self, is_break: bool) {
442 self.state = if is_break {
443 GraphemeState::Break
444 } else {
445 GraphemeState::NotBreak
446 };
447 }
448
449 #[inline]
450 fn decision(&mut self, is_break: bool) -> Result<bool, GraphemeIncomplete> {
451 self.decide(is_break);
452 Ok(is_break)
453 }
454
455 #[inline]
456 fn is_boundary_result(&self) -> Result<bool, GraphemeIncomplete> {
457 if self.state == GraphemeState::Break {
458 Ok(true)
459 } else if self.state == GraphemeState::NotBreak {
460 Ok(false)
461 } else if let Some(pre_context_offset) = self.pre_context_offset {
462 Err(GraphemeIncomplete::PreContext(pre_context_offset))
463 } else {
464 unreachable!("inconsistent state");
465 }
466 }
467
468 /// For handling rule GB9c:
469 ///
470 /// There's an `InCB=Consonant` after this, and we need to look back
471 /// to verify whether there should be a break.
472 ///
473 /// Seek backward to find an `InCB=Linker` preceded by an `InCB=Consonsnt`
474 /// (potentially separated by some number of `InCB=Linker` or `InCB=Extend`).
475 /// If we find the consonant in question, then there's no break; if we find a consonant
476 /// with no linker, or a non-linker non-extend non-consonant, or the start of text, there's a break;
477 /// otherwise we need more context
478 #[inline]
479 fn handle_incb_consonant(&mut self, chunk: &str, chunk_start: usize) {
480 use crate::tables::{self, grapheme as gr};
481
482 // GB9c only applies to extended grapheme clusters
483 if !self.is_extended {
484 self.decide(true);
485 return;
486 }
487
488 let mut incb_linker_count = self.incb_linker_count.unwrap_or(0);
489
490 for ch in chunk.chars().rev() {
491 if tables::is_incb_linker(ch) {
492 // We found an InCB linker
493 incb_linker_count += 1;
494 self.incb_linker_count = Some(incb_linker_count);
495 } else if tables::derived_property::InCB_Extend(ch) {
496 // We ignore InCB extends, continue
497 } else {
498 // Prev character is neither linker nor extend, break suppressed iff it's InCB=Consonant
499 let result = !(self.incb_linker_count.unwrap_or(0) > 0
500 && self.grapheme_category(ch) == gr::GC_InCB_Consonant);
501 self.decide(result);
502 return;
503 }
504 }
505
506 if chunk_start == 0 {
507 // Start of text and we still haven't found a consonant, so break
508 self.decide(true);
509 } else {
510 // We need more context
511 self.pre_context_offset = Some(chunk_start);
512 self.state = GraphemeState::InCbConsonant;
513 }
514 }
515
516 #[inline]
517 fn handle_regional(&mut self, chunk: &str, chunk_start: usize) {
518 use crate::tables::grapheme as gr;
519 let mut ris_count = self.ris_count.unwrap_or(0);
520 for ch in chunk.chars().rev() {
521 if self.grapheme_category(ch) != gr::GC_Regional_Indicator {
522 self.ris_count = Some(ris_count);
523 self.decide(ris_count % 2 == 0);
524 return;
525 }
526 ris_count += 1;
527 }
528 self.ris_count = Some(ris_count);
529 if chunk_start == 0 {
530 self.decide(ris_count % 2 == 0);
531 } else {
532 self.pre_context_offset = Some(chunk_start);
533 self.state = GraphemeState::Regional;
534 }
535 }
536
537 #[inline]
538 fn handle_emoji(&mut self, chunk: &str, chunk_start: usize, mut seen_zwj: bool) {
539 // \p{Extended_Pictographic} Extend* ZWJ × \p{Extended_Pictographic}
540 use crate::tables::grapheme as gr;
541 let mut iter = chunk.chars().rev();
542 if !seen_zwj {
543 if let Some(ch) = iter.next() {
544 if self.grapheme_category(ch) != gr::GC_ZWJ {
545 self.decide(true);
546 return;
547 } else {
548 seen_zwj = true;
549 }
550 }
551 }
552 for ch in iter {
553 match self.grapheme_category(ch) {
554 gr::GC_Extend => (),
555 gr::GC_Extended_Pictographic => {
556 self.decide(false);
557 return;
558 }
559 _ => {
560 self.decide(true);
561 return;
562 }
563 }
564 }
565 if chunk_start == 0 {
566 self.decide(true);
567 } else {
568 self.pre_context_offset = Some(chunk_start);
569 self.state = GraphemeState::Emoji { seen_zwj };
570 }
571 }
572
573 #[inline]
574 /// Determine whether the current cursor location is a grapheme cluster boundary.
575 /// Only a part of the string need be supplied. If `chunk_start` is nonzero or
576 /// the length of `chunk` is not equal to `len` on creation, then this method
577 /// may return `GraphemeIncomplete::PreContext`. The caller should then
578 /// call `provide_context` with the requested chunk, then retry calling this
579 /// method.
580 ///
581 /// For partial chunks, if the cursor is not at the beginning or end of the
582 /// string, the chunk should contain at least the codepoint following the cursor.
583 /// If the string is nonempty, the chunk must be nonempty.
584 ///
585 /// All calls should have consistent chunk contents (ie, if a chunk provides
586 /// content for a given slice, all further chunks covering that slice must have
587 /// the same content for it).
588 ///
589 /// ```rust
590 /// # use unicode_segmentation::GraphemeCursor;
591 /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
592 /// let mut cursor = GraphemeCursor::new(8, flags.len(), false);
593 /// assert_eq!(cursor.is_boundary(flags, 0), Ok(true));
594 /// cursor.set_cursor(12);
595 /// assert_eq!(cursor.is_boundary(flags, 0), Ok(false));
596 /// ```
597 pub fn is_boundary(
598 &mut self,
599 chunk: &str,
600 chunk_start: usize,
601 ) -> Result<bool, GraphemeIncomplete> {
602 use crate::tables::grapheme as gr;
603 if self.state == GraphemeState::Break {
604 return Ok(true);
605 }
606 if self.state == GraphemeState::NotBreak {
607 return Ok(false);
608 }
609 if (self.offset < chunk_start || self.offset >= chunk_start.saturating_add(chunk.len()))
610 && (self.offset > chunk_start.saturating_add(chunk.len()) || self.cat_after.is_none())
611 {
612 return Err(GraphemeIncomplete::InvalidOffset);
613 }
614 if let Some(pre_context_offset) = self.pre_context_offset {
615 return Err(GraphemeIncomplete::PreContext(pre_context_offset));
616 }
617 let offset_in_chunk = self.offset.saturating_sub(chunk_start);
618 if self.cat_after.is_none() {
619 let ch = chunk[offset_in_chunk..].chars().next().unwrap();
620 self.cat_after = Some(self.grapheme_category(ch));
621 }
622 if self.offset == chunk_start {
623 let mut need_pre_context = true;
624 match self.cat_after.unwrap() {
625 gr::GC_InCB_Consonant => self.state = GraphemeState::InCbConsonant,
626 gr::GC_Regional_Indicator => self.state = GraphemeState::Regional,
627 gr::GC_Extended_Pictographic => {
628 self.state = GraphemeState::Emoji { seen_zwj: false }
629 }
630 _ => need_pre_context = self.cat_before.is_none(),
631 }
632 if need_pre_context {
633 self.pre_context_offset = Some(chunk_start);
634 return Err(GraphemeIncomplete::PreContext(chunk_start));
635 }
636 }
637 if self.cat_before.is_none() {
638 let ch = chunk[..offset_in_chunk].chars().next_back().unwrap();
639 self.cat_before = Some(self.grapheme_category(ch));
640 }
641 match check_pair(self.cat_before.unwrap(), self.cat_after.unwrap()) {
642 PairResult::NotBreak => self.decision(false),
643 PairResult::Break => self.decision(true),
644 PairResult::Extended => {
645 let is_extended = self.is_extended;
646 self.decision(!is_extended)
647 }
648 PairResult::InCbConsonant => {
649 self.handle_incb_consonant(&chunk[..offset_in_chunk], chunk_start);
650 self.is_boundary_result()
651 }
652 PairResult::Regional => {
653 if let Some(ris_count) = self.ris_count {
654 return self.decision((ris_count % 2) == 0);
655 }
656 self.handle_regional(&chunk[..offset_in_chunk], chunk_start);
657 self.is_boundary_result()
658 }
659 PairResult::Emoji => {
660 self.handle_emoji(&chunk[..offset_in_chunk], chunk_start, false);
661 self.is_boundary_result()
662 }
663 }
664 }
665
666 #[inline]
667 /// Find the next boundary after the current cursor position. Only a part of
668 /// the string need be supplied. If the chunk is incomplete, then this
669 /// method might return `GraphemeIncomplete::PreContext` or
670 /// `GraphemeIncomplete::NextChunk`. In the former case, the caller should
671 /// call `provide_context` with the requested chunk, then retry. In the
672 /// latter case, the caller should provide the chunk following the one
673 /// given, then retry.
674 ///
675 /// See `is_boundary` for expectations on the provided chunk.
676 ///
677 /// ```rust
678 /// # use unicode_segmentation::GraphemeCursor;
679 /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
680 /// let mut cursor = GraphemeCursor::new(4, flags.len(), false);
681 /// assert_eq!(cursor.next_boundary(flags, 0), Ok(Some(8)));
682 /// assert_eq!(cursor.next_boundary(flags, 0), Ok(Some(16)));
683 /// assert_eq!(cursor.next_boundary(flags, 0), Ok(None));
684 /// ```
685 ///
686 /// And an example that uses partial strings:
687 ///
688 /// ```rust
689 /// # use unicode_segmentation::{GraphemeCursor, GraphemeIncomplete};
690 /// let s = "abcd";
691 /// let mut cursor = GraphemeCursor::new(0, s.len(), false);
692 /// assert_eq!(cursor.next_boundary(&s[..2], 0), Ok(Some(1)));
693 /// assert_eq!(cursor.next_boundary(&s[..2], 0), Err(GraphemeIncomplete::NextChunk));
694 /// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(Some(2)));
695 /// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(Some(3)));
696 /// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(Some(4)));
697 /// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(None));
698 /// ```
699 pub fn next_boundary(
700 &mut self,
701 chunk: &str,
702 chunk_start: usize,
703 ) -> Result<Option<usize>, GraphemeIncomplete> {
704 if self.offset == self.len {
705 return Ok(None);
706 }
707 let mut iter = chunk[self.offset.saturating_sub(chunk_start)..].chars();
708 let mut ch = match iter.next() {
709 Some(ch) => ch,
710 None => return Err(GraphemeIncomplete::NextChunk),
711 };
712 loop {
713 if self.resuming {
714 if self.cat_after.is_none() {
715 self.cat_after = Some(self.grapheme_category(ch));
716 }
717 } else {
718 self.offset = self.offset.saturating_add(ch.len_utf8());
719 self.state = GraphemeState::Unknown;
720 self.cat_before = self.cat_after.take();
721 if self.cat_before.is_none() {
722 self.cat_before = Some(self.grapheme_category(ch));
723 }
724 if crate::tables::is_incb_linker(ch) {
725 self.incb_linker_count = Some(self.incb_linker_count.map_or(1, |c| c + 1));
726 } else if !crate::tables::derived_property::InCB_Extend(ch) {
727 self.incb_linker_count = Some(0);
728 }
729 if self.cat_before.unwrap() == GraphemeCat::GC_Regional_Indicator {
730 self.ris_count = self.ris_count.map(|c| c + 1);
731 } else {
732 self.ris_count = Some(0);
733 }
734 if let Some(next_ch) = iter.next() {
735 ch = next_ch;
736 self.cat_after = Some(self.grapheme_category(ch));
737 } else if self.offset == self.len {
738 self.decide(true);
739 } else {
740 self.resuming = true;
741 return Err(GraphemeIncomplete::NextChunk);
742 }
743 }
744 self.resuming = true;
745 if self.is_boundary(chunk, chunk_start)? {
746 self.resuming = false;
747 return Ok(Some(self.offset));
748 }
749 self.resuming = false;
750 }
751 }
752
753 /// Find the previous boundary after the current cursor position. Only a part
754 /// of the string need be supplied. If the chunk is incomplete, then this
755 /// method might return `GraphemeIncomplete::PreContext` or
756 /// `GraphemeIncomplete::PrevChunk`. In the former case, the caller should
757 /// call `provide_context` with the requested chunk, then retry. In the
758 /// latter case, the caller should provide the chunk preceding the one
759 /// given, then retry.
760 ///
761 /// See `is_boundary` for expectations on the provided chunk.
762 ///
763 /// ```rust
764 /// # use unicode_segmentation::GraphemeCursor;
765 /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
766 /// let mut cursor = GraphemeCursor::new(12, flags.len(), false);
767 /// assert_eq!(cursor.prev_boundary(flags, 0), Ok(Some(8)));
768 /// assert_eq!(cursor.prev_boundary(flags, 0), Ok(Some(0)));
769 /// assert_eq!(cursor.prev_boundary(flags, 0), Ok(None));
770 /// ```
771 ///
772 /// And an example that uses partial strings (note the exact return is not
773 /// guaranteed, and may be `PrevChunk` or `PreContext` arbitrarily):
774 ///
775 /// ```rust
776 /// # use unicode_segmentation::{GraphemeCursor, GraphemeIncomplete};
777 /// let s = "abcd";
778 /// let mut cursor = GraphemeCursor::new(4, s.len(), false);
779 /// assert_eq!(cursor.prev_boundary(&s[2..4], 2), Ok(Some(3)));
780 /// assert_eq!(cursor.prev_boundary(&s[2..4], 2), Err(GraphemeIncomplete::PrevChunk));
781 /// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(Some(2)));
782 /// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(Some(1)));
783 /// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(Some(0)));
784 /// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(None));
785 /// ```
786 pub fn prev_boundary(
787 &mut self,
788 chunk: &str,
789 chunk_start: usize,
790 ) -> Result<Option<usize>, GraphemeIncomplete> {
791 if self.offset == 0 {
792 return Ok(None);
793 }
794 if self.offset == chunk_start {
795 return Err(GraphemeIncomplete::PrevChunk);
796 }
797 let mut iter = chunk[..self.offset.saturating_sub(chunk_start)]
798 .chars()
799 .rev();
800 let mut ch = iter.next().unwrap();
801 loop {
802 if self.offset == chunk_start {
803 self.resuming = true;
804 return Err(GraphemeIncomplete::PrevChunk);
805 }
806 if self.resuming {
807 self.cat_before = Some(self.grapheme_category(ch));
808 } else {
809 self.offset -= ch.len_utf8();
810 self.cat_after = self.cat_before.take();
811 self.state = GraphemeState::Unknown;
812 if let Some(incb_linker_count) = self.incb_linker_count {
813 self.incb_linker_count =
814 if incb_linker_count > 0 && crate::tables::is_incb_linker(ch) {
815 Some(incb_linker_count - 1)
816 } else if crate::tables::derived_property::InCB_Extend(ch) {
817 Some(incb_linker_count)
818 } else {
819 None
820 };
821 }
822 if let Some(ris_count) = self.ris_count {
823 self.ris_count = if ris_count > 0 {
824 Some(ris_count - 1)
825 } else {
826 None
827 };
828 }
829 if let Some(prev_ch) = iter.next() {
830 ch = prev_ch;
831 self.cat_before = Some(self.grapheme_category(ch));
832 } else if self.offset == 0 {
833 self.decide(true);
834 } else {
835 self.resuming = true;
836 self.cat_after = Some(self.grapheme_category(ch));
837 return Err(GraphemeIncomplete::PrevChunk);
838 }
839 }
840 self.resuming = true;
841 if self.is_boundary(chunk, chunk_start)? {
842 self.resuming = false;
843 return Ok(Some(self.offset));
844 }
845 self.resuming = false;
846 }
847 }
848}
849
850#[test]
851fn test_grapheme_cursor_ris_precontext() {
852 let s = "\u{1f1fa}\u{1f1f8}\u{1f1fa}\u{1f1f8}\u{1f1fa}\u{1f1f8}";
853 let mut c = GraphemeCursor::new(8, s.len(), true);
854 assert_eq!(
855 c.is_boundary(&s[4..], 4),
856 Err(GraphemeIncomplete::PreContext(4))
857 );
858 c.provide_context(&s[..4], 0);
859 assert_eq!(c.is_boundary(&s[4..], 4), Ok(true));
860}
861
862#[test]
863fn test_grapheme_cursor_chunk_start_require_precontext() {
864 let s = "\r\n";
865 let mut c = GraphemeCursor::new(1, s.len(), true);
866 assert_eq!(
867 c.is_boundary(&s[1..], 1),
868 Err(GraphemeIncomplete::PreContext(1))
869 );
870 c.provide_context(&s[..1], 0);
871 assert_eq!(c.is_boundary(&s[1..], 1), Ok(false));
872}
873
874#[test]
875fn test_grapheme_cursor_prev_boundary() {
876 let s = "abcd";
877 let mut c = GraphemeCursor::new(3, s.len(), true);
878 assert_eq!(
879 c.prev_boundary(&s[2..], 2),
880 Err(GraphemeIncomplete::PrevChunk)
881 );
882 assert_eq!(c.prev_boundary(&s[..2], 0), Ok(Some(2)));
883}
884
885#[test]
886fn test_grapheme_cursor_prev_boundary_chunk_start() {
887 let s = "abcd";
888 let mut c = GraphemeCursor::new(2, s.len(), true);
889 assert_eq!(
890 c.prev_boundary(&s[2..], 2),
891 Err(GraphemeIncomplete::PrevChunk)
892 );
893 assert_eq!(c.prev_boundary(&s[..2], 0), Ok(Some(1)));
894}
895
896#[test]
897fn test_grapheme_cursor_boundary_with_zwj_on_chunk_start() {
898 use GraphemeIncomplete::*;
899
900 let chunk0 = "👩"; // 4 bytes
901 let chunk1 = "\u{200d}🔬"; // 3 bytes + 4 bytes
902
903 let full_len = chunk0.len() + chunk1.len();
904
905 let mut cur = GraphemeCursor::new(0, full_len, true);
906 assert_eq!(cur.next_boundary(chunk0, 0), Err(NextChunk));
907 match cur.next_boundary(chunk1, chunk0.len()) {
908 Ok(res) => assert_eq!(res, Some(11)),
909 Err(PreContext(_)) => {
910 cur.provide_context(chunk0, 0);
911 assert_eq!(cur.next_boundary(chunk1, chunk0.len()), Ok(Some(11)));
912 }
913 _ => unreachable!(),
914 }
915}
916
917#[test]
918fn test_grapheme_cursor_emoji_no_zwj() {
919 use GraphemeIncomplete::*;
920 let chunk0 = "🍒"; // 4 bytes
921 let chunk1 = "🥑"; // 4 bytes
922 let full_len = chunk0.len() + chunk1.len();
923
924 let mut c = GraphemeCursor::new(0, full_len, true);
925 assert_eq!(c.next_boundary(chunk0, 0), Err(NextChunk));
926 assert_eq!(
927 c.next_boundary(chunk1, chunk0.len()),
928 Err(PreContext(chunk0.len()))
929 );
930 c.provide_context(chunk0, 0);
931 assert_eq!(c.next_boundary(chunk1, chunk0.len()), Ok(Some(4)));
932 assert_eq!(c.next_boundary(chunk1, chunk0.len()), Ok(Some(8)));
933 assert_eq!(c.next_boundary(chunk1, chunk0.len()), Ok(None));
934}
935
936#[test]
937fn test_grapheme_cursor_emoji_chunk_boundary_before_zwj() {
938 use GraphemeIncomplete::*;
939 let chunk0 = "🍒"; // 4 bytes
940 let chunk1 = "\u{200d}🥑"; // 3 + 4 bytes
941 let full_len = chunk0.len() + chunk1.len(); // 11
942
943 let mut c = GraphemeCursor::new(0, full_len, true);
944 assert_eq!(c.next_boundary(chunk0, 0), Err(NextChunk));
945 assert_eq!(
946 c.next_boundary(chunk1, chunk0.len()),
947 Err(PreContext(chunk0.len()))
948 );
949 c.provide_context(chunk0, 0);
950 assert_eq!(c.next_boundary(chunk1, chunk0.len()), Ok(Some(11)));
951 assert_eq!(c.next_boundary(chunk1, chunk0.len()), Ok(None));
952}
953
954#[test]
955fn test_grapheme_cursor_emoji_chunk_boundary_after_zwj() {
956 use GraphemeIncomplete::*;
957 let chunk0 = "🍒\u{200d}"; // 4 + 3 bytes
958 let chunk1 = "🥑"; // 4 bytes
959 let full_len = chunk0.len() + chunk1.len(); // 11
960
961 let mut c = GraphemeCursor::new(0, full_len, true);
962 assert_eq!(c.next_boundary(chunk0, 0), Err(NextChunk));
963 assert_eq!(
964 c.next_boundary(chunk1, chunk0.len()),
965 Err(PreContext(chunk0.len()))
966 );
967 c.provide_context(chunk0, 0);
968 assert_eq!(c.next_boundary(chunk1, chunk0.len()), Ok(Some(11)));
969 assert_eq!(c.next_boundary(chunk1, chunk0.len()), Ok(None));
970}
971
972#[test]
973fn test_grapheme_cursor_emoji_zwj_across_chunks() {
974 use GraphemeIncomplete::*;
975 let chunk0 = "🍒"; // 4 bytes
976 let chunk1 = "\u{200d}"; // 3 bytes
977 let chunk2 = "🥑"; // 4 bytes
978 let full_len = chunk0.len() + chunk1.len() + chunk2.len(); // 11
979 let chunk2_start = chunk0.len() + chunk1.len();
980
981 let mut c = GraphemeCursor::new(0, full_len, true);
982 assert_eq!(c.next_boundary(chunk0, 0), Err(NextChunk));
983 assert_eq!(c.next_boundary(chunk1, chunk0.len()), Err(NextChunk));
984 assert_eq!(
985 c.next_boundary(chunk2, chunk2_start),
986 Err(PreContext(chunk2_start))
987 );
988 c.provide_context(chunk1, chunk0.len());
989 assert_eq!(
990 c.next_boundary(chunk2, chunk2_start),
991 Err(PreContext(chunk0.len()))
992 );
993 c.provide_context(chunk0, 0);
994 assert_eq!(c.next_boundary(chunk2, chunk2_start), Ok(Some(11)));
995 assert_eq!(c.next_boundary(chunk2, chunk2_start), Ok(None));
996}