unicode_segmentation/
word.rs

1// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
2// file at the top-level directory of this distribution and at
3// http://rust-lang.org/COPYRIGHT.
4//
5// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8// option. This file may not be copied, modified, or distributed
9// except according to those terms.
10
11use core::cmp;
12use core::iter::Filter;
13
14use crate::tables::word::WordCat;
15
16/// An iterator over the substrings of a string which, after splitting the string on
17/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries),
18/// contain any characters with the
19/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
20/// property, or with
21/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
22///
23/// This struct is created by the [`unicode_words`] method on the [`UnicodeSegmentation`] trait. See
24/// its documentation for more.
25///
26/// [`unicode_words`]: trait.UnicodeSegmentation.html#tymethod.unicode_words
27/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
28#[derive(Debug)]
29pub struct UnicodeWords<'a> {
30    inner: WordsIter<'a>,
31}
32
33impl<'a> Iterator for UnicodeWords<'a> {
34    type Item = &'a str;
35    #[inline]
36    fn next(&mut self) -> Option<Self::Item> {
37        match &mut self.inner {
38            WordsIter::Ascii(i) => i.next(),
39            WordsIter::Unicode(i) => i.next(),
40        }
41    }
42    #[inline]
43    fn size_hint(&self) -> (usize, Option<usize>) {
44        match &self.inner {
45            WordsIter::Ascii(i) => i.size_hint(),
46            WordsIter::Unicode(i) => i.size_hint(),
47        }
48    }
49}
50impl<'a> DoubleEndedIterator for UnicodeWords<'a> {
51    #[inline]
52    fn next_back(&mut self) -> Option<Self::Item> {
53        match &mut self.inner {
54            WordsIter::Ascii(i) => i.next_back(),
55            WordsIter::Unicode(i) => i.next_back(),
56        }
57    }
58}
59
60/// An iterator over the substrings of a string which, after splitting the string on
61/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries),
62/// contain any characters with the
63/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
64/// property, or with
65/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
66/// This iterator also provides the byte offsets for each substring.
67///
68/// This struct is created by the [`unicode_word_indices`] method on the [`UnicodeSegmentation`] trait. See
69/// its documentation for more.
70///
71/// [`unicode_word_indices`]: trait.UnicodeSegmentation.html#tymethod.unicode_word_indices
72/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
73#[derive(Debug)]
74pub struct UnicodeWordIndices<'a> {
75    inner: IndicesIter<'a>,
76}
77
78impl<'a> Iterator for UnicodeWordIndices<'a> {
79    type Item = (usize, &'a str);
80    #[inline]
81    fn next(&mut self) -> Option<Self::Item> {
82        match &mut self.inner {
83            IndicesIter::Ascii(i) => i.next(),
84            IndicesIter::Unicode(i) => i.next(),
85        }
86    }
87    #[inline]
88    fn size_hint(&self) -> (usize, Option<usize>) {
89        match &self.inner {
90            IndicesIter::Ascii(i) => i.size_hint(),
91            IndicesIter::Unicode(i) => i.size_hint(),
92        }
93    }
94}
95impl<'a> DoubleEndedIterator for UnicodeWordIndices<'a> {
96    #[inline]
97    fn next_back(&mut self) -> Option<Self::Item> {
98        match &mut self.inner {
99            IndicesIter::Ascii(i) => i.next_back(),
100            IndicesIter::Unicode(i) => i.next_back(),
101        }
102    }
103}
104
105/// External iterator for a string's
106/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries).
107///
108/// This struct is created by the [`split_word_bounds`] method on the [`UnicodeSegmentation`]
109/// trait. See its documentation for more.
110///
111/// [`split_word_bounds`]: trait.UnicodeSegmentation.html#tymethod.split_word_bounds
112/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
113#[derive(Debug, Clone)]
114pub struct UWordBounds<'a> {
115    string: &'a str,
116    cat: Option<WordCat>,
117    catb: Option<WordCat>,
118}
119
120/// External iterator for word boundaries and byte offsets.
121///
122/// This struct is created by the [`split_word_bound_indices`] method on the
123/// [`UnicodeSegmentation`] trait. See its documentation for more.
124///
125/// [`split_word_bound_indices`]: trait.UnicodeSegmentation.html#tymethod.split_word_bound_indices
126/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
127#[derive(Debug, Clone)]
128pub struct UWordBoundIndices<'a> {
129    start_offset: usize,
130    iter: UWordBounds<'a>,
131}
132
133impl<'a> UWordBoundIndices<'a> {
134    #[inline]
135    /// View the underlying data (the part yet to be iterated) as a slice of the original string.
136    ///
137    /// ```rust
138    /// # use unicode_segmentation::UnicodeSegmentation;
139    /// let mut iter = "Hello world".split_word_bound_indices();
140    /// assert_eq!(iter.as_str(), "Hello world");
141    /// iter.next();
142    /// assert_eq!(iter.as_str(), " world");
143    /// iter.next();
144    /// assert_eq!(iter.as_str(), "world");
145    /// ```
146    pub fn as_str(&self) -> &'a str {
147        self.iter.as_str()
148    }
149}
150
151impl<'a> Iterator for UWordBoundIndices<'a> {
152    type Item = (usize, &'a str);
153
154    #[inline]
155    fn next(&mut self) -> Option<(usize, &'a str)> {
156        self.iter
157            .next()
158            .map(|s| (s.as_ptr() as usize - self.start_offset, s))
159    }
160
161    #[inline]
162    fn size_hint(&self) -> (usize, Option<usize>) {
163        self.iter.size_hint()
164    }
165}
166
167impl<'a> DoubleEndedIterator for UWordBoundIndices<'a> {
168    #[inline]
169    fn next_back(&mut self) -> Option<(usize, &'a str)> {
170        self.iter
171            .next_back()
172            .map(|s| (s.as_ptr() as usize - self.start_offset, s))
173    }
174}
175
176// state machine for word boundary rules
177#[derive(Clone, Copy, PartialEq, Eq, Debug)]
178enum UWordBoundsState {
179    Start,
180    Letter,
181    HLetter,
182    Numeric,
183    Katakana,
184    ExtendNumLet,
185    Regional(RegionalState),
186    FormatExtend(FormatExtendType),
187    Zwj,
188    Emoji,
189    WSegSpace,
190}
191
192// subtypes for FormatExtend state in UWordBoundsState
193#[derive(Clone, Copy, PartialEq, Eq, Debug)]
194enum FormatExtendType {
195    AcceptAny,
196    AcceptNone,
197    RequireLetter,
198    RequireHLetter,
199    AcceptQLetter,
200    RequireNumeric,
201}
202
203#[derive(Clone, Copy, PartialEq, Eq, Debug)]
204enum RegionalState {
205    Half,
206    Full,
207    Unknown,
208}
209
210fn is_emoji(ch: char) -> bool {
211    use crate::tables::emoji;
212    emoji::emoji_category(ch).2 == emoji::EmojiCat::EC_Extended_Pictographic
213}
214
215impl<'a> Iterator for UWordBounds<'a> {
216    type Item = &'a str;
217
218    #[inline]
219    fn size_hint(&self) -> (usize, Option<usize>) {
220        let slen = self.string.len();
221        (cmp::min(slen, 1), Some(slen))
222    }
223
224    #[inline]
225    fn next(&mut self) -> Option<&'a str> {
226        use self::FormatExtendType::*;
227        use self::UWordBoundsState::*;
228        use crate::tables::word as wd;
229        if self.string.is_empty() {
230            return None;
231        }
232
233        let mut take_curr = true;
234        let mut take_cat = true;
235        let mut idx = 0;
236        let mut saveidx = 0;
237        let mut state = Start;
238        let mut cat = wd::WC_Any;
239        let mut savecat = wd::WC_Any;
240
241        // If extend/format/zwj were skipped. Handles precedence of WB3d over WB4
242        let mut skipped_format_extend = false;
243        for (curr, ch) in self.string.char_indices() {
244            idx = curr;
245            // Whether or not the previous category was ZWJ
246            // ZWJs get collapsed, so this handles precedence of WB3c over WB4
247            let prev_zwj = cat == wd::WC_ZWJ;
248            // if there's a category cached, grab it
249            cat = match self.cat {
250                None => wd::word_category(ch).2,
251                _ => self.cat.take().unwrap(),
252            };
253            take_cat = true;
254
255            // handle rule WB4
256            // just skip all format, extend, and zwj chars
257            // note that Start is a special case: if there's a bunch of Format | Extend
258            // characters at the beginning of a block of text, dump them out as one unit.
259            //
260            // (This is not obvious from the wording of UAX#29, but if you look at the
261            // test cases http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakTest.txt
262            // then the "correct" interpretation of WB4 becomes apparent.)
263            if state != Start {
264                match cat {
265                    wd::WC_Extend | wd::WC_Format | wd::WC_ZWJ => {
266                        skipped_format_extend = true;
267                        continue;
268                    }
269                    _ => {}
270                }
271            }
272
273            // rule WB3c
274            // WB4 makes all ZWJs collapse into the previous state
275            // but you can still be in a Zwj state if you started with Zwj
276            //
277            // This means that an EP + Zwj will collapse into EP, which is wrong,
278            // since EP+EP is not a boundary but EP+ZWJ+EP is
279            //
280            // Thus, we separately keep track of whether or not the last character
281            // was a ZWJ. This is an additional bit of state tracked outside of the
282            // state enum; the state enum represents the last non-zwj state encountered.
283            // When prev_zwj is true, for the purposes of WB3c, we are in the Zwj state,
284            // however we are in the previous state for the purposes of all other rules.
285            if prev_zwj && is_emoji(ch) {
286                state = Emoji;
287                continue;
288            }
289            // Don't use `continue` in this match without updating `cat`
290            state = match state {
291                Start if cat == wd::WC_CR => {
292                    idx += match self.get_next_cat(idx) {
293                        Some(wd::WC_LF) => 1, // rule WB3
294                        _ => 0,
295                    };
296                    break; // rule WB3a
297                }
298                Start => match cat {
299                    wd::WC_ALetter => Letter,            // rule WB5, WB6, WB9, WB13a
300                    wd::WC_Hebrew_Letter => HLetter,     // rule WB5, WB6, WB7a, WB7b, WB9, WB13a
301                    wd::WC_Numeric => Numeric,           // rule WB8, WB10, WB12, WB13a
302                    wd::WC_Katakana => Katakana,         // rule WB13, WB13a
303                    wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a, WB13b
304                    wd::WC_Regional_Indicator => Regional(RegionalState::Half), // rule WB13c
305                    wd::WC_LF | wd::WC_Newline => break, // rule WB3a
306                    wd::WC_ZWJ => Zwj,                   // rule WB3c
307                    wd::WC_WSegSpace => WSegSpace,       // rule WB3d
308                    _ => {
309                        if let Some(ncat) = self.get_next_cat(idx) {
310                            // rule WB4
311                            if ncat == wd::WC_Format || ncat == wd::WC_Extend || ncat == wd::WC_ZWJ
312                            {
313                                state = FormatExtend(AcceptNone);
314                                self.cat = Some(ncat);
315                                continue;
316                            }
317                        }
318                        break; // rule WB999
319                    }
320                },
321                WSegSpace => match cat {
322                    wd::WC_WSegSpace if !skipped_format_extend => WSegSpace,
323                    _ => {
324                        take_curr = false;
325                        break;
326                    }
327                },
328                Zwj => {
329                    // We already handle WB3c above.
330                    take_curr = false;
331                    break;
332                }
333                Letter | HLetter => match cat {
334                    wd::WC_ALetter => Letter,            // rule WB5
335                    wd::WC_Hebrew_Letter => HLetter,     // rule WB5
336                    wd::WC_Numeric => Numeric,           // rule WB9
337                    wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
338                    wd::WC_Double_Quote if state == HLetter => {
339                        savecat = cat;
340                        saveidx = idx;
341                        FormatExtend(RequireHLetter) // rule WB7b
342                    }
343                    wd::WC_Single_Quote if state == HLetter => {
344                        FormatExtend(AcceptQLetter) // rule WB7a
345                    }
346                    wd::WC_MidLetter | wd::WC_MidNumLet | wd::WC_Single_Quote => {
347                        savecat = cat;
348                        saveidx = idx;
349                        FormatExtend(RequireLetter) // rule WB6
350                    }
351                    _ => {
352                        take_curr = false;
353                        break;
354                    }
355                },
356                Numeric => match cat {
357                    wd::WC_Numeric => Numeric,           // rule WB8
358                    wd::WC_ALetter => Letter,            // rule WB10
359                    wd::WC_Hebrew_Letter => HLetter,     // rule WB10
360                    wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
361                    wd::WC_MidNum | wd::WC_MidNumLet | wd::WC_Single_Quote => {
362                        savecat = cat;
363                        saveidx = idx;
364                        FormatExtend(RequireNumeric) // rule WB12
365                    }
366                    _ => {
367                        take_curr = false;
368                        break;
369                    }
370                },
371                Katakana => match cat {
372                    wd::WC_Katakana => Katakana,         // rule WB13
373                    wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
374                    _ => {
375                        take_curr = false;
376                        break;
377                    }
378                },
379                ExtendNumLet => match cat {
380                    wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
381                    wd::WC_ALetter => Letter,            // rule WB13b
382                    wd::WC_Hebrew_Letter => HLetter,     // rule WB13b
383                    wd::WC_Numeric => Numeric,           // rule WB13b
384                    wd::WC_Katakana => Katakana,         // rule WB13b
385                    _ => {
386                        take_curr = false;
387                        break;
388                    }
389                },
390                Regional(RegionalState::Full) => {
391                    // if it reaches here we've gone too far,
392                    // a full flag can only compose with ZWJ/Extend/Format
393                    // proceeding it.
394                    take_curr = false;
395                    break;
396                }
397                Regional(RegionalState::Half) => match cat {
398                    wd::WC_Regional_Indicator => Regional(RegionalState::Full), // rule WB13c
399                    _ => {
400                        take_curr = false;
401                        break;
402                    }
403                },
404                Regional(_) => {
405                    unreachable!("RegionalState::Unknown should not occur on forward iteration")
406                }
407                Emoji => {
408                    // We already handle WB3c above. If you've reached this point, the emoji sequence is over.
409                    take_curr = false;
410                    break;
411                }
412                FormatExtend(t) => match t {
413                    // handle FormatExtends depending on what type
414                    RequireNumeric if cat == wd::WC_Numeric => Numeric, // rule WB11
415                    RequireLetter | AcceptQLetter if cat == wd::WC_ALetter => Letter, // rule WB7
416                    RequireLetter | AcceptQLetter if cat == wd::WC_Hebrew_Letter => HLetter, // WB7a
417                    RequireHLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB7b
418                    AcceptNone | AcceptQLetter => {
419                        take_curr = false; // emit all the Format|Extend characters
420                        take_cat = false;
421                        break;
422                    }
423                    _ => break, // rewind (in if statement below)
424                },
425            }
426        }
427
428        if let FormatExtend(t) = state {
429            // we were looking for something and didn't find it; we have to back up
430            if t == RequireLetter || t == RequireHLetter || t == RequireNumeric {
431                idx = saveidx;
432                cat = savecat;
433                take_curr = false;
434            }
435        }
436
437        self.cat = if take_curr {
438            idx = idx + self.string[idx..].chars().next().unwrap().len_utf8();
439            None
440        } else if take_cat {
441            Some(cat)
442        } else {
443            None
444        };
445
446        let retstr = &self.string[..idx];
447        self.string = &self.string[idx..];
448        Some(retstr)
449    }
450}
451
452impl<'a> DoubleEndedIterator for UWordBounds<'a> {
453    #[inline]
454    fn next_back(&mut self) -> Option<&'a str> {
455        use self::FormatExtendType::*;
456        use self::UWordBoundsState::*;
457        use crate::tables::word as wd;
458        if self.string.is_empty() {
459            return None;
460        }
461
462        let mut take_curr = true;
463        let mut take_cat = true;
464        let mut idx = self.string.len();
465        idx -= self.string.chars().next_back().unwrap().len_utf8();
466        let mut previdx = idx;
467        let mut saveidx = idx;
468        let mut state = Start;
469        let mut savestate = Start;
470        let mut cat = wd::WC_Any;
471
472        // WB3c is context-sensitive (ZWJ + Extended_Pictographic),
473        // while WB4 collapses Extend/Format and would otherwise hide that context.
474        // We therefore keep this context outside the main state machine:
475        // whether the nearest non-(Extend|Format) char to the right is emoji.
476        let mut right_significant_is_emoji: bool = false;
477
478        let mut skipped_format_extend = false;
479
480        for (curr, ch) in self.string.char_indices().rev() {
481            previdx = idx;
482            idx = curr;
483
484            // if there's a category cached, grab it
485            cat = match self.catb {
486                None => wd::word_category(ch).2,
487                _ => self.catb.take().unwrap(),
488            };
489            take_cat = true;
490
491            // backward iterator over word boundaries. Mostly the same as the forward
492            // iterator, with two weirdnesses:
493            // (1) If we encounter a single quote in the Start state, we have to check for a
494            //     Hebrew Letter immediately before it.
495            // (2) Format and Extend char handling takes some gymnastics.
496
497            // Reverse-direction WB3c check: when we encounter ZWJ and the nearest
498            // significant right-side char is emoji, do not break here.
499            if cat == wd::WC_ZWJ && state != Zwj && right_significant_is_emoji {
500                continue;
501            }
502
503            // Keep the right-side WB3c context up to date as we move left.
504            // Ignore Extend/Format here to mirror WB4 collapsing behavior.
505            if cat != wd::WC_Extend && cat != wd::WC_Format {
506                right_significant_is_emoji = is_emoji(ch);
507            }
508
509            if cat == wd::WC_Extend || cat == wd::WC_Format || (cat == wd::WC_ZWJ && state != Zwj) {
510                // WB3c has more priority so we should not
511                // fold in that case
512                if !matches!(state, FormatExtend(_) | Start) {
513                    saveidx = previdx;
514                    savestate = state;
515                    state = FormatExtend(AcceptNone);
516                }
517
518                if state != Start {
519                    continue;
520                }
521            } else if state == FormatExtend(AcceptNone) {
522                // finished a scan of some Format|Extend chars, restore previous state
523                state = savestate;
524                previdx = saveidx;
525                take_cat = false;
526                skipped_format_extend = true;
527            }
528
529            // Don't use `continue` in this match without updating `catb`
530            state = match state {
531                Start | FormatExtend(AcceptAny) => match cat {
532                    wd::WC_ALetter => Letter,            // rule WB5, WB7, WB10, WB13b
533                    wd::WC_Hebrew_Letter => HLetter,     // rule WB5, WB7, WB7c, WB10, WB13b
534                    wd::WC_Numeric => Numeric,           // rule WB8, WB9, WB11, WB13b
535                    wd::WC_Katakana => Katakana,         // rule WB13, WB13b
536                    wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
537                    wd::WC_Regional_Indicator => Regional(RegionalState::Unknown), // rule WB13c
538                    // rule WB4:
539                    wd::WC_Extend | wd::WC_Format | wd::WC_ZWJ => FormatExtend(AcceptAny),
540                    wd::WC_Single_Quote => {
541                        saveidx = idx;
542                        FormatExtend(AcceptQLetter) // rule WB7a
543                    }
544                    wd::WC_WSegSpace => WSegSpace,
545                    wd::WC_CR | wd::WC_LF | wd::WC_Newline => {
546                        if state == Start {
547                            if cat == wd::WC_LF {
548                                idx -= match self.get_prev_cat(idx) {
549                                    Some(wd::WC_CR) => 1, // rule WB3
550                                    _ => 0,
551                                };
552                            }
553                        } else {
554                            take_curr = false;
555                        }
556                        break; // rule WB3a
557                    }
558                    _ if is_emoji(ch) => Zwj,
559                    _ => break, // rule WB999
560                },
561                Zwj => match cat {
562                    // rule WB3c
563                    wd::WC_ZWJ => FormatExtend(AcceptAny),
564                    _ => {
565                        take_curr = false;
566                        break;
567                    }
568                },
569                WSegSpace => match cat {
570                    // rule WB3d
571                    wd::WC_WSegSpace if !skipped_format_extend => WSegSpace,
572                    _ => {
573                        take_curr = false;
574                        break;
575                    }
576                },
577                Letter | HLetter => match cat {
578                    wd::WC_ALetter => Letter,            // rule WB5
579                    wd::WC_Hebrew_Letter => HLetter,     // rule WB5
580                    wd::WC_Numeric => Numeric,           // rule WB10
581                    wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13b
582                    wd::WC_Double_Quote if state == HLetter => {
583                        saveidx = previdx;
584                        FormatExtend(RequireHLetter) // rule WB7c
585                    }
586                    wd::WC_MidLetter | wd::WC_MidNumLet | wd::WC_Single_Quote => {
587                        saveidx = previdx;
588                        FormatExtend(RequireLetter) // rule WB7
589                    }
590                    _ => {
591                        take_curr = false;
592                        break;
593                    }
594                },
595                Numeric => match cat {
596                    wd::WC_Numeric => Numeric,           // rule WB8
597                    wd::WC_ALetter => Letter,            // rule WB9
598                    wd::WC_Hebrew_Letter => HLetter,     // rule WB9
599                    wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13b
600                    wd::WC_MidNum | wd::WC_MidNumLet | wd::WC_Single_Quote => {
601                        saveidx = previdx;
602                        FormatExtend(RequireNumeric) // rule WB11
603                    }
604                    _ => {
605                        take_curr = false;
606                        break;
607                    }
608                },
609                Katakana => match cat {
610                    wd::WC_Katakana => Katakana,         // rule WB13
611                    wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13b
612                    _ => {
613                        take_curr = false;
614                        break;
615                    }
616                },
617                ExtendNumLet => match cat {
618                    wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
619                    wd::WC_ALetter => Letter,            // rule WB13a
620                    wd::WC_Hebrew_Letter => HLetter,     // rule WB13a
621                    wd::WC_Numeric => Numeric,           // rule WB13a
622                    wd::WC_Katakana => Katakana,         // rule WB13a
623                    _ => {
624                        take_curr = false;
625                        break;
626                    }
627                },
628                Regional(mut regional_state) => match cat {
629                    // rule WB13c
630                    wd::WC_Regional_Indicator => {
631                        if regional_state == RegionalState::Unknown {
632                            let count = self.string[..previdx]
633                                .chars()
634                                .rev()
635                                .map(|c| wd::word_category(c).2)
636                                .filter(|&c| {
637                                    !(c == wd::WC_ZWJ || c == wd::WC_Extend || c == wd::WC_Format)
638                                })
639                                .take_while(|&c| c == wd::WC_Regional_Indicator)
640                                .count();
641                            regional_state = if count % 2 == 0 {
642                                RegionalState::Full
643                            } else {
644                                RegionalState::Half
645                            };
646                        }
647                        if regional_state == RegionalState::Full {
648                            take_curr = false;
649                            break;
650                        } else {
651                            Regional(RegionalState::Full)
652                        }
653                    }
654                    _ => {
655                        take_curr = false;
656                        break;
657                    }
658                },
659                Emoji => {
660                    if is_emoji(ch) {
661                        // rule WB3c
662                        Zwj
663                    } else {
664                        take_curr = false;
665                        break;
666                    }
667                }
668                FormatExtend(t) => match t {
669                    RequireNumeric if cat == wd::WC_Numeric => Numeric, // rule WB12
670                    RequireLetter if cat == wd::WC_ALetter => Letter,   // rule WB6
671                    RequireLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB6
672                    AcceptQLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB7a
673                    RequireHLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB7b
674                    _ => break,                                         // backtrack will happens
675                },
676            }
677        }
678
679        if let FormatExtend(t) = state {
680            // if we required something but didn't find it, backtrack
681            if t == RequireLetter
682                || t == RequireHLetter
683                || t == RequireNumeric
684                || t == AcceptNone
685                || t == AcceptQLetter
686            {
687                previdx = saveidx;
688                take_cat = false;
689                take_curr = false;
690            }
691        }
692
693        self.catb = if take_curr {
694            None
695        } else {
696            idx = previdx;
697            if take_cat {
698                Some(cat)
699            } else {
700                None
701            }
702        };
703
704        let retstr = &self.string[idx..];
705        self.string = &self.string[..idx];
706        Some(retstr)
707    }
708}
709
710impl<'a> UWordBounds<'a> {
711    #[inline]
712    /// View the underlying data (the part yet to be iterated) as a slice of the original string.
713    ///
714    /// ```rust
715    /// # use unicode_segmentation::UnicodeSegmentation;
716    /// let mut iter = "Hello world".split_word_bounds();
717    /// assert_eq!(iter.as_str(), "Hello world");
718    /// iter.next();
719    /// assert_eq!(iter.as_str(), " world");
720    /// iter.next();
721    /// assert_eq!(iter.as_str(), "world");
722    /// ```
723    pub fn as_str(&self) -> &'a str {
724        self.string
725    }
726
727    #[inline]
728    fn get_next_cat(&self, idx: usize) -> Option<WordCat> {
729        use crate::tables::word as wd;
730        let nidx = idx + self.string[idx..].chars().next().unwrap().len_utf8();
731        if nidx < self.string.len() {
732            let nch = self.string[nidx..].chars().next().unwrap();
733            Some(wd::word_category(nch).2)
734        } else {
735            None
736        }
737    }
738
739    #[inline]
740    fn get_prev_cat(&self, idx: usize) -> Option<WordCat> {
741        use crate::tables::word as wd;
742        if idx > 0 {
743            let nch = self.string[..idx].chars().next_back().unwrap();
744            Some(wd::word_category(nch).2)
745        } else {
746            None
747        }
748    }
749}
750
751/// ASCII‑fast‑path word‑boundary iterator for strings that contain only ASCII characters.
752///
753/// Since we handle only ASCII characters, we can use a much simpler set of
754/// word break values than the full Unicode algorithm.
755/// https://www.unicode.org/reports/tr29/#Table_Word_Break_Property_Values
756///
757/// | Word_Break value | ASCII code points that belong to it                             |
758/// | -----------------| --------------------------------------------------------------- |
759/// | CR               | U+000D (CR)                                                     |
760/// | LF               | U+000A (LF)                                                     |
761/// | Newline          | U+000B (VT), U+000C (FF)                                        |
762/// | Single_Quote     | U+0027 (')                                                      |
763/// | Double_Quote     | U+0022 (")                                                      |
764/// | MidNumLet        | U+002E (.) FULL STOP                                            |
765/// | MidLetter        | U+003A (:) COLON                                                |
766/// | MidNum           | U+002C (,), U+003B (;)                                          |
767/// | Numeric          | U+0030 – U+0039 (0 … 9)                                         |
768/// | ALetter          | U+0041 – U+005A (A … Z), U+0061 – U+007A (a … z)                |
769/// | ExtendNumLet     | U+005F (_) underscore                                           |
770/// | WSegSpace        | U+0020 (SPACE)                                                  |
771///
772/// The macro MidNumLetQ boils down to: U+002E (.) FULL STOP and U+0027 (')
773/// AHLetter is the same as ALetter, so we don't need to distinguish it.
774///
775/// Any other single ASCII byte is its own boundary (the default WB999).
776#[derive(Debug)]
777struct AsciiWordBoundIter<'a> {
778    rest: &'a str,
779    offset: usize,
780}
781
782impl<'a> AsciiWordBoundIter<'a> {
783    pub fn new(s: &'a str) -> Self {
784        AsciiWordBoundIter { rest: s, offset: 0 }
785    }
786
787    #[inline]
788    fn is_core(b: u8) -> bool {
789        b.is_ascii_alphanumeric() || b == b'_'
790    }
791
792    #[inline]
793    fn is_infix(b: u8, prev: u8, next: u8) -> bool {
794        match b {
795            // Numeric separators such as "1,000" or "3.14" (WB11/WB12)
796            //
797            // "Numeric (MidNum | MidNumLetQ) Numeric"
798            b'.' | b',' | b';' | b'\'' if prev.is_ascii_digit() && next.is_ascii_digit() => true,
799
800            // Dot or colon inside an alphabetic word ("e.g.", "http://") (WB6/WB7)
801            //
802            // "(MidLetter | MidNumLetQ) AHLetter (MidLetter | MidNumLetQ)"
803            // MidLetter  = b':'
804            // MidNumLetQ = b'.' | b'\''
805            b'\'' | b'.' | b':' if prev.is_ascii_alphabetic() && next.is_ascii_alphabetic() => true,
806            _ => false,
807        }
808    }
809}
810
811impl<'a> Iterator for AsciiWordBoundIter<'a> {
812    type Item = (usize, &'a str);
813
814    #[inline]
815    fn next(&mut self) -> Option<Self::Item> {
816        if self.rest.is_empty() {
817            return None;
818        }
819
820        let bytes = self.rest.as_bytes();
821        let len = bytes.len();
822
823        // 1) Keep horizontal whitespace together.
824        // Spec: WB3d joins adjacent *WSegSpace* into a single segment.
825        if bytes[0] == b' ' {
826            let mut i = 1;
827            while i < len && bytes[i] == b' ' {
828                i += 1;
829            }
830            let word = &self.rest[..i];
831            let pos = self.offset;
832            self.rest = &self.rest[i..];
833            self.offset += i;
834            return Some((pos, word));
835        }
836
837        // 2) Core-run (letters/digits/underscore + infix)
838        // Spec: ALetter × ALetter, Numeric × Numeric etc. (WB5–WB13b)
839        if Self::is_core(bytes[0]) {
840            let mut i = 1;
841            while i < len {
842                let b = bytes[i];
843                if Self::is_core(b)
844                    || (i + 1 < len && Self::is_infix(b, bytes[i - 1], bytes[i + 1]))
845                {
846                    i += 1;
847                } else {
848                    break;
849                }
850            }
851            let word = &self.rest[..i];
852            let pos = self.offset;
853            self.rest = &self.rest[i..];
854            self.offset += i;
855            return Some((pos, word));
856        }
857
858        // 3) Do not break within CRLF.
859        // Spec: WB3 treats CR+LF as a single non‑breaking pair.
860        if bytes[0] == b'\r' && len >= 2 && bytes[1] == b'\n' {
861            let word = &self.rest[..2];
862            let pos = self.offset;
863            self.rest = &self.rest[2..];
864            self.offset += 2;
865            Some((pos, word))
866        } else {
867            // 4) Otherwise, break everywhere
868            // Spec: the catch‑all rule WB999.
869            let word = &self.rest[..1];
870            let pos = self.offset;
871            self.rest = &self.rest[1..];
872            self.offset += 1;
873            Some((pos, word))
874        }
875    }
876}
877
878impl<'a> DoubleEndedIterator for AsciiWordBoundIter<'a> {
879    fn next_back(&mut self) -> Option<(usize, &'a str)> {
880        let rest = self.rest;
881        if rest.is_empty() {
882            return None;
883        }
884        let bytes = rest.as_bytes();
885        let len = bytes.len();
886
887        // 1) Group runs of spaces
888        // Spec: WB3d joins adjacent *WSegSpace* into a single segment.
889        if bytes[len - 1] == b' ' {
890            // find start of this last run of spaces
891            let mut start = len - 1;
892            while start > 0 && bytes[start - 1] == b' ' {
893                start -= 1;
894            }
895            let word = &rest[start..];
896            let pos = self.offset + start;
897            self.rest = &rest[..start];
898            return Some((pos, word));
899        }
900
901        // 2) Trailing Core-run (letters/digits/underscore + infix)
902        // Spec: ALetter × ALetter, Numeric × Numeric etc. (WB5–WB13b)
903        if Self::is_core(bytes[len - 1]) {
904            // scan backwards as long as we see `is_core` or an `is_infix`
905            let mut start = len - 1;
906            while start > 0 {
907                let b = bytes[start - 1];
908                let prev = if start >= 2 { bytes[start - 2] } else { b };
909                let next = bytes[start]; // the byte we just included
910                if Self::is_core(b) || Self::is_infix(b, prev, next) {
911                    start -= 1;
912                } else {
913                    break;
914                }
915            }
916            let word = &rest[start..];
917            let pos = self.offset + start;
918            self.rest = &rest[..start];
919            return Some((pos, word));
920        }
921
922        // 3) Non-core: CR+LF as one token, otherwise single char
923        // Spec: WB3 treats CR+LF as a single non‑breaking pair.
924        if len >= 2 && bytes[len - 2] == b'\r' && bytes[len - 1] == b'\n' {
925            let start = len - 2;
926            let word = &rest[start..];
927            let pos = self.offset + start;
928            self.rest = &rest[..start];
929            return Some((pos, word));
930        }
931
932        // 4) Fallback – every other byte is its own segment
933        // Spec: the catch‑all rule WB999.
934        let start = len - 1;
935        let word = &rest[start..];
936        let pos = self.offset + start;
937        self.rest = &rest[..start];
938        Some((pos, word))
939    }
940}
941
942#[inline]
943fn ascii_word_ok(t: &(usize, &str)) -> bool {
944    has_ascii_alphanumeric(&t.1)
945}
946#[inline]
947fn unicode_word_ok(t: &(usize, &str)) -> bool {
948    has_alphanumeric(&t.1)
949}
950
951type AsciiWordsIter<'a> = Filter<
952    core::iter::Map<AsciiWordBoundIter<'a>, fn((usize, &'a str)) -> &'a str>,
953    fn(&&'a str) -> bool,
954>;
955type UnicodeWordsIter<'a> = Filter<UWordBounds<'a>, fn(&&'a str) -> bool>;
956type AsciiIndicesIter<'a> = Filter<AsciiWordBoundIter<'a>, fn(&(usize, &'a str)) -> bool>;
957type UnicodeIndicesIter<'a> = Filter<UWordBoundIndices<'a>, fn(&(usize, &'a str)) -> bool>;
958
959#[derive(Debug)]
960enum WordsIter<'a> {
961    Ascii(AsciiWordsIter<'a>),
962    Unicode(UnicodeWordsIter<'a>),
963}
964
965#[derive(Debug)]
966enum IndicesIter<'a> {
967    Ascii(AsciiIndicesIter<'a>),
968    Unicode(UnicodeIndicesIter<'a>),
969}
970
971#[inline]
972pub fn new_unicode_words(s: &str) -> UnicodeWords<'_> {
973    let inner = if s.is_ascii() {
974        WordsIter::Ascii(new_unicode_words_ascii(s))
975    } else {
976        WordsIter::Unicode(new_unicode_words_general(s))
977    };
978    UnicodeWords { inner }
979}
980
981#[inline]
982pub fn new_unicode_word_indices(s: &str) -> UnicodeWordIndices<'_> {
983    let inner = if s.is_ascii() {
984        IndicesIter::Ascii(new_ascii_word_bound_indices(s).filter(ascii_word_ok))
985    } else {
986        IndicesIter::Unicode(new_word_bound_indices(s).filter(unicode_word_ok))
987    };
988    UnicodeWordIndices { inner }
989}
990
991#[inline]
992pub fn new_word_bounds(s: &str) -> UWordBounds<'_> {
993    UWordBounds {
994        string: s,
995        cat: None,
996        catb: None,
997    }
998}
999
1000#[inline]
1001pub fn new_word_bound_indices(s: &str) -> UWordBoundIndices<'_> {
1002    UWordBoundIndices {
1003        start_offset: s.as_ptr() as usize,
1004        iter: new_word_bounds(s),
1005    }
1006}
1007
1008#[inline]
1009fn new_ascii_word_bound_indices(s: &str) -> AsciiWordBoundIter<'_> {
1010    AsciiWordBoundIter::new(s)
1011}
1012
1013#[inline]
1014fn has_alphanumeric(s: &&str) -> bool {
1015    use crate::tables::util::is_alphanumeric;
1016
1017    s.chars().any(is_alphanumeric)
1018}
1019
1020#[inline]
1021fn has_ascii_alphanumeric(s: &&str) -> bool {
1022    s.chars().any(|c| c.is_ascii_alphanumeric())
1023}
1024
1025#[inline(always)]
1026fn strip_pos((_, w): (usize, &str)) -> &str {
1027    w
1028}
1029
1030#[inline]
1031fn new_unicode_words_ascii<'a>(s: &'a str) -> AsciiWordsIter<'a> {
1032    new_ascii_word_bound_indices(s)
1033        .map(strip_pos as fn(_) -> _)
1034        .filter(has_ascii_alphanumeric)
1035}
1036
1037#[inline]
1038fn new_unicode_words_general<'a>(s: &'a str) -> UnicodeWordsIter<'a> {
1039    new_word_bounds(s).filter(has_alphanumeric)
1040}
1041
1042#[cfg(test)]
1043mod tests {
1044    use crate::word::{
1045        new_ascii_word_bound_indices, new_unicode_words_ascii, new_word_bound_indices,
1046    };
1047    use std::string::String;
1048    use std::vec;
1049    use std::vec::Vec;
1050
1051    use proptest::prelude::*;
1052
1053    #[test]
1054    fn test_syriac_abbr_mark() {
1055        use crate::tables::word as wd;
1056        let (_, _, cat) = wd::word_category('\u{70f}');
1057        assert_eq!(cat, wd::WC_ALetter);
1058    }
1059
1060    #[test]
1061    fn test_end_of_ayah_cat() {
1062        use crate::tables::word as wd;
1063        let (_, _, cat) = wd::word_category('\u{6dd}');
1064        assert_eq!(cat, wd::WC_Numeric);
1065    }
1066
1067    #[test]
1068    fn test_ascii_word_bound_indices_various_cases() {
1069        let s = "Hello, world!";
1070        let words: Vec<(usize, &str)> = new_ascii_word_bound_indices(s).collect();
1071        let expected = vec![
1072            (0, "Hello"), // simple letters
1073            (5, ","),
1074            (6, " "),     // space after comma
1075            (7, "world"), // skip comma+space, stop at '!'
1076            (12, "!"),    // punctuation at the end
1077        ];
1078        assert_eq!(words, expected);
1079    }
1080
1081    #[test]
1082    fn test_ascii_word_indices_various_cases() {
1083        let s = "Hello, world! can't e.g. var1 123,456 foo_bar example.com 127.0.0.1:9090";
1084        let words: Vec<&str> = new_unicode_words_ascii(s).collect();
1085        let expected = vec![
1086            ("Hello"), // simple letters
1087            ("world"), // skip comma+space, stop at '!'
1088            ("can't"), // apostrophe joins letters
1089            ("e.g"),
1090            ("var1"),
1091            ("123,456"), // digits+comma+digits
1092            ("foo_bar"),
1093            ("example.com"),
1094            ("127.0.0.1"),
1095            ("9090"), // port number
1096        ];
1097        assert_eq!(words, expected);
1098    }
1099
1100    /// Strategy that yields every code-point from NUL (0) to DEL (127).
1101    fn ascii_char() -> impl Strategy<Value = char> {
1102        (0u8..=127).prop_map(|b| b as char)
1103    }
1104
1105    proptest! {
1106        #![proptest_config(ProptestConfig::with_cases(10000))]
1107        /// Fast path must equal general path for any ASCII input.
1108        #[test]
1109        fn proptest_ascii_matches_unicode_word_indices(
1110            // Vec<char> → String, length 0‒99
1111            s in proptest::collection::vec(ascii_char(), 0..100)
1112                   .prop_map(|v| v.into_iter().collect::<String>())
1113        ) {
1114            let fast: Vec<(usize, &str)> = new_ascii_word_bound_indices(&s).collect();
1115            let uni:  Vec<(usize, &str)> = new_word_bound_indices(&s).collect();
1116
1117            prop_assert_eq!(fast, uni);
1118        }
1119
1120        /// Fast path must equal general path for any ASCII input, forwards and backwards.
1121        #[test]
1122        fn proptest_ascii_matches_unicode_word_indices_rev(
1123            // Vec<char> → String, length 0‒99
1124            s in proptest::collection::vec(ascii_char(), 0..100)
1125                   .prop_map(|v| v.into_iter().collect::<String>())
1126        ) {
1127            let fast_rev: Vec<(usize, &str)> = new_ascii_word_bound_indices(&s).rev().collect();
1128            let uni_rev : Vec<(usize, &str)> = new_word_bound_indices(&s).rev().collect();
1129            prop_assert_eq!(fast_rev, uni_rev);
1130        }
1131    }
1132}
unicode_segmentation/word.rs

unicode_segmentation/
word.rs