unicode_segmentation/word.rs
1// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
2// file at the top-level directory of this distribution and at
3// http://rust-lang.org/COPYRIGHT.
4//
5// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8// option. This file may not be copied, modified, or distributed
9// except according to those terms.
10
11use core::cmp;
12use core::iter::Filter;
13
14use crate::tables::word::WordCat;
15
16/// An iterator over the substrings of a string which, after splitting the string on
17/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries),
18/// contain any characters with the
19/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
20/// property, or with
21/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
22///
23/// This struct is created by the [`unicode_words`] method on the [`UnicodeSegmentation`] trait. See
24/// its documentation for more.
25///
26/// [`unicode_words`]: trait.UnicodeSegmentation.html#tymethod.unicode_words
27/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
28#[derive(Debug)]
29pub struct UnicodeWords<'a> {
30 inner: WordsIter<'a>,
31}
32
33impl<'a> Iterator for UnicodeWords<'a> {
34 type Item = &'a str;
35 #[inline]
36 fn next(&mut self) -> Option<Self::Item> {
37 match &mut self.inner {
38 WordsIter::Ascii(i) => i.next(),
39 WordsIter::Unicode(i) => i.next(),
40 }
41 }
42 #[inline]
43 fn size_hint(&self) -> (usize, Option<usize>) {
44 match &self.inner {
45 WordsIter::Ascii(i) => i.size_hint(),
46 WordsIter::Unicode(i) => i.size_hint(),
47 }
48 }
49}
50impl<'a> DoubleEndedIterator for UnicodeWords<'a> {
51 #[inline]
52 fn next_back(&mut self) -> Option<Self::Item> {
53 match &mut self.inner {
54 WordsIter::Ascii(i) => i.next_back(),
55 WordsIter::Unicode(i) => i.next_back(),
56 }
57 }
58}
59
60/// An iterator over the substrings of a string which, after splitting the string on
61/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries),
62/// contain any characters with the
63/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
64/// property, or with
65/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
66/// This iterator also provides the byte offsets for each substring.
67///
68/// This struct is created by the [`unicode_word_indices`] method on the [`UnicodeSegmentation`] trait. See
69/// its documentation for more.
70///
71/// [`unicode_word_indices`]: trait.UnicodeSegmentation.html#tymethod.unicode_word_indices
72/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
73#[derive(Debug)]
74pub struct UnicodeWordIndices<'a> {
75 inner: IndicesIter<'a>,
76}
77
78impl<'a> Iterator for UnicodeWordIndices<'a> {
79 type Item = (usize, &'a str);
80 #[inline]
81 fn next(&mut self) -> Option<Self::Item> {
82 match &mut self.inner {
83 IndicesIter::Ascii(i) => i.next(),
84 IndicesIter::Unicode(i) => i.next(),
85 }
86 }
87 #[inline]
88 fn size_hint(&self) -> (usize, Option<usize>) {
89 match &self.inner {
90 IndicesIter::Ascii(i) => i.size_hint(),
91 IndicesIter::Unicode(i) => i.size_hint(),
92 }
93 }
94}
95impl<'a> DoubleEndedIterator for UnicodeWordIndices<'a> {
96 #[inline]
97 fn next_back(&mut self) -> Option<Self::Item> {
98 match &mut self.inner {
99 IndicesIter::Ascii(i) => i.next_back(),
100 IndicesIter::Unicode(i) => i.next_back(),
101 }
102 }
103}
104
105/// External iterator for a string's
106/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries).
107///
108/// This struct is created by the [`split_word_bounds`] method on the [`UnicodeSegmentation`]
109/// trait. See its documentation for more.
110///
111/// [`split_word_bounds`]: trait.UnicodeSegmentation.html#tymethod.split_word_bounds
112/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
113#[derive(Debug, Clone)]
114pub struct UWordBounds<'a> {
115 string: &'a str,
116 cat: Option<WordCat>,
117 catb: Option<WordCat>,
118}
119
120/// External iterator for word boundaries and byte offsets.
121///
122/// This struct is created by the [`split_word_bound_indices`] method on the
123/// [`UnicodeSegmentation`] trait. See its documentation for more.
124///
125/// [`split_word_bound_indices`]: trait.UnicodeSegmentation.html#tymethod.split_word_bound_indices
126/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
127#[derive(Debug, Clone)]
128pub struct UWordBoundIndices<'a> {
129 start_offset: usize,
130 iter: UWordBounds<'a>,
131}
132
133impl<'a> UWordBoundIndices<'a> {
134 #[inline]
135 /// View the underlying data (the part yet to be iterated) as a slice of the original string.
136 ///
137 /// ```rust
138 /// # use unicode_segmentation::UnicodeSegmentation;
139 /// let mut iter = "Hello world".split_word_bound_indices();
140 /// assert_eq!(iter.as_str(), "Hello world");
141 /// iter.next();
142 /// assert_eq!(iter.as_str(), " world");
143 /// iter.next();
144 /// assert_eq!(iter.as_str(), "world");
145 /// ```
146 pub fn as_str(&self) -> &'a str {
147 self.iter.as_str()
148 }
149}
150
151impl<'a> Iterator for UWordBoundIndices<'a> {
152 type Item = (usize, &'a str);
153
154 #[inline]
155 fn next(&mut self) -> Option<(usize, &'a str)> {
156 self.iter
157 .next()
158 .map(|s| (s.as_ptr() as usize - self.start_offset, s))
159 }
160
161 #[inline]
162 fn size_hint(&self) -> (usize, Option<usize>) {
163 self.iter.size_hint()
164 }
165}
166
167impl<'a> DoubleEndedIterator for UWordBoundIndices<'a> {
168 #[inline]
169 fn next_back(&mut self) -> Option<(usize, &'a str)> {
170 self.iter
171 .next_back()
172 .map(|s| (s.as_ptr() as usize - self.start_offset, s))
173 }
174}
175
176// state machine for word boundary rules
177#[derive(Clone, Copy, PartialEq, Eq, Debug)]
178enum UWordBoundsState {
179 Start,
180 Letter,
181 HLetter,
182 Numeric,
183 Katakana,
184 ExtendNumLet,
185 Regional(RegionalState),
186 FormatExtend(FormatExtendType),
187 Zwj,
188 Emoji,
189 WSegSpace,
190}
191
192// subtypes for FormatExtend state in UWordBoundsState
193#[derive(Clone, Copy, PartialEq, Eq, Debug)]
194enum FormatExtendType {
195 AcceptAny,
196 AcceptNone,
197 RequireLetter,
198 RequireHLetter,
199 AcceptQLetter,
200 RequireNumeric,
201}
202
203#[derive(Clone, Copy, PartialEq, Eq, Debug)]
204enum RegionalState {
205 Half,
206 Full,
207 Unknown,
208}
209
210fn is_emoji(ch: char) -> bool {
211 use crate::tables::emoji;
212 emoji::emoji_category(ch).2 == emoji::EmojiCat::EC_Extended_Pictographic
213}
214
215impl<'a> Iterator for UWordBounds<'a> {
216 type Item = &'a str;
217
218 #[inline]
219 fn size_hint(&self) -> (usize, Option<usize>) {
220 let slen = self.string.len();
221 (cmp::min(slen, 1), Some(slen))
222 }
223
224 #[inline]
225 fn next(&mut self) -> Option<&'a str> {
226 use self::FormatExtendType::*;
227 use self::UWordBoundsState::*;
228 use crate::tables::word as wd;
229 if self.string.is_empty() {
230 return None;
231 }
232
233 let mut take_curr = true;
234 let mut take_cat = true;
235 let mut idx = 0;
236 let mut saveidx = 0;
237 let mut state = Start;
238 let mut cat = wd::WC_Any;
239 let mut savecat = wd::WC_Any;
240
241 // If extend/format/zwj were skipped. Handles precedence of WB3d over WB4
242 let mut skipped_format_extend = false;
243 for (curr, ch) in self.string.char_indices() {
244 idx = curr;
245 // Whether or not the previous category was ZWJ
246 // ZWJs get collapsed, so this handles precedence of WB3c over WB4
247 let prev_zwj = cat == wd::WC_ZWJ;
248 // if there's a category cached, grab it
249 cat = match self.cat {
250 None => wd::word_category(ch).2,
251 _ => self.cat.take().unwrap(),
252 };
253 take_cat = true;
254
255 // handle rule WB4
256 // just skip all format, extend, and zwj chars
257 // note that Start is a special case: if there's a bunch of Format | Extend
258 // characters at the beginning of a block of text, dump them out as one unit.
259 //
260 // (This is not obvious from the wording of UAX#29, but if you look at the
261 // test cases http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakTest.txt
262 // then the "correct" interpretation of WB4 becomes apparent.)
263 if state != Start {
264 match cat {
265 wd::WC_Extend | wd::WC_Format | wd::WC_ZWJ => {
266 skipped_format_extend = true;
267 continue;
268 }
269 _ => {}
270 }
271 }
272
273 // rule WB3c
274 // WB4 makes all ZWJs collapse into the previous state
275 // but you can still be in a Zwj state if you started with Zwj
276 //
277 // This means that an EP + Zwj will collapse into EP, which is wrong,
278 // since EP+EP is not a boundary but EP+ZWJ+EP is
279 //
280 // Thus, we separately keep track of whether or not the last character
281 // was a ZWJ. This is an additional bit of state tracked outside of the
282 // state enum; the state enum represents the last non-zwj state encountered.
283 // When prev_zwj is true, for the purposes of WB3c, we are in the Zwj state,
284 // however we are in the previous state for the purposes of all other rules.
285 if prev_zwj && is_emoji(ch) {
286 state = Emoji;
287 continue;
288 }
289 // Don't use `continue` in this match without updating `cat`
290 state = match state {
291 Start if cat == wd::WC_CR => {
292 idx += match self.get_next_cat(idx) {
293 Some(wd::WC_LF) => 1, // rule WB3
294 _ => 0,
295 };
296 break; // rule WB3a
297 }
298 Start => match cat {
299 wd::WC_ALetter => Letter, // rule WB5, WB6, WB9, WB13a
300 wd::WC_Hebrew_Letter => HLetter, // rule WB5, WB6, WB7a, WB7b, WB9, WB13a
301 wd::WC_Numeric => Numeric, // rule WB8, WB10, WB12, WB13a
302 wd::WC_Katakana => Katakana, // rule WB13, WB13a
303 wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a, WB13b
304 wd::WC_Regional_Indicator => Regional(RegionalState::Half), // rule WB13c
305 wd::WC_LF | wd::WC_Newline => break, // rule WB3a
306 wd::WC_ZWJ => Zwj, // rule WB3c
307 wd::WC_WSegSpace => WSegSpace, // rule WB3d
308 _ => {
309 if let Some(ncat) = self.get_next_cat(idx) {
310 // rule WB4
311 if ncat == wd::WC_Format || ncat == wd::WC_Extend || ncat == wd::WC_ZWJ
312 {
313 state = FormatExtend(AcceptNone);
314 self.cat = Some(ncat);
315 continue;
316 }
317 }
318 break; // rule WB999
319 }
320 },
321 WSegSpace => match cat {
322 wd::WC_WSegSpace if !skipped_format_extend => WSegSpace,
323 _ => {
324 take_curr = false;
325 break;
326 }
327 },
328 Zwj => {
329 // We already handle WB3c above.
330 take_curr = false;
331 break;
332 }
333 Letter | HLetter => match cat {
334 wd::WC_ALetter => Letter, // rule WB5
335 wd::WC_Hebrew_Letter => HLetter, // rule WB5
336 wd::WC_Numeric => Numeric, // rule WB9
337 wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
338 wd::WC_Double_Quote if state == HLetter => {
339 savecat = cat;
340 saveidx = idx;
341 FormatExtend(RequireHLetter) // rule WB7b
342 }
343 wd::WC_Single_Quote if state == HLetter => {
344 FormatExtend(AcceptQLetter) // rule WB7a
345 }
346 wd::WC_MidLetter | wd::WC_MidNumLet | wd::WC_Single_Quote => {
347 savecat = cat;
348 saveidx = idx;
349 FormatExtend(RequireLetter) // rule WB6
350 }
351 _ => {
352 take_curr = false;
353 break;
354 }
355 },
356 Numeric => match cat {
357 wd::WC_Numeric => Numeric, // rule WB8
358 wd::WC_ALetter => Letter, // rule WB10
359 wd::WC_Hebrew_Letter => HLetter, // rule WB10
360 wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
361 wd::WC_MidNum | wd::WC_MidNumLet | wd::WC_Single_Quote => {
362 savecat = cat;
363 saveidx = idx;
364 FormatExtend(RequireNumeric) // rule WB12
365 }
366 _ => {
367 take_curr = false;
368 break;
369 }
370 },
371 Katakana => match cat {
372 wd::WC_Katakana => Katakana, // rule WB13
373 wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
374 _ => {
375 take_curr = false;
376 break;
377 }
378 },
379 ExtendNumLet => match cat {
380 wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
381 wd::WC_ALetter => Letter, // rule WB13b
382 wd::WC_Hebrew_Letter => HLetter, // rule WB13b
383 wd::WC_Numeric => Numeric, // rule WB13b
384 wd::WC_Katakana => Katakana, // rule WB13b
385 _ => {
386 take_curr = false;
387 break;
388 }
389 },
390 Regional(RegionalState::Full) => {
391 // if it reaches here we've gone too far,
392 // a full flag can only compose with ZWJ/Extend/Format
393 // proceeding it.
394 take_curr = false;
395 break;
396 }
397 Regional(RegionalState::Half) => match cat {
398 wd::WC_Regional_Indicator => Regional(RegionalState::Full), // rule WB13c
399 _ => {
400 take_curr = false;
401 break;
402 }
403 },
404 Regional(_) => {
405 unreachable!("RegionalState::Unknown should not occur on forward iteration")
406 }
407 Emoji => {
408 // We already handle WB3c above. If you've reached this point, the emoji sequence is over.
409 take_curr = false;
410 break;
411 }
412 FormatExtend(t) => match t {
413 // handle FormatExtends depending on what type
414 RequireNumeric if cat == wd::WC_Numeric => Numeric, // rule WB11
415 RequireLetter | AcceptQLetter if cat == wd::WC_ALetter => Letter, // rule WB7
416 RequireLetter | AcceptQLetter if cat == wd::WC_Hebrew_Letter => HLetter, // WB7a
417 RequireHLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB7b
418 AcceptNone | AcceptQLetter => {
419 take_curr = false; // emit all the Format|Extend characters
420 take_cat = false;
421 break;
422 }
423 _ => break, // rewind (in if statement below)
424 },
425 }
426 }
427
428 if let FormatExtend(t) = state {
429 // we were looking for something and didn't find it; we have to back up
430 if t == RequireLetter || t == RequireHLetter || t == RequireNumeric {
431 idx = saveidx;
432 cat = savecat;
433 take_curr = false;
434 }
435 }
436
437 self.cat = if take_curr {
438 idx = idx + self.string[idx..].chars().next().unwrap().len_utf8();
439 None
440 } else if take_cat {
441 Some(cat)
442 } else {
443 None
444 };
445
446 let retstr = &self.string[..idx];
447 self.string = &self.string[idx..];
448 Some(retstr)
449 }
450}
451
452impl<'a> DoubleEndedIterator for UWordBounds<'a> {
453 #[inline]
454 fn next_back(&mut self) -> Option<&'a str> {
455 use self::FormatExtendType::*;
456 use self::UWordBoundsState::*;
457 use crate::tables::word as wd;
458 if self.string.is_empty() {
459 return None;
460 }
461
462 let mut take_curr = true;
463 let mut take_cat = true;
464 let mut idx = self.string.len();
465 idx -= self.string.chars().next_back().unwrap().len_utf8();
466 let mut previdx = idx;
467 let mut saveidx = idx;
468 let mut state = Start;
469 let mut savestate = Start;
470 let mut cat = wd::WC_Any;
471
472 // WB3c is context-sensitive (ZWJ + Extended_Pictographic),
473 // while WB4 collapses Extend/Format and would otherwise hide that context.
474 // We therefore keep this context outside the main state machine:
475 // whether the nearest non-(Extend|Format) char to the right is emoji.
476 let mut right_significant_is_emoji: bool = false;
477
478 let mut skipped_format_extend = false;
479
480 for (curr, ch) in self.string.char_indices().rev() {
481 previdx = idx;
482 idx = curr;
483
484 // if there's a category cached, grab it
485 cat = match self.catb {
486 None => wd::word_category(ch).2,
487 _ => self.catb.take().unwrap(),
488 };
489 take_cat = true;
490
491 // backward iterator over word boundaries. Mostly the same as the forward
492 // iterator, with two weirdnesses:
493 // (1) If we encounter a single quote in the Start state, we have to check for a
494 // Hebrew Letter immediately before it.
495 // (2) Format and Extend char handling takes some gymnastics.
496
497 // Reverse-direction WB3c check: when we encounter ZWJ and the nearest
498 // significant right-side char is emoji, do not break here.
499 if cat == wd::WC_ZWJ && state != Zwj && right_significant_is_emoji {
500 continue;
501 }
502
503 // Keep the right-side WB3c context up to date as we move left.
504 // Ignore Extend/Format here to mirror WB4 collapsing behavior.
505 if cat != wd::WC_Extend && cat != wd::WC_Format {
506 right_significant_is_emoji = is_emoji(ch);
507 }
508
509 if cat == wd::WC_Extend || cat == wd::WC_Format || (cat == wd::WC_ZWJ && state != Zwj) {
510 // WB3c has more priority so we should not
511 // fold in that case
512 if !matches!(state, FormatExtend(_) | Start) {
513 saveidx = previdx;
514 savestate = state;
515 state = FormatExtend(AcceptNone);
516 }
517
518 if state != Start {
519 continue;
520 }
521 } else if state == FormatExtend(AcceptNone) {
522 // finished a scan of some Format|Extend chars, restore previous state
523 state = savestate;
524 previdx = saveidx;
525 take_cat = false;
526 skipped_format_extend = true;
527 }
528
529 // Don't use `continue` in this match without updating `catb`
530 state = match state {
531 Start | FormatExtend(AcceptAny) => match cat {
532 wd::WC_ALetter => Letter, // rule WB5, WB7, WB10, WB13b
533 wd::WC_Hebrew_Letter => HLetter, // rule WB5, WB7, WB7c, WB10, WB13b
534 wd::WC_Numeric => Numeric, // rule WB8, WB9, WB11, WB13b
535 wd::WC_Katakana => Katakana, // rule WB13, WB13b
536 wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
537 wd::WC_Regional_Indicator => Regional(RegionalState::Unknown), // rule WB13c
538 // rule WB4:
539 wd::WC_Extend | wd::WC_Format | wd::WC_ZWJ => FormatExtend(AcceptAny),
540 wd::WC_Single_Quote => {
541 saveidx = idx;
542 FormatExtend(AcceptQLetter) // rule WB7a
543 }
544 wd::WC_WSegSpace => WSegSpace,
545 wd::WC_CR | wd::WC_LF | wd::WC_Newline => {
546 if state == Start {
547 if cat == wd::WC_LF {
548 idx -= match self.get_prev_cat(idx) {
549 Some(wd::WC_CR) => 1, // rule WB3
550 _ => 0,
551 };
552 }
553 } else {
554 take_curr = false;
555 }
556 break; // rule WB3a
557 }
558 _ if is_emoji(ch) => Zwj,
559 _ => break, // rule WB999
560 },
561 Zwj => match cat {
562 // rule WB3c
563 wd::WC_ZWJ => FormatExtend(AcceptAny),
564 _ => {
565 take_curr = false;
566 break;
567 }
568 },
569 WSegSpace => match cat {
570 // rule WB3d
571 wd::WC_WSegSpace if !skipped_format_extend => WSegSpace,
572 _ => {
573 take_curr = false;
574 break;
575 }
576 },
577 Letter | HLetter => match cat {
578 wd::WC_ALetter => Letter, // rule WB5
579 wd::WC_Hebrew_Letter => HLetter, // rule WB5
580 wd::WC_Numeric => Numeric, // rule WB10
581 wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13b
582 wd::WC_Double_Quote if state == HLetter => {
583 saveidx = previdx;
584 FormatExtend(RequireHLetter) // rule WB7c
585 }
586 wd::WC_MidLetter | wd::WC_MidNumLet | wd::WC_Single_Quote => {
587 saveidx = previdx;
588 FormatExtend(RequireLetter) // rule WB7
589 }
590 _ => {
591 take_curr = false;
592 break;
593 }
594 },
595 Numeric => match cat {
596 wd::WC_Numeric => Numeric, // rule WB8
597 wd::WC_ALetter => Letter, // rule WB9
598 wd::WC_Hebrew_Letter => HLetter, // rule WB9
599 wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13b
600 wd::WC_MidNum | wd::WC_MidNumLet | wd::WC_Single_Quote => {
601 saveidx = previdx;
602 FormatExtend(RequireNumeric) // rule WB11
603 }
604 _ => {
605 take_curr = false;
606 break;
607 }
608 },
609 Katakana => match cat {
610 wd::WC_Katakana => Katakana, // rule WB13
611 wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13b
612 _ => {
613 take_curr = false;
614 break;
615 }
616 },
617 ExtendNumLet => match cat {
618 wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
619 wd::WC_ALetter => Letter, // rule WB13a
620 wd::WC_Hebrew_Letter => HLetter, // rule WB13a
621 wd::WC_Numeric => Numeric, // rule WB13a
622 wd::WC_Katakana => Katakana, // rule WB13a
623 _ => {
624 take_curr = false;
625 break;
626 }
627 },
628 Regional(mut regional_state) => match cat {
629 // rule WB13c
630 wd::WC_Regional_Indicator => {
631 if regional_state == RegionalState::Unknown {
632 let count = self.string[..previdx]
633 .chars()
634 .rev()
635 .map(|c| wd::word_category(c).2)
636 .filter(|&c| {
637 !(c == wd::WC_ZWJ || c == wd::WC_Extend || c == wd::WC_Format)
638 })
639 .take_while(|&c| c == wd::WC_Regional_Indicator)
640 .count();
641 regional_state = if count % 2 == 0 {
642 RegionalState::Full
643 } else {
644 RegionalState::Half
645 };
646 }
647 if regional_state == RegionalState::Full {
648 take_curr = false;
649 break;
650 } else {
651 Regional(RegionalState::Full)
652 }
653 }
654 _ => {
655 take_curr = false;
656 break;
657 }
658 },
659 Emoji => {
660 if is_emoji(ch) {
661 // rule WB3c
662 Zwj
663 } else {
664 take_curr = false;
665 break;
666 }
667 }
668 FormatExtend(t) => match t {
669 RequireNumeric if cat == wd::WC_Numeric => Numeric, // rule WB12
670 RequireLetter if cat == wd::WC_ALetter => Letter, // rule WB6
671 RequireLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB6
672 AcceptQLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB7a
673 RequireHLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB7b
674 _ => break, // backtrack will happens
675 },
676 }
677 }
678
679 if let FormatExtend(t) = state {
680 // if we required something but didn't find it, backtrack
681 if t == RequireLetter
682 || t == RequireHLetter
683 || t == RequireNumeric
684 || t == AcceptNone
685 || t == AcceptQLetter
686 {
687 previdx = saveidx;
688 take_cat = false;
689 take_curr = false;
690 }
691 }
692
693 self.catb = if take_curr {
694 None
695 } else {
696 idx = previdx;
697 if take_cat {
698 Some(cat)
699 } else {
700 None
701 }
702 };
703
704 let retstr = &self.string[idx..];
705 self.string = &self.string[..idx];
706 Some(retstr)
707 }
708}
709
710impl<'a> UWordBounds<'a> {
711 #[inline]
712 /// View the underlying data (the part yet to be iterated) as a slice of the original string.
713 ///
714 /// ```rust
715 /// # use unicode_segmentation::UnicodeSegmentation;
716 /// let mut iter = "Hello world".split_word_bounds();
717 /// assert_eq!(iter.as_str(), "Hello world");
718 /// iter.next();
719 /// assert_eq!(iter.as_str(), " world");
720 /// iter.next();
721 /// assert_eq!(iter.as_str(), "world");
722 /// ```
723 pub fn as_str(&self) -> &'a str {
724 self.string
725 }
726
727 #[inline]
728 fn get_next_cat(&self, idx: usize) -> Option<WordCat> {
729 use crate::tables::word as wd;
730 let nidx = idx + self.string[idx..].chars().next().unwrap().len_utf8();
731 if nidx < self.string.len() {
732 let nch = self.string[nidx..].chars().next().unwrap();
733 Some(wd::word_category(nch).2)
734 } else {
735 None
736 }
737 }
738
739 #[inline]
740 fn get_prev_cat(&self, idx: usize) -> Option<WordCat> {
741 use crate::tables::word as wd;
742 if idx > 0 {
743 let nch = self.string[..idx].chars().next_back().unwrap();
744 Some(wd::word_category(nch).2)
745 } else {
746 None
747 }
748 }
749}
750
751/// ASCII‑fast‑path word‑boundary iterator for strings that contain only ASCII characters.
752///
753/// Since we handle only ASCII characters, we can use a much simpler set of
754/// word break values than the full Unicode algorithm.
755/// https://www.unicode.org/reports/tr29/#Table_Word_Break_Property_Values
756///
757/// | Word_Break value | ASCII code points that belong to it |
758/// | -----------------| --------------------------------------------------------------- |
759/// | CR | U+000D (CR) |
760/// | LF | U+000A (LF) |
761/// | Newline | U+000B (VT), U+000C (FF) |
762/// | Single_Quote | U+0027 (') |
763/// | Double_Quote | U+0022 (") |
764/// | MidNumLet | U+002E (.) FULL STOP |
765/// | MidLetter | U+003A (:) COLON |
766/// | MidNum | U+002C (,), U+003B (;) |
767/// | Numeric | U+0030 – U+0039 (0 … 9) |
768/// | ALetter | U+0041 – U+005A (A … Z), U+0061 – U+007A (a … z) |
769/// | ExtendNumLet | U+005F (_) underscore |
770/// | WSegSpace | U+0020 (SPACE) |
771///
772/// The macro MidNumLetQ boils down to: U+002E (.) FULL STOP and U+0027 (')
773/// AHLetter is the same as ALetter, so we don't need to distinguish it.
774///
775/// Any other single ASCII byte is its own boundary (the default WB999).
776#[derive(Debug)]
777struct AsciiWordBoundIter<'a> {
778 rest: &'a str,
779 offset: usize,
780}
781
782impl<'a> AsciiWordBoundIter<'a> {
783 pub fn new(s: &'a str) -> Self {
784 AsciiWordBoundIter { rest: s, offset: 0 }
785 }
786
787 #[inline]
788 fn is_core(b: u8) -> bool {
789 b.is_ascii_alphanumeric() || b == b'_'
790 }
791
792 #[inline]
793 fn is_infix(b: u8, prev: u8, next: u8) -> bool {
794 match b {
795 // Numeric separators such as "1,000" or "3.14" (WB11/WB12)
796 //
797 // "Numeric (MidNum | MidNumLetQ) Numeric"
798 b'.' | b',' | b';' | b'\'' if prev.is_ascii_digit() && next.is_ascii_digit() => true,
799
800 // Dot or colon inside an alphabetic word ("e.g.", "http://") (WB6/WB7)
801 //
802 // "(MidLetter | MidNumLetQ) AHLetter (MidLetter | MidNumLetQ)"
803 // MidLetter = b':'
804 // MidNumLetQ = b'.' | b'\''
805 b'\'' | b'.' | b':' if prev.is_ascii_alphabetic() && next.is_ascii_alphabetic() => true,
806 _ => false,
807 }
808 }
809}
810
811impl<'a> Iterator for AsciiWordBoundIter<'a> {
812 type Item = (usize, &'a str);
813
814 #[inline]
815 fn next(&mut self) -> Option<Self::Item> {
816 if self.rest.is_empty() {
817 return None;
818 }
819
820 let bytes = self.rest.as_bytes();
821 let len = bytes.len();
822
823 // 1) Keep horizontal whitespace together.
824 // Spec: WB3d joins adjacent *WSegSpace* into a single segment.
825 if bytes[0] == b' ' {
826 let mut i = 1;
827 while i < len && bytes[i] == b' ' {
828 i += 1;
829 }
830 let word = &self.rest[..i];
831 let pos = self.offset;
832 self.rest = &self.rest[i..];
833 self.offset += i;
834 return Some((pos, word));
835 }
836
837 // 2) Core-run (letters/digits/underscore + infix)
838 // Spec: ALetter × ALetter, Numeric × Numeric etc. (WB5–WB13b)
839 if Self::is_core(bytes[0]) {
840 let mut i = 1;
841 while i < len {
842 let b = bytes[i];
843 if Self::is_core(b)
844 || (i + 1 < len && Self::is_infix(b, bytes[i - 1], bytes[i + 1]))
845 {
846 i += 1;
847 } else {
848 break;
849 }
850 }
851 let word = &self.rest[..i];
852 let pos = self.offset;
853 self.rest = &self.rest[i..];
854 self.offset += i;
855 return Some((pos, word));
856 }
857
858 // 3) Do not break within CRLF.
859 // Spec: WB3 treats CR+LF as a single non‑breaking pair.
860 if bytes[0] == b'\r' && len >= 2 && bytes[1] == b'\n' {
861 let word = &self.rest[..2];
862 let pos = self.offset;
863 self.rest = &self.rest[2..];
864 self.offset += 2;
865 Some((pos, word))
866 } else {
867 // 4) Otherwise, break everywhere
868 // Spec: the catch‑all rule WB999.
869 let word = &self.rest[..1];
870 let pos = self.offset;
871 self.rest = &self.rest[1..];
872 self.offset += 1;
873 Some((pos, word))
874 }
875 }
876}
877
878impl<'a> DoubleEndedIterator for AsciiWordBoundIter<'a> {
879 fn next_back(&mut self) -> Option<(usize, &'a str)> {
880 let rest = self.rest;
881 if rest.is_empty() {
882 return None;
883 }
884 let bytes = rest.as_bytes();
885 let len = bytes.len();
886
887 // 1) Group runs of spaces
888 // Spec: WB3d joins adjacent *WSegSpace* into a single segment.
889 if bytes[len - 1] == b' ' {
890 // find start of this last run of spaces
891 let mut start = len - 1;
892 while start > 0 && bytes[start - 1] == b' ' {
893 start -= 1;
894 }
895 let word = &rest[start..];
896 let pos = self.offset + start;
897 self.rest = &rest[..start];
898 return Some((pos, word));
899 }
900
901 // 2) Trailing Core-run (letters/digits/underscore + infix)
902 // Spec: ALetter × ALetter, Numeric × Numeric etc. (WB5–WB13b)
903 if Self::is_core(bytes[len - 1]) {
904 // scan backwards as long as we see `is_core` or an `is_infix`
905 let mut start = len - 1;
906 while start > 0 {
907 let b = bytes[start - 1];
908 let prev = if start >= 2 { bytes[start - 2] } else { b };
909 let next = bytes[start]; // the byte we just included
910 if Self::is_core(b) || Self::is_infix(b, prev, next) {
911 start -= 1;
912 } else {
913 break;
914 }
915 }
916 let word = &rest[start..];
917 let pos = self.offset + start;
918 self.rest = &rest[..start];
919 return Some((pos, word));
920 }
921
922 // 3) Non-core: CR+LF as one token, otherwise single char
923 // Spec: WB3 treats CR+LF as a single non‑breaking pair.
924 if len >= 2 && bytes[len - 2] == b'\r' && bytes[len - 1] == b'\n' {
925 let start = len - 2;
926 let word = &rest[start..];
927 let pos = self.offset + start;
928 self.rest = &rest[..start];
929 return Some((pos, word));
930 }
931
932 // 4) Fallback – every other byte is its own segment
933 // Spec: the catch‑all rule WB999.
934 let start = len - 1;
935 let word = &rest[start..];
936 let pos = self.offset + start;
937 self.rest = &rest[..start];
938 Some((pos, word))
939 }
940}
941
942#[inline]
943fn ascii_word_ok(t: &(usize, &str)) -> bool {
944 has_ascii_alphanumeric(&t.1)
945}
946#[inline]
947fn unicode_word_ok(t: &(usize, &str)) -> bool {
948 has_alphanumeric(&t.1)
949}
950
951type AsciiWordsIter<'a> = Filter<
952 core::iter::Map<AsciiWordBoundIter<'a>, fn((usize, &'a str)) -> &'a str>,
953 fn(&&'a str) -> bool,
954>;
955type UnicodeWordsIter<'a> = Filter<UWordBounds<'a>, fn(&&'a str) -> bool>;
956type AsciiIndicesIter<'a> = Filter<AsciiWordBoundIter<'a>, fn(&(usize, &'a str)) -> bool>;
957type UnicodeIndicesIter<'a> = Filter<UWordBoundIndices<'a>, fn(&(usize, &'a str)) -> bool>;
958
959#[derive(Debug)]
960enum WordsIter<'a> {
961 Ascii(AsciiWordsIter<'a>),
962 Unicode(UnicodeWordsIter<'a>),
963}
964
965#[derive(Debug)]
966enum IndicesIter<'a> {
967 Ascii(AsciiIndicesIter<'a>),
968 Unicode(UnicodeIndicesIter<'a>),
969}
970
971#[inline]
972pub fn new_unicode_words(s: &str) -> UnicodeWords<'_> {
973 let inner = if s.is_ascii() {
974 WordsIter::Ascii(new_unicode_words_ascii(s))
975 } else {
976 WordsIter::Unicode(new_unicode_words_general(s))
977 };
978 UnicodeWords { inner }
979}
980
981#[inline]
982pub fn new_unicode_word_indices(s: &str) -> UnicodeWordIndices<'_> {
983 let inner = if s.is_ascii() {
984 IndicesIter::Ascii(new_ascii_word_bound_indices(s).filter(ascii_word_ok))
985 } else {
986 IndicesIter::Unicode(new_word_bound_indices(s).filter(unicode_word_ok))
987 };
988 UnicodeWordIndices { inner }
989}
990
991#[inline]
992pub fn new_word_bounds(s: &str) -> UWordBounds<'_> {
993 UWordBounds {
994 string: s,
995 cat: None,
996 catb: None,
997 }
998}
999
1000#[inline]
1001pub fn new_word_bound_indices(s: &str) -> UWordBoundIndices<'_> {
1002 UWordBoundIndices {
1003 start_offset: s.as_ptr() as usize,
1004 iter: new_word_bounds(s),
1005 }
1006}
1007
1008#[inline]
1009fn new_ascii_word_bound_indices(s: &str) -> AsciiWordBoundIter<'_> {
1010 AsciiWordBoundIter::new(s)
1011}
1012
1013#[inline]
1014fn has_alphanumeric(s: &&str) -> bool {
1015 use crate::tables::util::is_alphanumeric;
1016
1017 s.chars().any(is_alphanumeric)
1018}
1019
1020#[inline]
1021fn has_ascii_alphanumeric(s: &&str) -> bool {
1022 s.chars().any(|c| c.is_ascii_alphanumeric())
1023}
1024
1025#[inline(always)]
1026fn strip_pos((_, w): (usize, &str)) -> &str {
1027 w
1028}
1029
1030#[inline]
1031fn new_unicode_words_ascii<'a>(s: &'a str) -> AsciiWordsIter<'a> {
1032 new_ascii_word_bound_indices(s)
1033 .map(strip_pos as fn(_) -> _)
1034 .filter(has_ascii_alphanumeric)
1035}
1036
1037#[inline]
1038fn new_unicode_words_general<'a>(s: &'a str) -> UnicodeWordsIter<'a> {
1039 new_word_bounds(s).filter(has_alphanumeric)
1040}
1041
1042#[cfg(test)]
1043mod tests {
1044 use crate::word::{
1045 new_ascii_word_bound_indices, new_unicode_words_ascii, new_word_bound_indices,
1046 };
1047 use std::string::String;
1048 use std::vec;
1049 use std::vec::Vec;
1050
1051 use proptest::prelude::*;
1052
1053 #[test]
1054 fn test_syriac_abbr_mark() {
1055 use crate::tables::word as wd;
1056 let (_, _, cat) = wd::word_category('\u{70f}');
1057 assert_eq!(cat, wd::WC_ALetter);
1058 }
1059
1060 #[test]
1061 fn test_end_of_ayah_cat() {
1062 use crate::tables::word as wd;
1063 let (_, _, cat) = wd::word_category('\u{6dd}');
1064 assert_eq!(cat, wd::WC_Numeric);
1065 }
1066
1067 #[test]
1068 fn test_ascii_word_bound_indices_various_cases() {
1069 let s = "Hello, world!";
1070 let words: Vec<(usize, &str)> = new_ascii_word_bound_indices(s).collect();
1071 let expected = vec![
1072 (0, "Hello"), // simple letters
1073 (5, ","),
1074 (6, " "), // space after comma
1075 (7, "world"), // skip comma+space, stop at '!'
1076 (12, "!"), // punctuation at the end
1077 ];
1078 assert_eq!(words, expected);
1079 }
1080
1081 #[test]
1082 fn test_ascii_word_indices_various_cases() {
1083 let s = "Hello, world! can't e.g. var1 123,456 foo_bar example.com 127.0.0.1:9090";
1084 let words: Vec<&str> = new_unicode_words_ascii(s).collect();
1085 let expected = vec![
1086 ("Hello"), // simple letters
1087 ("world"), // skip comma+space, stop at '!'
1088 ("can't"), // apostrophe joins letters
1089 ("e.g"),
1090 ("var1"),
1091 ("123,456"), // digits+comma+digits
1092 ("foo_bar"),
1093 ("example.com"),
1094 ("127.0.0.1"),
1095 ("9090"), // port number
1096 ];
1097 assert_eq!(words, expected);
1098 }
1099
1100 /// Strategy that yields every code-point from NUL (0) to DEL (127).
1101 fn ascii_char() -> impl Strategy<Value = char> {
1102 (0u8..=127).prop_map(|b| b as char)
1103 }
1104
1105 proptest! {
1106 #![proptest_config(ProptestConfig::with_cases(10000))]
1107 /// Fast path must equal general path for any ASCII input.
1108 #[test]
1109 fn proptest_ascii_matches_unicode_word_indices(
1110 // Vec<char> → String, length 0‒99
1111 s in proptest::collection::vec(ascii_char(), 0..100)
1112 .prop_map(|v| v.into_iter().collect::<String>())
1113 ) {
1114 let fast: Vec<(usize, &str)> = new_ascii_word_bound_indices(&s).collect();
1115 let uni: Vec<(usize, &str)> = new_word_bound_indices(&s).collect();
1116
1117 prop_assert_eq!(fast, uni);
1118 }
1119
1120 /// Fast path must equal general path for any ASCII input, forwards and backwards.
1121 #[test]
1122 fn proptest_ascii_matches_unicode_word_indices_rev(
1123 // Vec<char> → String, length 0‒99
1124 s in proptest::collection::vec(ascii_char(), 0..100)
1125 .prop_map(|v| v.into_iter().collect::<String>())
1126 ) {
1127 let fast_rev: Vec<(usize, &str)> = new_ascii_word_bound_indices(&s).rev().collect();
1128 let uni_rev : Vec<(usize, &str)> = new_word_bound_indices(&s).rev().collect();
1129 prop_assert_eq!(fast_rev, uni_rev);
1130 }
1131 }
1132}