text_tokenizer/
breakers.rs1use text_parsing::Snip;
2use unicode_segmentation::{USentenceBoundIndices, UnicodeSegmentation};
3
4pub trait SentenceBreaker {
5 type Iter<'t>: Iterator<Item = Snip>;
6
7 fn break_text<'t>(&self, text: &'t str) -> Self::Iter<'t>;
8}
9
10impl SentenceBreaker for () {
11 type Iter<'t> = std::iter::Empty<Snip>;
12 fn break_text<'t>(&self, _text: &'t str) -> Self::Iter<'t> {
13 std::iter::empty()
14 }
15}
16
17pub struct UnicodeSentenceBreaker;
18
19impl SentenceBreaker for UnicodeSentenceBreaker {
20 type Iter<'t> = UnicodeIter<'t>;
21
22 fn break_text<'t>(&self, text: &'t str) -> Self::Iter<'t> {
23 UnicodeIter {
24 iter: text.split_sentence_bound_indices(),
25 }
26 }
27}
28
29pub struct UnicodeIter<'t> {
30 iter: USentenceBoundIndices<'t>,
31}
32impl<'t> Iterator for UnicodeIter<'t> {
33 type Item = Snip;
34 fn next(&mut self) -> Option<Snip> {
35 self.iter.next().map(|(offset, s)| Snip {
36 offset,
37 length: s.len(),
38 })
39 }
40}