text_tokenizer/
breakers.rs

1use text_parsing::Snip;
2use unicode_segmentation::{USentenceBoundIndices, UnicodeSegmentation};
3
4pub trait SentenceBreaker {
5    type Iter<'t>: Iterator<Item = Snip>;
6
7    fn break_text<'t>(&self, text: &'t str) -> Self::Iter<'t>;
8}
9
10impl SentenceBreaker for () {
11    type Iter<'t> = std::iter::Empty<Snip>;
12    fn break_text<'t>(&self, _text: &'t str) -> Self::Iter<'t> {
13        std::iter::empty()
14    }
15}
16
17pub struct UnicodeSentenceBreaker;
18
19impl SentenceBreaker for UnicodeSentenceBreaker {
20    type Iter<'t> = UnicodeIter<'t>;
21
22    fn break_text<'t>(&self, text: &'t str) -> Self::Iter<'t> {
23        UnicodeIter {
24            iter: text.split_sentence_bound_indices(),
25        }
26    }
27}
28
29pub struct UnicodeIter<'t> {
30    iter: USentenceBoundIndices<'t>,
31}
32impl<'t> Iterator for UnicodeIter<'t> {
33    type Item = Snip;
34    fn next(&mut self) -> Option<Snip> {
35        self.iter.next().map(|(offset, s)| Snip {
36            offset,
37            length: s.len(),
38        })
39    }
40}