text_tokenizer/
breakers.rs

1use text_parsing::{
2    Snip,
3};
4use unicode_segmentation::{
5    UnicodeSegmentation,
6    USentenceBoundIndices,
7};
8
9
10pub trait SentenceBreaker {
11    type Iter<'t>: Iterator<Item = Snip>;
12
13    fn break_text<'t>(&self, text: &'t str) -> Self::Iter<'t>;
14}
15
16impl SentenceBreaker for () {
17    type Iter<'t> = std::iter::Empty<Snip>;
18    fn break_text<'t>(&self, _text: &'t str) -> Self::Iter<'t> {
19        std::iter::empty()
20    }
21}
22
23pub struct UnicodeSentenceBreaker;
24
25impl SentenceBreaker for UnicodeSentenceBreaker {
26    type Iter<'t> = UnicodeIter<'t>;
27    
28    fn break_text<'t>(&self, text: &'t str) -> Self::Iter<'t> {
29        UnicodeIter {
30            iter: text.split_sentence_bound_indices(),
31        }
32    }
33}
34
35pub struct UnicodeIter<'t> {
36    iter: USentenceBoundIndices<'t>,
37}
38impl<'t> Iterator for UnicodeIter<'t> {
39    type Item = Snip;
40    fn next(&mut self) -> Option<Snip> {
41        self.iter.next().map(|(offset,s)| Snip{ offset, length: s.len() } )
42    }
43}