text_tokenizer/
breakers.rs1use text_parsing::{
2 Snip,
3};
4use unicode_segmentation::{
5 UnicodeSegmentation,
6 USentenceBoundIndices,
7};
8
9
10pub trait SentenceBreaker {
11 type Iter<'t>: Iterator<Item = Snip>;
12
13 fn break_text<'t>(&self, text: &'t str) -> Self::Iter<'t>;
14}
15
16impl SentenceBreaker for () {
17 type Iter<'t> = std::iter::Empty<Snip>;
18 fn break_text<'t>(&self, _text: &'t str) -> Self::Iter<'t> {
19 std::iter::empty()
20 }
21}
22
23pub struct UnicodeSentenceBreaker;
24
25impl SentenceBreaker for UnicodeSentenceBreaker {
26 type Iter<'t> = UnicodeIter<'t>;
27
28 fn break_text<'t>(&self, text: &'t str) -> Self::Iter<'t> {
29 UnicodeIter {
30 iter: text.split_sentence_bound_indices(),
31 }
32 }
33}
34
35pub struct UnicodeIter<'t> {
36 iter: USentenceBoundIndices<'t>,
37}
38impl<'t> Iterator for UnicodeIter<'t> {
39 type Item = Snip;
40 fn next(&mut self) -> Option<Snip> {
41 self.iter.next().map(|(offset,s)| Snip{ offset, length: s.len() } )
42 }
43}