text_tokenizer/
text_tokens.rs

1use crate::{
2    Bound, IntoTokenizer, SentenceBreaker, Text, TextLocality, TextStr, TextToken, Token2,
3    TokenizerOptions, TokenizerParams, Tokens,
4};
5use std::collections::BTreeSet;
6
7use text_parsing::{Breaker, Local, Localize, Snip};
8
9use std::sync::Arc;
10
11#[derive(Debug, Clone, Copy)]
12pub(crate) struct InnerBound {
13    pub bytes: Snip,
14    pub chars: Snip,
15    pub breaker: Breaker,
16    pub original: Option<Local<()>>,
17}
18
19impl<'t> IntoTokenizer for &'t Text {
20    type IntoTokens = TextTokens<'t>;
21
22    fn into_tokenizer<S: SentenceBreaker>(self, params: TokenizerParams<S>) -> Self::IntoTokens {
23        TextTokens::new(
24            InnerText {
25                buffer: &self.buffer,
26                localities: self.localities.clone(),
27            },
28            self.breakers.clone(),
29            params,
30        )
31    }
32}
33impl<'t> IntoTokenizer for TextStr<'t> {
34    type IntoTokens = TextTokens<'t>;
35
36    fn into_tokenizer<S: SentenceBreaker>(self, params: TokenizerParams<S>) -> Self::IntoTokens {
37        TextTokens::new(
38            InnerText {
39                buffer: self.buffer,
40                localities: self.localities.clone(),
41            },
42            self.breakers.clone(),
43            params,
44        )
45    }
46}
47
48struct InnerText<'t> {
49    buffer: &'t str,
50    localities: Arc<Vec<TextLocality>>,
51}
52
53pub struct TextTokens<'t> {
54    text: InnerText<'t>,
55
56    bounds: BoundEnum,
57    current_offset: usize,
58    current_char_offset: usize,
59    current_tokens: Option<Tokens<'t>>,
60
61    options: BTreeSet<TokenizerOptions>,
62    next_offset: usize,
63    next_char_offset: usize,
64    next_bound: Option<TextToken>,
65}
66enum BoundEnum {
67    IntoIter(std::vec::IntoIter<InnerBound>),
68    Iter {
69        next: usize,
70        vec: Arc<Vec<InnerBound>>,
71    },
72}
73impl Iterator for BoundEnum {
74    type Item = InnerBound;
75    fn next(&mut self) -> Option<Self::Item> {
76        match self {
77            BoundEnum::Iter { next, vec } => vec.get(*next).map(|ib| {
78                *next += 1;
79                *ib
80            }),
81            /*match vec.get(next) {
82                Some(ib) => {
83                    *next += 1;
84                    Some(std::borrow::Cow::Borrowed(ib))
85                }
86                None => None,
87            },*/
88            BoundEnum::IntoIter(iter) => iter.next(),
89        }
90    }
91}
92impl<'t> TextTokens<'t> {
93    pub fn text(&self) -> &'t str {
94        self.text.buffer
95    }
96}
97impl<'t> TextTokens<'t> {
98    fn new<'q, S: SentenceBreaker>(
99        text: InnerText<'q>,
100        breakers: Arc<Vec<InnerBound>>,
101        params: TokenizerParams<S>,
102    ) -> TextTokens<'q> {
103        fn btoc(txt: &str) -> Vec<usize> {
104            let mut v = Vec::with_capacity(txt.len());
105            v.resize(txt.len(), 0);
106            let mut max = 0;
107            for (ci, (bi, c)) in txt.char_indices().enumerate() {
108                for i in bi..bi + c.len_utf8() {
109                    v[i] = ci;
110                }
111                max = ci;
112            }
113            v.push(max + 1);
114            v
115        }
116
117        let mut bounds = None;
118        if params.options.contains(&TokenizerOptions::WithSentences) {
119            let mut new_b = Vec::new();
120            let mut offset = 0;
121            let mut char_offset = 0;
122            let cnt = breakers.len();
123            for ib in breakers.iter() {
124                let InnerBound {
125                    bytes,
126                    chars,
127                    breaker,
128                    original: _,
129                } = ib;
130                if bytes.offset < offset {
131                    continue;
132                }
133                match breaker {
134                    Breaker::None | Breaker::Space | Breaker::Line | Breaker::Word => {}
135                    Breaker::Sentence | Breaker::Paragraph | Breaker::Section => {
136                        let txt = &text.buffer[offset..bytes.offset];
137                        //println!("{}",txt);
138                        let btoc = btoc(txt);
139                        for snip in params.sentence_breaker.break_text(txt) {
140                            //println!("{:?} -> {}",snip, offset + snip.offset);
141                            if text.buffer[offset + snip.offset..offset + snip.offset + snip.length]
142                                .trim()
143                                .len()
144                                > 0
145                            {
146                                new_b.push(InnerBound {
147                                    bytes: Snip {
148                                        offset: offset + snip.offset + snip.length,
149                                        length: 0,
150                                    },
151                                    chars: Snip {
152                                        offset: char_offset + btoc[snip.offset + snip.length],
153                                        length: 0,
154                                    },
155                                    breaker: Breaker::Sentence,
156                                    original: None,
157                                });
158                            }
159                        }
160                        new_b.pop(); // remove last sentence breaker
161                    }
162                }
163                new_b.push(ib.clone());
164                offset = bytes.offset + bytes.length;
165                char_offset = chars.offset + chars.length;
166                //println!("");
167            }
168            let txt = &text.buffer[offset..];
169            for snip in params.sentence_breaker.break_text(txt) {
170                //println!("{}",txt);
171                //println!("{:?} -> {}",snip,offset + snip.offset);
172                if text.buffer[offset + snip.offset..offset + snip.offset + snip.length]
173                    .trim()
174                    .len()
175                    > 0
176                {
177                    let btoc = btoc(txt);
178                    new_b.push(InnerBound {
179                        bytes: Snip {
180                            offset: offset + snip.offset + snip.length,
181                            length: 0,
182                        },
183                        chars: Snip {
184                            offset: char_offset + btoc[snip.offset + snip.length],
185                            length: 0,
186                        },
187                        breaker: Breaker::Sentence,
188                        original: None,
189                    });
190                }
191            }
192            new_b.pop(); // remove last sentence breaker
193            if new_b.len() > cnt {
194                bounds = Some(BoundEnum::IntoIter(new_b.into_iter()));
195            }
196        }
197        let bounds = match bounds {
198            Some(b) => b,
199            None => BoundEnum::Iter {
200                next: 0,
201                vec: breakers,
202            },
203        };
204        TextTokens {
205            text,
206            bounds,
207            current_offset: 0,
208            current_char_offset: 0,
209            current_tokens: None,
210            options: params.options,
211
212            next_offset: 0,
213            next_char_offset: 0,
214            next_bound: None,
215        }
216    }
217}
218impl<'t> Iterator for TextTokens<'t> {
219    type Item = TextToken;
220
221    fn next(&mut self) -> Option<Self::Item> {
222        loop {
223            match &mut self.current_tokens {
224                Some(tokens) => match tokens.next() {
225                    Some(local_token) => {
226                        let (local, token) = local_token.into_inner();
227                        let local = local.with_shift(self.current_char_offset, self.current_offset);
228                        let Snip {
229                            offset: first,
230                            length: len,
231                        } = local.chars();
232                        if len > 0 {
233                            let last = first + len - 1;
234                            let original = match len == 1 {
235                                false => match Local::from_segment(
236                                    self.text.localities[first].original,
237                                    self.text.localities[last].original,
238                                ) {
239                                    Ok(loc) => loc,
240                                    Err(_) => continue,
241                                },
242                                true => self.text.localities[first].original,
243                            };
244                            break Some(TextToken {
245                                locality: local,
246                                original: Some(original),
247                                token: token.into(),
248                            });
249                        }
250                    }
251                    None => {
252                        self.current_tokens = None;
253                        self.current_offset = self.next_offset;
254                        self.current_char_offset = self.next_char_offset;
255                        if let Some(tok) = self.next_bound.take() {
256                            break Some(tok);
257                        }
258                    }
259                },
260                None => {
261                    let (txt, next_offset, opt_bound) = match self.bounds.next() {
262                        Some(ib) => {
263                            let InnerBound {
264                                bytes,
265                                chars,
266                                breaker,
267                                original,
268                            } = &ib;
269                            if bytes.offset < self.current_offset {
270                                continue;
271                            }
272                            let txt = &self.text.buffer[self.current_offset..bytes.offset];
273                            let next_offset = bytes.offset + bytes.length;
274                            let next_char_offset = chars.offset + chars.length;
275                            let opt_bound = match match breaker {
276                                Breaker::None | Breaker::Space | Breaker::Line | Breaker::Word => {
277                                    None
278                                }
279                                Breaker::Sentence => Some(Bound::Sentence),
280                                Breaker::Paragraph => Some(Bound::Paragraph),
281                                Breaker::Section => Some(Bound::Section),
282                            } {
283                                Some(bound) => Some(TextToken {
284                                    locality: ().localize(*chars, *bytes),
285                                    original: *original,
286                                    token: Token2::Bound(bound),
287                                }),
288                                None => None,
289                            };
290                            (txt, (next_offset, next_char_offset), opt_bound)
291                        }
292                        None => match self.current_offset < self.text.buffer.len() {
293                            true => {
294                                let txt = &self.text.buffer[self.current_offset..];
295                                let next_offset = self.text.buffer.len();
296                                let next_char_offset = self.text.localities.len();
297                                let opt_bound = None;
298                                (txt, (next_offset, next_char_offset), opt_bound)
299                            }
300                            false => break None,
301                        },
302                    };
303                    self.next_offset = next_offset.0;
304                    self.next_char_offset = next_offset.1;
305                    self.next_bound = opt_bound;
306                    self.current_tokens = Some(Tokens::new(txt, &self.options));
307                }
308            }
309        }
310    }
311}