1use crate::{
2 Bound, IntoTokenizer, SentenceBreaker, Text, TextLocality, TextStr, TextToken, Token2,
3 TokenizerOptions, TokenizerParams, Tokens,
4};
5use std::collections::BTreeSet;
6
7use text_parsing::{Breaker, Local, Localize, Snip};
8
9use std::sync::Arc;
10
11#[derive(Debug, Clone, Copy)]
12pub(crate) struct InnerBound {
13 pub bytes: Snip,
14 pub chars: Snip,
15 pub breaker: Breaker,
16 pub original: Option<Local<()>>,
17}
18
19impl<'t> IntoTokenizer for &'t Text {
20 type IntoTokens = TextTokens<'t>;
21
22 fn into_tokenizer<S: SentenceBreaker>(self, params: TokenizerParams<S>) -> Self::IntoTokens {
23 TextTokens::new(
24 InnerText {
25 buffer: &self.buffer,
26 localities: self.localities.clone(),
27 },
28 self.breakers.clone(),
29 params,
30 )
31 }
32}
33impl<'t> IntoTokenizer for TextStr<'t> {
34 type IntoTokens = TextTokens<'t>;
35
36 fn into_tokenizer<S: SentenceBreaker>(self, params: TokenizerParams<S>) -> Self::IntoTokens {
37 TextTokens::new(
38 InnerText {
39 buffer: self.buffer,
40 localities: self.localities.clone(),
41 },
42 self.breakers.clone(),
43 params,
44 )
45 }
46}
47
48struct InnerText<'t> {
49 buffer: &'t str,
50 localities: Arc<Vec<TextLocality>>,
51}
52
53pub struct TextTokens<'t> {
54 text: InnerText<'t>,
55
56 bounds: BoundEnum,
57 current_offset: usize,
58 current_char_offset: usize,
59 current_tokens: Option<Tokens<'t>>,
60
61 options: BTreeSet<TokenizerOptions>,
62 next_offset: usize,
63 next_char_offset: usize,
64 next_bound: Option<TextToken>,
65}
66enum BoundEnum {
67 IntoIter(std::vec::IntoIter<InnerBound>),
68 Iter {
69 next: usize,
70 vec: Arc<Vec<InnerBound>>,
71 },
72}
73impl Iterator for BoundEnum {
74 type Item = InnerBound;
75 fn next(&mut self) -> Option<Self::Item> {
76 match self {
77 BoundEnum::Iter { next, vec } => vec.get(*next).map(|ib| {
78 *next += 1;
79 *ib
80 }),
81 BoundEnum::IntoIter(iter) => iter.next(),
89 }
90 }
91}
92impl<'t> TextTokens<'t> {
93 pub fn text(&self) -> &'t str {
94 self.text.buffer
95 }
96}
97impl<'t> TextTokens<'t> {
98 fn new<'q, S: SentenceBreaker>(
99 text: InnerText<'q>,
100 breakers: Arc<Vec<InnerBound>>,
101 params: TokenizerParams<S>,
102 ) -> TextTokens<'q> {
103 fn btoc(txt: &str) -> Vec<usize> {
104 let mut v = Vec::with_capacity(txt.len());
105 v.resize(txt.len(), 0);
106 let mut max = 0;
107 for (ci, (bi, c)) in txt.char_indices().enumerate() {
108 for i in bi..bi + c.len_utf8() {
109 v[i] = ci;
110 }
111 max = ci;
112 }
113 v.push(max + 1);
114 v
115 }
116
117 let mut bounds = None;
118 if params.options.contains(&TokenizerOptions::WithSentences) {
119 let mut new_b = Vec::new();
120 let mut offset = 0;
121 let mut char_offset = 0;
122 let cnt = breakers.len();
123 for ib in breakers.iter() {
124 let InnerBound {
125 bytes,
126 chars,
127 breaker,
128 original: _,
129 } = ib;
130 if bytes.offset < offset {
131 continue;
132 }
133 match breaker {
134 Breaker::None | Breaker::Space | Breaker::Line | Breaker::Word => {}
135 Breaker::Sentence | Breaker::Paragraph | Breaker::Section => {
136 let txt = &text.buffer[offset..bytes.offset];
137 let btoc = btoc(txt);
139 for snip in params.sentence_breaker.break_text(txt) {
140 if text.buffer[offset + snip.offset..offset + snip.offset + snip.length]
142 .trim()
143 .len()
144 > 0
145 {
146 new_b.push(InnerBound {
147 bytes: Snip {
148 offset: offset + snip.offset + snip.length,
149 length: 0,
150 },
151 chars: Snip {
152 offset: char_offset + btoc[snip.offset + snip.length],
153 length: 0,
154 },
155 breaker: Breaker::Sentence,
156 original: None,
157 });
158 }
159 }
160 new_b.pop(); }
162 }
163 new_b.push(ib.clone());
164 offset = bytes.offset + bytes.length;
165 char_offset = chars.offset + chars.length;
166 }
168 let txt = &text.buffer[offset..];
169 for snip in params.sentence_breaker.break_text(txt) {
170 if text.buffer[offset + snip.offset..offset + snip.offset + snip.length]
173 .trim()
174 .len()
175 > 0
176 {
177 let btoc = btoc(txt);
178 new_b.push(InnerBound {
179 bytes: Snip {
180 offset: offset + snip.offset + snip.length,
181 length: 0,
182 },
183 chars: Snip {
184 offset: char_offset + btoc[snip.offset + snip.length],
185 length: 0,
186 },
187 breaker: Breaker::Sentence,
188 original: None,
189 });
190 }
191 }
192 new_b.pop(); if new_b.len() > cnt {
194 bounds = Some(BoundEnum::IntoIter(new_b.into_iter()));
195 }
196 }
197 let bounds = match bounds {
198 Some(b) => b,
199 None => BoundEnum::Iter {
200 next: 0,
201 vec: breakers,
202 },
203 };
204 TextTokens {
205 text,
206 bounds,
207 current_offset: 0,
208 current_char_offset: 0,
209 current_tokens: None,
210 options: params.options,
211
212 next_offset: 0,
213 next_char_offset: 0,
214 next_bound: None,
215 }
216 }
217}
218impl<'t> Iterator for TextTokens<'t> {
219 type Item = TextToken;
220
221 fn next(&mut self) -> Option<Self::Item> {
222 loop {
223 match &mut self.current_tokens {
224 Some(tokens) => match tokens.next() {
225 Some(local_token) => {
226 let (local, token) = local_token.into_inner();
227 let local = local.with_shift(self.current_char_offset, self.current_offset);
228 let Snip {
229 offset: first,
230 length: len,
231 } = local.chars();
232 if len > 0 {
233 let last = first + len - 1;
234 let original = match len == 1 {
235 false => match Local::from_segment(
236 self.text.localities[first].original,
237 self.text.localities[last].original,
238 ) {
239 Ok(loc) => loc,
240 Err(_) => continue,
241 },
242 true => self.text.localities[first].original,
243 };
244 break Some(TextToken {
245 locality: local,
246 original: Some(original),
247 token: token.into(),
248 });
249 }
250 }
251 None => {
252 self.current_tokens = None;
253 self.current_offset = self.next_offset;
254 self.current_char_offset = self.next_char_offset;
255 if let Some(tok) = self.next_bound.take() {
256 break Some(tok);
257 }
258 }
259 },
260 None => {
261 let (txt, next_offset, opt_bound) = match self.bounds.next() {
262 Some(ib) => {
263 let InnerBound {
264 bytes,
265 chars,
266 breaker,
267 original,
268 } = &ib;
269 if bytes.offset < self.current_offset {
270 continue;
271 }
272 let txt = &self.text.buffer[self.current_offset..bytes.offset];
273 let next_offset = bytes.offset + bytes.length;
274 let next_char_offset = chars.offset + chars.length;
275 let opt_bound = match match breaker {
276 Breaker::None | Breaker::Space | Breaker::Line | Breaker::Word => {
277 None
278 }
279 Breaker::Sentence => Some(Bound::Sentence),
280 Breaker::Paragraph => Some(Bound::Paragraph),
281 Breaker::Section => Some(Bound::Section),
282 } {
283 Some(bound) => Some(TextToken {
284 locality: ().localize(*chars, *bytes),
285 original: *original,
286 token: Token2::Bound(bound),
287 }),
288 None => None,
289 };
290 (txt, (next_offset, next_char_offset), opt_bound)
291 }
292 None => match self.current_offset < self.text.buffer.len() {
293 true => {
294 let txt = &self.text.buffer[self.current_offset..];
295 let next_offset = self.text.buffer.len();
296 let next_char_offset = self.text.localities.len();
297 let opt_bound = None;
298 (txt, (next_offset, next_char_offset), opt_bound)
299 }
300 false => break None,
301 },
302 };
303 self.next_offset = next_offset.0;
304 self.next_char_offset = next_offset.1;
305 self.next_bound = opt_bound;
306 self.current_tokens = Some(Tokens::new(txt, &self.options));
307 }
308 }
309 }
310 }
311}