1use std::collections::BTreeSet;
2use crate::{
3 Tokens,
4 TextToken, Token2, Bound,
5 TokenizerOptions,
6 TokenizerParams,
7 SentenceBreaker,
8 Text, TextStr,
9 IntoTokenizer,
10};
11
12use text_parsing::{
13 Breaker, Local, Snip,
14 Localize,
15};
16
17use std::borrow::Cow;
18
19#[derive(Debug,Clone)]
20pub(crate) struct InnerBound {
21 pub bytes: Snip,
22 pub chars: Snip,
23 pub breaker: Breaker,
24 pub original: Option<Local<()>>,
25}
26
27impl<'t> IntoTokenizer for &'t Text {
28 type IntoTokens = TextTokens<'t>;
29
30 fn into_tokenizer<S: SentenceBreaker>(self, params: TokenizerParams<S>) -> Self::IntoTokens {
31 TextTokens::new(
32 InnerText {
33 buffer: &self.buffer,
34 originals: Cow::Borrowed(&self.originals),
35 },
36 Cow::Borrowed(&self.breakers),
37 params)
38 }
39}
40impl<'t> IntoTokenizer for TextStr<'t> {
41 type IntoTokens = TextTokens<'t>;
42
43 fn into_tokenizer<S: SentenceBreaker>(self, params: TokenizerParams<S>) -> Self::IntoTokens {
44 TextTokens::new(
45 InnerText {
46 buffer: self.buffer,
47 originals: Cow::Owned(self.originals),
48 },
49 Cow::Owned(self.breakers),
50 params)
51 }
52}
53
54struct InnerText<'t> {
55 buffer: &'t str,
56 originals: Cow<'t,Vec<Local<()>>>,
57}
58
59pub struct TextTokens<'t> {
60 text: InnerText<'t>,
61
62 bounds: BoundEnum<'t>,
63 current_offset: usize,
64 current_char_offset: usize,
65 current_tokens: Option<Tokens<'t>>,
66
67 options: BTreeSet<TokenizerOptions>,
68 next_offset: usize,
69 next_char_offset: usize,
70 next_bound: Option<TextToken>,
71}
72enum BoundEnum<'t> {
73 Iter(std::slice::Iter<'t,InnerBound>),
74 IntoIter(std::vec::IntoIter<InnerBound>),
75}
76impl<'t> Iterator for BoundEnum<'t> {
77 type Item = std::borrow::Cow<'t,InnerBound>;
78 fn next(&mut self) -> Option<Self::Item> {
79 match self {
80 BoundEnum::Iter(iter) => iter.next().map(std::borrow::Cow::Borrowed),
81 BoundEnum::IntoIter(iter) => iter.next().map(std::borrow::Cow::Owned),
82 }
83 }
84}
85impl<'t> TextTokens<'t> {
86 pub fn text(&self) -> &'t str {
87 self.text.buffer
88 }
89}
90impl<'t> TextTokens<'t> {
91 fn new<'q,S: SentenceBreaker>(text: InnerText<'q>, breakers: Cow<'q,Vec<InnerBound>>, params: TokenizerParams<S>) -> TextTokens<'q> {
92 fn btoc(txt: &str) -> Vec<usize> {
93 let mut v = Vec::with_capacity(txt.len());
94 v.resize(txt.len(),0);
95 let mut max = 0;
96 for (ci,(bi,c)) in txt.char_indices().enumerate() {
97 for i in bi .. bi + c.len_utf8() {
98 v[i] = ci;
99 }
100 max = ci;
101 }
102 v.push(max+1);
103 v
104 }
105
106 let mut bounds = None;
107 if params.options.contains(&TokenizerOptions::WithSentences) {
108 let mut new_b = Vec::new();
109 let mut offset = 0;
110 let mut char_offset = 0;
111 let cnt = breakers.len();
112 for ib in breakers.iter() {
113 let InnerBound{ bytes, chars, breaker, original: _ } = ib;
114 if bytes.offset < offset { continue; }
115 match breaker {
116 Breaker::None | Breaker::Space | Breaker::Line | Breaker::Word => {},
117 Breaker::Sentence | Breaker::Paragraph | Breaker::Section => {
118 let txt = &text.buffer[offset .. bytes.offset];
119 let btoc = btoc(txt);
121 for snip in params.sentence_breaker.break_text(txt) {
122 if text.buffer[offset + snip.offset .. offset + snip.offset + snip.length].trim().len() > 0 {
124 new_b.push(InnerBound {
125 bytes: Snip{ offset: offset + snip.offset + snip.length, length: 0 },
126 chars: Snip{ offset: char_offset + btoc[snip.offset + snip.length], length: 0 },
127 breaker: Breaker::Sentence,
128 original: None,
129 });
130 }
131 }
132 new_b.pop(); },
134 }
135 new_b.push(ib.clone());
136 offset = bytes.offset + bytes.length;
137 char_offset = chars.offset + chars.length;
138 }
140 let txt = &text.buffer[offset ..];
141 for snip in params.sentence_breaker.break_text(txt) {
142 if text.buffer[offset + snip.offset .. offset + snip.offset + snip.length].trim().len() > 0 {
145 let btoc = btoc(txt);
146 new_b.push(InnerBound {
147 bytes: Snip{ offset: offset + snip.offset + snip.length, length: 0 },
148 chars: Snip{ offset: char_offset + btoc[snip.offset + snip.length], length: 0 },
149 breaker: Breaker::Sentence,
150 original: None,
151 });
152 }
153 }
154 new_b.pop(); if new_b.len() > cnt {
156 bounds = Some(BoundEnum::IntoIter(new_b.into_iter()));
157 }
158 }
159 let bounds = match bounds {
160 Some(b) => b,
161 None => match breakers {
162 Cow::Owned(b) => BoundEnum::IntoIter(b.into_iter()),
163 Cow::Borrowed(b) => BoundEnum::Iter(b.iter()),
164 },
165 };
166 TextTokens {
167 text,
168 bounds,
169 current_offset: 0,
170 current_char_offset: 0,
171 current_tokens: None,
172 options: params.options,
173
174 next_offset: 0,
175 next_char_offset: 0,
176 next_bound: None,
177 }
178 }
179}
180impl<'t> Iterator for TextTokens<'t> {
181 type Item = TextToken;
182
183 fn next(&mut self) -> Option<Self::Item> {
184 loop {
185 match &mut self.current_tokens {
186 Some(tokens) => match tokens.next() {
187 Some(local_token) => {
188 let (local,token) = local_token.into_inner();
189 let local = local.with_shift(self.current_char_offset, self.current_offset);
190 let Snip { offset: first, length: len } = local.chars();
191 if len > 0 {
192 let last = first + len - 1;
193 let original = match len == 1 {
194 false => match Local::from_segment(self.text.originals[first],self.text.originals[last]) {
195 Ok(loc) => loc,
196 Err(_) => continue,
197 },
198 true => self.text.originals[first],
199 };
200 break Some(TextToken {
201 locality: local,
202 original: Some(original),
203 token: token.into(),
204 });
205 }
206 },
207 None => {
208 self.current_tokens = None;
209 self.current_offset = self.next_offset;
210 self.current_char_offset = self.next_char_offset;
211 if let Some(tok) = self.next_bound.take() {
212 break Some(tok);
213 }
214 },
215 },
216 None => {
217 let (txt,next_offset,opt_bound) = match self.bounds.next() {
218 Some(ib) => {
219 let InnerBound{ bytes, chars, breaker, original } = ib.as_ref();
220 if bytes.offset < self.current_offset { continue; }
221 let txt = &self.text.buffer[self.current_offset .. bytes.offset];
222 let next_offset = bytes.offset + bytes.length;
223 let next_char_offset = chars.offset + chars.length;
224 let opt_bound = match match breaker {
225 Breaker::None | Breaker::Space | Breaker::Line | Breaker::Word => None,
226 Breaker::Sentence => Some(Bound::Sentence),
227 Breaker::Paragraph => Some(Bound::Paragraph),
228 Breaker::Section => Some(Bound::Section),
229 } {
230 Some(bound) => Some(TextToken {
231 locality: ().localize(*chars,*bytes),
232 original: *original,
233 token: Token2::Bound(bound),
234 }),
235 None => None,
236 };
237 (txt,(next_offset,next_char_offset),opt_bound)
238 },
239 None => match self.current_offset < self.text.buffer.len() {
240 true => {
241 let txt = &self.text.buffer[self.current_offset .. ];
242 let next_offset = self.text.buffer.len();
243 let next_char_offset = self.text.originals.len();
244 let opt_bound = None;
245 (txt,(next_offset,next_char_offset),opt_bound)
246 },
247 false => break None,
248 },
249 };
250 self.next_offset = next_offset.0;
251 self.next_char_offset = next_offset.1;
252 self.next_bound = opt_bound;
253 self.current_tokens = Some(Tokens::new(txt,&self.options));
254 },
255 }
256 }
257 }
258}
259