text_tokenizer/tokens.rs
1use std::collections::{BTreeSet, VecDeque};
2use std::str::FromStr;
3use unicode_properties::{GeneralCategory, GeneralCategoryGroup, UnicodeGeneralCategory};
4
5use text_parsing::Local;
6
7use crate::{
8 EMOJIMAP, Formatter, IntoTokenizer, Number, Numerical, SentenceBreaker, Separator, Special,
9 Struct, Token, TokenizerOptions, TokenizerParams, Unicode, Word,
10 wordbreaker::{BasicToken, WordBreaker, one_char_word},
11};
12
13impl<'t> IntoTokenizer for &'t str {
14 type IntoTokens = Tokens<'t>;
15
16 fn into_tokenizer<S: SentenceBreaker>(self, params: TokenizerParams<S>) -> Self::IntoTokens {
17 Tokens::new(self, ¶ms.options)
18 }
19}
20
21impl<'t> Iterator for Tokens<'t> {
22 type Item = Local<Token>;
23
24 fn next(&mut self) -> Option<Self::Item> {
25 loop {
26 if self.buffer.len() > 0 {
27 return self.next_from_buffer();
28 } else {
29 loop {
30 match self.bounds.next() {
31 Some(local_bt) => {
32 let sep = if let BasicToken::Separator(_) = local_bt.data() {
33 true
34 } else {
35 false
36 };
37 self.buffer.push_back(local_bt);
38 if sep {
39 return self.next();
40 }
41 }
42 None if self.buffer.len() > 0 => return self.next(),
43 None => return None,
44 }
45 }
46 }
47 }
48 }
49}
50
51//#[derive(Debug)]
52pub struct Tokens<'t> {
53 bounds: WordBreaker<'t>,
54 buffer: VecDeque<Local<BasicToken<'t>>>,
55 allow_structs: bool,
56}
57impl<'t> Tokens<'t> {
58 pub(crate) fn new<'a>(s: &'a str, options: &BTreeSet<TokenizerOptions>) -> Tokens<'a> {
59 Tokens {
60 bounds: WordBreaker::new(s, &options),
61 buffer: VecDeque::new(),
62 allow_structs: if options.contains(&TokenizerOptions::StructTokens) {
63 true
64 } else {
65 false
66 },
67 }
68 }
69 fn basic_separator_to_pt(&mut self, c: char) -> Token {
70 Token::Special(Special::Separator(match c {
71 ' ' => Separator::Space,
72 '\n' => Separator::Newline,
73 '\t' => Separator::Tab,
74 _ => Separator::Char(c),
75 }))
76 }
77 fn basic_formater_to_pt(&mut self, c: char) -> Token {
78 Token::Unicode(Unicode::Formatter(match c {
79 '\u{200d}' => Formatter::Joiner,
80 _ => Formatter::Char(c),
81 }))
82 }
83 fn basic_number_to_pt(&mut self, s: &str) -> Token {
84 Token::Word(match i64::from_str(s) {
85 Ok(n) => match s.chars().next() {
86 Some('0') => {
87 #[cfg(not(feature = "strings"))]
88 {
89 Word::Number(Number::ZeroInteger { i: n })
90 }
91 #[cfg(feature = "strings")]
92 {
93 Word::Number(Number::ZeroInteger {
94 i: n,
95 s: s.to_string(),
96 })
97 }
98 }
99 Some(_) | None => Word::Number(Number::Integer(n)),
100 },
101 Err(_) => match f64::from_str(s) {
102 Ok(n) => Word::Number(Number::Float(n)),
103 Err(..) => {
104 #[cfg(feature = "strings")]
105 {
106 Word::Word(s.to_string())
107 }
108 #[cfg(not(feature = "strings"))]
109 {
110 Word::Word
111 }
112 }
113 },
114 })
115 }
116 fn basic_mixed_to_pt(&mut self, s: &str) -> Token {
117 let mut word = true;
118 let mut has_word_parts = false;
119 let mut first = true;
120 let mut same = false;
121 let mut one_c = ' ';
122 for c in s.chars() {
123 match c.is_alphanumeric()
124 || c.is_digit(10)
125 || (c.general_category_group() == GeneralCategoryGroup::Punctuation)
126 || (c == '\u{0060}')
127 {
128 true => {
129 has_word_parts = true;
130 }
131 false => {
132 word = false;
133 }
134 }
135 match first {
136 true => {
137 one_c = c;
138 first = false;
139 same = true;
140 }
141 false => {
142 if one_c != c {
143 same = false;
144 }
145 }
146 }
147 }
148 if !first
149 && same
150 && (one_c.is_whitespace() || (one_c.general_category() == GeneralCategory::Format))
151 {
152 if one_c.is_whitespace() {
153 return self.basic_separator_to_pt(one_c);
154 } else {
155 return self.basic_formater_to_pt(one_c);
156 }
157 }
158 if word {
159 #[cfg(feature = "strings")]
160 {
161 Token::Word(Word::StrangeWord(s.to_string()))
162 }
163 #[cfg(not(feature = "strings"))]
164 {
165 Token::Word(Word::StrangeWord)
166 }
167 } else {
168 let rs = s.replace("\u{fe0f}", "");
169 match EMOJIMAP.get(&rs as &str) {
170 Some(em) => Token::Word(Word::Emoji(em)),
171 None => match one_char_word(&rs) {
172 //Some(c) if c.general_category() == GeneralCategory::ModifierSymbol => Token::UnicodeModifier(c),
173 Some(c) if c.general_category_group() == GeneralCategoryGroup::Symbol => {
174 match c.general_category() {
175 GeneralCategory::CurrencySymbol => Token::Special(Special::Currency(c)),
176 _ => Token::Special(Special::Symbol(c)),
177 }
178 }
179 Some(_) | None => match has_word_parts {
180 true => {
181 #[cfg(feature = "strings")]
182 {
183 Token::Word(Word::StrangeWord(s.to_string()))
184 }
185 #[cfg(not(feature = "strings"))]
186 {
187 Token::Word(Word::StrangeWord)
188 }
189 }
190 false => {
191 #[cfg(feature = "strings")]
192 {
193 Token::Unicode(Unicode::String({
194 let mut us = "".to_string();
195 for c in rs.chars() {
196 if us != "" {
197 us += "_";
198 }
199 us += "u";
200 let ns = format!("{}", c.escape_unicode());
201 us += &ns[3..ns.len() - 1];
202 }
203 us
204 }))
205 }
206 #[cfg(not(feature = "strings"))]
207 {
208 Token::Unicode(Unicode::String)
209 }
210 }
211 },
212 },
213 }
214 }
215 }
216 fn basic_alphanumeric_to_pt(&mut self, s: &str) -> Token {
217 /*
218 Word
219 StrangeWord
220 pub enum Numerical {
221 Date(String),
222 Ip(String),
223 DotSeparated(String),
224 Countable(String),
225 Measures(String),
226 Alphanumeric(String),
227 }*/
228 //let mut wrd = true;
229 let mut digits = false;
230 let mut digits_begin_only = false;
231 let mut dots = false;
232 let mut alphas_and_apos = false;
233 let mut other = false;
234
235 let mut start_digit = true;
236 for c in s.chars() {
237 if start_digit && (!c.is_digit(10)) {
238 start_digit = false;
239 }
240 match c {
241 c @ _ if c.is_digit(10) => {
242 digits = true;
243 if start_digit {
244 digits_begin_only = true;
245 } else {
246 digits_begin_only = false;
247 }
248 }
249 c @ _ if c.is_alphabetic() => {
250 alphas_and_apos = true;
251 }
252 '\'' => {
253 alphas_and_apos = true;
254 }
255 '.' => {
256 dots = true;
257 }
258 _ => {
259 other = true;
260 }
261 }
262 }
263 Token::Word(
264 match (digits, digits_begin_only, dots, alphas_and_apos, other) {
265 (true, false, true, false, false) => {
266 // TODO: Date, Ip, DotSeparated
267 #[cfg(feature = "strings")]
268 {
269 Word::Numerical(Numerical::DotSeparated(s.to_string()))
270 }
271 #[cfg(not(feature = "strings"))]
272 {
273 Word::Numerical(Numerical::DotSeparated)
274 }
275 }
276 (true, true, _, true, false) => {
277 // TODO: Countable or Measures
278 #[cfg(feature = "strings")]
279 {
280 Word::Numerical(Numerical::Measures(s.to_string()))
281 }
282 #[cfg(not(feature = "strings"))]
283 {
284 Word::Numerical(Numerical::Measures)
285 }
286 }
287 (true, _, _, _, _) => {
288 // Numerical trash, ids, etc.
289 #[cfg(feature = "strings")]
290 {
291 Word::Numerical(Numerical::Alphanumeric(s.to_string()))
292 }
293 #[cfg(not(feature = "strings"))]
294 {
295 Word::Numerical(Numerical::Alphanumeric)
296 }
297 }
298 (false, false, _, true, false) => {
299 // Word
300 #[cfg(feature = "strings")]
301 {
302 Word::Word(s.to_string())
303 }
304 #[cfg(not(feature = "strings"))]
305 {
306 Word::Word
307 }
308 }
309 (false, false, _, _, _) => {
310 // Strange
311 #[cfg(feature = "strings")]
312 {
313 Word::StrangeWord(s.to_string())
314 }
315 #[cfg(not(feature = "strings"))]
316 {
317 Word::StrangeWord
318 }
319 }
320 (false, true, _, _, _) => unreachable!(),
321 },
322 )
323 }
324 fn basic_punctuation_to_pt(&mut self, c: char) -> Token {
325 Token::Special(Special::Punctuation(c))
326 }
327 /*fn check_url(&mut self) -> Option<PositionalToken> {
328 if !self.allow_structs { return None; }
329 let check = if self.buffer.len()>3 {
330 match (&self.buffer[0],&self.buffer[1],&self.buffer[2]) {
331 (BasicToken::Alphanumeric("http"),BasicToken::Punctuation(":"),BasicToken::Punctuation("//")) |
332 (BasicToken::Alphanumeric("https"),BasicToken::Punctuation(":"),BasicToken::Punctuation("//")) => true,
333 _ => false,
334 }
335 } else { false };
336 if check {
337 let mut url = "".to_string();
338 let tag_bound = None;
339 loop {
340 if let Some(b) = tag_bound {
341 if (self.offset + url.len()) >= b { break; }
342 }
343 match self.buffer.pop_front() {
344 None => break,
345 Some(BasicToken::Separator(s)) => {
346 self.buffer.push_front(BasicToken::Separator(s));
347 break;
348 },
349 Some(BasicToken::Alphanumeric(s)) |
350 Some(BasicToken::Number(s)) |
351 Some(BasicToken::Punctuation(s)) |
352 Some(BasicToken::Formatter(s)) |
353 Some(BasicToken::Mixed(s)) => {
354 url += s;
355 },
356 }
357 }
358 let len = url.len();
359 let tok = PositionalToken {
360 offset: self.offset,
361 length: len,
362 token: Token::Url(url),
363 };
364 self.offset += len;
365 Some(tok)
366 } else { None }
367 }*/
368
369 // allowed because of feature "strings"
370 #[allow(unused_mut)]
371 #[allow(unused_variables)]
372 fn check_hashtag(&mut self) -> Option<Local<Token>> {
373 if !self.allow_structs || (self.buffer.len() < 2) {
374 return None;
375 }
376
377 let (mut loc, bt) = self.buffer[0].into_inner();
378 let mut ln = 1;
379 let mut buf = String::new();
380 match bt {
381 BasicToken::Punctuation('#') => {
382 while ln < self.buffer.len() {
383 let (nloc, nbt) = self.buffer[ln].into_inner();
384 match nbt {
385 BasicToken::Punctuation('_') => match Local::from_segment(loc, nloc) {
386 Ok(lc) => {
387 #[cfg(feature = "strings")]
388 {
389 buf.push('_');
390 }
391 loc = lc;
392 ln += 1;
393 }
394 Err(_) => break,
395 },
396 BasicToken::Alphanumeric(_s) | BasicToken::Number(_s) => {
397 match Local::from_segment(loc, nloc) {
398 Ok(lc) => {
399 #[cfg(feature = "strings")]
400 {
401 buf += _s;
402 }
403 loc = lc;
404 ln += 1;
405 }
406 Err(_) => break,
407 }
408 }
409 BasicToken::Punctuation(..)
410 | BasicToken::Separator(..)
411 | BasicToken::Formatter(..)
412 | BasicToken::Mixed(..) => break,
413 }
414 }
415 match ln > 1 {
416 true => {
417 for _ in 0..ln {
418 self.buffer.pop_front();
419 }
420 Some(loc.local(Token::Struct({
421 #[cfg(feature = "strings")]
422 {
423 Struct::Hashtag(buf)
424 }
425 #[cfg(not(feature = "strings"))]
426 {
427 Struct::Hashtag
428 }
429 })))
430 }
431 false => None,
432 }
433 }
434 _ => None,
435 }
436 }
437
438 // allowed because of feature "strings"
439 #[allow(unused_mut)]
440 #[allow(unused_variables)]
441 fn check_mention(&mut self) -> Option<Local<Token>> {
442 if !self.allow_structs || (self.buffer.len() < 2) {
443 return None;
444 }
445
446 let (mut loc, bt) = self.buffer[0].into_inner();
447 let mut ln = 1;
448 let mut buf = String::new();
449 match bt {
450 BasicToken::Punctuation('@') => {
451 while ln < self.buffer.len() {
452 let (nloc, nbt) = self.buffer[ln].into_inner();
453 match nbt {
454 BasicToken::Punctuation('_') => match Local::from_segment(loc, nloc) {
455 Ok(lc) => {
456 #[cfg(feature = "strings")]
457 {
458 buf.push('_');
459 }
460 loc = lc;
461 ln += 1;
462 }
463 Err(_) => break,
464 },
465 BasicToken::Alphanumeric(_s) | BasicToken::Number(_s) => {
466 match Local::from_segment(loc, nloc) {
467 Ok(lc) => {
468 #[cfg(feature = "strings")]
469 {
470 buf += _s;
471 }
472 loc = lc;
473 ln += 1;
474 }
475 Err(_) => break,
476 }
477 }
478 BasicToken::Punctuation(..)
479 | BasicToken::Separator(..)
480 | BasicToken::Formatter(..)
481 | BasicToken::Mixed(..) => break,
482 }
483 }
484 match ln > 1 {
485 true => {
486 for _ in 0..ln {
487 self.buffer.pop_front();
488 }
489 Some(loc.local(Token::Struct({
490 #[cfg(feature = "strings")]
491 {
492 Struct::Mention(buf)
493 }
494 #[cfg(not(feature = "strings"))]
495 {
496 Struct::Mention
497 }
498 })))
499 }
500 false => None,
501 }
502 }
503 _ => None,
504 }
505 }
506 fn next_from_buffer(&mut self) -> Option<Local<Token>> {
507 //if let Some(t) = self.check_url() { return Some(t); }
508 if let Some(t) = self.check_hashtag() {
509 return Some(t);
510 }
511 if let Some(t) = self.check_mention() {
512 return Some(t);
513 }
514 match self.buffer.pop_front() {
515 Some(local_tok) => {
516 let (local, tok) = local_tok.into_inner();
517 Some(local.local(match tok {
518 BasicToken::Alphanumeric(s) => self.basic_alphanumeric_to_pt(s),
519 BasicToken::Number(s) => self.basic_number_to_pt(s),
520 BasicToken::Punctuation(s) => self.basic_punctuation_to_pt(s),
521 BasicToken::Mixed(s) => self.basic_mixed_to_pt(s),
522 BasicToken::Separator(s) => self.basic_separator_to_pt(s),
523 BasicToken::Formatter(s) => self.basic_formater_to_pt(s),
524 }))
525 }
526 None => None,
527 }
528 }
529}