1use std::collections::{BTreeSet, VecDeque};
2use unicode_properties::{GeneralCategory, GeneralCategoryGroup, UnicodeGeneralCategory};
3
4use text_parsing::Local;
5
6use crate::{
7 EMOJIMAP, Formatter, IntoTokenizer, Numerical, SentenceBreaker, Separator, Special, Struct,
8 Token, TokenizerOptions, TokenizerParams, Unicode, Word,
9 numbers::NumberChecker,
10 wordbreaker::{BasicToken, WordBreaker, one_char_word},
11};
12
13impl<'t> IntoTokenizer for &'t str {
14 type IntoTokens = Tokens<'t>;
15
16 fn into_tokenizer<S: SentenceBreaker>(self, params: TokenizerParams<S>) -> Self::IntoTokens {
17 Tokens::new(self, ¶ms.options)
18 }
19}
20
21impl<'t> Iterator for Tokens<'t> {
22 type Item = Local<Token>;
23
24 fn next(&mut self) -> Option<Self::Item> {
25 loop {
26 if self.buffer.len() > 0 {
27 return self.next_from_buffer();
28 } else {
29 loop {
30 match self.bounds.next() {
31 Some(local_bt) => {
32 let sep = if let BasicToken::Separator(_) = local_bt.data() {
33 true
34 } else {
35 false
36 };
37 self.buffer.push_back(local_bt);
38 if sep {
39 return self.next();
40 }
41 }
42 None if self.buffer.len() > 0 => return self.next(),
43 None => return None,
44 }
45 }
46 }
47 }
48 }
49}
50
51pub struct Tokens<'t> {
53 bounds: WordBreaker<'t>,
54 buffer: VecDeque<Local<BasicToken<'t>>>,
55 allow_structs: bool,
56}
57impl<'t> Tokens<'t> {
58 pub(crate) fn new<'a>(s: &'a str, options: &BTreeSet<TokenizerOptions>) -> Tokens<'a> {
59 Tokens {
60 bounds: WordBreaker::new(s, &options),
61 buffer: VecDeque::new(),
62 allow_structs: if options.contains(&TokenizerOptions::StructTokens) {
63 true
64 } else {
65 false
66 },
67 }
68 }
69 fn basic_separator_to_pt(&mut self, c: char) -> Token {
70 Token::Special(Special::Separator(match c {
71 ' ' => Separator::Space,
72 '\n' => Separator::Newline,
73 '\t' => Separator::Tab,
74 _ => Separator::Char(c),
75 }))
76 }
77 fn basic_formater_to_pt(&mut self, c: char) -> Token {
78 Token::Unicode(Unicode::Formatter(match c {
79 '\u{200d}' => Formatter::Joiner,
80 _ => Formatter::Char(c),
81 }))
82 }
83 fn basic_number_to_pt(&mut self, _s: &str, num: NumberChecker) -> Token {
84 Token::Word(match num.into_number() {
85 Some(num) => Word::Number(num),
86 None => {
87 #[cfg(feature = "strings")]
88 {
89 Word::Word(_s.to_string())
90 }
91 #[cfg(not(feature = "strings"))]
92 {
93 Word::Word
94 }
95 }
96 })
97 }
98 fn basic_mixed_to_pt(&mut self, s: &str) -> Token {
99 let mut word = true;
100 let mut has_word_parts = false;
101 let mut first = true;
102 let mut same = false;
103 let mut one_c = ' ';
104 for c in s.chars() {
105 match c.is_alphanumeric()
106 || c.is_digit(10)
107 || (c.general_category_group() == GeneralCategoryGroup::Punctuation)
108 || (c == '\u{0060}')
109 {
110 true => {
111 has_word_parts = true;
112 }
113 false => {
114 word = false;
115 }
116 }
117 match first {
118 true => {
119 one_c = c;
120 first = false;
121 same = true;
122 }
123 false => {
124 if one_c != c {
125 same = false;
126 }
127 }
128 }
129 }
130 if !first
131 && same
132 && (one_c.is_whitespace() || (one_c.general_category() == GeneralCategory::Format))
133 {
134 if one_c.is_whitespace() {
135 return self.basic_separator_to_pt(one_c);
136 } else {
137 return self.basic_formater_to_pt(one_c);
138 }
139 }
140 if word {
141 #[cfg(feature = "strings")]
142 {
143 Token::Word(Word::StrangeWord(s.to_string()))
144 }
145 #[cfg(not(feature = "strings"))]
146 {
147 Token::Word(Word::StrangeWord)
148 }
149 } else {
150 let rs = s.replace("\u{fe0f}", "");
151 match EMOJIMAP.get(&rs as &str) {
152 Some(em) => Token::Word(Word::Emoji(em)),
153 None => match one_char_word(&rs) {
154 Some(c) if c.general_category_group() == GeneralCategoryGroup::Symbol => {
156 match c.general_category() {
157 GeneralCategory::CurrencySymbol => Token::Special(Special::Currency(c)),
158 _ => Token::Special(Special::Symbol(c)),
159 }
160 }
161 Some(_) | None => match has_word_parts {
162 true => {
163 #[cfg(feature = "strings")]
164 {
165 Token::Word(Word::StrangeWord(s.to_string()))
166 }
167 #[cfg(not(feature = "strings"))]
168 {
169 Token::Word(Word::StrangeWord)
170 }
171 }
172 false => {
173 #[cfg(feature = "strings")]
174 {
175 Token::Unicode(Unicode::String({
176 let mut us = "".to_string();
177 for c in rs.chars() {
178 if us != "" {
179 us += "_";
180 }
181 us += "u";
182 let ns = format!("{}", c.escape_unicode());
183 us += &ns[3..ns.len() - 1];
184 }
185 us
186 }))
187 }
188 #[cfg(not(feature = "strings"))]
189 {
190 Token::Unicode(Unicode::String)
191 }
192 }
193 },
194 },
195 }
196 }
197 }
198 fn basic_alphanumeric_to_pt(&mut self, s: &str) -> Token {
199 let mut digits = false;
212 let mut digits_begin_only = false;
213 let mut dots = false;
214 let mut alphas_and_apos = false;
215 let mut other = false;
216
217 let mut start_digit = true;
218 for c in s.chars() {
219 if start_digit && (!c.is_digit(10)) {
220 start_digit = false;
221 }
222 match c {
223 c @ _ if c.is_digit(10) => {
224 digits = true;
225 if start_digit {
226 digits_begin_only = true;
227 } else {
228 digits_begin_only = false;
229 }
230 }
231 c @ _ if c.is_alphabetic() => {
232 alphas_and_apos = true;
233 }
234 '\'' => {
235 alphas_and_apos = true;
236 }
237 '.' => {
238 dots = true;
239 }
240 _ => {
241 other = true;
242 }
243 }
244 }
245 Token::Word(
246 match (digits, digits_begin_only, dots, alphas_and_apos, other) {
247 (true, false, true, false, false) => {
248 #[cfg(feature = "strings")]
250 {
251 Word::Numerical(Numerical::DotSeparated(s.to_string()))
252 }
253 #[cfg(not(feature = "strings"))]
254 {
255 Word::Numerical(Numerical::DotSeparated)
256 }
257 }
258 (true, true, _, true, false) => {
259 #[cfg(feature = "strings")]
261 {
262 Word::Numerical(Numerical::Measures(s.to_string()))
263 }
264 #[cfg(not(feature = "strings"))]
265 {
266 Word::Numerical(Numerical::Measures)
267 }
268 }
269 (true, _, _, _, _) => {
270 #[cfg(feature = "strings")]
272 {
273 Word::Numerical(Numerical::Alphanumeric(s.to_string()))
274 }
275 #[cfg(not(feature = "strings"))]
276 {
277 Word::Numerical(Numerical::Alphanumeric)
278 }
279 }
280 (false, false, _, true, false) => {
281 #[cfg(feature = "strings")]
283 {
284 Word::Word(s.to_string())
285 }
286 #[cfg(not(feature = "strings"))]
287 {
288 Word::Word
289 }
290 }
291 (false, false, _, _, _) => {
292 #[cfg(feature = "strings")]
294 {
295 Word::StrangeWord(s.to_string())
296 }
297 #[cfg(not(feature = "strings"))]
298 {
299 Word::StrangeWord
300 }
301 }
302 (false, true, _, _, _) => unreachable!(),
303 },
304 )
305 }
306 fn basic_punctuation_to_pt(&mut self, c: char) -> Token {
307 Token::Special(Special::Punctuation(c))
308 }
309 fn basic_currency_to_pt(&mut self, c: char) -> Token {
310 Token::Special(Special::Currency(c))
311 }
312 #[allow(unused_mut)]
356 #[allow(unused_variables)]
357 fn check_hashtag(&mut self) -> Option<Local<Token>> {
358 if !self.allow_structs || (self.buffer.len() < 2) {
359 return None;
360 }
361
362 let (mut loc, bt) = self.buffer[0].into_inner();
363 let mut ln = 1;
364 let mut buf = String::new();
365 match bt {
366 BasicToken::Punctuation('#') => {
367 while ln < self.buffer.len() {
368 let (nloc, nbt) = self.buffer[ln].into_inner();
369 match nbt {
370 BasicToken::Punctuation('_') => match Local::from_segment(loc, nloc) {
371 Ok(lc) => {
372 #[cfg(feature = "strings")]
373 {
374 buf.push('_');
375 }
376 loc = lc;
377 ln += 1;
378 }
379 Err(_) => break,
380 },
381 BasicToken::Alphanumeric(_s) | BasicToken::Number(_s, _) => {
382 match Local::from_segment(loc, nloc) {
383 Ok(lc) => {
384 #[cfg(feature = "strings")]
385 {
386 buf += _s;
387 }
388 loc = lc;
389 ln += 1;
390 }
391 Err(_) => break,
392 }
393 }
394 BasicToken::Punctuation(..)
395 | BasicToken::CurrencySymbol(..)
396 | BasicToken::Separator(..)
397 | BasicToken::Formatter(..)
398 | BasicToken::Mixed(..) => break,
399 }
400 }
401 match ln > 1 {
402 true => {
403 for _ in 0..ln {
404 self.buffer.pop_front();
405 }
406 Some(loc.local(Token::Struct({
407 #[cfg(feature = "strings")]
408 {
409 Struct::Hashtag(buf)
410 }
411 #[cfg(not(feature = "strings"))]
412 {
413 Struct::Hashtag
414 }
415 })))
416 }
417 false => None,
418 }
419 }
420 _ => None,
421 }
422 }
423
424 #[allow(unused_mut)]
426 #[allow(unused_variables)]
427 fn check_mention(&mut self) -> Option<Local<Token>> {
428 if !self.allow_structs || (self.buffer.len() < 2) {
429 return None;
430 }
431
432 let (mut loc, bt) = self.buffer[0].into_inner();
433 let mut ln = 1;
434 let mut buf = String::new();
435 match bt {
436 BasicToken::Punctuation('@') => {
437 while ln < self.buffer.len() {
438 let (nloc, nbt) = self.buffer[ln].into_inner();
439 match nbt {
440 BasicToken::Punctuation('_') => match Local::from_segment(loc, nloc) {
441 Ok(lc) => {
442 #[cfg(feature = "strings")]
443 {
444 buf.push('_');
445 }
446 loc = lc;
447 ln += 1;
448 }
449 Err(_) => break,
450 },
451 BasicToken::Alphanumeric(_s) | BasicToken::Number(_s, _) => {
452 match Local::from_segment(loc, nloc) {
453 Ok(lc) => {
454 #[cfg(feature = "strings")]
455 {
456 buf += _s;
457 }
458 loc = lc;
459 ln += 1;
460 }
461 Err(_) => break,
462 }
463 }
464 BasicToken::Punctuation(..)
465 | BasicToken::CurrencySymbol(..)
466 | BasicToken::Separator(..)
467 | BasicToken::Formatter(..)
468 | BasicToken::Mixed(..) => break,
469 }
470 }
471 match ln > 1 {
472 true => {
473 for _ in 0..ln {
474 self.buffer.pop_front();
475 }
476 Some(loc.local(Token::Struct({
477 #[cfg(feature = "strings")]
478 {
479 Struct::Mention(buf)
480 }
481 #[cfg(not(feature = "strings"))]
482 {
483 Struct::Mention
484 }
485 })))
486 }
487 false => None,
488 }
489 }
490 _ => None,
491 }
492 }
493 fn next_from_buffer(&mut self) -> Option<Local<Token>> {
494 if let Some(t) = self.check_hashtag() {
496 return Some(t);
497 }
498 if let Some(t) = self.check_mention() {
499 return Some(t);
500 }
501 match self.buffer.pop_front() {
502 Some(local_tok) => {
503 let (local, tok) = local_tok.into_inner();
504 Some(local.local(match tok {
505 BasicToken::Alphanumeric(s) => self.basic_alphanumeric_to_pt(s),
506 BasicToken::Number(s, num) => self.basic_number_to_pt(s, num),
507 BasicToken::Punctuation(s) => self.basic_punctuation_to_pt(s),
508 BasicToken::CurrencySymbol(s) => self.basic_currency_to_pt(s),
509 BasicToken::Mixed(s) => self.basic_mixed_to_pt(s),
510 BasicToken::Separator(s) => self.basic_separator_to_pt(s),
511 BasicToken::Formatter(s) => self.basic_formater_to_pt(s),
512 }))
513 }
514 None => None,
515 }
516 }
517}