1use std::collections::{BTreeSet, VecDeque};
2use std::str::FromStr;
3use unicode_properties::{GeneralCategory, GeneralCategoryGroup, UnicodeGeneralCategory};
4
5use text_parsing::Local;
6
7use crate::{
8 EMOJIMAP, Formatter, IntoTokenizer, Number, Numerical, SentenceBreaker, Separator, Special,
9 Struct, Token, TokenizerOptions, TokenizerParams, Unicode, Word,
10 wordbreaker::{BasicToken, WordBreaker, one_char_word},
11};
12
13impl<'t> IntoTokenizer for &'t str {
14 type IntoTokens = Tokens<'t>;
15
16 fn into_tokenizer<S: SentenceBreaker>(self, params: TokenizerParams<S>) -> Self::IntoTokens {
17 Tokens::new(self, ¶ms.options)
18 }
19}
20
21impl<'t> Iterator for Tokens<'t> {
22 type Item = Local<Token>;
23
24 fn next(&mut self) -> Option<Self::Item> {
25 loop {
26 if self.buffer.len() > 0 {
27 return self.next_from_buffer();
28 } else {
29 loop {
30 match self.bounds.next() {
31 Some(local_bt) => {
32 let sep = if let BasicToken::Separator(_) = local_bt.data() {
33 true
34 } else {
35 false
36 };
37 self.buffer.push_back(local_bt);
38 if sep {
39 return self.next();
40 }
41 }
42 None if self.buffer.len() > 0 => return self.next(),
43 None => return None,
44 }
45 }
46 }
47 }
48 }
49}
50
51pub struct Tokens<'t> {
53 bounds: WordBreaker<'t>,
54 buffer: VecDeque<Local<BasicToken<'t>>>,
55 allow_structs: bool,
56}
57impl<'t> Tokens<'t> {
58 pub(crate) fn new<'a>(s: &'a str, options: &BTreeSet<TokenizerOptions>) -> Tokens<'a> {
59 Tokens {
60 bounds: WordBreaker::new(s, &options),
61 buffer: VecDeque::new(),
62 allow_structs: if options.contains(&TokenizerOptions::StructTokens) {
63 true
64 } else {
65 false
66 },
67 }
68 }
69 fn basic_separator_to_pt(&mut self, c: char) -> Token {
70 Token::Special(Special::Separator(match c {
71 ' ' => Separator::Space,
72 '\n' => Separator::Newline,
73 '\t' => Separator::Tab,
74 _ => Separator::Char(c),
75 }))
76 }
77 fn basic_formater_to_pt(&mut self, c: char) -> Token {
78 Token::Unicode(Unicode::Formatter(match c {
79 '\u{200d}' => Formatter::Joiner,
80 _ => Formatter::Char(c),
81 }))
82 }
83 fn basic_number_to_pt(&mut self, s: &str) -> Token {
84 Token::Word(match i64::from_str(s) {
85 Ok(n) => match s.chars().next() {
86 Some('0') => {
87 #[cfg(not(feature = "strings"))]
88 {
89 Word::Number(Number::ZeroInteger { i: n })
90 }
91 #[cfg(feature = "strings")]
92 {
93 Word::Number(Number::ZeroInteger {
94 i: n,
95 s: s.to_string(),
96 })
97 }
98 }
99 Some(_) | None => Word::Number(Number::Integer(n)),
100 },
101 Err(_) => match f64::from_str(s) {
102 Ok(n) => Word::Number(Number::Float(n)),
103 Err(..) => {
104 #[cfg(feature = "strings")]
105 {
106 Word::Word(s.to_string())
107 }
108 #[cfg(not(feature = "strings"))]
109 {
110 Word::Word
111 }
112 }
113 },
114 })
115 }
116 fn basic_mixed_to_pt(&mut self, s: &str) -> Token {
117 let mut word = true;
118 let mut has_word_parts = false;
119 let mut first = true;
120 let mut same = false;
121 let mut one_c = ' ';
122 for c in s.chars() {
123 match c.is_alphanumeric()
124 || c.is_digit(10)
125 || (c.general_category_group() == GeneralCategoryGroup::Punctuation)
126 || (c == '\u{0060}')
127 {
128 true => {
129 has_word_parts = true;
130 }
131 false => {
132 word = false;
133 }
134 }
135 match first {
136 true => {
137 one_c = c;
138 first = false;
139 same = true;
140 }
141 false => {
142 if one_c != c {
143 same = false;
144 }
145 }
146 }
147 }
148 if !first
149 && same
150 && (one_c.is_whitespace() || (one_c.general_category() == GeneralCategory::Format))
151 {
152 if one_c.is_whitespace() {
153 return self.basic_separator_to_pt(one_c);
154 } else {
155 return self.basic_formater_to_pt(one_c);
156 }
157 }
158 if word {
159 #[cfg(feature = "strings")]
160 {
161 Token::Word(Word::StrangeWord(s.to_string()))
162 }
163 #[cfg(not(feature = "strings"))]
164 {
165 Token::Word(Word::StrangeWord)
166 }
167 } else {
168 let rs = s.replace("\u{fe0f}", "");
169 match EMOJIMAP.get(&rs as &str) {
170 Some(em) => Token::Word(Word::Emoji(em)),
171 None => match one_char_word(&rs) {
172 Some(c) if c.general_category_group() == GeneralCategoryGroup::Symbol => {
174 match c.general_category() {
175 GeneralCategory::CurrencySymbol => Token::Special(Special::Currency(c)),
176 _ => Token::Special(Special::Symbol(c)),
177 }
178 }
179 Some(_) | None => match has_word_parts {
180 true => {
181 #[cfg(feature = "strings")]
182 {
183 Token::Word(Word::StrangeWord(s.to_string()))
184 }
185 #[cfg(not(feature = "strings"))]
186 {
187 Token::Word(Word::StrangeWord)
188 }
189 }
190 false => {
191 #[cfg(feature = "strings")]
192 {
193 Token::Unicode(Unicode::String({
194 let mut us = "".to_string();
195 for c in rs.chars() {
196 if us != "" {
197 us += "_";
198 }
199 us += "u";
200 let ns = format!("{}", c.escape_unicode());
201 us += &ns[3..ns.len() - 1];
202 }
203 us
204 }))
205 }
206 #[cfg(not(feature = "strings"))]
207 {
208 Token::Unicode(Unicode::String)
209 }
210 }
211 },
212 },
213 }
214 }
215 }
216 fn basic_alphanumeric_to_pt(&mut self, s: &str) -> Token {
217 let mut digits = false;
230 let mut digits_begin_only = false;
231 let mut dots = false;
232 let mut alphas_and_apos = false;
233 let mut other = false;
234
235 let mut start_digit = true;
236 for c in s.chars() {
237 if start_digit && (!c.is_digit(10)) {
238 start_digit = false;
239 }
240 match c {
241 c @ _ if c.is_digit(10) => {
242 digits = true;
243 if start_digit {
244 digits_begin_only = true;
245 } else {
246 digits_begin_only = false;
247 }
248 }
249 c @ _ if c.is_alphabetic() => {
250 alphas_and_apos = true;
251 }
252 '\'' => {
253 alphas_and_apos = true;
254 }
255 '.' => {
256 dots = true;
257 }
258 _ => {
259 other = true;
260 }
261 }
262 }
263 Token::Word(
264 match (digits, digits_begin_only, dots, alphas_and_apos, other) {
265 (true, false, true, false, false) => {
266 #[cfg(feature = "strings")]
268 {
269 Word::Numerical(Numerical::DotSeparated(s.to_string()))
270 }
271 #[cfg(not(feature = "strings"))]
272 {
273 Word::Numerical(Numerical::DotSeparated)
274 }
275 }
276 (true, true, _, true, false) => {
277 #[cfg(feature = "strings")]
279 {
280 Word::Numerical(Numerical::Measures(s.to_string()))
281 }
282 #[cfg(not(feature = "strings"))]
283 {
284 Word::Numerical(Numerical::Measures)
285 }
286 }
287 (true, _, _, _, _) => {
288 #[cfg(feature = "strings")]
290 {
291 Word::Numerical(Numerical::Alphanumeric(s.to_string()))
292 }
293 #[cfg(not(feature = "strings"))]
294 {
295 Word::Numerical(Numerical::Alphanumeric)
296 }
297 }
298 (false, false, _, true, false) => {
299 #[cfg(feature = "strings")]
301 {
302 Word::Word(s.to_string())
303 }
304 #[cfg(not(feature = "strings"))]
305 {
306 Word::Word
307 }
308 }
309 (false, false, _, _, _) => {
310 #[cfg(feature = "strings")]
312 {
313 Word::StrangeWord(s.to_string())
314 }
315 #[cfg(not(feature = "strings"))]
316 {
317 Word::StrangeWord
318 }
319 }
320 (false, true, _, _, _) => unreachable!(),
321 },
322 )
323 }
324 fn basic_punctuation_to_pt(&mut self, c: char) -> Token {
325 Token::Special(Special::Punctuation(c))
326 }
327 fn basic_currency_to_pt(&mut self, c: char) -> Token {
328 Token::Special(Special::Currency(c))
329 }
330 #[allow(unused_mut)]
374 #[allow(unused_variables)]
375 fn check_hashtag(&mut self) -> Option<Local<Token>> {
376 if !self.allow_structs || (self.buffer.len() < 2) {
377 return None;
378 }
379
380 let (mut loc, bt) = self.buffer[0].into_inner();
381 let mut ln = 1;
382 let mut buf = String::new();
383 match bt {
384 BasicToken::Punctuation('#') => {
385 while ln < self.buffer.len() {
386 let (nloc, nbt) = self.buffer[ln].into_inner();
387 match nbt {
388 BasicToken::Punctuation('_') => match Local::from_segment(loc, nloc) {
389 Ok(lc) => {
390 #[cfg(feature = "strings")]
391 {
392 buf.push('_');
393 }
394 loc = lc;
395 ln += 1;
396 }
397 Err(_) => break,
398 },
399 BasicToken::Alphanumeric(_s) | BasicToken::Number(_s) => {
400 match Local::from_segment(loc, nloc) {
401 Ok(lc) => {
402 #[cfg(feature = "strings")]
403 {
404 buf += _s;
405 }
406 loc = lc;
407 ln += 1;
408 }
409 Err(_) => break,
410 }
411 }
412 BasicToken::Punctuation(..)
413 | BasicToken::CurrencySymbol(..)
414 | BasicToken::Separator(..)
415 | BasicToken::Formatter(..)
416 | BasicToken::Mixed(..) => break,
417 }
418 }
419 match ln > 1 {
420 true => {
421 for _ in 0..ln {
422 self.buffer.pop_front();
423 }
424 Some(loc.local(Token::Struct({
425 #[cfg(feature = "strings")]
426 {
427 Struct::Hashtag(buf)
428 }
429 #[cfg(not(feature = "strings"))]
430 {
431 Struct::Hashtag
432 }
433 })))
434 }
435 false => None,
436 }
437 }
438 _ => None,
439 }
440 }
441
442 #[allow(unused_mut)]
444 #[allow(unused_variables)]
445 fn check_mention(&mut self) -> Option<Local<Token>> {
446 if !self.allow_structs || (self.buffer.len() < 2) {
447 return None;
448 }
449
450 let (mut loc, bt) = self.buffer[0].into_inner();
451 let mut ln = 1;
452 let mut buf = String::new();
453 match bt {
454 BasicToken::Punctuation('@') => {
455 while ln < self.buffer.len() {
456 let (nloc, nbt) = self.buffer[ln].into_inner();
457 match nbt {
458 BasicToken::Punctuation('_') => match Local::from_segment(loc, nloc) {
459 Ok(lc) => {
460 #[cfg(feature = "strings")]
461 {
462 buf.push('_');
463 }
464 loc = lc;
465 ln += 1;
466 }
467 Err(_) => break,
468 },
469 BasicToken::Alphanumeric(_s) | BasicToken::Number(_s) => {
470 match Local::from_segment(loc, nloc) {
471 Ok(lc) => {
472 #[cfg(feature = "strings")]
473 {
474 buf += _s;
475 }
476 loc = lc;
477 ln += 1;
478 }
479 Err(_) => break,
480 }
481 }
482 BasicToken::Punctuation(..)
483 | BasicToken::CurrencySymbol(..)
484 | BasicToken::Separator(..)
485 | BasicToken::Formatter(..)
486 | BasicToken::Mixed(..) => break,
487 }
488 }
489 match ln > 1 {
490 true => {
491 for _ in 0..ln {
492 self.buffer.pop_front();
493 }
494 Some(loc.local(Token::Struct({
495 #[cfg(feature = "strings")]
496 {
497 Struct::Mention(buf)
498 }
499 #[cfg(not(feature = "strings"))]
500 {
501 Struct::Mention
502 }
503 })))
504 }
505 false => None,
506 }
507 }
508 _ => None,
509 }
510 }
511 fn next_from_buffer(&mut self) -> Option<Local<Token>> {
512 if let Some(t) = self.check_hashtag() {
514 return Some(t);
515 }
516 if let Some(t) = self.check_mention() {
517 return Some(t);
518 }
519 match self.buffer.pop_front() {
520 Some(local_tok) => {
521 let (local, tok) = local_tok.into_inner();
522 Some(local.local(match tok {
523 BasicToken::Alphanumeric(s) => self.basic_alphanumeric_to_pt(s),
524 BasicToken::Number(s) => self.basic_number_to_pt(s),
525 BasicToken::Punctuation(s) => self.basic_punctuation_to_pt(s),
526 BasicToken::CurrencySymbol(s) => self.basic_currency_to_pt(s),
527 BasicToken::Mixed(s) => self.basic_mixed_to_pt(s),
528 BasicToken::Separator(s) => self.basic_separator_to_pt(s),
529 BasicToken::Formatter(s) => self.basic_formater_to_pt(s),
530 }))
531 }
532 None => None,
533 }
534 }
535}