1use std::collections::{BTreeSet, VecDeque};
2use std::str::FromStr;
3use unicode_properties::{GeneralCategory, GeneralCategoryGroup, UnicodeGeneralCategory};
4
5use text_parsing::Local;
6
7use crate::{
8 EMOJIMAP, Formatter, IntoTokenizer, Number, Numerical, SentenceBreaker, Separator, Special,
9 Struct, Token, TokenizerOptions, TokenizerParams, Unicode, Word,
10 wordbreaker::{BasicToken, WordBreaker, one_char_word},
11};
12
13impl<'t> IntoTokenizer for &'t str {
14 type IntoTokens = Tokens<'t>;
15
16 fn into_tokenizer<S: SentenceBreaker>(self, params: TokenizerParams<S>) -> Self::IntoTokens {
17 Tokens::new(self, ¶ms.options)
18 }
19}
20
21impl<'t> Iterator for Tokens<'t> {
22 type Item = Local<Token>;
23
24 fn next(&mut self) -> Option<Self::Item> {
25 loop {
26 if self.buffer.len() > 0 {
27 return self.next_from_buffer();
28 } else {
29 loop {
30 match self.bounds.next() {
31 Some(local_bt) => {
32 let sep = if let BasicToken::Separator(_) = local_bt.data() {
33 true
34 } else {
35 false
36 };
37 self.buffer.push_back(local_bt);
38 if sep {
39 return self.next();
40 }
41 }
42 None if self.buffer.len() > 0 => return self.next(),
43 None => return None,
44 }
45 }
46 }
47 }
48 }
49}
50
51pub struct Tokens<'t> {
53 bounds: WordBreaker<'t>,
54 buffer: VecDeque<Local<BasicToken<'t>>>,
55 allow_structs: bool,
56}
57impl<'t> Tokens<'t> {
58 pub(crate) fn new<'a>(s: &'a str, options: &BTreeSet<TokenizerOptions>) -> Tokens<'a> {
59 Tokens {
60 bounds: WordBreaker::new(s, &options),
61 buffer: VecDeque::new(),
62 allow_structs: if options.contains(&TokenizerOptions::StructTokens) {
63 true
64 } else {
65 false
66 },
67 }
68 }
69 fn basic_separator_to_pt(&mut self, c: char) -> Token {
70 Token::Special(Special::Separator(match c {
71 ' ' => Separator::Space,
72 '\n' => Separator::Newline,
73 '\t' => Separator::Tab,
74 _ => Separator::Char(c),
75 }))
76 }
77 fn basic_formater_to_pt(&mut self, c: char) -> Token {
78 Token::Unicode(Unicode::Formatter(match c {
79 '\u{200d}' => Formatter::Joiner,
80 _ => Formatter::Char(c),
81 }))
82 }
83 fn basic_number_to_pt(&mut self, s: &str) -> Token {
84 Token::Word(match i64::from_str(s) {
85 Ok(n) => match s.chars().next() {
86 Some('0') => {
87 #[cfg(not(feature = "strings"))]
88 {
89 Word::Number(Number::ZeroInteger { i: n })
90 }
91 #[cfg(feature = "strings")]
92 {
93 Word::Number(Number::ZeroInteger {
94 i: n,
95 s: s.to_string(),
96 })
97 }
98 }
99 Some(_) | None => Word::Number(Number::Integer(n)),
100 },
101 Err(_) => match f64::from_str(s) {
102 Ok(n) => Word::Number(Number::Float(n)),
103 Err(..) => {
104 #[cfg(feature = "strings")]
105 {
106 Word::Word(s.to_string())
107 }
108 #[cfg(not(feature = "strings"))]
109 {
110 Word::Word
111 }
112 }
113 },
114 })
115 }
116 fn basic_mixed_to_pt(&mut self, s: &str) -> Token {
117 let mut word = true;
118 let mut has_word_parts = false;
119 let mut first = true;
120 let mut same = false;
121 let mut one_c = ' ';
122 for c in s.chars() {
123 match c.is_alphanumeric()
124 || c.is_digit(10)
125 || (c.general_category_group() == GeneralCategoryGroup::Punctuation)
126 || (c == '\u{0060}')
127 {
128 true => {
129 has_word_parts = true;
130 }
131 false => {
132 word = false;
133 }
134 }
135 match first {
136 true => {
137 one_c = c;
138 first = false;
139 same = true;
140 }
141 false => {
142 if one_c != c {
143 same = false;
144 }
145 }
146 }
147 }
148 if !first
149 && same
150 && (one_c.is_whitespace() || (one_c.general_category() == GeneralCategory::Format))
151 {
152 if one_c.is_whitespace() {
153 return self.basic_separator_to_pt(one_c);
154 } else {
155 return self.basic_formater_to_pt(one_c);
156 }
157 }
158 if word {
159 #[cfg(feature = "strings")]
160 {
161 Token::Word(Word::StrangeWord(s.to_string()))
162 }
163 #[cfg(not(feature = "strings"))]
164 {
165 Token::Word(Word::StrangeWord)
166 }
167 } else {
168 let rs = s.replace("\u{fe0f}", "");
169 match EMOJIMAP.get(&rs as &str) {
170 Some(em) => Token::Word(Word::Emoji(em)),
171 None => match one_char_word(&rs) {
172 Some(c) if c.general_category_group() == GeneralCategoryGroup::Symbol => {
174 match c.general_category() {
175 GeneralCategory::CurrencySymbol => Token::Special(Special::Currency(c)),
176 _ => Token::Special(Special::Symbol(c)),
177 }
178 }
179 Some(_) | None => match has_word_parts {
180 true => {
181 #[cfg(feature = "strings")]
182 {
183 Token::Word(Word::StrangeWord(s.to_string()))
184 }
185 #[cfg(not(feature = "strings"))]
186 {
187 Token::Word(Word::StrangeWord)
188 }
189 }
190 false => {
191 #[cfg(feature = "strings")]
192 {
193 Token::Unicode(Unicode::String({
194 let mut us = "".to_string();
195 for c in rs.chars() {
196 if us != "" {
197 us += "_";
198 }
199 us += "u";
200 let ns = format!("{}", c.escape_unicode());
201 us += &ns[3..ns.len() - 1];
202 }
203 us
204 }))
205 }
206 #[cfg(not(feature = "strings"))]
207 {
208 Token::Unicode(Unicode::String)
209 }
210 }
211 },
212 },
213 }
214 }
215 }
216 fn basic_alphanumeric_to_pt(&mut self, s: &str) -> Token {
217 let mut digits = false;
230 let mut digits_begin_only = false;
231 let mut dots = false;
232 let mut alphas_and_apos = false;
233 let mut other = false;
234
235 let mut start_digit = true;
236 for c in s.chars() {
237 if start_digit && (!c.is_digit(10)) {
238 start_digit = false;
239 }
240 match c {
241 c @ _ if c.is_digit(10) => {
242 digits = true;
243 if start_digit {
244 digits_begin_only = true;
245 } else {
246 digits_begin_only = false;
247 }
248 }
249 c @ _ if c.is_alphabetic() => {
250 alphas_and_apos = true;
251 }
252 '\'' => {
253 alphas_and_apos = true;
254 }
255 '.' => {
256 dots = true;
257 }
258 _ => {
259 other = true;
260 }
261 }
262 }
263 Token::Word(
264 match (digits, digits_begin_only, dots, alphas_and_apos, other) {
265 (true, false, true, false, false) => {
266 #[cfg(feature = "strings")]
268 {
269 Word::Numerical(Numerical::DotSeparated(s.to_string()))
270 }
271 #[cfg(not(feature = "strings"))]
272 {
273 Word::Numerical(Numerical::DotSeparated)
274 }
275 }
276 (true, true, _, true, false) => {
277 #[cfg(feature = "strings")]
279 {
280 Word::Numerical(Numerical::Measures(s.to_string()))
281 }
282 #[cfg(not(feature = "strings"))]
283 {
284 Word::Numerical(Numerical::Measures)
285 }
286 }
287 (true, _, _, _, _) => {
288 #[cfg(feature = "strings")]
290 {
291 Word::Numerical(Numerical::Alphanumeric(s.to_string()))
292 }
293 #[cfg(not(feature = "strings"))]
294 {
295 Word::Numerical(Numerical::Alphanumeric)
296 }
297 }
298 (false, false, _, true, false) => {
299 #[cfg(feature = "strings")]
301 {
302 Word::Word(s.to_string())
303 }
304 #[cfg(not(feature = "strings"))]
305 {
306 Word::Word
307 }
308 }
309 (false, false, _, _, _) => {
310 #[cfg(feature = "strings")]
312 {
313 Word::StrangeWord(s.to_string())
314 }
315 #[cfg(not(feature = "strings"))]
316 {
317 Word::StrangeWord
318 }
319 }
320 (false, true, _, _, _) => unreachable!(),
321 },
322 )
323 }
324 fn basic_punctuation_to_pt(&mut self, c: char) -> Token {
325 Token::Special(Special::Punctuation(c))
326 }
327 fn check_hashtag(&mut self) -> Option<Local<Token>> {
369 if !self.allow_structs || (self.buffer.len() < 2) {
370 return None;
371 }
372
373 let (mut loc, bt) = self.buffer[0].into_inner();
374 let mut ln = 1;
375 let mut buf = String::new();
376 match bt {
377 BasicToken::Punctuation('#') => {
378 while ln < self.buffer.len() {
379 let (nloc, nbt) = self.buffer[ln].into_inner();
380 match nbt {
381 BasicToken::Punctuation('_') => match Local::from_segment(loc, nloc) {
382 Ok(lc) => {
383 #[cfg(feature = "strings")]
384 {
385 buf.push('_');
386 }
387 loc = lc;
388 ln += 1;
389 }
390 Err(_) => break,
391 },
392 BasicToken::Alphanumeric(_s) | BasicToken::Number(_s) => {
393 match Local::from_segment(loc, nloc) {
394 Ok(lc) => {
395 #[cfg(feature = "strings")]
396 {
397 buf += _s;
398 }
399 loc = lc;
400 ln += 1;
401 }
402 Err(_) => break,
403 }
404 }
405 BasicToken::Punctuation(..)
406 | BasicToken::Separator(..)
407 | BasicToken::Formatter(..)
408 | BasicToken::Mixed(..) => break,
409 }
410 }
411 match ln > 1 {
412 true => {
413 for _ in 0..ln {
414 self.buffer.pop_front();
415 }
416 Some(loc.local(Token::Struct({
417 #[cfg(feature = "strings")]
418 {
419 Struct::Hashtag(buf)
420 }
421 #[cfg(not(feature = "strings"))]
422 {
423 Struct::Hashtag
424 }
425 })))
426 }
427 false => None,
428 }
429 }
430 _ => None,
431 }
432 }
433 fn check_mention(&mut self) -> Option<Local<Token>> {
434 if !self.allow_structs || (self.buffer.len() < 2) {
435 return None;
436 }
437
438 let (mut loc, bt) = self.buffer[0].into_inner();
439 let mut ln = 1;
440 let mut buf = String::new();
441 match bt {
442 BasicToken::Punctuation('@') => {
443 while ln < self.buffer.len() {
444 let (nloc, nbt) = self.buffer[ln].into_inner();
445 match nbt {
446 BasicToken::Punctuation('_') => match Local::from_segment(loc, nloc) {
447 Ok(lc) => {
448 #[cfg(feature = "strings")]
449 {
450 buf.push('_');
451 }
452 loc = lc;
453 ln += 1;
454 }
455 Err(_) => break,
456 },
457 BasicToken::Alphanumeric(_s) | BasicToken::Number(_s) => {
458 match Local::from_segment(loc, nloc) {
459 Ok(lc) => {
460 #[cfg(feature = "strings")]
461 {
462 buf += _s;
463 }
464 loc = lc;
465 ln += 1;
466 }
467 Err(_) => break,
468 }
469 }
470 BasicToken::Punctuation(..)
471 | BasicToken::Separator(..)
472 | BasicToken::Formatter(..)
473 | BasicToken::Mixed(..) => break,
474 }
475 }
476 match ln > 1 {
477 true => {
478 for _ in 0..ln {
479 self.buffer.pop_front();
480 }
481 Some(loc.local(Token::Struct({
482 #[cfg(feature = "strings")]
483 {
484 Struct::Mention(buf)
485 }
486 #[cfg(not(feature = "strings"))]
487 {
488 Struct::Mention
489 }
490 })))
491 }
492 false => None,
493 }
494 }
495 _ => None,
496 }
497 }
498 fn next_from_buffer(&mut self) -> Option<Local<Token>> {
499 if let Some(t) = self.check_hashtag() {
501 return Some(t);
502 }
503 if let Some(t) = self.check_mention() {
504 return Some(t);
505 }
506 match self.buffer.pop_front() {
507 Some(local_tok) => {
508 let (local, tok) = local_tok.into_inner();
509 Some(local.local(match tok {
510 BasicToken::Alphanumeric(s) => self.basic_alphanumeric_to_pt(s),
511 BasicToken::Number(s) => self.basic_number_to_pt(s),
512 BasicToken::Punctuation(s) => self.basic_punctuation_to_pt(s),
513 BasicToken::Mixed(s) => self.basic_mixed_to_pt(s),
514 BasicToken::Separator(s) => self.basic_separator_to_pt(s),
515 BasicToken::Formatter(s) => self.basic_formater_to_pt(s),
516 }))
517 }
518 None => None,
519 }
520 }
521}