text_tokenizer/
tokens.rs

1use std::collections::{BTreeSet, VecDeque};
2use std::str::FromStr;
3use unicode_properties::{GeneralCategory, GeneralCategoryGroup, UnicodeGeneralCategory};
4
5use text_parsing::Local;
6
7use crate::{
8    EMOJIMAP, Formatter, IntoTokenizer, Number, Numerical, SentenceBreaker, Separator, Special,
9    Struct, Token, TokenizerOptions, TokenizerParams, Unicode, Word,
10    wordbreaker::{BasicToken, WordBreaker, one_char_word},
11};
12
13impl<'t> IntoTokenizer for &'t str {
14    type IntoTokens = Tokens<'t>;
15
16    fn into_tokenizer<S: SentenceBreaker>(self, params: TokenizerParams<S>) -> Self::IntoTokens {
17        Tokens::new(self, &params.options)
18    }
19}
20
21impl<'t> Iterator for Tokens<'t> {
22    type Item = Local<Token>;
23
24    fn next(&mut self) -> Option<Self::Item> {
25        loop {
26            if self.buffer.len() > 0 {
27                return self.next_from_buffer();
28            } else {
29                loop {
30                    match self.bounds.next() {
31                        Some(local_bt) => {
32                            let sep = if let BasicToken::Separator(_) = local_bt.data() {
33                                true
34                            } else {
35                                false
36                            };
37                            self.buffer.push_back(local_bt);
38                            if sep {
39                                return self.next();
40                            }
41                        }
42                        None if self.buffer.len() > 0 => return self.next(),
43                        None => return None,
44                    }
45                }
46            }
47        }
48    }
49}
50
51//#[derive(Debug)]
52pub struct Tokens<'t> {
53    bounds: WordBreaker<'t>,
54    buffer: VecDeque<Local<BasicToken<'t>>>,
55    allow_structs: bool,
56}
57impl<'t> Tokens<'t> {
58    pub(crate) fn new<'a>(s: &'a str, options: &BTreeSet<TokenizerOptions>) -> Tokens<'a> {
59        Tokens {
60            bounds: WordBreaker::new(s, &options),
61            buffer: VecDeque::new(),
62            allow_structs: if options.contains(&TokenizerOptions::StructTokens) {
63                true
64            } else {
65                false
66            },
67        }
68    }
69    fn basic_separator_to_pt(&mut self, c: char) -> Token {
70        Token::Special(Special::Separator(match c {
71            ' ' => Separator::Space,
72            '\n' => Separator::Newline,
73            '\t' => Separator::Tab,
74            _ => Separator::Char(c),
75        }))
76    }
77    fn basic_formater_to_pt(&mut self, c: char) -> Token {
78        Token::Unicode(Unicode::Formatter(match c {
79            '\u{200d}' => Formatter::Joiner,
80            _ => Formatter::Char(c),
81        }))
82    }
83    fn basic_number_to_pt(&mut self, s: &str) -> Token {
84        Token::Word(match i64::from_str(s) {
85            Ok(n) => match s.chars().next() {
86                Some('0') => {
87                    #[cfg(not(feature = "strings"))]
88                    {
89                        Word::Number(Number::ZeroInteger { i: n })
90                    }
91                    #[cfg(feature = "strings")]
92                    {
93                        Word::Number(Number::ZeroInteger {
94                            i: n,
95                            s: s.to_string(),
96                        })
97                    }
98                }
99                Some(_) | None => Word::Number(Number::Integer(n)),
100            },
101            Err(_) => match f64::from_str(s) {
102                Ok(n) => Word::Number(Number::Float(n)),
103                Err(..) => {
104                    #[cfg(feature = "strings")]
105                    {
106                        Word::Word(s.to_string())
107                    }
108                    #[cfg(not(feature = "strings"))]
109                    {
110                        Word::Word
111                    }
112                }
113            },
114        })
115    }
116    fn basic_mixed_to_pt(&mut self, s: &str) -> Token {
117        let mut word = true;
118        let mut has_word_parts = false;
119        let mut first = true;
120        let mut same = false;
121        let mut one_c = ' ';
122        for c in s.chars() {
123            match c.is_alphanumeric()
124                || c.is_digit(10)
125                || (c.general_category_group() == GeneralCategoryGroup::Punctuation)
126                || (c == '\u{0060}')
127            {
128                true => {
129                    has_word_parts = true;
130                }
131                false => {
132                    word = false;
133                }
134            }
135            match first {
136                true => {
137                    one_c = c;
138                    first = false;
139                    same = true;
140                }
141                false => {
142                    if one_c != c {
143                        same = false;
144                    }
145                }
146            }
147        }
148        if !first
149            && same
150            && (one_c.is_whitespace() || (one_c.general_category() == GeneralCategory::Format))
151        {
152            if one_c.is_whitespace() {
153                return self.basic_separator_to_pt(one_c);
154            } else {
155                return self.basic_formater_to_pt(one_c);
156            }
157        }
158        if word {
159            #[cfg(feature = "strings")]
160            {
161                Token::Word(Word::StrangeWord(s.to_string()))
162            }
163            #[cfg(not(feature = "strings"))]
164            {
165                Token::Word(Word::StrangeWord)
166            }
167        } else {
168            let rs = s.replace("\u{fe0f}", "");
169            match EMOJIMAP.get(&rs as &str) {
170                Some(em) => Token::Word(Word::Emoji(em)),
171                None => match one_char_word(&rs) {
172                    //Some(c) if c.general_category() == GeneralCategory::ModifierSymbol => Token::UnicodeModifier(c),
173                    Some(c) if c.general_category_group() == GeneralCategoryGroup::Symbol => {
174                        match c.general_category() {
175                            GeneralCategory::CurrencySymbol => Token::Special(Special::Currency(c)),
176                            _ => Token::Special(Special::Symbol(c)),
177                        }
178                    }
179                    Some(_) | None => match has_word_parts {
180                        true => {
181                            #[cfg(feature = "strings")]
182                            {
183                                Token::Word(Word::StrangeWord(s.to_string()))
184                            }
185                            #[cfg(not(feature = "strings"))]
186                            {
187                                Token::Word(Word::StrangeWord)
188                            }
189                        }
190                        false => {
191                            #[cfg(feature = "strings")]
192                            {
193                                Token::Unicode(Unicode::String({
194                                    let mut us = "".to_string();
195                                    for c in rs.chars() {
196                                        if us != "" {
197                                            us += "_";
198                                        }
199                                        us += "u";
200                                        let ns = format!("{}", c.escape_unicode());
201                                        us += &ns[3..ns.len() - 1];
202                                    }
203                                    us
204                                }))
205                            }
206                            #[cfg(not(feature = "strings"))]
207                            {
208                                Token::Unicode(Unicode::String)
209                            }
210                        }
211                    },
212                },
213            }
214        }
215    }
216    fn basic_alphanumeric_to_pt(&mut self, s: &str) -> Token {
217        /*
218        Word
219        StrangeWord
220        pub enum Numerical {
221            Date(String),
222            Ip(String),
223            DotSeparated(String),
224            Countable(String),
225            Measures(String),
226            Alphanumeric(String),
227        }*/
228        //let mut wrd = true;
229        let mut digits = false;
230        let mut digits_begin_only = false;
231        let mut dots = false;
232        let mut alphas_and_apos = false;
233        let mut other = false;
234
235        let mut start_digit = true;
236        for c in s.chars() {
237            if start_digit && (!c.is_digit(10)) {
238                start_digit = false;
239            }
240            match c {
241                c @ _ if c.is_digit(10) => {
242                    digits = true;
243                    if start_digit {
244                        digits_begin_only = true;
245                    } else {
246                        digits_begin_only = false;
247                    }
248                }
249                c @ _ if c.is_alphabetic() => {
250                    alphas_and_apos = true;
251                }
252                '\'' => {
253                    alphas_and_apos = true;
254                }
255                '.' => {
256                    dots = true;
257                }
258                _ => {
259                    other = true;
260                }
261            }
262        }
263        Token::Word(
264            match (digits, digits_begin_only, dots, alphas_and_apos, other) {
265                (true, false, true, false, false) => {
266                    // TODO: Date, Ip, DotSeparated
267                    #[cfg(feature = "strings")]
268                    {
269                        Word::Numerical(Numerical::DotSeparated(s.to_string()))
270                    }
271                    #[cfg(not(feature = "strings"))]
272                    {
273                        Word::Numerical(Numerical::DotSeparated)
274                    }
275                }
276                (true, true, _, true, false) => {
277                    // TODO: Countable or Measures
278                    #[cfg(feature = "strings")]
279                    {
280                        Word::Numerical(Numerical::Measures(s.to_string()))
281                    }
282                    #[cfg(not(feature = "strings"))]
283                    {
284                        Word::Numerical(Numerical::Measures)
285                    }
286                }
287                (true, _, _, _, _) => {
288                    // Numerical trash, ids, etc.
289                    #[cfg(feature = "strings")]
290                    {
291                        Word::Numerical(Numerical::Alphanumeric(s.to_string()))
292                    }
293                    #[cfg(not(feature = "strings"))]
294                    {
295                        Word::Numerical(Numerical::Alphanumeric)
296                    }
297                }
298                (false, false, _, true, false) => {
299                    // Word
300                    #[cfg(feature = "strings")]
301                    {
302                        Word::Word(s.to_string())
303                    }
304                    #[cfg(not(feature = "strings"))]
305                    {
306                        Word::Word
307                    }
308                }
309                (false, false, _, _, _) => {
310                    // Strange
311                    #[cfg(feature = "strings")]
312                    {
313                        Word::StrangeWord(s.to_string())
314                    }
315                    #[cfg(not(feature = "strings"))]
316                    {
317                        Word::StrangeWord
318                    }
319                }
320                (false, true, _, _, _) => unreachable!(),
321            },
322        )
323    }
324    fn basic_punctuation_to_pt(&mut self, c: char) -> Token {
325        Token::Special(Special::Punctuation(c))
326    }
327    fn basic_currency_to_pt(&mut self, c: char) -> Token {
328        Token::Special(Special::Currency(c))
329    }
330    /*fn check_url(&mut self) -> Option<PositionalToken> {
331            if !self.allow_structs { return None; }
332            let check = if self.buffer.len()>3 {
333                match (&self.buffer[0],&self.buffer[1],&self.buffer[2]) {
334                    (BasicToken::Alphanumeric("http"),BasicToken::Punctuation(":"),BasicToken::Punctuation("//")) |
335                    (BasicToken::Alphanumeric("https"),BasicToken::Punctuation(":"),BasicToken::Punctuation("//")) => true,
336                    _ => false,
337                }
338            } else { false };
339            if check {
340                let mut url = "".to_string();
341                let tag_bound = None;
342                loop {
343                    if let Some(b) = tag_bound {
344                        if (self.offset + url.len()) >= b { break; }
345                    }
346                    match self.buffer.pop_front() {
347                        None => break,
348                        Some(BasicToken::Separator(s)) => {
349                            self.buffer.push_front(BasicToken::Separator(s));
350                            break;
351                        },
352                        Some(BasicToken::Alphanumeric(s)) |
353                        Some(BasicToken::Number(s)) |
354                        Some(BasicToken::Punctuation(s)) |
355                        Some(BasicToken::Formatter(s)) |
356                        Some(BasicToken::Mixed(s)) => {
357                            url += s;
358                        },
359                    }
360                }
361                let len = url.len();
362                let tok = PositionalToken {
363                    offset: self.offset,
364                    length: len,
365                    token: Token::Url(url),
366                };
367                self.offset += len;
368                Some(tok)
369            } else { None }
370    }*/
371
372    // allowed because of feature "strings"
373    #[allow(unused_mut)]
374    #[allow(unused_variables)]
375    fn check_hashtag(&mut self) -> Option<Local<Token>> {
376        if !self.allow_structs || (self.buffer.len() < 2) {
377            return None;
378        }
379
380        let (mut loc, bt) = self.buffer[0].into_inner();
381        let mut ln = 1;
382        let mut buf = String::new();
383        match bt {
384            BasicToken::Punctuation('#') => {
385                while ln < self.buffer.len() {
386                    let (nloc, nbt) = self.buffer[ln].into_inner();
387                    match nbt {
388                        BasicToken::Punctuation('_') => match Local::from_segment(loc, nloc) {
389                            Ok(lc) => {
390                                #[cfg(feature = "strings")]
391                                {
392                                    buf.push('_');
393                                }
394                                loc = lc;
395                                ln += 1;
396                            }
397                            Err(_) => break,
398                        },
399                        BasicToken::Alphanumeric(_s) | BasicToken::Number(_s) => {
400                            match Local::from_segment(loc, nloc) {
401                                Ok(lc) => {
402                                    #[cfg(feature = "strings")]
403                                    {
404                                        buf += _s;
405                                    }
406                                    loc = lc;
407                                    ln += 1;
408                                }
409                                Err(_) => break,
410                            }
411                        }
412                        BasicToken::Punctuation(..)
413                        | BasicToken::CurrencySymbol(..)
414                        | BasicToken::Separator(..)
415                        | BasicToken::Formatter(..)
416                        | BasicToken::Mixed(..) => break,
417                    }
418                }
419                match ln > 1 {
420                    true => {
421                        for _ in 0..ln {
422                            self.buffer.pop_front();
423                        }
424                        Some(loc.local(Token::Struct({
425                            #[cfg(feature = "strings")]
426                            {
427                                Struct::Hashtag(buf)
428                            }
429                            #[cfg(not(feature = "strings"))]
430                            {
431                                Struct::Hashtag
432                            }
433                        })))
434                    }
435                    false => None,
436                }
437            }
438            _ => None,
439        }
440    }
441
442    // allowed because of feature "strings"
443    #[allow(unused_mut)]
444    #[allow(unused_variables)]
445    fn check_mention(&mut self) -> Option<Local<Token>> {
446        if !self.allow_structs || (self.buffer.len() < 2) {
447            return None;
448        }
449
450        let (mut loc, bt) = self.buffer[0].into_inner();
451        let mut ln = 1;
452        let mut buf = String::new();
453        match bt {
454            BasicToken::Punctuation('@') => {
455                while ln < self.buffer.len() {
456                    let (nloc, nbt) = self.buffer[ln].into_inner();
457                    match nbt {
458                        BasicToken::Punctuation('_') => match Local::from_segment(loc, nloc) {
459                            Ok(lc) => {
460                                #[cfg(feature = "strings")]
461                                {
462                                    buf.push('_');
463                                }
464                                loc = lc;
465                                ln += 1;
466                            }
467                            Err(_) => break,
468                        },
469                        BasicToken::Alphanumeric(_s) | BasicToken::Number(_s) => {
470                            match Local::from_segment(loc, nloc) {
471                                Ok(lc) => {
472                                    #[cfg(feature = "strings")]
473                                    {
474                                        buf += _s;
475                                    }
476                                    loc = lc;
477                                    ln += 1;
478                                }
479                                Err(_) => break,
480                            }
481                        }
482                        BasicToken::Punctuation(..)
483                        | BasicToken::CurrencySymbol(..)
484                        | BasicToken::Separator(..)
485                        | BasicToken::Formatter(..)
486                        | BasicToken::Mixed(..) => break,
487                    }
488                }
489                match ln > 1 {
490                    true => {
491                        for _ in 0..ln {
492                            self.buffer.pop_front();
493                        }
494                        Some(loc.local(Token::Struct({
495                            #[cfg(feature = "strings")]
496                            {
497                                Struct::Mention(buf)
498                            }
499                            #[cfg(not(feature = "strings"))]
500                            {
501                                Struct::Mention
502                            }
503                        })))
504                    }
505                    false => None,
506                }
507            }
508            _ => None,
509        }
510    }
511    fn next_from_buffer(&mut self) -> Option<Local<Token>> {
512        //if let Some(t) = self.check_url() { return Some(t); }
513        if let Some(t) = self.check_hashtag() {
514            return Some(t);
515        }
516        if let Some(t) = self.check_mention() {
517            return Some(t);
518        }
519        match self.buffer.pop_front() {
520            Some(local_tok) => {
521                let (local, tok) = local_tok.into_inner();
522                Some(local.local(match tok {
523                    BasicToken::Alphanumeric(s) => self.basic_alphanumeric_to_pt(s),
524                    BasicToken::Number(s) => self.basic_number_to_pt(s),
525                    BasicToken::Punctuation(s) => self.basic_punctuation_to_pt(s),
526                    BasicToken::CurrencySymbol(s) => self.basic_currency_to_pt(s),
527                    BasicToken::Mixed(s) => self.basic_mixed_to_pt(s),
528                    BasicToken::Separator(s) => self.basic_separator_to_pt(s),
529                    BasicToken::Formatter(s) => self.basic_formater_to_pt(s),
530                }))
531            }
532            None => None,
533        }
534    }
535}