text_tokenizer/
tokens.rs

1use std::collections::{BTreeSet, VecDeque};
2use unicode_properties::{GeneralCategory, GeneralCategoryGroup, UnicodeGeneralCategory};
3
4use text_parsing::Local;
5
6use crate::{
7    EMOJIMAP, Formatter, IntoTokenizer, Numerical, SentenceBreaker, Separator, Special, Struct,
8    Token, TokenizerOptions, TokenizerParams, Unicode, Word,
9    numbers::NumberChecker,
10    wordbreaker::{BasicToken, WordBreaker, one_char_word},
11};
12
13impl<'t> IntoTokenizer for &'t str {
14    type IntoTokens = Tokens<'t>;
15
16    fn into_tokenizer<S: SentenceBreaker>(self, params: TokenizerParams<S>) -> Self::IntoTokens {
17        Tokens::new(self, &params.options)
18    }
19}
20
21impl<'t> Iterator for Tokens<'t> {
22    type Item = Local<Token>;
23
24    fn next(&mut self) -> Option<Self::Item> {
25        loop {
26            if self.buffer.len() > 0 {
27                return self.next_from_buffer();
28            } else {
29                loop {
30                    match self.bounds.next() {
31                        Some(local_bt) => {
32                            let sep = if let BasicToken::Separator(_) = local_bt.data() {
33                                true
34                            } else {
35                                false
36                            };
37                            self.buffer.push_back(local_bt);
38                            if sep {
39                                return self.next();
40                            }
41                        }
42                        None if self.buffer.len() > 0 => return self.next(),
43                        None => return None,
44                    }
45                }
46            }
47        }
48    }
49}
50
51//#[derive(Debug)]
52pub struct Tokens<'t> {
53    bounds: WordBreaker<'t>,
54    buffer: VecDeque<Local<BasicToken<'t>>>,
55    allow_structs: bool,
56}
57impl<'t> Tokens<'t> {
58    pub(crate) fn new<'a>(s: &'a str, options: &BTreeSet<TokenizerOptions>) -> Tokens<'a> {
59        Tokens {
60            bounds: WordBreaker::new(s, &options),
61            buffer: VecDeque::new(),
62            allow_structs: if options.contains(&TokenizerOptions::StructTokens) {
63                true
64            } else {
65                false
66            },
67        }
68    }
69    fn basic_separator_to_pt(&mut self, c: char) -> Token {
70        Token::Special(Special::Separator(match c {
71            ' ' => Separator::Space,
72            '\n' => Separator::Newline,
73            '\t' => Separator::Tab,
74            _ => Separator::Char(c),
75        }))
76    }
77    fn basic_formater_to_pt(&mut self, c: char) -> Token {
78        Token::Unicode(Unicode::Formatter(match c {
79            '\u{200d}' => Formatter::Joiner,
80            _ => Formatter::Char(c),
81        }))
82    }
83    fn basic_number_to_pt(&mut self, _s: &str, num: NumberChecker) -> Token {
84        Token::Word(match num.into_number() {
85            Some(num) => Word::Number(num),
86            None => {
87                #[cfg(feature = "strings")]
88                {
89                    Word::Word(_s.to_string())
90                }
91                #[cfg(not(feature = "strings"))]
92                {
93                    Word::Word
94                }
95            }
96        })
97    }
98    fn basic_mixed_to_pt(&mut self, s: &str) -> Token {
99        let mut word = true;
100        let mut has_word_parts = false;
101        let mut first = true;
102        let mut same = false;
103        let mut one_c = ' ';
104        for c in s.chars() {
105            match c.is_alphanumeric()
106                || c.is_digit(10)
107                || (c.general_category_group() == GeneralCategoryGroup::Punctuation)
108                || (c == '\u{0060}')
109            {
110                true => {
111                    has_word_parts = true;
112                }
113                false => {
114                    word = false;
115                }
116            }
117            match first {
118                true => {
119                    one_c = c;
120                    first = false;
121                    same = true;
122                }
123                false => {
124                    if one_c != c {
125                        same = false;
126                    }
127                }
128            }
129        }
130        if !first
131            && same
132            && (one_c.is_whitespace() || (one_c.general_category() == GeneralCategory::Format))
133        {
134            if one_c.is_whitespace() {
135                return self.basic_separator_to_pt(one_c);
136            } else {
137                return self.basic_formater_to_pt(one_c);
138            }
139        }
140        if word {
141            #[cfg(feature = "strings")]
142            {
143                Token::Word(Word::StrangeWord(s.to_string()))
144            }
145            #[cfg(not(feature = "strings"))]
146            {
147                Token::Word(Word::StrangeWord)
148            }
149        } else {
150            let rs = s.replace("\u{fe0f}", "");
151            match EMOJIMAP.get(&rs as &str) {
152                Some(em) => Token::Word(Word::Emoji(em)),
153                None => match one_char_word(&rs) {
154                    //Some(c) if c.general_category() == GeneralCategory::ModifierSymbol => Token::UnicodeModifier(c),
155                    Some(c) if c.general_category_group() == GeneralCategoryGroup::Symbol => {
156                        match c.general_category() {
157                            GeneralCategory::CurrencySymbol => Token::Special(Special::Currency(c)),
158                            _ => Token::Special(Special::Symbol(c)),
159                        }
160                    }
161                    Some(_) | None => match has_word_parts {
162                        true => {
163                            #[cfg(feature = "strings")]
164                            {
165                                Token::Word(Word::StrangeWord(s.to_string()))
166                            }
167                            #[cfg(not(feature = "strings"))]
168                            {
169                                Token::Word(Word::StrangeWord)
170                            }
171                        }
172                        false => {
173                            #[cfg(feature = "strings")]
174                            {
175                                Token::Unicode(Unicode::String({
176                                    let mut us = "".to_string();
177                                    for c in rs.chars() {
178                                        if us != "" {
179                                            us += "_";
180                                        }
181                                        us += "u";
182                                        let ns = format!("{}", c.escape_unicode());
183                                        us += &ns[3..ns.len() - 1];
184                                    }
185                                    us
186                                }))
187                            }
188                            #[cfg(not(feature = "strings"))]
189                            {
190                                Token::Unicode(Unicode::String)
191                            }
192                        }
193                    },
194                },
195            }
196        }
197    }
198    fn basic_alphanumeric_to_pt(&mut self, s: &str) -> Token {
199        /*
200        Word
201        StrangeWord
202        pub enum Numerical {
203            Date(String),
204            Ip(String),
205            DotSeparated(String),
206            Countable(String),
207            Measures(String),
208            Alphanumeric(String),
209        }*/
210        //let mut wrd = true;
211        let mut digits = false;
212        let mut digits_begin_only = false;
213        let mut dots = false;
214        let mut alphas_and_apos = false;
215        let mut other = false;
216
217        let mut start_digit = true;
218        for c in s.chars() {
219            if start_digit && (!c.is_digit(10)) {
220                start_digit = false;
221            }
222            match c {
223                c @ _ if c.is_digit(10) => {
224                    digits = true;
225                    if start_digit {
226                        digits_begin_only = true;
227                    } else {
228                        digits_begin_only = false;
229                    }
230                }
231                c @ _ if c.is_alphabetic() => {
232                    alphas_and_apos = true;
233                }
234                '\'' => {
235                    alphas_and_apos = true;
236                }
237                '.' => {
238                    dots = true;
239                }
240                _ => {
241                    other = true;
242                }
243            }
244        }
245        Token::Word(
246            match (digits, digits_begin_only, dots, alphas_and_apos, other) {
247                (true, false, true, false, false) => {
248                    // TODO: Date, Ip, DotSeparated
249                    #[cfg(feature = "strings")]
250                    {
251                        Word::Numerical(Numerical::DotSeparated(s.to_string()))
252                    }
253                    #[cfg(not(feature = "strings"))]
254                    {
255                        Word::Numerical(Numerical::DotSeparated)
256                    }
257                }
258                (true, true, _, true, false) => {
259                    // TODO: Countable or Measures
260                    #[cfg(feature = "strings")]
261                    {
262                        Word::Numerical(Numerical::Measures(s.to_string()))
263                    }
264                    #[cfg(not(feature = "strings"))]
265                    {
266                        Word::Numerical(Numerical::Measures)
267                    }
268                }
269                (true, _, _, _, _) => {
270                    // Numerical trash, ids, etc.
271                    #[cfg(feature = "strings")]
272                    {
273                        Word::Numerical(Numerical::Alphanumeric(s.to_string()))
274                    }
275                    #[cfg(not(feature = "strings"))]
276                    {
277                        Word::Numerical(Numerical::Alphanumeric)
278                    }
279                }
280                (false, false, _, true, false) => {
281                    // Word
282                    #[cfg(feature = "strings")]
283                    {
284                        Word::Word(s.to_string())
285                    }
286                    #[cfg(not(feature = "strings"))]
287                    {
288                        Word::Word
289                    }
290                }
291                (false, false, _, _, _) => {
292                    // Strange
293                    #[cfg(feature = "strings")]
294                    {
295                        Word::StrangeWord(s.to_string())
296                    }
297                    #[cfg(not(feature = "strings"))]
298                    {
299                        Word::StrangeWord
300                    }
301                }
302                (false, true, _, _, _) => unreachable!(),
303            },
304        )
305    }
306    fn basic_punctuation_to_pt(&mut self, c: char) -> Token {
307        Token::Special(Special::Punctuation(c))
308    }
309    fn basic_currency_to_pt(&mut self, c: char) -> Token {
310        Token::Special(Special::Currency(c))
311    }
312    /*fn check_url(&mut self) -> Option<PositionalToken> {
313            if !self.allow_structs { return None; }
314            let check = if self.buffer.len()>3 {
315                match (&self.buffer[0],&self.buffer[1],&self.buffer[2]) {
316                    (BasicToken::Alphanumeric("http"),BasicToken::Punctuation(":"),BasicToken::Punctuation("//")) |
317                    (BasicToken::Alphanumeric("https"),BasicToken::Punctuation(":"),BasicToken::Punctuation("//")) => true,
318                    _ => false,
319                }
320            } else { false };
321            if check {
322                let mut url = "".to_string();
323                let tag_bound = None;
324                loop {
325                    if let Some(b) = tag_bound {
326                        if (self.offset + url.len()) >= b { break; }
327                    }
328                    match self.buffer.pop_front() {
329                        None => break,
330                        Some(BasicToken::Separator(s)) => {
331                            self.buffer.push_front(BasicToken::Separator(s));
332                            break;
333                        },
334                        Some(BasicToken::Alphanumeric(s)) |
335                        Some(BasicToken::Number(s)) |
336                        Some(BasicToken::Punctuation(s)) |
337                        Some(BasicToken::Formatter(s)) |
338                        Some(BasicToken::Mixed(s)) => {
339                            url += s;
340                        },
341                    }
342                }
343                let len = url.len();
344                let tok = PositionalToken {
345                    offset: self.offset,
346                    length: len,
347                    token: Token::Url(url),
348                };
349                self.offset += len;
350                Some(tok)
351            } else { None }
352    }*/
353
354    // allowed because of feature "strings"
355    #[allow(unused_mut)]
356    #[allow(unused_variables)]
357    fn check_hashtag(&mut self) -> Option<Local<Token>> {
358        if !self.allow_structs || (self.buffer.len() < 2) {
359            return None;
360        }
361
362        let (mut loc, bt) = self.buffer[0].into_inner();
363        let mut ln = 1;
364        let mut buf = String::new();
365        match bt {
366            BasicToken::Punctuation('#') => {
367                while ln < self.buffer.len() {
368                    let (nloc, nbt) = self.buffer[ln].into_inner();
369                    match nbt {
370                        BasicToken::Punctuation('_') => match Local::from_segment(loc, nloc) {
371                            Ok(lc) => {
372                                #[cfg(feature = "strings")]
373                                {
374                                    buf.push('_');
375                                }
376                                loc = lc;
377                                ln += 1;
378                            }
379                            Err(_) => break,
380                        },
381                        BasicToken::Alphanumeric(_s) | BasicToken::Number(_s, _) => {
382                            match Local::from_segment(loc, nloc) {
383                                Ok(lc) => {
384                                    #[cfg(feature = "strings")]
385                                    {
386                                        buf += _s;
387                                    }
388                                    loc = lc;
389                                    ln += 1;
390                                }
391                                Err(_) => break,
392                            }
393                        }
394                        BasicToken::Punctuation(..)
395                        | BasicToken::CurrencySymbol(..)
396                        | BasicToken::Separator(..)
397                        | BasicToken::Formatter(..)
398                        | BasicToken::Mixed(..) => break,
399                    }
400                }
401                match ln > 1 {
402                    true => {
403                        for _ in 0..ln {
404                            self.buffer.pop_front();
405                        }
406                        Some(loc.local(Token::Struct({
407                            #[cfg(feature = "strings")]
408                            {
409                                Struct::Hashtag(buf)
410                            }
411                            #[cfg(not(feature = "strings"))]
412                            {
413                                Struct::Hashtag
414                            }
415                        })))
416                    }
417                    false => None,
418                }
419            }
420            _ => None,
421        }
422    }
423
424    // allowed because of feature "strings"
425    #[allow(unused_mut)]
426    #[allow(unused_variables)]
427    fn check_mention(&mut self) -> Option<Local<Token>> {
428        if !self.allow_structs || (self.buffer.len() < 2) {
429            return None;
430        }
431
432        let (mut loc, bt) = self.buffer[0].into_inner();
433        let mut ln = 1;
434        let mut buf = String::new();
435        match bt {
436            BasicToken::Punctuation('@') => {
437                while ln < self.buffer.len() {
438                    let (nloc, nbt) = self.buffer[ln].into_inner();
439                    match nbt {
440                        BasicToken::Punctuation('_') => match Local::from_segment(loc, nloc) {
441                            Ok(lc) => {
442                                #[cfg(feature = "strings")]
443                                {
444                                    buf.push('_');
445                                }
446                                loc = lc;
447                                ln += 1;
448                            }
449                            Err(_) => break,
450                        },
451                        BasicToken::Alphanumeric(_s) | BasicToken::Number(_s, _) => {
452                            match Local::from_segment(loc, nloc) {
453                                Ok(lc) => {
454                                    #[cfg(feature = "strings")]
455                                    {
456                                        buf += _s;
457                                    }
458                                    loc = lc;
459                                    ln += 1;
460                                }
461                                Err(_) => break,
462                            }
463                        }
464                        BasicToken::Punctuation(..)
465                        | BasicToken::CurrencySymbol(..)
466                        | BasicToken::Separator(..)
467                        | BasicToken::Formatter(..)
468                        | BasicToken::Mixed(..) => break,
469                    }
470                }
471                match ln > 1 {
472                    true => {
473                        for _ in 0..ln {
474                            self.buffer.pop_front();
475                        }
476                        Some(loc.local(Token::Struct({
477                            #[cfg(feature = "strings")]
478                            {
479                                Struct::Mention(buf)
480                            }
481                            #[cfg(not(feature = "strings"))]
482                            {
483                                Struct::Mention
484                            }
485                        })))
486                    }
487                    false => None,
488                }
489            }
490            _ => None,
491        }
492    }
493    fn next_from_buffer(&mut self) -> Option<Local<Token>> {
494        //if let Some(t) = self.check_url() { return Some(t); }
495        if let Some(t) = self.check_hashtag() {
496            return Some(t);
497        }
498        if let Some(t) = self.check_mention() {
499            return Some(t);
500        }
501        match self.buffer.pop_front() {
502            Some(local_tok) => {
503                let (local, tok) = local_tok.into_inner();
504                Some(local.local(match tok {
505                    BasicToken::Alphanumeric(s) => self.basic_alphanumeric_to_pt(s),
506                    BasicToken::Number(s, num) => self.basic_number_to_pt(s, num),
507                    BasicToken::Punctuation(s) => self.basic_punctuation_to_pt(s),
508                    BasicToken::CurrencySymbol(s) => self.basic_currency_to_pt(s),
509                    BasicToken::Mixed(s) => self.basic_mixed_to_pt(s),
510                    BasicToken::Separator(s) => self.basic_separator_to_pt(s),
511                    BasicToken::Formatter(s) => self.basic_formater_to_pt(s),
512                }))
513            }
514            None => None,
515        }
516    }
517}