text_tokenizer/
tokens.rs

1use std::collections::{BTreeSet, VecDeque};
2use std::str::FromStr;
3use unicode_properties::{GeneralCategory, GeneralCategoryGroup, UnicodeGeneralCategory};
4
5use text_parsing::Local;
6
7use crate::{
8    wordbreaker::{one_char_word, BasicToken, WordBreaker},
9    Formatter, IntoTokenizer, Number, Numerical, SentenceBreaker, Separator, Special, Struct,
10    Token, TokenizerOptions, TokenizerParams, Unicode, Word, EMOJIMAP,
11};
12
13impl<'t> IntoTokenizer for &'t str {
14    type IntoTokens = Tokens<'t>;
15
16    fn into_tokenizer<S: SentenceBreaker>(self, params: TokenizerParams<S>) -> Self::IntoTokens {
17        Tokens::new(self, &params.options)
18    }
19}
20
21impl<'t> Iterator for Tokens<'t> {
22    type Item = Local<Token>;
23
24    fn next(&mut self) -> Option<Self::Item> {
25        loop {
26            if self.buffer.len() > 0 {
27                return self.next_from_buffer();
28            } else {
29                loop {
30                    match self.bounds.next() {
31                        Some(local_bt) => {
32                            let sep = if let BasicToken::Separator(_) = local_bt.data() {
33                                true
34                            } else {
35                                false
36                            };
37                            self.buffer.push_back(local_bt);
38                            if sep {
39                                return self.next();
40                            }
41                        }
42                        None if self.buffer.len() > 0 => return self.next(),
43                        None => return None,
44                    }
45                }
46            }
47        }
48    }
49}
50
51//#[derive(Debug)]
52pub struct Tokens<'t> {
53    bounds: WordBreaker<'t>,
54    buffer: VecDeque<Local<BasicToken<'t>>>,
55    allow_structs: bool,
56}
57impl<'t> Tokens<'t> {
58    pub(crate) fn new<'a>(s: &'a str, options: &BTreeSet<TokenizerOptions>) -> Tokens<'a> {
59        Tokens {
60            bounds: WordBreaker::new(s, &options),
61            buffer: VecDeque::new(),
62            allow_structs: if options.contains(&TokenizerOptions::StructTokens) {
63                true
64            } else {
65                false
66            },
67        }
68    }
69    fn basic_separator_to_pt(&mut self, c: char) -> Token {
70        Token::Special(Special::Separator(match c {
71            ' ' => Separator::Space,
72            '\n' => Separator::Newline,
73            '\t' => Separator::Tab,
74            _ => Separator::Char(c),
75        }))
76    }
77    fn basic_formater_to_pt(&mut self, c: char) -> Token {
78        Token::Unicode(Unicode::Formatter(match c {
79            '\u{200d}' => Formatter::Joiner,
80            _ => Formatter::Char(c),
81        }))
82    }
83    fn basic_number_to_pt(&mut self, s: &str) -> Token {
84        Token::Word(match i64::from_str(s) {
85            Ok(n) => match s.chars().next() {
86                Some('0') => {
87                    #[cfg(not(feature = "strings"))]
88                    {
89                        Word::Number(Number::ZeroInteger { i: n })
90                    }
91                    #[cfg(feature = "strings")]
92                    {
93                        Word::Number(Number::ZeroInteger {
94                            i: n,
95                            s: s.to_string(),
96                        })
97                    }
98                }
99                Some(_) | None => Word::Number(Number::Integer(n)),
100            },
101            Err(_) => match f64::from_str(s) {
102                Ok(n) => Word::Number(Number::Float(n)),
103                Err(..) => {
104                    #[cfg(feature = "strings")]
105                    {
106                        Word::Word(s.to_string())
107                    }
108                    #[cfg(not(feature = "strings"))]
109                    {
110                        Word::Word
111                    }
112                }
113            },
114        })
115    }
116    fn basic_mixed_to_pt(&mut self, s: &str) -> Token {
117        let mut word = true;
118        let mut has_word_parts = false;
119        let mut first = true;
120        let mut same = false;
121        let mut one_c = ' ';
122        for c in s.chars() {
123            match c.is_alphanumeric()
124                || c.is_digit(10)
125                || (c.general_category_group() == GeneralCategoryGroup::Punctuation)
126                || (c == '\u{0060}')
127            {
128                true => {
129                    has_word_parts = true;
130                }
131                false => {
132                    word = false;
133                }
134            }
135            match first {
136                true => {
137                    one_c = c;
138                    first = false;
139                    same = true;
140                }
141                false => {
142                    if one_c != c {
143                        same = false;
144                    }
145                }
146            }
147        }
148        if !first
149            && same
150            && (one_c.is_whitespace() || (one_c.general_category() == GeneralCategory::Format))
151        {
152            if one_c.is_whitespace() {
153                return self.basic_separator_to_pt(one_c);
154            } else {
155                return self.basic_formater_to_pt(one_c);
156            }
157        }
158        if word {
159            #[cfg(feature = "strings")]
160            {
161                Token::Word(Word::StrangeWord(s.to_string()))
162            }
163            #[cfg(not(feature = "strings"))]
164            {
165                Token::Word(Word::StrangeWord)
166            }
167        } else {
168            let rs = s.replace("\u{fe0f}", "");
169            match EMOJIMAP.get(&rs as &str) {
170                Some(em) => Token::Word(Word::Emoji(em)),
171                None => match one_char_word(&rs) {
172                    //Some(c) if c.general_category() == GeneralCategory::ModifierSymbol => Token::UnicodeModifier(c),
173                    Some(c) if c.general_category_group() == GeneralCategoryGroup::Symbol => {
174                        Token::Special(Special::Symbol(c))
175                    }
176                    Some(_) | None => match has_word_parts {
177                        true => {
178                            #[cfg(feature = "strings")]
179                            {
180                                Token::Word(Word::StrangeWord(s.to_string()))
181                            }
182                            #[cfg(not(feature = "strings"))]
183                            {
184                                Token::Word(Word::StrangeWord)
185                            }
186                        }
187                        false => {
188                            #[cfg(feature = "strings")]
189                            {
190                                Token::Unicode(Unicode::String({
191                                    let mut us = "".to_string();
192                                    for c in rs.chars() {
193                                        if us != "" {
194                                            us += "_";
195                                        }
196                                        us += "u";
197                                        let ns = format!("{}", c.escape_unicode());
198                                        us += &ns[3..ns.len() - 1];
199                                    }
200                                    us
201                                }))
202                            }
203                            #[cfg(not(feature = "strings"))]
204                            {
205                                Token::Unicode(Unicode::String)
206                            }
207                        }
208                    },
209                },
210            }
211        }
212    }
213    fn basic_alphanumeric_to_pt(&mut self, s: &str) -> Token {
214        /*
215        Word
216        StrangeWord
217        pub enum Numerical {
218            Date(String),
219            Ip(String),
220            DotSeparated(String),
221            Countable(String),
222            Measures(String),
223            Alphanumeric(String),
224        }*/
225        //let mut wrd = true;
226        let mut digits = false;
227        let mut digits_begin_only = false;
228        let mut dots = false;
229        let mut alphas_and_apos = false;
230        let mut other = false;
231
232        let mut start_digit = true;
233        for c in s.chars() {
234            if start_digit && (!c.is_digit(10)) {
235                start_digit = false;
236            }
237            match c {
238                c @ _ if c.is_digit(10) => {
239                    digits = true;
240                    if start_digit {
241                        digits_begin_only = true;
242                    } else {
243                        digits_begin_only = false;
244                    }
245                }
246                c @ _ if c.is_alphabetic() => {
247                    alphas_and_apos = true;
248                }
249                '\'' => {
250                    alphas_and_apos = true;
251                }
252                '.' => {
253                    dots = true;
254                }
255                _ => {
256                    other = true;
257                }
258            }
259        }
260        Token::Word(
261            match (digits, digits_begin_only, dots, alphas_and_apos, other) {
262                (true, false, true, false, false) => {
263                    // TODO: Date, Ip, DotSeparated
264                    #[cfg(feature = "strings")]
265                    {
266                        Word::Numerical(Numerical::DotSeparated(s.to_string()))
267                    }
268                    #[cfg(not(feature = "strings"))]
269                    {
270                        Word::Numerical(Numerical::DotSeparated)
271                    }
272                }
273                (true, true, _, true, false) => {
274                    // TODO: Countable or Measures
275                    #[cfg(feature = "strings")]
276                    {
277                        Word::Numerical(Numerical::Measures(s.to_string()))
278                    }
279                    #[cfg(not(feature = "strings"))]
280                    {
281                        Word::Numerical(Numerical::Measures)
282                    }
283                }
284                (true, _, _, _, _) => {
285                    // Numerical trash, ids, etc.
286                    #[cfg(feature = "strings")]
287                    {
288                        Word::Numerical(Numerical::Alphanumeric(s.to_string()))
289                    }
290                    #[cfg(not(feature = "strings"))]
291                    {
292                        Word::Numerical(Numerical::Alphanumeric)
293                    }
294                }
295                (false, false, _, true, false) => {
296                    // Word
297                    #[cfg(feature = "strings")]
298                    {
299                        Word::Word(s.to_string())
300                    }
301                    #[cfg(not(feature = "strings"))]
302                    {
303                        Word::Word
304                    }
305                }
306                (false, false, _, _, _) => {
307                    // Strange
308                    #[cfg(feature = "strings")]
309                    {
310                        Word::StrangeWord(s.to_string())
311                    }
312                    #[cfg(not(feature = "strings"))]
313                    {
314                        Word::StrangeWord
315                    }
316                }
317                (false, true, _, _, _) => unreachable!(),
318            },
319        )
320    }
321    fn basic_punctuation_to_pt(&mut self, c: char) -> Token {
322        Token::Special(Special::Punctuation(c))
323    }
324    /*fn check_url(&mut self) -> Option<PositionalToken> {
325        if !self.allow_structs { return None; }
326        let check = if self.buffer.len()>3 {
327            match (&self.buffer[0],&self.buffer[1],&self.buffer[2]) {
328                (BasicToken::Alphanumeric("http"),BasicToken::Punctuation(":"),BasicToken::Punctuation("//")) |
329                (BasicToken::Alphanumeric("https"),BasicToken::Punctuation(":"),BasicToken::Punctuation("//")) => true,
330                _ => false,
331            }
332        } else { false };
333        if check {
334            let mut url = "".to_string();
335            let tag_bound = None;
336            loop {
337                if let Some(b) = tag_bound {
338                    if (self.offset + url.len()) >= b { break; }
339                }
340                match self.buffer.pop_front() {
341                    None => break,
342                    Some(BasicToken::Separator(s)) => {
343                        self.buffer.push_front(BasicToken::Separator(s));
344                        break;
345                    },
346                    Some(BasicToken::Alphanumeric(s)) |
347                    Some(BasicToken::Number(s)) |
348                    Some(BasicToken::Punctuation(s)) |
349                    Some(BasicToken::Formatter(s)) |
350                    Some(BasicToken::Mixed(s)) => {
351                        url += s;
352                    },
353                }
354            }
355            let len = url.len();
356            let tok = PositionalToken {
357                offset: self.offset,
358                length: len,
359                token: Token::Url(url),
360            };
361            self.offset += len;
362            Some(tok)
363        } else { None }
364    }*/
365    fn check_hashtag(&mut self) -> Option<Local<Token>> {
366        if !self.allow_structs || (self.buffer.len() < 2) {
367            return None;
368        }
369
370        let (loc1, s1) = self.buffer[0].into_inner();
371        let (loc2, s2) = self.buffer[1].into_inner();
372        match (s1, s2) {
373            (BasicToken::Punctuation('#'), BasicToken::Alphanumeric(_s))
374            | (BasicToken::Punctuation('#'), BasicToken::Number(_s)) => {
375                match Local::from_segment(loc1, loc2) {
376                    Ok(local) => {
377                        self.buffer.pop_front();
378                        self.buffer.pop_front();
379
380                        Some(local.local(Token::Struct({
381                            #[cfg(feature = "strings")]
382                            {
383                                Struct::Hashtag(_s.to_string())
384                            }
385                            #[cfg(not(feature = "strings"))]
386                            {
387                                Struct::Hashtag
388                            }
389                        })))
390                    }
391                    Err(_) => None,
392                }
393            }
394            _ => None,
395        }
396    }
397    fn check_mention(&mut self) -> Option<Local<Token>> {
398        if !self.allow_structs || (self.buffer.len() < 2) {
399            return None;
400        }
401
402        let (loc1, s1) = self.buffer[0].into_inner();
403        let (loc2, s2) = self.buffer[1].into_inner();
404        match (s1, s2) {
405            (BasicToken::Punctuation('@'), BasicToken::Alphanumeric(_s))
406            | (BasicToken::Punctuation('@'), BasicToken::Number(_s)) => {
407                match Local::from_segment(loc1, loc2) {
408                    Ok(local) => {
409                        self.buffer.pop_front();
410                        self.buffer.pop_front();
411
412                        Some(local.local(Token::Struct({
413                            #[cfg(feature = "strings")]
414                            {
415                                Struct::Mention(_s.to_string())
416                            }
417                            #[cfg(not(feature = "strings"))]
418                            {
419                                Struct::Mention
420                            }
421                        })))
422                    }
423                    Err(_) => None,
424                }
425            }
426            _ => None,
427        }
428    }
429    fn next_from_buffer(&mut self) -> Option<Local<Token>> {
430        //if let Some(t) = self.check_url() { return Some(t); }
431        if let Some(t) = self.check_hashtag() {
432            return Some(t);
433        }
434        if let Some(t) = self.check_mention() {
435            return Some(t);
436        }
437        match self.buffer.pop_front() {
438            Some(local_tok) => {
439                let (local, tok) = local_tok.into_inner();
440                Some(local.local(match tok {
441                    BasicToken::Alphanumeric(s) => self.basic_alphanumeric_to_pt(s),
442                    BasicToken::Number(s) => self.basic_number_to_pt(s),
443                    BasicToken::Punctuation(s) => self.basic_punctuation_to_pt(s),
444                    BasicToken::Mixed(s) => self.basic_mixed_to_pt(s),
445                    BasicToken::Separator(s) => self.basic_separator_to_pt(s),
446                    BasicToken::Formatter(s) => self.basic_formater_to_pt(s),
447                }))
448            }
449            None => None,
450        }
451    }
452}