1use std::collections::{BTreeSet, VecDeque};
2use std::str::FromStr;
3use unicode_properties::{GeneralCategory, GeneralCategoryGroup, UnicodeGeneralCategory};
4
5use text_parsing::Local;
6
7use crate::{
8 wordbreaker::{one_char_word, BasicToken, WordBreaker},
9 Formatter, IntoTokenizer, Number, Numerical, SentenceBreaker, Separator, Special, Struct,
10 Token, TokenizerOptions, TokenizerParams, Unicode, Word, EMOJIMAP,
11};
12
13impl<'t> IntoTokenizer for &'t str {
14 type IntoTokens = Tokens<'t>;
15
16 fn into_tokenizer<S: SentenceBreaker>(self, params: TokenizerParams<S>) -> Self::IntoTokens {
17 Tokens::new(self, ¶ms.options)
18 }
19}
20
21impl<'t> Iterator for Tokens<'t> {
22 type Item = Local<Token>;
23
24 fn next(&mut self) -> Option<Self::Item> {
25 loop {
26 if self.buffer.len() > 0 {
27 return self.next_from_buffer();
28 } else {
29 loop {
30 match self.bounds.next() {
31 Some(local_bt) => {
32 let sep = if let BasicToken::Separator(_) = local_bt.data() {
33 true
34 } else {
35 false
36 };
37 self.buffer.push_back(local_bt);
38 if sep {
39 return self.next();
40 }
41 }
42 None if self.buffer.len() > 0 => return self.next(),
43 None => return None,
44 }
45 }
46 }
47 }
48 }
49}
50
51pub struct Tokens<'t> {
53 bounds: WordBreaker<'t>,
54 buffer: VecDeque<Local<BasicToken<'t>>>,
55 allow_structs: bool,
56}
57impl<'t> Tokens<'t> {
58 pub(crate) fn new<'a>(s: &'a str, options: &BTreeSet<TokenizerOptions>) -> Tokens<'a> {
59 Tokens {
60 bounds: WordBreaker::new(s, &options),
61 buffer: VecDeque::new(),
62 allow_structs: if options.contains(&TokenizerOptions::StructTokens) {
63 true
64 } else {
65 false
66 },
67 }
68 }
69 fn basic_separator_to_pt(&mut self, c: char) -> Token {
70 Token::Special(Special::Separator(match c {
71 ' ' => Separator::Space,
72 '\n' => Separator::Newline,
73 '\t' => Separator::Tab,
74 _ => Separator::Char(c),
75 }))
76 }
77 fn basic_formater_to_pt(&mut self, c: char) -> Token {
78 Token::Unicode(Unicode::Formatter(match c {
79 '\u{200d}' => Formatter::Joiner,
80 _ => Formatter::Char(c),
81 }))
82 }
83 fn basic_number_to_pt(&mut self, s: &str) -> Token {
84 Token::Word(match i64::from_str(s) {
85 Ok(n) => Word::Number(Number::Integer(n)),
86 Err(_) => match f64::from_str(s) {
87 Ok(n) => Word::Number(Number::Float(n)),
88 Err(..) => {
89 #[cfg(feature = "strings")]
90 {
91 Word::Word(s.to_string())
92 }
93 #[cfg(not(feature = "strings"))]
94 {
95 Word::Word
96 }
97 }
98 },
99 })
100 }
101 fn basic_mixed_to_pt(&mut self, s: &str) -> Token {
102 let mut word = true;
103 let mut has_word_parts = false;
104 let mut first = true;
105 let mut same = false;
106 let mut one_c = ' ';
107 for c in s.chars() {
108 match c.is_alphanumeric()
109 || c.is_digit(10)
110 || (c.general_category_group() == GeneralCategoryGroup::Punctuation)
111 || (c == '\u{0060}')
112 {
113 true => {
114 has_word_parts = true;
115 }
116 false => {
117 word = false;
118 }
119 }
120 match first {
121 true => {
122 one_c = c;
123 first = false;
124 same = true;
125 }
126 false => {
127 if one_c != c {
128 same = false;
129 }
130 }
131 }
132 }
133 if !first
134 && same
135 && (one_c.is_whitespace() || (one_c.general_category() == GeneralCategory::Format))
136 {
137 if one_c.is_whitespace() {
138 return self.basic_separator_to_pt(one_c);
139 } else {
140 return self.basic_formater_to_pt(one_c);
141 }
142 }
143 if word {
144 #[cfg(feature = "strings")]
145 {
146 Token::Word(Word::StrangeWord(s.to_string()))
147 }
148 #[cfg(not(feature = "strings"))]
149 {
150 Token::Word(Word::StrangeWord)
151 }
152 } else {
153 let rs = s.replace("\u{fe0f}", "");
154 match EMOJIMAP.get(&rs as &str) {
155 Some(em) => Token::Word(Word::Emoji(em)),
156 None => match one_char_word(&rs) {
157 Some(c) if c.general_category_group() == GeneralCategoryGroup::Symbol => {
159 Token::Special(Special::Symbol(c))
160 }
161 Some(_) | None => match has_word_parts {
162 true => {
163 #[cfg(feature = "strings")]
164 {
165 Token::Word(Word::StrangeWord(s.to_string()))
166 }
167 #[cfg(not(feature = "strings"))]
168 {
169 Token::Word(Word::StrangeWord)
170 }
171 }
172 false => {
173 #[cfg(feature = "strings")]
174 {
175 Token::Unicode(Unicode::String({
176 let mut us = "".to_string();
177 for c in rs.chars() {
178 if us != "" {
179 us += "_";
180 }
181 us += "u";
182 let ns = format!("{}", c.escape_unicode());
183 us += &ns[3..ns.len() - 1];
184 }
185 us
186 }))
187 }
188 #[cfg(not(feature = "strings"))]
189 {
190 Token::Unicode(Unicode::String)
191 }
192 }
193 },
194 },
195 }
196 }
197 }
198 fn basic_alphanumeric_to_pt(&mut self, s: &str) -> Token {
199 let mut digits = false;
212 let mut digits_begin_only = false;
213 let mut dots = false;
214 let mut alphas_and_apos = false;
215 let mut other = false;
216
217 let mut start_digit = true;
218 for c in s.chars() {
219 if start_digit && (!c.is_digit(10)) {
220 start_digit = false;
221 }
222 match c {
223 c @ _ if c.is_digit(10) => {
224 digits = true;
225 if start_digit {
226 digits_begin_only = true;
227 } else {
228 digits_begin_only = false;
229 }
230 }
231 c @ _ if c.is_alphabetic() => {
232 alphas_and_apos = true;
233 }
234 '\'' => {
235 alphas_and_apos = true;
236 }
237 '.' => {
238 dots = true;
239 }
240 _ => {
241 other = true;
242 }
243 }
244 }
245 Token::Word(
246 match (digits, digits_begin_only, dots, alphas_and_apos, other) {
247 (true, false, true, false, false) => {
248 #[cfg(feature = "strings")]
250 {
251 Word::Numerical(Numerical::DotSeparated(s.to_string()))
252 }
253 #[cfg(not(feature = "strings"))]
254 {
255 Word::Numerical(Numerical::DotSeparated)
256 }
257 }
258 (true, true, _, true, false) => {
259 #[cfg(feature = "strings")]
261 {
262 Word::Numerical(Numerical::Measures(s.to_string()))
263 }
264 #[cfg(not(feature = "strings"))]
265 {
266 Word::Numerical(Numerical::Measures)
267 }
268 }
269 (true, _, _, _, _) => {
270 #[cfg(feature = "strings")]
272 {
273 Word::Numerical(Numerical::Alphanumeric(s.to_string()))
274 }
275 #[cfg(not(feature = "strings"))]
276 {
277 Word::Numerical(Numerical::Alphanumeric)
278 }
279 }
280 (false, false, _, true, false) => {
281 #[cfg(feature = "strings")]
283 {
284 Word::Word(s.to_string())
285 }
286 #[cfg(not(feature = "strings"))]
287 {
288 Word::Word
289 }
290 }
291 (false, false, _, _, _) => {
292 #[cfg(feature = "strings")]
294 {
295 Word::StrangeWord(s.to_string())
296 }
297 #[cfg(not(feature = "strings"))]
298 {
299 Word::StrangeWord
300 }
301 }
302 (false, true, _, _, _) => unreachable!(),
303 },
304 )
305 }
306 fn basic_punctuation_to_pt(&mut self, c: char) -> Token {
307 Token::Special(Special::Punctuation(c))
308 }
309 fn check_hashtag(&mut self) -> Option<Local<Token>> {
351 if !self.allow_structs || (self.buffer.len() < 2) {
352 return None;
353 }
354
355 let (loc1, s1) = self.buffer[0].into_inner();
356 let (loc2, s2) = self.buffer[1].into_inner();
357 match (s1, s2) {
358 (BasicToken::Punctuation('#'), BasicToken::Alphanumeric(_s))
359 | (BasicToken::Punctuation('#'), BasicToken::Number(_s)) => {
360 match Local::from_segment(loc1, loc2) {
361 Ok(local) => {
362 self.buffer.pop_front();
363 self.buffer.pop_front();
364
365 Some(local.local(Token::Struct({
366 #[cfg(feature = "strings")]
367 {
368 Struct::Hashtag(_s.to_string())
369 }
370 #[cfg(not(feature = "strings"))]
371 {
372 Struct::Hashtag
373 }
374 })))
375 }
376 Err(_) => None,
377 }
378 }
379 _ => None,
380 }
381 }
382 fn check_mention(&mut self) -> Option<Local<Token>> {
383 if !self.allow_structs || (self.buffer.len() < 2) {
384 return None;
385 }
386
387 let (loc1, s1) = self.buffer[0].into_inner();
388 let (loc2, s2) = self.buffer[1].into_inner();
389 match (s1, s2) {
390 (BasicToken::Punctuation('@'), BasicToken::Alphanumeric(_s))
391 | (BasicToken::Punctuation('@'), BasicToken::Number(_s)) => {
392 match Local::from_segment(loc1, loc2) {
393 Ok(local) => {
394 self.buffer.pop_front();
395 self.buffer.pop_front();
396
397 Some(local.local(Token::Struct({
398 #[cfg(feature = "strings")]
399 {
400 Struct::Mention(_s.to_string())
401 }
402 #[cfg(not(feature = "strings"))]
403 {
404 Struct::Mention
405 }
406 })))
407 }
408 Err(_) => None,
409 }
410 }
411 _ => None,
412 }
413 }
414 fn next_from_buffer(&mut self) -> Option<Local<Token>> {
415 if let Some(t) = self.check_hashtag() {
417 return Some(t);
418 }
419 if let Some(t) = self.check_mention() {
420 return Some(t);
421 }
422 match self.buffer.pop_front() {
423 Some(local_tok) => {
424 let (local, tok) = local_tok.into_inner();
425 Some(local.local(match tok {
426 BasicToken::Alphanumeric(s) => self.basic_alphanumeric_to_pt(s),
427 BasicToken::Number(s) => self.basic_number_to_pt(s),
428 BasicToken::Punctuation(s) => self.basic_punctuation_to_pt(s),
429 BasicToken::Mixed(s) => self.basic_mixed_to_pt(s),
430 BasicToken::Separator(s) => self.basic_separator_to_pt(s),
431 BasicToken::Formatter(s) => self.basic_formater_to_pt(s),
432 }))
433 }
434 None => None,
435 }
436 }
437}