1use std::collections::{BTreeSet, VecDeque};
2use std::str::FromStr;
3use unicode_properties::{GeneralCategory, GeneralCategoryGroup, UnicodeGeneralCategory};
4
5use text_parsing::Local;
6
7use crate::{
8 wordbreaker::{one_char_word, BasicToken, WordBreaker},
9 Formatter, IntoTokenizer, Number, Numerical, SentenceBreaker, Separator, Special, Struct,
10 Token, TokenizerOptions, TokenizerParams, Unicode, Word, EMOJIMAP,
11};
12
13impl<'t> IntoTokenizer for &'t str {
14 type IntoTokens = Tokens<'t>;
15
16 fn into_tokenizer<S: SentenceBreaker>(self, params: TokenizerParams<S>) -> Self::IntoTokens {
17 Tokens::new(self, ¶ms.options)
18 }
19}
20
21impl<'t> Iterator for Tokens<'t> {
22 type Item = Local<Token>;
23
24 fn next(&mut self) -> Option<Self::Item> {
25 loop {
26 if self.buffer.len() > 0 {
27 return self.next_from_buffer();
28 } else {
29 loop {
30 match self.bounds.next() {
31 Some(local_bt) => {
32 let sep = if let BasicToken::Separator(_) = local_bt.data() {
33 true
34 } else {
35 false
36 };
37 self.buffer.push_back(local_bt);
38 if sep {
39 return self.next();
40 }
41 }
42 None if self.buffer.len() > 0 => return self.next(),
43 None => return None,
44 }
45 }
46 }
47 }
48 }
49}
50
51pub struct Tokens<'t> {
53 bounds: WordBreaker<'t>,
54 buffer: VecDeque<Local<BasicToken<'t>>>,
55 allow_structs: bool,
56}
57impl<'t> Tokens<'t> {
58 pub(crate) fn new<'a>(s: &'a str, options: &BTreeSet<TokenizerOptions>) -> Tokens<'a> {
59 Tokens {
60 bounds: WordBreaker::new(s, &options),
61 buffer: VecDeque::new(),
62 allow_structs: if options.contains(&TokenizerOptions::StructTokens) {
63 true
64 } else {
65 false
66 },
67 }
68 }
69 fn basic_separator_to_pt(&mut self, c: char) -> Token {
70 Token::Special(Special::Separator(match c {
71 ' ' => Separator::Space,
72 '\n' => Separator::Newline,
73 '\t' => Separator::Tab,
74 _ => Separator::Char(c),
75 }))
76 }
77 fn basic_formater_to_pt(&mut self, c: char) -> Token {
78 Token::Unicode(Unicode::Formatter(match c {
79 '\u{200d}' => Formatter::Joiner,
80 _ => Formatter::Char(c),
81 }))
82 }
83 fn basic_number_to_pt(&mut self, s: &str) -> Token {
84 Token::Word(match i64::from_str(s) {
85 Ok(n) => match s.chars().next() {
86 Some('0') => {
87 #[cfg(not(feature = "strings"))]
88 {
89 Word::Number(Number::ZeroInteger { i: n })
90 }
91 #[cfg(feature = "strings")]
92 {
93 Word::Number(Number::ZeroInteger {
94 i: n,
95 s: s.to_string(),
96 })
97 }
98 }
99 Some(_) | None => Word::Number(Number::Integer(n)),
100 },
101 Err(_) => match f64::from_str(s) {
102 Ok(n) => Word::Number(Number::Float(n)),
103 Err(..) => {
104 #[cfg(feature = "strings")]
105 {
106 Word::Word(s.to_string())
107 }
108 #[cfg(not(feature = "strings"))]
109 {
110 Word::Word
111 }
112 }
113 },
114 })
115 }
116 fn basic_mixed_to_pt(&mut self, s: &str) -> Token {
117 let mut word = true;
118 let mut has_word_parts = false;
119 let mut first = true;
120 let mut same = false;
121 let mut one_c = ' ';
122 for c in s.chars() {
123 match c.is_alphanumeric()
124 || c.is_digit(10)
125 || (c.general_category_group() == GeneralCategoryGroup::Punctuation)
126 || (c == '\u{0060}')
127 {
128 true => {
129 has_word_parts = true;
130 }
131 false => {
132 word = false;
133 }
134 }
135 match first {
136 true => {
137 one_c = c;
138 first = false;
139 same = true;
140 }
141 false => {
142 if one_c != c {
143 same = false;
144 }
145 }
146 }
147 }
148 if !first
149 && same
150 && (one_c.is_whitespace() || (one_c.general_category() == GeneralCategory::Format))
151 {
152 if one_c.is_whitespace() {
153 return self.basic_separator_to_pt(one_c);
154 } else {
155 return self.basic_formater_to_pt(one_c);
156 }
157 }
158 if word {
159 #[cfg(feature = "strings")]
160 {
161 Token::Word(Word::StrangeWord(s.to_string()))
162 }
163 #[cfg(not(feature = "strings"))]
164 {
165 Token::Word(Word::StrangeWord)
166 }
167 } else {
168 let rs = s.replace("\u{fe0f}", "");
169 match EMOJIMAP.get(&rs as &str) {
170 Some(em) => Token::Word(Word::Emoji(em)),
171 None => match one_char_word(&rs) {
172 Some(c) if c.general_category_group() == GeneralCategoryGroup::Symbol => {
174 Token::Special(Special::Symbol(c))
175 }
176 Some(_) | None => match has_word_parts {
177 true => {
178 #[cfg(feature = "strings")]
179 {
180 Token::Word(Word::StrangeWord(s.to_string()))
181 }
182 #[cfg(not(feature = "strings"))]
183 {
184 Token::Word(Word::StrangeWord)
185 }
186 }
187 false => {
188 #[cfg(feature = "strings")]
189 {
190 Token::Unicode(Unicode::String({
191 let mut us = "".to_string();
192 for c in rs.chars() {
193 if us != "" {
194 us += "_";
195 }
196 us += "u";
197 let ns = format!("{}", c.escape_unicode());
198 us += &ns[3..ns.len() - 1];
199 }
200 us
201 }))
202 }
203 #[cfg(not(feature = "strings"))]
204 {
205 Token::Unicode(Unicode::String)
206 }
207 }
208 },
209 },
210 }
211 }
212 }
213 fn basic_alphanumeric_to_pt(&mut self, s: &str) -> Token {
214 let mut digits = false;
227 let mut digits_begin_only = false;
228 let mut dots = false;
229 let mut alphas_and_apos = false;
230 let mut other = false;
231
232 let mut start_digit = true;
233 for c in s.chars() {
234 if start_digit && (!c.is_digit(10)) {
235 start_digit = false;
236 }
237 match c {
238 c @ _ if c.is_digit(10) => {
239 digits = true;
240 if start_digit {
241 digits_begin_only = true;
242 } else {
243 digits_begin_only = false;
244 }
245 }
246 c @ _ if c.is_alphabetic() => {
247 alphas_and_apos = true;
248 }
249 '\'' => {
250 alphas_and_apos = true;
251 }
252 '.' => {
253 dots = true;
254 }
255 _ => {
256 other = true;
257 }
258 }
259 }
260 Token::Word(
261 match (digits, digits_begin_only, dots, alphas_and_apos, other) {
262 (true, false, true, false, false) => {
263 #[cfg(feature = "strings")]
265 {
266 Word::Numerical(Numerical::DotSeparated(s.to_string()))
267 }
268 #[cfg(not(feature = "strings"))]
269 {
270 Word::Numerical(Numerical::DotSeparated)
271 }
272 }
273 (true, true, _, true, false) => {
274 #[cfg(feature = "strings")]
276 {
277 Word::Numerical(Numerical::Measures(s.to_string()))
278 }
279 #[cfg(not(feature = "strings"))]
280 {
281 Word::Numerical(Numerical::Measures)
282 }
283 }
284 (true, _, _, _, _) => {
285 #[cfg(feature = "strings")]
287 {
288 Word::Numerical(Numerical::Alphanumeric(s.to_string()))
289 }
290 #[cfg(not(feature = "strings"))]
291 {
292 Word::Numerical(Numerical::Alphanumeric)
293 }
294 }
295 (false, false, _, true, false) => {
296 #[cfg(feature = "strings")]
298 {
299 Word::Word(s.to_string())
300 }
301 #[cfg(not(feature = "strings"))]
302 {
303 Word::Word
304 }
305 }
306 (false, false, _, _, _) => {
307 #[cfg(feature = "strings")]
309 {
310 Word::StrangeWord(s.to_string())
311 }
312 #[cfg(not(feature = "strings"))]
313 {
314 Word::StrangeWord
315 }
316 }
317 (false, true, _, _, _) => unreachable!(),
318 },
319 )
320 }
321 fn basic_punctuation_to_pt(&mut self, c: char) -> Token {
322 Token::Special(Special::Punctuation(c))
323 }
324 fn check_hashtag(&mut self) -> Option<Local<Token>> {
366 if !self.allow_structs || (self.buffer.len() < 2) {
367 return None;
368 }
369
370 let (loc1, s1) = self.buffer[0].into_inner();
371 let (loc2, s2) = self.buffer[1].into_inner();
372 match (s1, s2) {
373 (BasicToken::Punctuation('#'), BasicToken::Alphanumeric(_s))
374 | (BasicToken::Punctuation('#'), BasicToken::Number(_s)) => {
375 match Local::from_segment(loc1, loc2) {
376 Ok(local) => {
377 self.buffer.pop_front();
378 self.buffer.pop_front();
379
380 Some(local.local(Token::Struct({
381 #[cfg(feature = "strings")]
382 {
383 Struct::Hashtag(_s.to_string())
384 }
385 #[cfg(not(feature = "strings"))]
386 {
387 Struct::Hashtag
388 }
389 })))
390 }
391 Err(_) => None,
392 }
393 }
394 _ => None,
395 }
396 }
397 fn check_mention(&mut self) -> Option<Local<Token>> {
398 if !self.allow_structs || (self.buffer.len() < 2) {
399 return None;
400 }
401
402 let (loc1, s1) = self.buffer[0].into_inner();
403 let (loc2, s2) = self.buffer[1].into_inner();
404 match (s1, s2) {
405 (BasicToken::Punctuation('@'), BasicToken::Alphanumeric(_s))
406 | (BasicToken::Punctuation('@'), BasicToken::Number(_s)) => {
407 match Local::from_segment(loc1, loc2) {
408 Ok(local) => {
409 self.buffer.pop_front();
410 self.buffer.pop_front();
411
412 Some(local.local(Token::Struct({
413 #[cfg(feature = "strings")]
414 {
415 Struct::Mention(_s.to_string())
416 }
417 #[cfg(not(feature = "strings"))]
418 {
419 Struct::Mention
420 }
421 })))
422 }
423 Err(_) => None,
424 }
425 }
426 _ => None,
427 }
428 }
429 fn next_from_buffer(&mut self) -> Option<Local<Token>> {
430 if let Some(t) = self.check_hashtag() {
432 return Some(t);
433 }
434 if let Some(t) = self.check_mention() {
435 return Some(t);
436 }
437 match self.buffer.pop_front() {
438 Some(local_tok) => {
439 let (local, tok) = local_tok.into_inner();
440 Some(local.local(match tok {
441 BasicToken::Alphanumeric(s) => self.basic_alphanumeric_to_pt(s),
442 BasicToken::Number(s) => self.basic_number_to_pt(s),
443 BasicToken::Punctuation(s) => self.basic_punctuation_to_pt(s),
444 BasicToken::Mixed(s) => self.basic_mixed_to_pt(s),
445 BasicToken::Separator(s) => self.basic_separator_to_pt(s),
446 BasicToken::Formatter(s) => self.basic_formater_to_pt(s),
447 }))
448 }
449 None => None,
450 }
451 }
452}