text_tokenizer/
lib.rs

1use std::sync::Arc;
2use text_parsing::{Breaker, IntoSource, Local, Localize, Snip, Source, SourceEvent};
3
4mod emoji;
5pub use emoji::EMOJIMAP;
6
7mod breakers;
8pub use breakers::{SentenceBreaker, UnicodeSentenceBreaker};
9
10mod wordbreaker;
11
12mod options;
13pub use options::{IntoTokenizer, TokenizerOptions, TokenizerParams};
14
15mod tokens;
16pub use tokens::Tokens;
17
18mod text_tokens;
19use text_tokens::InnerBound;
20pub use text_tokens::TextTokens;
21
22#[derive(Debug)]
23pub enum Error {
24    TextParser(text_parsing::Error),
25}
26
27const EPS: f64 = 1e-8;
28
29#[cfg(feature = "strings")]
30#[derive(Debug, Clone, Copy, PartialEq, PartialOrd)]
31pub enum Number {
32    Integer(i64),
33    Float(f64),
34    // String is an integer, but with the leading zeroes, for example: "007"
35    ZeroInteger { i: i64, s: String },
36}
37
38#[cfg(not(feature = "strings"))]
39#[derive(Debug, Clone, Copy, PartialEq, PartialOrd)]
40pub enum Number {
41    Integer(i64),
42    Float(f64),
43    ZeroInteger { i: i64 },
44}
45
46impl Number {
47    pub fn as_f64(&self) -> f64 {
48        match self {
49            Number::Integer(i) => *i as f64,
50            Number::Float(f) => *f,
51            Number::ZeroInteger { i, .. } => *i as f64,
52        }
53    }
54}
55impl Ord for Number {
56    fn cmp(&self, other: &Number) -> std::cmp::Ordering {
57        let s = self.as_f64();
58        let o = other.as_f64();
59        let d = s - o;
60        match d.abs() < EPS {
61            true => std::cmp::Ordering::Equal,
62            false => {
63                if d > 0.0 {
64                    return std::cmp::Ordering::Greater;
65                }
66                if d < 0.0 {
67                    return std::cmp::Ordering::Less;
68                }
69                std::cmp::Ordering::Equal
70            }
71        }
72    }
73}
74impl Eq for Number {}
75
76#[derive(Debug, Clone, Copy, Eq, PartialEq, Ord, PartialOrd)]
77pub enum Separator {
78    Space,
79    Tab,
80    Newline,
81    Char(char),
82}
83
84#[derive(Debug, Clone, Copy, Eq, PartialEq, Ord, PartialOrd)]
85pub enum Formatter {
86    Char(char),
87    Joiner, // u{200d}
88}
89
90#[derive(Debug, Clone, Copy, PartialEq, Eq, Ord, PartialOrd)]
91pub enum Special {
92    Punctuation(char),
93    Symbol(char),
94    Separator(Separator),
95}
96
97#[cfg(feature = "strings")]
98#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord)]
99pub enum Word {
100    Word(String),
101    StrangeWord(String),
102    Numerical(Numerical),
103    Number(Number),
104    Emoji(&'static str),
105}
106
107#[cfg(feature = "strings")]
108#[derive(Debug, Clone, Eq, PartialEq, Ord, PartialOrd)]
109pub enum Numerical {
110    //Date(String),
111    //Ip(String),
112    //Countable(String),
113    DotSeparated(String),
114    Measures(String),
115    Alphanumeric(String),
116}
117
118#[cfg(feature = "strings")]
119#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord)]
120pub enum Struct {
121    Hashtag(String),
122    Mention(String),
123    //Url(String),
124}
125
126#[cfg(feature = "strings")]
127#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord)]
128pub enum Unicode {
129    String(String),
130    Formatter(Formatter),
131}
132
133#[cfg(not(feature = "strings"))]
134#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord)]
135pub enum Word {
136    Word,
137    StrangeWord,
138    Numerical(Numerical),
139    Number(Number),
140    Emoji(&'static str),
141}
142
143#[cfg(not(feature = "strings"))]
144#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord)]
145pub enum Numerical {
146    //Date,
147    //Ip,
148    //Countable,
149    DotSeparated,
150    Measures,
151    Alphanumeric,
152}
153
154#[cfg(not(feature = "strings"))]
155#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord)]
156pub enum Struct {
157    Hashtag,
158    Mention,
159    //Url,
160}
161
162#[cfg(not(feature = "strings"))]
163#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord)]
164pub enum Unicode {
165    String,
166    Formatter(Formatter),
167}
168
169#[cfg(feature = "strings")]
170#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord)]
171pub enum Token {
172    Word(Word),
173    Struct(Struct),
174    Special(Special),
175    Unicode(Unicode),
176}
177
178#[cfg(not(feature = "strings"))]
179#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord)]
180pub enum Token {
181    Word(Word),
182    Struct(Struct),
183    Special(Special),
184    Unicode(Unicode),
185}
186
187/*pub trait IntoTokens<T> {
188    type IntoTokens: IntoTokenizer<IntoTokens = T>;
189}
190
191impl<'s> IntoTokenSource<Token2> for &'s str {
192    type IntoTokens = TextStr<'s>;
193
194    fn (self) -> Result<TextStr<'s>,Error> {
195        TextStr::new(self)
196    }
197
198}*/
199
200#[derive(Debug)]
201pub struct TextStr<'s> {
202    buffer: &'s str,
203    localities: Arc<Vec<TextLocality>>,
204    breakers: Arc<Vec<InnerBound>>,
205}
206impl<'s> TextStr<'s> {
207    pub fn new<'a>(s: &'a str) -> Result<TextStr<'a>, Error> {
208        let text = inner_new(s.into_source(), false)?;
209        Ok(TextStr {
210            buffer: s,
211            localities: text.localities,
212            breakers: text.breakers,
213        })
214    }
215}
216
217fn inner_new<S: Source>(mut source: S, with_buffer: bool) -> Result<Text, Error> {
218    let mut buffer = String::new();
219    let mut localities = Vec::new();
220    let mut breakers = Vec::new();
221    let mut buffer_len = 0;
222
223    while let Some(local_se) = source.next_char().map_err(Error::TextParser)? {
224        let (local, se) = local_se.into_inner();
225        let c = match se {
226            SourceEvent::Char(c) => match c {
227                '\u{0060}' => '\u{0027}',
228                _ => c,
229            },
230            SourceEvent::Breaker(b) => {
231                let (c, opt_b) = match b {
232                    Breaker::None => continue,
233                    Breaker::Space => (' ', None),
234                    Breaker::Line => ('\n', None),
235                    Breaker::Word => ('\u{200B}', Some(b)), // zero width space
236                    Breaker::Sentence | Breaker::Paragraph | Breaker::Section => ('\n', Some(b)),
237                };
238                if let Some(b) = opt_b {
239                    let br = InnerBound {
240                        bytes: Snip {
241                            offset: buffer_len,
242                            length: c.len_utf8(),
243                        },
244                        chars: Snip {
245                            offset: localities.len(),
246                            length: 1,
247                        },
248                        breaker: b,
249                        original: Some(local),
250                    };
251                    //println!("BR: {:?}",br);
252                    breakers.push(br);
253                }
254                c
255            }
256        };
257
258        let buf_local = ().localize(
259            Snip {
260                // chars
261                offset: localities.len(),
262                length: 1,
263            },
264            Snip {
265                // bytes
266                offset: buffer_len,
267                length: c.len_utf8(),
268            },
269        );
270        if with_buffer {
271            buffer.push(c);
272        }
273        buffer_len += c.len_utf8();
274        localities.push(TextLocality {
275            buffer: buf_local,
276            original: local,
277        });
278    }
279    Ok(Text {
280        buffer: Arc::new(buffer),
281        localities: Arc::new(localities),
282        breakers: Arc::new(breakers),
283    })
284}
285
286#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord)]
287pub struct TextLocality {
288    pub buffer: Local<()>,
289    pub original: Local<()>,
290}
291
292#[derive(Debug)]
293pub struct Text {
294    buffer: Arc<String>,
295    localities: Arc<Vec<TextLocality>>,
296    breakers: Arc<Vec<InnerBound>>,
297}
298impl Text {
299    pub fn new<S: Source>(source: S) -> Result<Text, Error> {
300        inner_new(source, true)
301    }
302    pub fn token_text<'s>(&'s self, token: &TextToken) -> &'s str {
303        let Snip {
304            offset: begin,
305            length: len,
306        } = token.locality.bytes();
307        let end = begin + len;
308        &self.buffer[begin..end]
309    }
310    pub fn text(&self) -> &str {
311        self.buffer.as_ref()
312    }
313    pub fn original_locality(&self, idx: usize) -> Option<Local<()>> {
314        self.localities.get(idx).map(|tl| tl.original)
315    }
316    pub fn localities(&self) -> &Vec<TextLocality> {
317        self.localities.as_ref()
318    }
319    pub fn shared_text(&self) -> Text {
320        Text {
321            buffer: self.buffer.clone(),
322            localities: self.localities.clone(),
323            breakers: self.breakers.clone(),
324        }
325    }
326}
327
328impl TryFrom<String> for Text {
329    type Error = Error;
330
331    fn try_from(s: String) -> Result<Text, Error> {
332        let mut text = inner_new((&s).into_source(), false)?;
333        text.buffer = Arc::new(s);
334        Ok(text)
335    }
336}
337
338impl TryFrom<&str> for Text {
339    type Error = Error;
340
341    fn try_from(s: &str) -> Result<Text, Error> {
342        Text::new(s.into_source())
343    }
344}
345
346#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord)]
347pub enum Bound {
348    Sentence,
349    Paragraph,
350    Section,
351}
352
353#[cfg(feature = "strings")]
354#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord)]
355pub struct TextToken {
356    locality: Local<()>,
357    original: Option<Local<()>>,
358    pub token: Token2,
359}
360
361#[cfg(not(feature = "strings"))]
362#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord)]
363pub struct TextToken {
364    locality: Local<()>,
365    original: Option<Local<()>>,
366    pub token: Token2,
367}
368
369#[cfg(test)]
370impl TextToken {
371    fn into_original_token_1(self) -> Option<Local<Token>> {
372        match self.original {
373            Some(original) => self.token.into_token().map(|t| original.local(t)),
374            None => None,
375        }
376    }
377}
378
379impl TextToken {
380    pub fn local(&self) -> Local<()> {
381        self.locality
382    }
383    pub fn original(&self) -> Option<Local<()>> {
384        self.original
385    }
386    pub fn into_position(mut self) -> TextToken {
387        self.locality = self.locality.into_position();
388        self.original = self.original.map(|or| or.into_position());
389        self
390    }
391    pub fn try_as_token(&self) -> Result<Token, Bound> {
392        self.token.try_as_token()
393    }
394    pub fn as_original_token(&self) -> Option<Local<&Token2>> {
395        self.original.map(|original| original.local(&self.token))
396    }
397    pub fn into_original_token(self) -> Option<Local<Token2>> {
398        self.original.map(|original| original.local(self.token))
399    }
400    pub fn original_str<'s>(&self, original: &'s str) -> Result<&'s str, OriginalError> {
401        match self.original {
402            Some(local) => {
403                let Snip {
404                    offset: begin,
405                    length: len,
406                } = local.bytes();
407                let end = begin + len;
408                match original.get(begin..end) {
409                    Some(s) => Ok(s),
410                    None => Err(OriginalError::InvalidSnip),
411                }
412            }
413            None => Err(OriginalError::NoOriginal),
414        }
415    }
416
417    pub fn test_token(lt: Local<Token2>) -> TextToken {
418        let (local, token) = lt.into_inner();
419        TextToken {
420            locality: local,
421            original: Some(local.local(())),
422            token,
423        }
424    }
425    pub fn test_new(token: Token2, local: Local<()>, original: Option<Local<()>>) -> TextToken {
426        TextToken {
427            locality: local,
428            original,
429            token,
430        }
431    }
432}
433
434/*pub trait TokenExt: Iterator<Item = TextToken> + Sized {
435    fn merge_separators(self) -> Merger<Self>;
436}
437
438impl<T> TokenExt for T where T: Iterator<Item = TextToken> {
439    fn merge_separators(self) -> Merger<Self> {
440        Merger {
441            tokens: self,
442        }
443    }
444}
445
446pub struct Merger<T>
447where T: Iterator<Item = TextToken>
448{
449    tokens: T,
450}
451impl<T> Iterator for Merger<T>
452where T: Iterator<Item = TextToken>
453{
454    type Item = TextToken;
455    fn next(&mut self) -> Option<Self::Item> {
456        self.tokens.next()
457    }
458}*/
459
460#[derive(Debug)]
461pub enum OriginalError {
462    NoOriginal,
463    InvalidSnip,
464}
465
466/*#[derive(Debug,Clone,PartialEq)]
467pub enum ExtToken {
468    Token(Local<Token>),
469    Breaker(Local<Bound>),
470    Bound(Bound),
471}*/
472
473#[cfg(feature = "strings")]
474#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord)]
475pub enum Token2 {
476    Word(Word),
477    Struct(Struct),
478    Special(Special),
479    Unicode(Unicode),
480
481    Bound(Bound),
482}
483#[cfg(not(feature = "strings"))]
484#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord)]
485pub enum Token2 {
486    Word(Word),
487    Struct(Struct),
488    Special(Special),
489    Unicode(Unicode),
490
491    Bound(Bound),
492}
493impl From<Token> for Token2 {
494    fn from(t: Token) -> Token2 {
495        match t {
496            Token::Word(w) => Token2::Word(w),
497            Token::Struct(s) => Token2::Struct(s),
498            Token::Special(s) => Token2::Special(s),
499            Token::Unicode(u) => Token2::Unicode(u),
500        }
501    }
502}
503impl Token2 {
504    #[cfg(not(feature = "strings"))]
505    fn try_as_token(&self) -> Result<Token, Bound> {
506        (*self).try_into_token()
507    }
508
509    #[cfg(feature = "strings")]
510    fn try_as_token(&self) -> Result<Token, Bound> {
511        self.clone().try_into_token()
512    }
513
514    fn try_into_token(self) -> Result<Token, Bound> {
515        match self {
516            Token2::Word(w) => Ok(Token::Word(w)),
517            Token2::Struct(s) => Ok(Token::Struct(s)),
518            Token2::Special(s) => Ok(Token::Special(s)),
519            Token2::Unicode(u) => Ok(Token::Unicode(u)),
520            Token2::Bound(b) => Err(b),
521        }
522    }
523}
524#[cfg(test)]
525impl Token2 {
526    fn into_token(self) -> Option<Token> {
527        match self {
528            Token2::Word(w) => Some(Token::Word(w)),
529            Token2::Struct(s) => Some(Token::Struct(s)),
530            Token2::Special(s) => Some(Token::Special(s)),
531            Token2::Unicode(u) => Some(Token::Unicode(u)),
532            Token2::Bound(_) => None,
533        }
534    }
535}
536
537#[cfg(test)]
538mod test_v0_5 {
539    use super::*;
540    use text_parsing::{entities, tagger, IntoPipeParser, IntoSource, ParserExt, SourceExt};
541
542    //#[test]
543    fn basic() {
544        /*let uws = "Oxana Putan shared the quick (\"brown\") fox can't jump 32.3 feet, right?4pda etc. qeq U.S.A  asd\n\n\nBrr, it's 29.3°F!\n Русское предложение #36.6 для тестирования деления по юникод-словам...\n🇷🇺 🇸🇹\n👱🏿👶🏽👨🏽\n+Done! Готово";
545
546        /*let result = vec![
547            PositionalToken { source: uws, offset: 0, length: 7, token: Token::Word("l'oreal".to_string()) },
548            PositionalToken { source: uws, offset: 7, length: 1, token: Token::Punctuation(";".to_string()) },
549            PositionalToken { source: uws, offset: 8, length: 1, token: Token::Separator(Separator::Space) },
550            PositionalToken { source: uws, offset: 9, length: 7, token: Token::Word("l'oreal".to_string()) },
551        ];*/
552        let text = Text::new({
553            uws.into_source()
554                .into_separator()
555                .merge_separators()
556        }).unwrap();*/
557
558        let uws = "<p>Oxana Putan shared the quick (\"brown\") fox can't jump 32.3 feet, right? 4pda etc.</p><p> qeq U.S.A  asd\n\n\nBrr, it's 29.3°F!\n Русское предложение #36.6 для тестирования деления по юникод-словам...\n🇷🇺 🇸🇹\n👱🏿👶🏽👨🏽\n+Done! Готово</p>";
559        let text = Text::new({
560            uws.into_source()
561                .pipe(tagger::Builder::new().create().into_breaker())
562                .pipe(entities::Builder::new().create().into_piped())
563                .into_separator()
564        })
565        .unwrap();
566        let lib_res = text
567            .into_tokenizer({
568                TokenizerParams::default()
569                    .add_option(TokenizerOptions::SplitDot)
570                    .add_option(TokenizerOptions::SplitUnderscore)
571                    .add_option(TokenizerOptions::SplitColon)
572                    .with_default_sentences()
573            })
574            .collect::<Vec<_>>();
575
576        for tok in lib_res {
577            println!(
578                "C{:?}, B{:?}, {:?} -> {:?}",
579                tok.original.map(|loc| loc.chars()),
580                tok.original.map(|loc| loc.bytes()),
581                tok.token,
582                tok.original_str(uws)
583            );
584        }
585
586        panic!()
587    }
588}
589
590#[cfg(test)]
591#[cfg(feature = "strings")]
592mod test {
593    use super::*;
594    use text_parsing::{
595        entities, tagger, IntoPipeParser, IntoSource, Localize, ParserExt, SourceExt,
596    };
597
598    /*
599    #[allow(dead_code)]
600    fn print_pt(tok: &PositionalToken) -> String {
601        let mut r = match &tok.token {
602            Token::BBCode{ left, right } => {
603                let left = print_pts(left);
604                let right = print_pts(right);
605                format!("PositionalToken {{ offset: {}, length: {}, token: Token::BBCode {{ left: vec![\n{}], right: vec![\n{}] }} }},",tok.offset,tok.length,left,right)
606            },
607            _ => format!("PositionalToken {{ offset: {}, length: {}, token: Token::{:?} }},",tok.offset,tok.length,tok.token),
608        };
609        r = r.replace("\")","\".to_string())");
610        r
611    }
612    #[allow(dead_code)]
613    fn print_pts(lib_res: &Vec<PositionalToken>) -> String {
614        let mut r = String::new();
615        for tok in lib_res {
616            r += &print_pt(&tok);
617            r += "\n";
618        }
619        r
620    }
621    #[allow(dead_code)]
622    fn print_result(lib_res: &Vec<PositionalToken>) {
623        let mut r = print_pts(lib_res);
624        r = r.replace("Separator(","Separator(Separator::");
625        r = r.replace("UnicodeFormatter(","UnicodeFormatter(Formatter::");
626        r = r.replace("Number(","Number(Number::");
627        r = r.replace("Numerical(","Numerical(Numerical::");
628        println!("{}",r);
629    }
630
631    #[allow(dead_code)]
632    fn print_ct(tok: &CharToken) -> String {
633        let mut r = format!("CharToken {{ byte_offset: {}, byte_length: {}, char_offset: {}, char_length: {}, token: Token::{:?} }},",tok.byte_offset,tok.byte_length,tok.char_offset,tok.char_length,tok.token);
634        r = r.replace("\")","\".to_string())");
635        r
636    }
637
638    #[allow(dead_code)]
639    fn print_cts(lib_res: &Vec<CharToken>) -> String {
640        let mut r = String::new();
641        for tok in lib_res {
642            r += &print_ct(&tok);
643            r += "\n";
644        }
645        r
646    }
647
648    #[allow(dead_code)]
649    fn print_cresult(lib_res: &Vec<CharToken>) {
650        let mut r = print_cts(lib_res);
651        r = r.replace("Separator(","Separator(Separator::");
652        r = r.replace("UnicodeFormatter(","UnicodeFormatter(Formatter::");
653        r = r.replace("Number(","Number(Number::");
654        r = r.replace("Numerical(","Numerical(Numerical::");
655        println!("{}",r);
656    }*/
657
658    #[derive(Debug, Clone)]
659    struct CharToken {
660        byte_offset: usize,
661        byte_length: usize,
662        char_offset: usize,
663        char_length: usize,
664        token: Token,
665    }
666    impl Into<Local<Token>> for CharToken {
667        fn into(self) -> Local<Token> {
668            self.token.localize(
669                Snip {
670                    offset: self.char_offset,
671                    length: self.char_length,
672                },
673                Snip {
674                    offset: self.byte_offset,
675                    length: self.byte_length,
676                },
677            )
678        }
679    }
680
681    #[derive(Debug, Clone)]
682    struct PositionalToken {
683        source: &'static str,
684        offset: usize,
685        length: usize,
686        token: Token,
687    }
688    impl Into<Local<Token>> for PositionalToken {
689        fn into(self) -> Local<Token> {
690            self.token.localize(
691                Snip {
692                    offset: self.source[..self.offset].chars().count(),
693                    length: self.source[self.offset..self.offset + self.length]
694                        .chars()
695                        .count(),
696                },
697                Snip {
698                    offset: self.offset,
699                    length: self.length,
700                },
701            )
702        }
703    }
704
705    fn check_results(result: &Vec<PositionalToken>, lib_res: &Vec<Local<Token>>, _uws: &str) {
706        assert_eq!(result.len(), lib_res.len());
707        for i in 0..result.len() {
708            let res: Local<Token> = result[i].clone().into();
709            assert_eq!(res, lib_res[i]);
710        }
711    }
712
713    fn check_cresults(result: &Vec<CharToken>, lib_res: &Vec<Local<Token>>, _uws: &str) {
714        assert_eq!(result.len(), lib_res.len());
715        for i in 0..result.len() {
716            let res: Local<Token> = result[i].clone().into();
717            assert_eq!(res, lib_res[i]);
718        }
719    }
720
721    fn check<T: Clone + std::fmt::Debug + Into<Local<Token>>>(
722        res: &Vec<T>,
723        lib: &Vec<Local<Token>>,
724        _uws: &str,
725    ) {
726        let mut lib = lib.iter();
727        let mut res = res.iter().map(|r| {
728            let res: Local<Token> = r.clone().into();
729            res
730        });
731        let mut diff = Vec::new();
732        loop {
733            match (lib.next(), res.next()) {
734                (Some(lw), Some(rw)) => {
735                    if *lw != rw {
736                        diff.push(format!("LIB:  {:?}", lw));
737                        diff.push(format!("TEST: {:?}", rw));
738                        diff.push("".to_string())
739                    }
740                }
741                (Some(lw), None) => {
742                    diff.push(format!("LIB:  {:?}", lw));
743                    diff.push("TEST: ----".to_string());
744                    diff.push("".to_string())
745                }
746                (None, Some(rw)) => {
747                    diff.push("LIB:  ----".to_string());
748                    diff.push(format!("TEST: {:?}", rw));
749                    diff.push("".to_string())
750                }
751                (None, None) => break,
752            }
753        }
754        if diff.len() > 0 {
755            for ln in &diff {
756                println!("{}", ln);
757            }
758            panic!("Diff count: {}", diff.len() / 3);
759        }
760    }
761
762    #[test]
763    fn spaces() {
764        let uws = "    spaces    too   many   apces   ";
765        let result = vec![
766            PositionalToken {
767                source: uws,
768                offset: 0,
769                length: 4,
770                token: Token::Special(Special::Separator(Separator::Space)),
771            },
772            PositionalToken {
773                source: uws,
774                offset: 4,
775                length: 6,
776                token: Token::Word(Word::Word("spaces".to_string())),
777            },
778            PositionalToken {
779                source: uws,
780                offset: 10,
781                length: 4,
782                token: Token::Special(Special::Separator(Separator::Space)),
783            },
784            PositionalToken {
785                source: uws,
786                offset: 14,
787                length: 3,
788                token: Token::Word(Word::Word("too".to_string())),
789            },
790            PositionalToken {
791                source: uws,
792                offset: 17,
793                length: 3,
794                token: Token::Special(Special::Separator(Separator::Space)),
795            },
796            PositionalToken {
797                source: uws,
798                offset: 20,
799                length: 4,
800                token: Token::Word(Word::Word("many".to_string())),
801            },
802            PositionalToken {
803                source: uws,
804                offset: 24,
805                length: 3,
806                token: Token::Special(Special::Separator(Separator::Space)),
807            },
808            PositionalToken {
809                source: uws,
810                offset: 27,
811                length: 5,
812                token: Token::Word(Word::Word("apces".to_string())),
813            },
814            PositionalToken {
815                source: uws,
816                offset: 32,
817                length: 3,
818                token: Token::Special(Special::Separator(Separator::Space)),
819            },
820        ];
821        let lib_res = uws
822            .into_tokenizer(TokenizerParams::v1())
823            .collect::<Vec<_>>();
824        check_results(&result, &lib_res, uws);
825        //panic!()
826    }
827
828    #[test]
829    fn numbers() {
830        let uws = "(() -2\n()  -2";
831        let result = vec![
832            PositionalToken {
833                source: uws,
834                offset: 0,
835                length: 1,
836                token: Token::Special(Special::Punctuation('(')),
837            },
838            PositionalToken {
839                source: uws,
840                offset: 1,
841                length: 1,
842                token: Token::Special(Special::Punctuation('(')),
843            },
844            PositionalToken {
845                source: uws,
846                offset: 2,
847                length: 1,
848                token: Token::Special(Special::Punctuation(')')),
849            },
850            PositionalToken {
851                source: uws,
852                offset: 3,
853                length: 1,
854                token: Token::Special(Special::Separator(Separator::Space)),
855            },
856            PositionalToken {
857                source: uws,
858                offset: 4,
859                length: 2,
860                token: Token::Word(Word::Number(Number::Integer(-2))),
861            },
862            PositionalToken {
863                source: uws,
864                offset: 6,
865                length: 1,
866                token: Token::Special(Special::Separator(Separator::Newline)),
867            },
868            PositionalToken {
869                source: uws,
870                offset: 7,
871                length: 1,
872                token: Token::Special(Special::Punctuation('(')),
873            },
874            PositionalToken {
875                source: uws,
876                offset: 8,
877                length: 1,
878                token: Token::Special(Special::Punctuation(')')),
879            },
880            PositionalToken {
881                source: uws,
882                offset: 9,
883                length: 2,
884                token: Token::Special(Special::Separator(Separator::Space)),
885            },
886            PositionalToken {
887                source: uws,
888                offset: 11,
889                length: 2,
890                token: Token::Word(Word::Number(Number::Integer(-2))),
891            },
892        ];
893        let lib_res = uws
894            .into_tokenizer({
895                TokenizerParams::default()
896                    .add_option(TokenizerOptions::SplitDot)
897                    .add_option(TokenizerOptions::SplitUnderscore)
898                    .add_option(TokenizerOptions::SplitColon)
899                    .add_option(TokenizerOptions::MergeWhites)
900            })
901            .collect::<Vec<_>>();
902        check_results(&result, &lib_res, uws);
903    }
904
905    #[test]
906    fn word_with_inner_hyphens() {
907        let uws = "Опро­сы по­ка­зы­ва­ют";
908        let result = vec![
909            PositionalToken {
910                source: uws,
911                offset: 0,
912                length: 14,
913                token: Token::Word(Word::StrangeWord("Опро­сы".to_string())),
914            },
915            PositionalToken {
916                source: uws,
917                offset: 14,
918                length: 1,
919                token: Token::Special(Special::Separator(Separator::Space)),
920            },
921            PositionalToken {
922                source: uws,
923                offset: 15,
924                length: 28,
925                token: Token::Word(Word::StrangeWord("по­ка­зы­ва­ют".to_string())),
926            },
927        ];
928        let lib_res = uws
929            .into_tokenizer(TokenizerParams::v1())
930            .collect::<Vec<_>>();
931        check_results(&result, &lib_res, uws);
932    }
933
934    #[test]
935    fn mixed_but_word() {
936        let uws = "L’Oreal";
937        let result = vec![PositionalToken {
938            source: uws,
939            offset: 0,
940            length: 9,
941            token: Token::Word(Word::StrangeWord("L’Oreal".to_string())),
942        }];
943        let lib_res = uws
944            .into_tokenizer(TokenizerParams::v1())
945            .collect::<Vec<_>>();
946        check_results(&result, &lib_res, uws);
947    }
948
949    #[test]
950    fn hashtags() {
951        let uws = "#hashtag#hashtag2";
952        let result = vec![
953            PositionalToken {
954                source: uws,
955                offset: 0,
956                length: 1,
957                token: Token::Special(Special::Punctuation('#')),
958            },
959            PositionalToken {
960                source: uws,
961                offset: 1,
962                length: 7,
963                token: Token::Word(Word::Word("hashtag".to_string())),
964            },
965            PositionalToken {
966                source: uws,
967                offset: 8,
968                length: 1,
969                token: Token::Special(Special::Punctuation('#')),
970            },
971            PositionalToken {
972                source: uws,
973                offset: 9,
974                length: 8,
975                token: Token::Word(Word::Numerical(Numerical::Alphanumeric(
976                    "hashtag2".to_string(),
977                ))),
978            },
979        ];
980        let lib_res = uws
981            .into_tokenizer(TokenizerParams::v1())
982            .collect::<Vec<_>>();
983        check_results(&result, &lib_res, uws);
984    }
985
986    #[test]
987    fn apostrophe() {
988        let uws = "l'oreal; l\u{0060}oreal";
989        let result = vec![
990            PositionalToken {
991                source: uws,
992                offset: 0,
993                length: 7,
994                token: Token::Word(Word::Word("l'oreal".to_string())),
995            },
996            PositionalToken {
997                source: uws,
998                offset: 7,
999                length: 1,
1000                token: Token::Special(Special::Punctuation(';')),
1001            },
1002            PositionalToken {
1003                source: uws,
1004                offset: 8,
1005                length: 1,
1006                token: Token::Special(Special::Separator(Separator::Space)),
1007            },
1008            PositionalToken {
1009                source: uws,
1010                offset: 9,
1011                length: 7,
1012                token: Token::Word(Word::Word("l'oreal".to_string())),
1013            },
1014        ];
1015        let text = Text::new(uws.into_source()).unwrap();
1016        let lib_res = text
1017            .into_tokenizer(TokenizerParams::v1())
1018            .filter_map(|tt| tt.into_original_token_1())
1019            .collect::<Vec<_>>();
1020        check_results(&result, &lib_res, uws);
1021    }
1022
1023    #[test]
1024    fn char_tokens() {
1025        let uws = "[Oxana Putan|1712640565] shared the quick (\"brown\") fox can't jump 32.3 feet, right? 4pda etc. qeq U.S.A  asd\n\n\nBrr, it's 29.3°F!\n Русское предложение #36.6 для тестирования деления по юникод-словам...\n🇷🇺 🇸🇹\n👱🏿👶🏽👨🏽\n+Done! Готово";
1026        let result = vec![
1027            CharToken {
1028                byte_offset: 0,
1029                byte_length: 1,
1030                char_offset: 0,
1031                char_length: 1,
1032                token: Token::Special(Special::Punctuation('[')),
1033            },
1034            CharToken {
1035                byte_offset: 1,
1036                byte_length: 5,
1037                char_offset: 1,
1038                char_length: 5,
1039                token: Token::Word(Word::Word("Oxana".to_string())),
1040            },
1041            CharToken {
1042                byte_offset: 6,
1043                byte_length: 1,
1044                char_offset: 6,
1045                char_length: 1,
1046                token: Token::Special(Special::Separator(Separator::Space)),
1047            },
1048            CharToken {
1049                byte_offset: 7,
1050                byte_length: 5,
1051                char_offset: 7,
1052                char_length: 5,
1053                token: Token::Word(Word::Word("Putan".to_string())),
1054            },
1055            CharToken {
1056                byte_offset: 12,
1057                byte_length: 1,
1058                char_offset: 12,
1059                char_length: 1,
1060                token: Token::Special(Special::Punctuation('|')),
1061            },
1062            CharToken {
1063                byte_offset: 13,
1064                byte_length: 10,
1065                char_offset: 13,
1066                char_length: 10,
1067                token: Token::Word(Word::Number(Number::Integer(1712640565))),
1068            },
1069            CharToken {
1070                byte_offset: 23,
1071                byte_length: 1,
1072                char_offset: 23,
1073                char_length: 1,
1074                token: Token::Special(Special::Punctuation(']')),
1075            },
1076            /*CharToken { byte_offset: 0, byte_length: 24, char_offset: 0, char_length: 24, token: Token::BBCode { left: vec![
1077            CharToken { byte_offset: 1, byte_length: 5, char_offset: 1, char_length: 5, token: Token::Word(Word::Word("Oxana".to_string())) },
1078            CharToken { byte_offset: 6, byte_length: 1, char_offset: 6, char_length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
1079            CharToken { byte_offset: 7, byte_length: 5, char_offset: 7, char_length: 5, token: Token::Word(Word::Word("Putan".to_string())) },
1080            ], right: vec![
1081            CharToken { byte_offset: 13, byte_length: 10, char_offset: 13, char_length: 10, token: Token::Word(Word::Number(Number::Integer(1712640565))) },
1082            ] } },*/
1083            CharToken {
1084                byte_offset: 24,
1085                byte_length: 1,
1086                char_offset: 24,
1087                char_length: 1,
1088                token: Token::Special(Special::Separator(Separator::Space)),
1089            },
1090            CharToken {
1091                byte_offset: 25,
1092                byte_length: 6,
1093                char_offset: 25,
1094                char_length: 6,
1095                token: Token::Word(Word::Word("shared".to_string())),
1096            },
1097            CharToken {
1098                byte_offset: 31,
1099                byte_length: 1,
1100                char_offset: 31,
1101                char_length: 1,
1102                token: Token::Special(Special::Separator(Separator::Space)),
1103            },
1104            CharToken {
1105                byte_offset: 32,
1106                byte_length: 3,
1107                char_offset: 32,
1108                char_length: 3,
1109                token: Token::Word(Word::Word("the".to_string())),
1110            },
1111            CharToken {
1112                byte_offset: 35,
1113                byte_length: 1,
1114                char_offset: 35,
1115                char_length: 1,
1116                token: Token::Special(Special::Separator(Separator::Space)),
1117            },
1118            CharToken {
1119                byte_offset: 36,
1120                byte_length: 5,
1121                char_offset: 36,
1122                char_length: 5,
1123                token: Token::Word(Word::Word("quick".to_string())),
1124            },
1125            CharToken {
1126                byte_offset: 41,
1127                byte_length: 1,
1128                char_offset: 41,
1129                char_length: 1,
1130                token: Token::Special(Special::Separator(Separator::Space)),
1131            },
1132            CharToken {
1133                byte_offset: 42,
1134                byte_length: 1,
1135                char_offset: 42,
1136                char_length: 1,
1137                token: Token::Special(Special::Punctuation('(')),
1138            },
1139            CharToken {
1140                byte_offset: 43,
1141                byte_length: 1,
1142                char_offset: 43,
1143                char_length: 1,
1144                token: Token::Special(Special::Punctuation('"')),
1145            },
1146            CharToken {
1147                byte_offset: 44,
1148                byte_length: 5,
1149                char_offset: 44,
1150                char_length: 5,
1151                token: Token::Word(Word::Word("brown".to_string())),
1152            },
1153            CharToken {
1154                byte_offset: 49,
1155                byte_length: 1,
1156                char_offset: 49,
1157                char_length: 1,
1158                token: Token::Special(Special::Punctuation('"')),
1159            },
1160            CharToken {
1161                byte_offset: 50,
1162                byte_length: 1,
1163                char_offset: 50,
1164                char_length: 1,
1165                token: Token::Special(Special::Punctuation(')')),
1166            },
1167            CharToken {
1168                byte_offset: 51,
1169                byte_length: 1,
1170                char_offset: 51,
1171                char_length: 1,
1172                token: Token::Special(Special::Separator(Separator::Space)),
1173            },
1174            CharToken {
1175                byte_offset: 52,
1176                byte_length: 3,
1177                char_offset: 52,
1178                char_length: 3,
1179                token: Token::Word(Word::Word("fox".to_string())),
1180            },
1181            CharToken {
1182                byte_offset: 55,
1183                byte_length: 1,
1184                char_offset: 55,
1185                char_length: 1,
1186                token: Token::Special(Special::Separator(Separator::Space)),
1187            },
1188            CharToken {
1189                byte_offset: 56,
1190                byte_length: 5,
1191                char_offset: 56,
1192                char_length: 5,
1193                token: Token::Word(Word::Word("can\'t".to_string())),
1194            },
1195            CharToken {
1196                byte_offset: 61,
1197                byte_length: 1,
1198                char_offset: 61,
1199                char_length: 1,
1200                token: Token::Special(Special::Separator(Separator::Space)),
1201            },
1202            CharToken {
1203                byte_offset: 62,
1204                byte_length: 4,
1205                char_offset: 62,
1206                char_length: 4,
1207                token: Token::Word(Word::Word("jump".to_string())),
1208            },
1209            CharToken {
1210                byte_offset: 66,
1211                byte_length: 1,
1212                char_offset: 66,
1213                char_length: 1,
1214                token: Token::Special(Special::Separator(Separator::Space)),
1215            },
1216            CharToken {
1217                byte_offset: 67,
1218                byte_length: 4,
1219                char_offset: 67,
1220                char_length: 4,
1221                token: Token::Word(Word::Number(Number::Float(32.3))),
1222            },
1223            CharToken {
1224                byte_offset: 71,
1225                byte_length: 1,
1226                char_offset: 71,
1227                char_length: 1,
1228                token: Token::Special(Special::Separator(Separator::Space)),
1229            },
1230            CharToken {
1231                byte_offset: 72,
1232                byte_length: 4,
1233                char_offset: 72,
1234                char_length: 4,
1235                token: Token::Word(Word::Word("feet".to_string())),
1236            },
1237            CharToken {
1238                byte_offset: 76,
1239                byte_length: 1,
1240                char_offset: 76,
1241                char_length: 1,
1242                token: Token::Special(Special::Punctuation(',')),
1243            },
1244            CharToken {
1245                byte_offset: 77,
1246                byte_length: 1,
1247                char_offset: 77,
1248                char_length: 1,
1249                token: Token::Special(Special::Separator(Separator::Space)),
1250            },
1251            CharToken {
1252                byte_offset: 78,
1253                byte_length: 5,
1254                char_offset: 78,
1255                char_length: 5,
1256                token: Token::Word(Word::Word("right".to_string())),
1257            },
1258            CharToken {
1259                byte_offset: 83,
1260                byte_length: 1,
1261                char_offset: 83,
1262                char_length: 1,
1263                token: Token::Special(Special::Punctuation('?')),
1264            },
1265            CharToken {
1266                byte_offset: 84,
1267                byte_length: 1,
1268                char_offset: 84,
1269                char_length: 1,
1270                token: Token::Special(Special::Separator(Separator::Space)),
1271            },
1272            CharToken {
1273                byte_offset: 85,
1274                byte_length: 4,
1275                char_offset: 85,
1276                char_length: 4,
1277                token: Token::Word(Word::Numerical(Numerical::Measures("4pda".to_string()))),
1278            },
1279            CharToken {
1280                byte_offset: 89,
1281                byte_length: 1,
1282                char_offset: 89,
1283                char_length: 1,
1284                token: Token::Special(Special::Separator(Separator::Space)),
1285            },
1286            CharToken {
1287                byte_offset: 90,
1288                byte_length: 3,
1289                char_offset: 90,
1290                char_length: 3,
1291                token: Token::Word(Word::Word("etc".to_string())),
1292            },
1293            CharToken {
1294                byte_offset: 93,
1295                byte_length: 1,
1296                char_offset: 93,
1297                char_length: 1,
1298                token: Token::Special(Special::Punctuation('.')),
1299            },
1300            CharToken {
1301                byte_offset: 94,
1302                byte_length: 1,
1303                char_offset: 94,
1304                char_length: 1,
1305                token: Token::Special(Special::Separator(Separator::Space)),
1306            },
1307            CharToken {
1308                byte_offset: 95,
1309                byte_length: 3,
1310                char_offset: 95,
1311                char_length: 3,
1312                token: Token::Word(Word::Word("qeq".to_string())),
1313            },
1314            CharToken {
1315                byte_offset: 98,
1316                byte_length: 1,
1317                char_offset: 98,
1318                char_length: 1,
1319                token: Token::Special(Special::Separator(Separator::Space)),
1320            },
1321            CharToken {
1322                byte_offset: 99,
1323                byte_length: 5,
1324                char_offset: 99,
1325                char_length: 5,
1326                token: Token::Word(Word::Word("U.S.A".to_string())),
1327            },
1328            CharToken {
1329                byte_offset: 104,
1330                byte_length: 2,
1331                char_offset: 104,
1332                char_length: 2,
1333                token: Token::Special(Special::Separator(Separator::Space)),
1334            },
1335            CharToken {
1336                byte_offset: 106,
1337                byte_length: 3,
1338                char_offset: 106,
1339                char_length: 3,
1340                token: Token::Word(Word::Word("asd".to_string())),
1341            },
1342            CharToken {
1343                byte_offset: 109,
1344                byte_length: 3,
1345                char_offset: 109,
1346                char_length: 3,
1347                token: Token::Special(Special::Separator(Separator::Newline)),
1348            },
1349            CharToken {
1350                byte_offset: 112,
1351                byte_length: 3,
1352                char_offset: 112,
1353                char_length: 3,
1354                token: Token::Word(Word::Word("Brr".to_string())),
1355            },
1356            CharToken {
1357                byte_offset: 115,
1358                byte_length: 1,
1359                char_offset: 115,
1360                char_length: 1,
1361                token: Token::Special(Special::Punctuation(',')),
1362            },
1363            CharToken {
1364                byte_offset: 116,
1365                byte_length: 1,
1366                char_offset: 116,
1367                char_length: 1,
1368                token: Token::Special(Special::Separator(Separator::Space)),
1369            },
1370            CharToken {
1371                byte_offset: 117,
1372                byte_length: 4,
1373                char_offset: 117,
1374                char_length: 4,
1375                token: Token::Word(Word::Word("it\'s".to_string())),
1376            },
1377            CharToken {
1378                byte_offset: 121,
1379                byte_length: 1,
1380                char_offset: 121,
1381                char_length: 1,
1382                token: Token::Special(Special::Separator(Separator::Space)),
1383            },
1384            CharToken {
1385                byte_offset: 122,
1386                byte_length: 4,
1387                char_offset: 122,
1388                char_length: 4,
1389                token: Token::Word(Word::Number(Number::Float(29.3))),
1390            },
1391            CharToken {
1392                byte_offset: 126,
1393                byte_length: 2,
1394                char_offset: 126,
1395                char_length: 1,
1396                token: Token::Special(Special::Symbol('°')),
1397            },
1398            CharToken {
1399                byte_offset: 128,
1400                byte_length: 1,
1401                char_offset: 127,
1402                char_length: 1,
1403                token: Token::Word(Word::Word("F".to_string())),
1404            },
1405            CharToken {
1406                byte_offset: 129,
1407                byte_length: 1,
1408                char_offset: 128,
1409                char_length: 1,
1410                token: Token::Special(Special::Punctuation('!')),
1411            },
1412            CharToken {
1413                byte_offset: 130,
1414                byte_length: 1,
1415                char_offset: 129,
1416                char_length: 1,
1417                token: Token::Special(Special::Separator(Separator::Newline)),
1418            },
1419            CharToken {
1420                byte_offset: 131,
1421                byte_length: 1,
1422                char_offset: 130,
1423                char_length: 1,
1424                token: Token::Special(Special::Separator(Separator::Space)),
1425            },
1426            CharToken {
1427                byte_offset: 132,
1428                byte_length: 14,
1429                char_offset: 131,
1430                char_length: 7,
1431                token: Token::Word(Word::Word("Русское".to_string())),
1432            },
1433            CharToken {
1434                byte_offset: 146,
1435                byte_length: 1,
1436                char_offset: 138,
1437                char_length: 1,
1438                token: Token::Special(Special::Separator(Separator::Space)),
1439            },
1440            CharToken {
1441                byte_offset: 147,
1442                byte_length: 22,
1443                char_offset: 139,
1444                char_length: 11,
1445                token: Token::Word(Word::Word("предложение".to_string())),
1446            },
1447            CharToken {
1448                byte_offset: 169,
1449                byte_length: 1,
1450                char_offset: 150,
1451                char_length: 1,
1452                token: Token::Special(Special::Separator(Separator::Space)),
1453            },
1454            CharToken {
1455                byte_offset: 170,
1456                byte_length: 5,
1457                char_offset: 151,
1458                char_length: 5,
1459                token: Token::Struct(Struct::Hashtag("36.6".to_string())),
1460            },
1461            CharToken {
1462                byte_offset: 175,
1463                byte_length: 1,
1464                char_offset: 156,
1465                char_length: 1,
1466                token: Token::Special(Special::Separator(Separator::Space)),
1467            },
1468            CharToken {
1469                byte_offset: 176,
1470                byte_length: 6,
1471                char_offset: 157,
1472                char_length: 3,
1473                token: Token::Word(Word::Word("для".to_string())),
1474            },
1475            CharToken {
1476                byte_offset: 182,
1477                byte_length: 1,
1478                char_offset: 160,
1479                char_length: 1,
1480                token: Token::Special(Special::Separator(Separator::Space)),
1481            },
1482            CharToken {
1483                byte_offset: 183,
1484                byte_length: 24,
1485                char_offset: 161,
1486                char_length: 12,
1487                token: Token::Word(Word::Word("тестирования".to_string())),
1488            },
1489            CharToken {
1490                byte_offset: 207,
1491                byte_length: 1,
1492                char_offset: 173,
1493                char_length: 1,
1494                token: Token::Special(Special::Separator(Separator::Space)),
1495            },
1496            CharToken {
1497                byte_offset: 208,
1498                byte_length: 14,
1499                char_offset: 174,
1500                char_length: 7,
1501                token: Token::Word(Word::Word("деления".to_string())),
1502            },
1503            CharToken {
1504                byte_offset: 222,
1505                byte_length: 1,
1506                char_offset: 181,
1507                char_length: 1,
1508                token: Token::Special(Special::Separator(Separator::Space)),
1509            },
1510            CharToken {
1511                byte_offset: 223,
1512                byte_length: 4,
1513                char_offset: 182,
1514                char_length: 2,
1515                token: Token::Word(Word::Word("по".to_string())),
1516            },
1517            CharToken {
1518                byte_offset: 227,
1519                byte_length: 1,
1520                char_offset: 184,
1521                char_length: 1,
1522                token: Token::Special(Special::Separator(Separator::Space)),
1523            },
1524            CharToken {
1525                byte_offset: 228,
1526                byte_length: 12,
1527                char_offset: 185,
1528                char_length: 6,
1529                token: Token::Word(Word::Word("юникод".to_string())),
1530            },
1531            CharToken {
1532                byte_offset: 240,
1533                byte_length: 1,
1534                char_offset: 191,
1535                char_length: 1,
1536                token: Token::Special(Special::Punctuation('-')),
1537            },
1538            CharToken {
1539                byte_offset: 241,
1540                byte_length: 12,
1541                char_offset: 192,
1542                char_length: 6,
1543                token: Token::Word(Word::Word("словам".to_string())),
1544            },
1545            CharToken {
1546                byte_offset: 253,
1547                byte_length: 3,
1548                char_offset: 198,
1549                char_length: 3,
1550                token: Token::Special(Special::Punctuation('.')),
1551            },
1552            CharToken {
1553                byte_offset: 256,
1554                byte_length: 1,
1555                char_offset: 201,
1556                char_length: 1,
1557                token: Token::Special(Special::Separator(Separator::Newline)),
1558            },
1559            CharToken {
1560                byte_offset: 257,
1561                byte_length: 8,
1562                char_offset: 202,
1563                char_length: 2,
1564                token: Token::Word(Word::Emoji("russia")),
1565            },
1566            CharToken {
1567                byte_offset: 265,
1568                byte_length: 1,
1569                char_offset: 204,
1570                char_length: 1,
1571                token: Token::Special(Special::Separator(Separator::Space)),
1572            },
1573            CharToken {
1574                byte_offset: 266,
1575                byte_length: 8,
1576                char_offset: 205,
1577                char_length: 2,
1578                token: Token::Word(Word::Emoji("sao_tome_and_principe")),
1579            },
1580            CharToken {
1581                byte_offset: 274,
1582                byte_length: 1,
1583                char_offset: 207,
1584                char_length: 1,
1585                token: Token::Special(Special::Separator(Separator::Newline)),
1586            },
1587            CharToken {
1588                byte_offset: 275,
1589                byte_length: 8,
1590                char_offset: 208,
1591                char_length: 2,
1592                token: Token::Word(Word::Emoji("blond_haired_person_dark_skin_tone")),
1593            },
1594            CharToken {
1595                byte_offset: 283,
1596                byte_length: 8,
1597                char_offset: 210,
1598                char_length: 2,
1599                token: Token::Word(Word::Emoji("baby_medium_skin_tone")),
1600            },
1601            CharToken {
1602                byte_offset: 291,
1603                byte_length: 8,
1604                char_offset: 212,
1605                char_length: 2,
1606                token: Token::Word(Word::Emoji("man_medium_skin_tone")),
1607            },
1608            CharToken {
1609                byte_offset: 299,
1610                byte_length: 1,
1611                char_offset: 214,
1612                char_length: 1,
1613                token: Token::Special(Special::Separator(Separator::Newline)),
1614            },
1615            CharToken {
1616                byte_offset: 300,
1617                byte_length: 1,
1618                char_offset: 215,
1619                char_length: 1,
1620                token: Token::Special(Special::Punctuation('+')),
1621            },
1622            CharToken {
1623                byte_offset: 301,
1624                byte_length: 4,
1625                char_offset: 216,
1626                char_length: 4,
1627                token: Token::Word(Word::Word("Done".to_string())),
1628            },
1629            CharToken {
1630                byte_offset: 305,
1631                byte_length: 1,
1632                char_offset: 220,
1633                char_length: 1,
1634                token: Token::Special(Special::Punctuation('!')),
1635            },
1636            CharToken {
1637                byte_offset: 306,
1638                byte_length: 1,
1639                char_offset: 221,
1640                char_length: 1,
1641                token: Token::Special(Special::Separator(Separator::Space)),
1642            },
1643            CharToken {
1644                byte_offset: 307,
1645                byte_length: 12,
1646                char_offset: 222,
1647                char_length: 6,
1648                token: Token::Word(Word::Word("Готово".to_string())),
1649            },
1650        ];
1651
1652        let lib_res = uws
1653            .into_tokenizer(TokenizerParams::complex())
1654            .collect::<Vec<_>>();
1655
1656        //print_cresult(); panic!();
1657        check_cresults(&result, &lib_res, uws);
1658    }
1659
1660    #[test]
1661    fn general_default() {
1662        let uws = "The quick (\"brown\") fox can't jump 32.3 feet, right? 4pda etc. qeq U.S.A  asd\n\n\nBrr, it's 29.3°F!\n Русское предложение #36.6 для тестирования деления по юникод-словам...\n";
1663        let result = vec![
1664            PositionalToken {
1665                source: uws,
1666                offset: 0,
1667                length: 3,
1668                token: Token::Word(Word::Word("The".to_string())),
1669            },
1670            PositionalToken {
1671                source: uws,
1672                offset: 3,
1673                length: 1,
1674                token: Token::Special(Special::Separator(Separator::Space)),
1675            },
1676            PositionalToken {
1677                source: uws,
1678                offset: 4,
1679                length: 5,
1680                token: Token::Word(Word::Word("quick".to_string())),
1681            },
1682            PositionalToken {
1683                source: uws,
1684                offset: 9,
1685                length: 1,
1686                token: Token::Special(Special::Separator(Separator::Space)),
1687            },
1688            PositionalToken {
1689                source: uws,
1690                offset: 10,
1691                length: 1,
1692                token: Token::Special(Special::Punctuation('(')),
1693            },
1694            PositionalToken {
1695                source: uws,
1696                offset: 11,
1697                length: 1,
1698                token: Token::Special(Special::Punctuation('"')),
1699            },
1700            PositionalToken {
1701                source: uws,
1702                offset: 12,
1703                length: 5,
1704                token: Token::Word(Word::Word("brown".to_string())),
1705            },
1706            PositionalToken {
1707                source: uws,
1708                offset: 17,
1709                length: 1,
1710                token: Token::Special(Special::Punctuation('"')),
1711            },
1712            PositionalToken {
1713                source: uws,
1714                offset: 18,
1715                length: 1,
1716                token: Token::Special(Special::Punctuation(')')),
1717            },
1718            PositionalToken {
1719                source: uws,
1720                offset: 19,
1721                length: 1,
1722                token: Token::Special(Special::Separator(Separator::Space)),
1723            },
1724            PositionalToken {
1725                source: uws,
1726                offset: 20,
1727                length: 3,
1728                token: Token::Word(Word::Word("fox".to_string())),
1729            },
1730            PositionalToken {
1731                source: uws,
1732                offset: 23,
1733                length: 1,
1734                token: Token::Special(Special::Separator(Separator::Space)),
1735            },
1736            PositionalToken {
1737                source: uws,
1738                offset: 24,
1739                length: 5,
1740                token: Token::Word(Word::Word("can\'t".to_string())),
1741            },
1742            PositionalToken {
1743                source: uws,
1744                offset: 29,
1745                length: 1,
1746                token: Token::Special(Special::Separator(Separator::Space)),
1747            },
1748            PositionalToken {
1749                source: uws,
1750                offset: 30,
1751                length: 4,
1752                token: Token::Word(Word::Word("jump".to_string())),
1753            },
1754            PositionalToken {
1755                source: uws,
1756                offset: 34,
1757                length: 1,
1758                token: Token::Special(Special::Separator(Separator::Space)),
1759            },
1760            PositionalToken {
1761                source: uws,
1762                offset: 35,
1763                length: 4,
1764                token: Token::Word(Word::Number(Number::Float(32.3))),
1765            },
1766            PositionalToken {
1767                source: uws,
1768                offset: 39,
1769                length: 1,
1770                token: Token::Special(Special::Separator(Separator::Space)),
1771            },
1772            PositionalToken {
1773                source: uws,
1774                offset: 40,
1775                length: 4,
1776                token: Token::Word(Word::Word("feet".to_string())),
1777            },
1778            PositionalToken {
1779                source: uws,
1780                offset: 44,
1781                length: 1,
1782                token: Token::Special(Special::Punctuation(',')),
1783            },
1784            PositionalToken {
1785                source: uws,
1786                offset: 45,
1787                length: 1,
1788                token: Token::Special(Special::Separator(Separator::Space)),
1789            },
1790            PositionalToken {
1791                source: uws,
1792                offset: 46,
1793                length: 5,
1794                token: Token::Word(Word::Word("right".to_string())),
1795            },
1796            PositionalToken {
1797                source: uws,
1798                offset: 51,
1799                length: 1,
1800                token: Token::Special(Special::Punctuation('?')),
1801            },
1802            PositionalToken {
1803                source: uws,
1804                offset: 52,
1805                length: 1,
1806                token: Token::Special(Special::Separator(Separator::Space)),
1807            },
1808            PositionalToken {
1809                source: uws,
1810                offset: 53,
1811                length: 4,
1812                token: Token::Word(Word::Numerical(Numerical::Measures("4pda".to_string()))),
1813            }, // TODO
1814            PositionalToken {
1815                source: uws,
1816                offset: 57,
1817                length: 1,
1818                token: Token::Special(Special::Separator(Separator::Space)),
1819            },
1820            PositionalToken {
1821                source: uws,
1822                offset: 58,
1823                length: 3,
1824                token: Token::Word(Word::Word("etc".to_string())),
1825            },
1826            PositionalToken {
1827                source: uws,
1828                offset: 61,
1829                length: 1,
1830                token: Token::Special(Special::Punctuation('.')),
1831            },
1832            PositionalToken {
1833                source: uws,
1834                offset: 62,
1835                length: 1,
1836                token: Token::Special(Special::Separator(Separator::Space)),
1837            },
1838            PositionalToken {
1839                source: uws,
1840                offset: 63,
1841                length: 3,
1842                token: Token::Word(Word::Word("qeq".to_string())),
1843            },
1844            PositionalToken {
1845                source: uws,
1846                offset: 66,
1847                length: 1,
1848                token: Token::Special(Special::Separator(Separator::Space)),
1849            },
1850            PositionalToken {
1851                source: uws,
1852                offset: 67,
1853                length: 1,
1854                token: Token::Word(Word::Word("U".to_string())),
1855            },
1856            PositionalToken {
1857                source: uws,
1858                offset: 68,
1859                length: 1,
1860                token: Token::Special(Special::Punctuation('.')),
1861            },
1862            PositionalToken {
1863                source: uws,
1864                offset: 69,
1865                length: 1,
1866                token: Token::Word(Word::Word("S".to_string())),
1867            },
1868            PositionalToken {
1869                source: uws,
1870                offset: 70,
1871                length: 1,
1872                token: Token::Special(Special::Punctuation('.')),
1873            },
1874            PositionalToken {
1875                source: uws,
1876                offset: 71,
1877                length: 1,
1878                token: Token::Word(Word::Word("A".to_string())),
1879            },
1880            PositionalToken {
1881                source: uws,
1882                offset: 72,
1883                length: 2,
1884                token: Token::Special(Special::Separator(Separator::Space)),
1885            },
1886            PositionalToken {
1887                source: uws,
1888                offset: 74,
1889                length: 3,
1890                token: Token::Word(Word::Word("asd".to_string())),
1891            },
1892            PositionalToken {
1893                source: uws,
1894                offset: 77,
1895                length: 3,
1896                token: Token::Special(Special::Separator(Separator::Newline)),
1897            },
1898            PositionalToken {
1899                source: uws,
1900                offset: 80,
1901                length: 3,
1902                token: Token::Word(Word::Word("Brr".to_string())),
1903            },
1904            PositionalToken {
1905                source: uws,
1906                offset: 83,
1907                length: 1,
1908                token: Token::Special(Special::Punctuation(',')),
1909            },
1910            PositionalToken {
1911                source: uws,
1912                offset: 84,
1913                length: 1,
1914                token: Token::Special(Special::Separator(Separator::Space)),
1915            },
1916            PositionalToken {
1917                source: uws,
1918                offset: 85,
1919                length: 4,
1920                token: Token::Word(Word::Word("it\'s".to_string())),
1921            },
1922            PositionalToken {
1923                source: uws,
1924                offset: 89,
1925                length: 1,
1926                token: Token::Special(Special::Separator(Separator::Space)),
1927            },
1928            PositionalToken {
1929                source: uws,
1930                offset: 90,
1931                length: 4,
1932                token: Token::Word(Word::Number(Number::Float(29.3))),
1933            },
1934            PositionalToken {
1935                source: uws,
1936                offset: 94,
1937                length: 2,
1938                token: Token::Special(Special::Symbol('°')),
1939            },
1940            PositionalToken {
1941                source: uws,
1942                offset: 96,
1943                length: 1,
1944                token: Token::Word(Word::Word("F".to_string())),
1945            },
1946            PositionalToken {
1947                source: uws,
1948                offset: 97,
1949                length: 1,
1950                token: Token::Special(Special::Punctuation('!')),
1951            },
1952            PositionalToken {
1953                source: uws,
1954                offset: 98,
1955                length: 1,
1956                token: Token::Special(Special::Separator(Separator::Newline)),
1957            },
1958            PositionalToken {
1959                source: uws,
1960                offset: 99,
1961                length: 1,
1962                token: Token::Special(Special::Separator(Separator::Space)),
1963            },
1964            PositionalToken {
1965                source: uws,
1966                offset: 100,
1967                length: 14,
1968                token: Token::Word(Word::Word("Русское".to_string())),
1969            },
1970            PositionalToken {
1971                source: uws,
1972                offset: 114,
1973                length: 1,
1974                token: Token::Special(Special::Separator(Separator::Space)),
1975            },
1976            PositionalToken {
1977                source: uws,
1978                offset: 115,
1979                length: 22,
1980                token: Token::Word(Word::Word("предложение".to_string())),
1981            },
1982            PositionalToken {
1983                source: uws,
1984                offset: 137,
1985                length: 1,
1986                token: Token::Special(Special::Separator(Separator::Space)),
1987            },
1988            PositionalToken {
1989                source: uws,
1990                offset: 138,
1991                length: 1,
1992                token: Token::Special(Special::Punctuation('#')),
1993            },
1994            PositionalToken {
1995                source: uws,
1996                offset: 139,
1997                length: 4,
1998                token: Token::Word(Word::Number(Number::Float(36.6))),
1999            },
2000            PositionalToken {
2001                source: uws,
2002                offset: 143,
2003                length: 1,
2004                token: Token::Special(Special::Separator(Separator::Space)),
2005            },
2006            PositionalToken {
2007                source: uws,
2008                offset: 144,
2009                length: 6,
2010                token: Token::Word(Word::Word("для".to_string())),
2011            },
2012            PositionalToken {
2013                source: uws,
2014                offset: 150,
2015                length: 1,
2016                token: Token::Special(Special::Separator(Separator::Space)),
2017            },
2018            PositionalToken {
2019                source: uws,
2020                offset: 151,
2021                length: 24,
2022                token: Token::Word(Word::Word("тестирования".to_string())),
2023            },
2024            PositionalToken {
2025                source: uws,
2026                offset: 175,
2027                length: 1,
2028                token: Token::Special(Special::Separator(Separator::Space)),
2029            },
2030            PositionalToken {
2031                source: uws,
2032                offset: 176,
2033                length: 14,
2034                token: Token::Word(Word::Word("деления".to_string())),
2035            },
2036            PositionalToken {
2037                source: uws,
2038                offset: 190,
2039                length: 1,
2040                token: Token::Special(Special::Separator(Separator::Space)),
2041            },
2042            PositionalToken {
2043                source: uws,
2044                offset: 191,
2045                length: 4,
2046                token: Token::Word(Word::Word("по".to_string())),
2047            },
2048            PositionalToken {
2049                source: uws,
2050                offset: 195,
2051                length: 1,
2052                token: Token::Special(Special::Separator(Separator::Space)),
2053            },
2054            PositionalToken {
2055                source: uws,
2056                offset: 196,
2057                length: 12,
2058                token: Token::Word(Word::Word("юникод".to_string())),
2059            },
2060            PositionalToken {
2061                source: uws,
2062                offset: 208,
2063                length: 1,
2064                token: Token::Special(Special::Punctuation('-')),
2065            },
2066            PositionalToken {
2067                source: uws,
2068                offset: 209,
2069                length: 12,
2070                token: Token::Word(Word::Word("словам".to_string())),
2071            },
2072            PositionalToken {
2073                source: uws,
2074                offset: 221,
2075                length: 3,
2076                token: Token::Special(Special::Punctuation('.')),
2077            },
2078            PositionalToken {
2079                source: uws,
2080                offset: 224,
2081                length: 1,
2082                token: Token::Special(Special::Separator(Separator::Newline)),
2083            },
2084        ];
2085        let lib_res = uws
2086            .into_tokenizer(TokenizerParams::v1())
2087            .collect::<Vec<_>>();
2088        check_results(&result, &lib_res, uws);
2089    }
2090
2091    #[test]
2092    fn general_no_split() {
2093        let uws = "The quick (\"brown\") fox can't jump 32.3 feet, right? 4pda etc. qeq U.S.A  asd\n\n\nBrr, it's 29.3°F!\n Русское предложение #36.6 для тестирования деления по юникод-словам...\n";
2094        let result = vec![
2095            PositionalToken {
2096                source: uws,
2097                offset: 0,
2098                length: 3,
2099                token: Token::Word(Word::Word("The".to_string())),
2100            },
2101            PositionalToken {
2102                source: uws,
2103                offset: 3,
2104                length: 1,
2105                token: Token::Special(Special::Separator(Separator::Space)),
2106            },
2107            PositionalToken {
2108                source: uws,
2109                offset: 4,
2110                length: 5,
2111                token: Token::Word(Word::Word("quick".to_string())),
2112            },
2113            PositionalToken {
2114                source: uws,
2115                offset: 9,
2116                length: 1,
2117                token: Token::Special(Special::Separator(Separator::Space)),
2118            },
2119            PositionalToken {
2120                source: uws,
2121                offset: 10,
2122                length: 1,
2123                token: Token::Special(Special::Punctuation('(')),
2124            },
2125            PositionalToken {
2126                source: uws,
2127                offset: 11,
2128                length: 1,
2129                token: Token::Special(Special::Punctuation('"')),
2130            },
2131            PositionalToken {
2132                source: uws,
2133                offset: 12,
2134                length: 5,
2135                token: Token::Word(Word::Word("brown".to_string())),
2136            },
2137            PositionalToken {
2138                source: uws,
2139                offset: 17,
2140                length: 1,
2141                token: Token::Special(Special::Punctuation('"')),
2142            },
2143            PositionalToken {
2144                source: uws,
2145                offset: 18,
2146                length: 1,
2147                token: Token::Special(Special::Punctuation(')')),
2148            },
2149            PositionalToken {
2150                source: uws,
2151                offset: 19,
2152                length: 1,
2153                token: Token::Special(Special::Separator(Separator::Space)),
2154            },
2155            PositionalToken {
2156                source: uws,
2157                offset: 20,
2158                length: 3,
2159                token: Token::Word(Word::Word("fox".to_string())),
2160            },
2161            PositionalToken {
2162                source: uws,
2163                offset: 23,
2164                length: 1,
2165                token: Token::Special(Special::Separator(Separator::Space)),
2166            },
2167            PositionalToken {
2168                source: uws,
2169                offset: 24,
2170                length: 5,
2171                token: Token::Word(Word::Word("can\'t".to_string())),
2172            },
2173            PositionalToken {
2174                source: uws,
2175                offset: 29,
2176                length: 1,
2177                token: Token::Special(Special::Separator(Separator::Space)),
2178            },
2179            PositionalToken {
2180                source: uws,
2181                offset: 30,
2182                length: 4,
2183                token: Token::Word(Word::Word("jump".to_string())),
2184            },
2185            PositionalToken {
2186                source: uws,
2187                offset: 34,
2188                length: 1,
2189                token: Token::Special(Special::Separator(Separator::Space)),
2190            },
2191            PositionalToken {
2192                source: uws,
2193                offset: 35,
2194                length: 4,
2195                token: Token::Word(Word::Number(Number::Float(32.3))),
2196            },
2197            PositionalToken {
2198                source: uws,
2199                offset: 39,
2200                length: 1,
2201                token: Token::Special(Special::Separator(Separator::Space)),
2202            },
2203            PositionalToken {
2204                source: uws,
2205                offset: 40,
2206                length: 4,
2207                token: Token::Word(Word::Word("feet".to_string())),
2208            },
2209            PositionalToken {
2210                source: uws,
2211                offset: 44,
2212                length: 1,
2213                token: Token::Special(Special::Punctuation(',')),
2214            },
2215            PositionalToken {
2216                source: uws,
2217                offset: 45,
2218                length: 1,
2219                token: Token::Special(Special::Separator(Separator::Space)),
2220            },
2221            PositionalToken {
2222                source: uws,
2223                offset: 46,
2224                length: 5,
2225                token: Token::Word(Word::Word("right".to_string())),
2226            },
2227            PositionalToken {
2228                source: uws,
2229                offset: 51,
2230                length: 1,
2231                token: Token::Special(Special::Punctuation('?')),
2232            },
2233            PositionalToken {
2234                source: uws,
2235                offset: 52,
2236                length: 1,
2237                token: Token::Special(Special::Separator(Separator::Space)),
2238            },
2239            PositionalToken {
2240                source: uws,
2241                offset: 53,
2242                length: 4,
2243                token: Token::Word(Word::Numerical(Numerical::Measures("4pda".to_string()))),
2244            }, // TODO
2245            PositionalToken {
2246                source: uws,
2247                offset: 57,
2248                length: 1,
2249                token: Token::Special(Special::Separator(Separator::Space)),
2250            },
2251            PositionalToken {
2252                source: uws,
2253                offset: 58,
2254                length: 3,
2255                token: Token::Word(Word::Word("etc".to_string())),
2256            },
2257            PositionalToken {
2258                source: uws,
2259                offset: 61,
2260                length: 1,
2261                token: Token::Special(Special::Punctuation('.')),
2262            },
2263            PositionalToken {
2264                source: uws,
2265                offset: 62,
2266                length: 1,
2267                token: Token::Special(Special::Separator(Separator::Space)),
2268            },
2269            PositionalToken {
2270                source: uws,
2271                offset: 63,
2272                length: 3,
2273                token: Token::Word(Word::Word("qeq".to_string())),
2274            },
2275            PositionalToken {
2276                source: uws,
2277                offset: 66,
2278                length: 1,
2279                token: Token::Special(Special::Separator(Separator::Space)),
2280            },
2281            PositionalToken {
2282                source: uws,
2283                offset: 67,
2284                length: 5,
2285                token: Token::Word(Word::Word("U.S.A".to_string())),
2286            },
2287            PositionalToken {
2288                source: uws,
2289                offset: 72,
2290                length: 1,
2291                token: Token::Special(Special::Separator(Separator::Space)),
2292            },
2293            PositionalToken {
2294                source: uws,
2295                offset: 73,
2296                length: 1,
2297                token: Token::Special(Special::Separator(Separator::Space)),
2298            },
2299            PositionalToken {
2300                source: uws,
2301                offset: 74,
2302                length: 3,
2303                token: Token::Word(Word::Word("asd".to_string())),
2304            },
2305            PositionalToken {
2306                source: uws,
2307                offset: 77,
2308                length: 1,
2309                token: Token::Special(Special::Separator(Separator::Newline)),
2310            },
2311            PositionalToken {
2312                source: uws,
2313                offset: 78,
2314                length: 1,
2315                token: Token::Special(Special::Separator(Separator::Newline)),
2316            },
2317            PositionalToken {
2318                source: uws,
2319                offset: 79,
2320                length: 1,
2321                token: Token::Special(Special::Separator(Separator::Newline)),
2322            },
2323            PositionalToken {
2324                source: uws,
2325                offset: 80,
2326                length: 3,
2327                token: Token::Word(Word::Word("Brr".to_string())),
2328            },
2329            PositionalToken {
2330                source: uws,
2331                offset: 83,
2332                length: 1,
2333                token: Token::Special(Special::Punctuation(',')),
2334            },
2335            PositionalToken {
2336                source: uws,
2337                offset: 84,
2338                length: 1,
2339                token: Token::Special(Special::Separator(Separator::Space)),
2340            },
2341            PositionalToken {
2342                source: uws,
2343                offset: 85,
2344                length: 4,
2345                token: Token::Word(Word::Word("it\'s".to_string())),
2346            },
2347            PositionalToken {
2348                source: uws,
2349                offset: 89,
2350                length: 1,
2351                token: Token::Special(Special::Separator(Separator::Space)),
2352            },
2353            PositionalToken {
2354                source: uws,
2355                offset: 90,
2356                length: 4,
2357                token: Token::Word(Word::Number(Number::Float(29.3))),
2358            },
2359            PositionalToken {
2360                source: uws,
2361                offset: 94,
2362                length: 2,
2363                token: Token::Special(Special::Symbol('°')),
2364            },
2365            PositionalToken {
2366                source: uws,
2367                offset: 96,
2368                length: 1,
2369                token: Token::Word(Word::Word("F".to_string())),
2370            },
2371            PositionalToken {
2372                source: uws,
2373                offset: 97,
2374                length: 1,
2375                token: Token::Special(Special::Punctuation('!')),
2376            },
2377            PositionalToken {
2378                source: uws,
2379                offset: 98,
2380                length: 1,
2381                token: Token::Special(Special::Separator(Separator::Newline)),
2382            },
2383            PositionalToken {
2384                source: uws,
2385                offset: 99,
2386                length: 1,
2387                token: Token::Special(Special::Separator(Separator::Space)),
2388            },
2389            PositionalToken {
2390                source: uws,
2391                offset: 100,
2392                length: 14,
2393                token: Token::Word(Word::Word("Русское".to_string())),
2394            },
2395            PositionalToken {
2396                source: uws,
2397                offset: 114,
2398                length: 1,
2399                token: Token::Special(Special::Separator(Separator::Space)),
2400            },
2401            PositionalToken {
2402                source: uws,
2403                offset: 115,
2404                length: 22,
2405                token: Token::Word(Word::Word("предложение".to_string())),
2406            },
2407            PositionalToken {
2408                source: uws,
2409                offset: 137,
2410                length: 1,
2411                token: Token::Special(Special::Separator(Separator::Space)),
2412            },
2413            PositionalToken {
2414                source: uws,
2415                offset: 138,
2416                length: 1,
2417                token: Token::Special(Special::Punctuation('#')),
2418            },
2419            PositionalToken {
2420                source: uws,
2421                offset: 139,
2422                length: 4,
2423                token: Token::Word(Word::Number(Number::Float(36.6))),
2424            },
2425            PositionalToken {
2426                source: uws,
2427                offset: 143,
2428                length: 1,
2429                token: Token::Special(Special::Separator(Separator::Space)),
2430            },
2431            PositionalToken {
2432                source: uws,
2433                offset: 144,
2434                length: 6,
2435                token: Token::Word(Word::Word("для".to_string())),
2436            },
2437            PositionalToken {
2438                source: uws,
2439                offset: 150,
2440                length: 1,
2441                token: Token::Special(Special::Separator(Separator::Space)),
2442            },
2443            PositionalToken {
2444                source: uws,
2445                offset: 151,
2446                length: 24,
2447                token: Token::Word(Word::Word("тестирования".to_string())),
2448            },
2449            PositionalToken {
2450                source: uws,
2451                offset: 175,
2452                length: 1,
2453                token: Token::Special(Special::Separator(Separator::Space)),
2454            },
2455            PositionalToken {
2456                source: uws,
2457                offset: 176,
2458                length: 14,
2459                token: Token::Word(Word::Word("деления".to_string())),
2460            },
2461            PositionalToken {
2462                source: uws,
2463                offset: 190,
2464                length: 1,
2465                token: Token::Special(Special::Separator(Separator::Space)),
2466            },
2467            PositionalToken {
2468                source: uws,
2469                offset: 191,
2470                length: 4,
2471                token: Token::Word(Word::Word("по".to_string())),
2472            },
2473            PositionalToken {
2474                source: uws,
2475                offset: 195,
2476                length: 1,
2477                token: Token::Special(Special::Separator(Separator::Space)),
2478            },
2479            PositionalToken {
2480                source: uws,
2481                offset: 196,
2482                length: 12,
2483                token: Token::Word(Word::Word("юникод".to_string())),
2484            },
2485            PositionalToken {
2486                source: uws,
2487                offset: 208,
2488                length: 1,
2489                token: Token::Special(Special::Punctuation('-')),
2490            },
2491            PositionalToken {
2492                source: uws,
2493                offset: 209,
2494                length: 12,
2495                token: Token::Word(Word::Word("словам".to_string())),
2496            },
2497            PositionalToken {
2498                source: uws,
2499                offset: 221,
2500                length: 1,
2501                token: Token::Special(Special::Punctuation('.')),
2502            },
2503            PositionalToken {
2504                source: uws,
2505                offset: 222,
2506                length: 1,
2507                token: Token::Special(Special::Punctuation('.')),
2508            },
2509            PositionalToken {
2510                source: uws,
2511                offset: 223,
2512                length: 1,
2513                token: Token::Special(Special::Punctuation('.')),
2514            },
2515            PositionalToken {
2516                source: uws,
2517                offset: 224,
2518                length: 1,
2519                token: Token::Special(Special::Separator(Separator::Newline)),
2520            },
2521        ];
2522        let lib_res = uws.into_tokenizer(Default::default()).collect::<Vec<_>>();
2523        check_results(&result, &lib_res, uws);
2524    }
2525
2526    #[test]
2527    fn general_complex() {
2528        let uws = "The quick (\"brown\") fox can't jump 32.3 feet, right? 4pda etc. qeq U.S.A  asd\n\n\nBrr, it's 29.3°F!\n Русское предложение #36.6 для тестирования деления по юникод-словам...\n";
2529        let result = vec![
2530            PositionalToken {
2531                source: uws,
2532                offset: 0,
2533                length: 3,
2534                token: Token::Word(Word::Word("The".to_string())),
2535            },
2536            PositionalToken {
2537                source: uws,
2538                offset: 3,
2539                length: 1,
2540                token: Token::Special(Special::Separator(Separator::Space)),
2541            },
2542            PositionalToken {
2543                source: uws,
2544                offset: 4,
2545                length: 5,
2546                token: Token::Word(Word::Word("quick".to_string())),
2547            },
2548            PositionalToken {
2549                source: uws,
2550                offset: 9,
2551                length: 1,
2552                token: Token::Special(Special::Separator(Separator::Space)),
2553            },
2554            PositionalToken {
2555                source: uws,
2556                offset: 10,
2557                length: 1,
2558                token: Token::Special(Special::Punctuation('(')),
2559            },
2560            PositionalToken {
2561                source: uws,
2562                offset: 11,
2563                length: 1,
2564                token: Token::Special(Special::Punctuation('"')),
2565            },
2566            PositionalToken {
2567                source: uws,
2568                offset: 12,
2569                length: 5,
2570                token: Token::Word(Word::Word("brown".to_string())),
2571            },
2572            PositionalToken {
2573                source: uws,
2574                offset: 17,
2575                length: 1,
2576                token: Token::Special(Special::Punctuation('"')),
2577            },
2578            PositionalToken {
2579                source: uws,
2580                offset: 18,
2581                length: 1,
2582                token: Token::Special(Special::Punctuation(')')),
2583            },
2584            PositionalToken {
2585                source: uws,
2586                offset: 19,
2587                length: 1,
2588                token: Token::Special(Special::Separator(Separator::Space)),
2589            },
2590            PositionalToken {
2591                source: uws,
2592                offset: 20,
2593                length: 3,
2594                token: Token::Word(Word::Word("fox".to_string())),
2595            },
2596            PositionalToken {
2597                source: uws,
2598                offset: 23,
2599                length: 1,
2600                token: Token::Special(Special::Separator(Separator::Space)),
2601            },
2602            PositionalToken {
2603                source: uws,
2604                offset: 24,
2605                length: 5,
2606                token: Token::Word(Word::Word("can\'t".to_string())),
2607            },
2608            PositionalToken {
2609                source: uws,
2610                offset: 29,
2611                length: 1,
2612                token: Token::Special(Special::Separator(Separator::Space)),
2613            },
2614            PositionalToken {
2615                source: uws,
2616                offset: 30,
2617                length: 4,
2618                token: Token::Word(Word::Word("jump".to_string())),
2619            },
2620            PositionalToken {
2621                source: uws,
2622                offset: 34,
2623                length: 1,
2624                token: Token::Special(Special::Separator(Separator::Space)),
2625            },
2626            PositionalToken {
2627                source: uws,
2628                offset: 35,
2629                length: 4,
2630                token: Token::Word(Word::Number(Number::Float(32.3))),
2631            },
2632            PositionalToken {
2633                source: uws,
2634                offset: 39,
2635                length: 1,
2636                token: Token::Special(Special::Separator(Separator::Space)),
2637            },
2638            PositionalToken {
2639                source: uws,
2640                offset: 40,
2641                length: 4,
2642                token: Token::Word(Word::Word("feet".to_string())),
2643            },
2644            PositionalToken {
2645                source: uws,
2646                offset: 44,
2647                length: 1,
2648                token: Token::Special(Special::Punctuation(',')),
2649            },
2650            PositionalToken {
2651                source: uws,
2652                offset: 45,
2653                length: 1,
2654                token: Token::Special(Special::Separator(Separator::Space)),
2655            },
2656            PositionalToken {
2657                source: uws,
2658                offset: 46,
2659                length: 5,
2660                token: Token::Word(Word::Word("right".to_string())),
2661            },
2662            PositionalToken {
2663                source: uws,
2664                offset: 51,
2665                length: 1,
2666                token: Token::Special(Special::Punctuation('?')),
2667            },
2668            PositionalToken {
2669                source: uws,
2670                offset: 52,
2671                length: 1,
2672                token: Token::Special(Special::Separator(Separator::Space)),
2673            },
2674            PositionalToken {
2675                source: uws,
2676                offset: 53,
2677                length: 4,
2678                token: Token::Word(Word::Numerical(Numerical::Measures("4pda".to_string()))),
2679            }, // TODO
2680            PositionalToken {
2681                source: uws,
2682                offset: 57,
2683                length: 1,
2684                token: Token::Special(Special::Separator(Separator::Space)),
2685            },
2686            PositionalToken {
2687                source: uws,
2688                offset: 58,
2689                length: 3,
2690                token: Token::Word(Word::Word("etc".to_string())),
2691            },
2692            PositionalToken {
2693                source: uws,
2694                offset: 61,
2695                length: 1,
2696                token: Token::Special(Special::Punctuation('.')),
2697            },
2698            PositionalToken {
2699                source: uws,
2700                offset: 62,
2701                length: 1,
2702                token: Token::Special(Special::Separator(Separator::Space)),
2703            },
2704            PositionalToken {
2705                source: uws,
2706                offset: 63,
2707                length: 3,
2708                token: Token::Word(Word::Word("qeq".to_string())),
2709            },
2710            PositionalToken {
2711                source: uws,
2712                offset: 66,
2713                length: 1,
2714                token: Token::Special(Special::Separator(Separator::Space)),
2715            },
2716            PositionalToken {
2717                source: uws,
2718                offset: 67,
2719                length: 5,
2720                token: Token::Word(Word::Word("U.S.A".to_string())),
2721            },
2722            PositionalToken {
2723                source: uws,
2724                offset: 72,
2725                length: 2,
2726                token: Token::Special(Special::Separator(Separator::Space)),
2727            },
2728            PositionalToken {
2729                source: uws,
2730                offset: 74,
2731                length: 3,
2732                token: Token::Word(Word::Word("asd".to_string())),
2733            },
2734            PositionalToken {
2735                source: uws,
2736                offset: 77,
2737                length: 3,
2738                token: Token::Special(Special::Separator(Separator::Newline)),
2739            },
2740            PositionalToken {
2741                source: uws,
2742                offset: 80,
2743                length: 3,
2744                token: Token::Word(Word::Word("Brr".to_string())),
2745            },
2746            PositionalToken {
2747                source: uws,
2748                offset: 83,
2749                length: 1,
2750                token: Token::Special(Special::Punctuation(',')),
2751            },
2752            PositionalToken {
2753                source: uws,
2754                offset: 84,
2755                length: 1,
2756                token: Token::Special(Special::Separator(Separator::Space)),
2757            },
2758            PositionalToken {
2759                source: uws,
2760                offset: 85,
2761                length: 4,
2762                token: Token::Word(Word::Word("it\'s".to_string())),
2763            },
2764            PositionalToken {
2765                source: uws,
2766                offset: 89,
2767                length: 1,
2768                token: Token::Special(Special::Separator(Separator::Space)),
2769            },
2770            PositionalToken {
2771                source: uws,
2772                offset: 90,
2773                length: 4,
2774                token: Token::Word(Word::Number(Number::Float(29.3))),
2775            },
2776            PositionalToken {
2777                source: uws,
2778                offset: 94,
2779                length: 2,
2780                token: Token::Special(Special::Symbol('°')),
2781            },
2782            PositionalToken {
2783                source: uws,
2784                offset: 96,
2785                length: 1,
2786                token: Token::Word(Word::Word("F".to_string())),
2787            },
2788            PositionalToken {
2789                source: uws,
2790                offset: 97,
2791                length: 1,
2792                token: Token::Special(Special::Punctuation('!')),
2793            },
2794            PositionalToken {
2795                source: uws,
2796                offset: 98,
2797                length: 1,
2798                token: Token::Special(Special::Separator(Separator::Newline)),
2799            },
2800            PositionalToken {
2801                source: uws,
2802                offset: 99,
2803                length: 1,
2804                token: Token::Special(Special::Separator(Separator::Space)),
2805            },
2806            PositionalToken {
2807                source: uws,
2808                offset: 100,
2809                length: 14,
2810                token: Token::Word(Word::Word("Русское".to_string())),
2811            },
2812            PositionalToken {
2813                source: uws,
2814                offset: 114,
2815                length: 1,
2816                token: Token::Special(Special::Separator(Separator::Space)),
2817            },
2818            PositionalToken {
2819                source: uws,
2820                offset: 115,
2821                length: 22,
2822                token: Token::Word(Word::Word("предложение".to_string())),
2823            },
2824            PositionalToken {
2825                source: uws,
2826                offset: 137,
2827                length: 1,
2828                token: Token::Special(Special::Separator(Separator::Space)),
2829            },
2830            PositionalToken {
2831                source: uws,
2832                offset: 138,
2833                length: 5,
2834                token: Token::Struct(Struct::Hashtag("36.6".to_string())),
2835            },
2836            PositionalToken {
2837                source: uws,
2838                offset: 143,
2839                length: 1,
2840                token: Token::Special(Special::Separator(Separator::Space)),
2841            },
2842            PositionalToken {
2843                source: uws,
2844                offset: 144,
2845                length: 6,
2846                token: Token::Word(Word::Word("для".to_string())),
2847            },
2848            PositionalToken {
2849                source: uws,
2850                offset: 150,
2851                length: 1,
2852                token: Token::Special(Special::Separator(Separator::Space)),
2853            },
2854            PositionalToken {
2855                source: uws,
2856                offset: 151,
2857                length: 24,
2858                token: Token::Word(Word::Word("тестирования".to_string())),
2859            },
2860            PositionalToken {
2861                source: uws,
2862                offset: 175,
2863                length: 1,
2864                token: Token::Special(Special::Separator(Separator::Space)),
2865            },
2866            PositionalToken {
2867                source: uws,
2868                offset: 176,
2869                length: 14,
2870                token: Token::Word(Word::Word("деления".to_string())),
2871            },
2872            PositionalToken {
2873                source: uws,
2874                offset: 190,
2875                length: 1,
2876                token: Token::Special(Special::Separator(Separator::Space)),
2877            },
2878            PositionalToken {
2879                source: uws,
2880                offset: 191,
2881                length: 4,
2882                token: Token::Word(Word::Word("по".to_string())),
2883            },
2884            PositionalToken {
2885                source: uws,
2886                offset: 195,
2887                length: 1,
2888                token: Token::Special(Special::Separator(Separator::Space)),
2889            },
2890            PositionalToken {
2891                source: uws,
2892                offset: 196,
2893                length: 12,
2894                token: Token::Word(Word::Word("юникод".to_string())),
2895            },
2896            PositionalToken {
2897                source: uws,
2898                offset: 208,
2899                length: 1,
2900                token: Token::Special(Special::Punctuation('-')),
2901            },
2902            PositionalToken {
2903                source: uws,
2904                offset: 209,
2905                length: 12,
2906                token: Token::Word(Word::Word("словам".to_string())),
2907            },
2908            PositionalToken {
2909                source: uws,
2910                offset: 221,
2911                length: 3,
2912                token: Token::Special(Special::Punctuation('.')),
2913            },
2914            PositionalToken {
2915                source: uws,
2916                offset: 224,
2917                length: 1,
2918                token: Token::Special(Special::Separator(Separator::Newline)),
2919            },
2920        ];
2921        let lib_res = uws
2922            .into_tokenizer(TokenizerParams::complex())
2923            .collect::<Vec<_>>();
2924        check_results(&result, &lib_res, uws);
2925    }
2926
2927    #[test]
2928    fn plus_minus() {
2929        let uws = "+23 -4.5 -34 +25.7 - 2 + 5.6";
2930        let result = vec![
2931            PositionalToken {
2932                source: uws,
2933                offset: 0,
2934                length: 3,
2935                token: Token::Word(Word::Number(Number::Integer(23))),
2936            },
2937            PositionalToken {
2938                source: uws,
2939                offset: 3,
2940                length: 1,
2941                token: Token::Special(Special::Separator(Separator::Space)),
2942            },
2943            PositionalToken {
2944                source: uws,
2945                offset: 4,
2946                length: 4,
2947                token: Token::Word(Word::Number(Number::Float(-4.5))),
2948            },
2949            PositionalToken {
2950                source: uws,
2951                offset: 8,
2952                length: 1,
2953                token: Token::Special(Special::Separator(Separator::Space)),
2954            },
2955            PositionalToken {
2956                source: uws,
2957                offset: 9,
2958                length: 3,
2959                token: Token::Word(Word::Number(Number::Integer(-34))),
2960            },
2961            PositionalToken {
2962                source: uws,
2963                offset: 12,
2964                length: 1,
2965                token: Token::Special(Special::Separator(Separator::Space)),
2966            },
2967            PositionalToken {
2968                source: uws,
2969                offset: 13,
2970                length: 5,
2971                token: Token::Word(Word::Number(Number::Float(25.7))),
2972            },
2973            PositionalToken {
2974                source: uws,
2975                offset: 18,
2976                length: 1,
2977                token: Token::Special(Special::Separator(Separator::Space)),
2978            },
2979            PositionalToken {
2980                source: uws,
2981                offset: 19,
2982                length: 1,
2983                token: Token::Special(Special::Punctuation('-')),
2984            },
2985            PositionalToken {
2986                source: uws,
2987                offset: 20,
2988                length: 1,
2989                token: Token::Special(Special::Separator(Separator::Space)),
2990            },
2991            PositionalToken {
2992                source: uws,
2993                offset: 21,
2994                length: 1,
2995                token: Token::Word(Word::Number(Number::Integer(2))),
2996            },
2997            PositionalToken {
2998                source: uws,
2999                offset: 22,
3000                length: 1,
3001                token: Token::Special(Special::Separator(Separator::Space)),
3002            },
3003            PositionalToken {
3004                source: uws,
3005                offset: 23,
3006                length: 1,
3007                token: Token::Special(Special::Punctuation('+')),
3008            },
3009            PositionalToken {
3010                source: uws,
3011                offset: 24,
3012                length: 1,
3013                token: Token::Special(Special::Separator(Separator::Space)),
3014            },
3015            PositionalToken {
3016                source: uws,
3017                offset: 25,
3018                length: 3,
3019                token: Token::Word(Word::Number(Number::Float(5.6))),
3020            },
3021        ];
3022        let lib_res = uws
3023            .into_tokenizer(TokenizerParams::v1())
3024            .collect::<Vec<_>>();
3025        check(&result, &lib_res, uws);
3026        //print_result(&lib_res); panic!("")
3027    }
3028
3029    #[test]
3030    #[ignore]
3031    fn woman_bouncing_ball() {
3032        let uws = "\u{26f9}\u{200d}\u{2640}";
3033        let result = vec![PositionalToken {
3034            source: uws,
3035            offset: 0,
3036            length: 9,
3037            token: Token::Word(Word::Emoji("woman_bouncing_ball")),
3038        }];
3039        let lib_res = uws
3040            .into_tokenizer(TokenizerParams::v1())
3041            .collect::<Vec<_>>();
3042        check_results(&result, &lib_res, uws);
3043        //print_result(&lib_res); panic!("")
3044    }
3045
3046    #[test]
3047    fn emoji_and_rusabbr_default() {
3048        let uws = "🇷🇺 🇸🇹\n👱🏿👶🏽👨🏽\n👱\nС.С.С.Р.\n👨‍👩‍👦‍👦\n🧠\n";
3049        let result = vec![
3050            PositionalToken {
3051                source: uws,
3052                offset: 0,
3053                length: 8,
3054                token: Token::Word(Word::Emoji("russia")),
3055            },
3056            PositionalToken {
3057                source: uws,
3058                offset: 8,
3059                length: 1,
3060                token: Token::Special(Special::Separator(Separator::Space)),
3061            },
3062            PositionalToken {
3063                source: uws,
3064                offset: 9,
3065                length: 8,
3066                token: Token::Word(Word::Emoji("sao_tome_and_principe")),
3067            },
3068            PositionalToken {
3069                source: uws,
3070                offset: 17,
3071                length: 1,
3072                token: Token::Special(Special::Separator(Separator::Newline)),
3073            },
3074            PositionalToken {
3075                source: uws,
3076                offset: 18,
3077                length: 8,
3078                token: Token::Word(Word::Emoji("blond_haired_person_dark_skin_tone")),
3079            },
3080            PositionalToken {
3081                source: uws,
3082                offset: 26,
3083                length: 8,
3084                token: Token::Word(Word::Emoji("baby_medium_skin_tone")),
3085            },
3086            PositionalToken {
3087                source: uws,
3088                offset: 34,
3089                length: 8,
3090                token: Token::Word(Word::Emoji("man_medium_skin_tone")),
3091            },
3092            PositionalToken {
3093                source: uws,
3094                offset: 42,
3095                length: 1,
3096                token: Token::Special(Special::Separator(Separator::Newline)),
3097            },
3098            PositionalToken {
3099                source: uws,
3100                offset: 43,
3101                length: 4,
3102                token: Token::Word(Word::Emoji("blond_haired_person")),
3103            },
3104            PositionalToken {
3105                source: uws,
3106                offset: 47,
3107                length: 1,
3108                token: Token::Special(Special::Separator(Separator::Newline)),
3109            },
3110            PositionalToken {
3111                source: uws,
3112                offset: 48,
3113                length: 2,
3114                token: Token::Word(Word::Word("С".to_string())),
3115            },
3116            PositionalToken {
3117                source: uws,
3118                offset: 50,
3119                length: 1,
3120                token: Token::Special(Special::Punctuation('.')),
3121            },
3122            PositionalToken {
3123                source: uws,
3124                offset: 51,
3125                length: 2,
3126                token: Token::Word(Word::Word("С".to_string())),
3127            },
3128            PositionalToken {
3129                source: uws,
3130                offset: 53,
3131                length: 1,
3132                token: Token::Special(Special::Punctuation('.')),
3133            },
3134            PositionalToken {
3135                source: uws,
3136                offset: 54,
3137                length: 2,
3138                token: Token::Word(Word::Word("С".to_string())),
3139            },
3140            PositionalToken {
3141                source: uws,
3142                offset: 56,
3143                length: 1,
3144                token: Token::Special(Special::Punctuation('.')),
3145            },
3146            PositionalToken {
3147                source: uws,
3148                offset: 57,
3149                length: 2,
3150                token: Token::Word(Word::Word("Р".to_string())),
3151            },
3152            PositionalToken {
3153                source: uws,
3154                offset: 59,
3155                length: 1,
3156                token: Token::Special(Special::Punctuation('.')),
3157            },
3158            PositionalToken {
3159                source: uws,
3160                offset: 60,
3161                length: 1,
3162                token: Token::Special(Special::Separator(Separator::Newline)),
3163            },
3164            PositionalToken {
3165                source: uws,
3166                offset: 61,
3167                length: 25,
3168                token: Token::Word(Word::Emoji("family_man_woman_boy_boy")),
3169            },
3170            PositionalToken {
3171                source: uws,
3172                offset: 86,
3173                length: 1,
3174                token: Token::Special(Special::Separator(Separator::Newline)),
3175            },
3176            PositionalToken {
3177                source: uws,
3178                offset: 87,
3179                length: 4,
3180                token: Token::Word(Word::Emoji("brain")),
3181            },
3182            PositionalToken {
3183                source: uws,
3184                offset: 91,
3185                length: 1,
3186                token: Token::Special(Special::Separator(Separator::Newline)),
3187            },
3188        ];
3189
3190        let lib_res = uws
3191            .into_tokenizer(TokenizerParams::v1())
3192            .collect::<Vec<_>>();
3193        check_results(&result, &lib_res, uws);
3194        //print_result(&lib_res); panic!();
3195    }
3196
3197    #[test]
3198    fn emoji_and_rusabbr_no_split() {
3199        let uws = "🇷🇺 🇸🇹\n👱🏿👶🏽👨🏽\n👱\nС.С.С.Р.\n👨‍👩‍👦‍👦\n🧠\n";
3200        let result = vec![
3201            PositionalToken {
3202                source: uws,
3203                offset: 0,
3204                length: 8,
3205                token: Token::Word(Word::Emoji("russia")),
3206            },
3207            PositionalToken {
3208                source: uws,
3209                offset: 8,
3210                length: 1,
3211                token: Token::Special(Special::Separator(Separator::Space)),
3212            },
3213            PositionalToken {
3214                source: uws,
3215                offset: 9,
3216                length: 8,
3217                token: Token::Word(Word::Emoji("sao_tome_and_principe")),
3218            },
3219            PositionalToken {
3220                source: uws,
3221                offset: 17,
3222                length: 1,
3223                token: Token::Special(Special::Separator(Separator::Newline)),
3224            },
3225            PositionalToken {
3226                source: uws,
3227                offset: 18,
3228                length: 8,
3229                token: Token::Word(Word::Emoji("blond_haired_person_dark_skin_tone")),
3230            },
3231            PositionalToken {
3232                source: uws,
3233                offset: 26,
3234                length: 8,
3235                token: Token::Word(Word::Emoji("baby_medium_skin_tone")),
3236            },
3237            PositionalToken {
3238                source: uws,
3239                offset: 34,
3240                length: 8,
3241                token: Token::Word(Word::Emoji("man_medium_skin_tone")),
3242            },
3243            PositionalToken {
3244                source: uws,
3245                offset: 42,
3246                length: 1,
3247                token: Token::Special(Special::Separator(Separator::Newline)),
3248            },
3249            PositionalToken {
3250                source: uws,
3251                offset: 43,
3252                length: 4,
3253                token: Token::Word(Word::Emoji("blond_haired_person")),
3254            },
3255            PositionalToken {
3256                source: uws,
3257                offset: 47,
3258                length: 1,
3259                token: Token::Special(Special::Separator(Separator::Newline)),
3260            },
3261            PositionalToken {
3262                source: uws,
3263                offset: 48,
3264                length: 11,
3265                token: Token::Word(Word::Word("С.С.С.Р".to_string())),
3266            },
3267            PositionalToken {
3268                source: uws,
3269                offset: 59,
3270                length: 1,
3271                token: Token::Special(Special::Punctuation('.')),
3272            },
3273            PositionalToken {
3274                source: uws,
3275                offset: 60,
3276                length: 1,
3277                token: Token::Special(Special::Separator(Separator::Newline)),
3278            },
3279            PositionalToken {
3280                source: uws,
3281                offset: 61,
3282                length: 25,
3283                token: Token::Word(Word::Emoji("family_man_woman_boy_boy")),
3284            },
3285            PositionalToken {
3286                source: uws,
3287                offset: 86,
3288                length: 1,
3289                token: Token::Special(Special::Separator(Separator::Newline)),
3290            },
3291            PositionalToken {
3292                source: uws,
3293                offset: 87,
3294                length: 4,
3295                token: Token::Word(Word::Emoji("brain")),
3296            },
3297            PositionalToken {
3298                source: uws,
3299                offset: 91,
3300                length: 1,
3301                token: Token::Special(Special::Separator(Separator::Newline)),
3302            },
3303        ];
3304
3305        let lib_res = uws.into_tokenizer(Default::default()).collect::<Vec<_>>();
3306        check_results(&result, &lib_res, uws);
3307        //print_result(&lib_res); panic!();
3308    }
3309
3310    /*#[test]
3311    fn hashtags_mentions_urls() {
3312        let uws = "\nSome ##text with #hashtags and @other components\nadfa wdsfdf asdf asd http://asdfasdfsd.com/fasdfd/sadfsadf/sdfas/12312_12414/asdf?fascvx=fsfwer&dsdfasdf=fasdf#fasdf asdfa sdfa sdf\nasdfas df asd who@bla-bla.com asdfas df asdfsd\n";
3313        let result = vec![
3314            PositionalToken { source: uws, offset: 0, length: 1, token: Token::Special(Special::Separator(Separator::Newline)) },
3315            PositionalToken { source: uws, offset: 1, length: 4, token: Token::Word(Word::Word("Some".to_string())) },
3316            PositionalToken { source: uws, offset: 5, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3317            PositionalToken { source: uws, offset: 6, length: 2, token: Token::Special(Special::Punctuation("##".to_string())) },
3318            PositionalToken { source: uws, offset: 8, length: 4, token: Token::Word(Word::Word("text".to_string())) },
3319            PositionalToken { source: uws, offset: 12, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3320            PositionalToken { source: uws, offset: 13, length: 4, token: Token::Word(Word::Word("with".to_string())) },
3321            PositionalToken { source: uws, offset: 17, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3322            PositionalToken { source: uws, offset: 18, length: 9, token: Token::Struct(Struct::Hashtag("hashtags".to_string())) },
3323            PositionalToken { source: uws, offset: 27, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3324            PositionalToken { source: uws, offset: 28, length: 3, token: Token::Word(Word::Word("and".to_string())) },
3325            PositionalToken { source: uws, offset: 31, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3326            PositionalToken { source: uws, offset: 32, length: 6, token: Token::Struct(Struct::Mention("other".to_string())) },
3327            PositionalToken { source: uws, offset: 38, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3328            PositionalToken { source: uws, offset: 39, length: 10, token: Token::Word(Word::Word("components".to_string())) },
3329            PositionalToken { source: uws, offset: 49, length: 1, token: Token::Special(Special::Separator(Separator::Newline)) },
3330            PositionalToken { source: uws, offset: 50, length: 4, token: Token::Word(Word::Word("adfa".to_string())) },
3331            PositionalToken { source: uws, offset: 54, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3332            PositionalToken { source: uws, offset: 55, length: 6, token: Token::Word(Word::Word("wdsfdf".to_string())) },
3333            PositionalToken { source: uws, offset: 61, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3334            PositionalToken { source: uws, offset: 62, length: 4, token: Token::Word(Word::Word("asdf".to_string())) },
3335            PositionalToken { source: uws, offset: 66, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3336            PositionalToken { source: uws, offset: 67, length: 3, token: Token::Word(Word::Word("asd".to_string())) },
3337            PositionalToken { source: uws, offset: 70, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3338            PositionalToken { source: uws, offset: 71, length: 95, token: Token::Struct(Struct::Url("http://asdfasdfsd.com/fasdfd/sadfsadf/sdfas/12312_12414/asdf?fascvx=fsfwer&dsdfasdf=fasdf#fasdf".to_string())) },
3339            PositionalToken { source: uws, offset: 166, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3340            PositionalToken { source: uws, offset: 167, length: 5, token: Token::Word(Word::Word("asdfa".to_string())) },
3341            PositionalToken { source: uws, offset: 172, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3342            PositionalToken { source: uws, offset: 173, length: 4, token: Token::Word(Word::Word("sdfa".to_string())) },
3343            PositionalToken { source: uws, offset: 177, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3344            PositionalToken { source: uws, offset: 178, length: 3, token: Token::Word(Word::Word("sdf".to_string())) },
3345            PositionalToken { source: uws, offset: 181, length: 1, token: Token::Special(Special::Separator(Separator::Newline)) },
3346            PositionalToken { source: uws, offset: 182, length: 6, token: Token::Word(Word::Word("asdfas".to_string())) },
3347            PositionalToken { source: uws, offset: 188, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3348            PositionalToken { source: uws, offset: 189, length: 2, token: Token::Word(Word::Word("df".to_string())) },
3349            PositionalToken { source: uws, offset: 191, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3350            PositionalToken { source: uws, offset: 192, length: 3, token: Token::Word(Word::Word("asd".to_string())) },
3351            PositionalToken { source: uws, offset: 195, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3352            PositionalToken { source: uws, offset: 196, length: 3, token: Token::Word(Word::Word("who".to_string())) },
3353            PositionalToken { source: uws, offset: 199, length: 4, token: Token::Struct(Struct::Mention("bla".to_string())) },
3354            PositionalToken { source: uws, offset: 203, length: 1, token: Token::Special(Special::Punctuation('-')) },
3355            PositionalToken { source: uws, offset: 204, length: 7, token: Token::Word(Word::Word("bla.com".to_string())) },
3356            PositionalToken { source: uws, offset: 211, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3357            PositionalToken { source: uws, offset: 212, length: 6, token: Token::Word(Word::Word("asdfas".to_string())) },
3358            PositionalToken { source: uws, offset: 218, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3359            PositionalToken { source: uws, offset: 219, length: 2, token: Token::Word(Word::Word("df".to_string())) },
3360            PositionalToken { source: uws, offset: 221, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3361            PositionalToken { source: uws, offset: 222, length: 6, token: Token::Word(Word::Word("asdfsd".to_string())) },
3362            PositionalToken { source: uws, offset: 228, length: 1, token: Token::Special(Special::Separator(Separator::Newline)) },
3363            ];
3364        let lib_res = uws.into_tokenizer(TokenizerParams::complex()).collect::<Vec<_>>();
3365        check_results(&result,&lib_res,uws);
3366        //print_result(&lib_res); panic!("")
3367    }*/
3368
3369    /*#[test]
3370    fn bb_code() {
3371        let uws = "[Oxana Putan|1712640565] shared a [post|100001150683379_1873048549410150]. \nAndrew\n[link|https://www.facebook.com/100001150683379/posts/1873048549410150]\nДрузья мои, издатели, редакторы, просветители, культуртрегеры, субъекты мирового рынка и ту хум ит ещё мей консёрн.\nНа текущий момент я лишен былой подвижности, хоть и ковыляю по больничных коридорам по разным нуждам и за кипятком.\nВрачи обещают мне заживление отверстых ран моих в течение полугода и на этот период можно предполагать с уверенностью преимущественно домашний образ жизни.\n[|]";
3372        let result = vec![
3373            PositionalToken { offset: 0, length: 24, token: Token::BBCode { left: vec![
3374                PositionalToken { offset: 1, length: 5, token: Token::Word(Word::Word("Oxana".to_string())) },
3375                PositionalToken { offset: 6, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3376                PositionalToken { offset: 7, length: 5, token: Token::Word(Word::Word("Putan".to_string())) },
3377                ], right: vec![
3378                PositionalToken { offset: 13, length: 10, token: Token::Word(Word::Number(Number::Integer(1712640565))) },
3379                ] } },
3380            PositionalToken { offset: 24, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3381            PositionalToken { offset: 25, length: 6, token: Token::Word(Word::Word("shared".to_string())) },
3382            PositionalToken { offset: 31, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3383            PositionalToken { offset: 32, length: 1, token: Token::Word(Word::Word("a".to_string())) },
3384            PositionalToken { offset: 33, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3385            PositionalToken { offset: 34, length: 39, token: Token::BBCode { left: vec![
3386                PositionalToken { offset: 35, length: 4, token: Token::Word(Word::Word("post".to_string())) },
3387                ], right: vec![
3388                PositionalToken { offset: 40, length: 32, token: Token::Word(Word::Numerical(Numerical::Alphanumeric("100001150683379_1873048549410150".to_string()))) },
3389                ] } },
3390            PositionalToken { offset: 73, length: 1, token: Token::Special(Special::Punctuation('.')) },
3391            PositionalToken { offset: 74, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3392            PositionalToken { offset: 75, length: 1, token: Token::Special(Special::Separator(Separator::Newline)) },
3393            PositionalToken { offset: 76, length: 6, token: Token::Word(Word::Word("Andrew".to_string())) },
3394            PositionalToken { offset: 82, length: 1, token: Token::Special(Special::Separator(Separator::Newline)) },
3395            PositionalToken { offset: 83, length: 70, token: Token::BBCode { left: vec![
3396                PositionalToken { offset: 84, length: 4, token: Token::Word(Word::Word("link".to_string())) },
3397                ], right: vec![
3398                PositionalToken { offset: 89, length: 63, token: Token::Struct(Struct::Url("https://www.facebook.com/100001150683379/posts/1873048549410150".to_string())) },
3399                ] } },
3400            PositionalToken { offset: 153, length: 1, token: Token::Special(Special::Separator(Separator::Newline)) },
3401            PositionalToken { offset: 154, length: 12, token: Token::Word(Word::Word("Друзья".to_string())) },
3402            PositionalToken { offset: 166, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3403            PositionalToken { offset: 167, length: 6, token: Token::Word(Word::Word("мои".to_string())) },
3404            PositionalToken { offset: 173, length: 1, token: Token::Special(Special::Punctuation(',')) },
3405            PositionalToken { offset: 174, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3406            PositionalToken { offset: 175, length: 16, token: Token::Word(Word::Word("издатели".to_string())) },
3407            PositionalToken { offset: 191, length: 1, token: Token::Special(Special::Punctuation(',')) },
3408            PositionalToken { offset: 192, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3409            PositionalToken { offset: 193, length: 18, token: Token::Word(Word::Word("редакторы".to_string())) },
3410            PositionalToken { offset: 211, length: 1, token: Token::Special(Special::Punctuation(',')) },
3411            PositionalToken { offset: 212, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3412            PositionalToken { offset: 213, length: 24, token: Token::Word(Word::Word("просветители".to_string())) },
3413            PositionalToken { offset: 237, length: 1, token: Token::Special(Special::Punctuation(',')) },
3414            PositionalToken { offset: 238, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3415            PositionalToken { offset: 239, length: 28, token: Token::Word(Word::Word("культуртрегеры".to_string())) },
3416            PositionalToken { offset: 267, length: 1, token: Token::Special(Special::Punctuation(',')) },
3417            PositionalToken { offset: 268, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3418            PositionalToken { offset: 269, length: 16, token: Token::Word(Word::Word("субъекты".to_string())) },
3419            PositionalToken { offset: 285, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3420            PositionalToken { offset: 286, length: 16, token: Token::Word(Word::Word("мирового".to_string())) },
3421            PositionalToken { offset: 302, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3422            PositionalToken { offset: 303, length: 10, token: Token::Word(Word::Word("рынка".to_string())) },
3423            PositionalToken { offset: 313, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3424            PositionalToken { offset: 314, length: 2, token: Token::Word(Word::Word("и".to_string())) },
3425            PositionalToken { offset: 316, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3426            PositionalToken { offset: 317, length: 4, token: Token::Word(Word::Word("ту".to_string())) },
3427            PositionalToken { offset: 321, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3428            PositionalToken { offset: 322, length: 6, token: Token::Word(Word::Word("хум".to_string())) },
3429            PositionalToken { offset: 328, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3430            PositionalToken { offset: 329, length: 4, token: Token::Word(Word::Word("ит".to_string())) },
3431            PositionalToken { offset: 333, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3432            PositionalToken { offset: 334, length: 6, token: Token::Word(Word::Word("ещё".to_string())) },
3433            PositionalToken { offset: 340, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3434            PositionalToken { offset: 341, length: 6, token: Token::Word(Word::Word("мей".to_string())) },
3435            PositionalToken { offset: 347, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3436            PositionalToken { offset: 348, length: 14, token: Token::Word(Word::Word("консёрн".to_string())) },
3437            PositionalToken { offset: 362, length: 1, token: Token::Special(Special::Punctuation('.')) },
3438            PositionalToken { offset: 363, length: 1, token: Token::Special(Special::Separator(Separator::Newline)) },
3439            PositionalToken { offset: 364, length: 4, token: Token::Word(Word::Word("На".to_string())) },
3440            PositionalToken { offset: 368, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3441            PositionalToken { offset: 369, length: 14, token: Token::Word(Word::Word("текущий".to_string())) },
3442            PositionalToken { offset: 383, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3443            PositionalToken { offset: 384, length: 12, token: Token::Word(Word::Word("момент".to_string())) },
3444            PositionalToken { offset: 396, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3445            PositionalToken { offset: 397, length: 2, token: Token::Word(Word::Word("я".to_string())) },
3446            PositionalToken { offset: 399, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3447            PositionalToken { offset: 400, length: 10, token: Token::Word(Word::Word("лишен".to_string())) },
3448            PositionalToken { offset: 410, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3449            PositionalToken { offset: 411, length: 10, token: Token::Word(Word::Word("былой".to_string())) },
3450            PositionalToken { offset: 421, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3451            PositionalToken { offset: 422, length: 22, token: Token::Word(Word::Word("подвижности".to_string())) },
3452            PositionalToken { offset: 444, length: 1, token: Token::Special(Special::Punctuation(',')) },
3453            PositionalToken { offset: 445, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3454            PositionalToken { offset: 446, length: 8, token: Token::Word(Word::Word("хоть".to_string())) },
3455            PositionalToken { offset: 454, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3456            PositionalToken { offset: 455, length: 2, token: Token::Word(Word::Word("и".to_string())) },
3457            PositionalToken { offset: 457, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3458            PositionalToken { offset: 458, length: 14, token: Token::Word(Word::Word("ковыляю".to_string())) },
3459            PositionalToken { offset: 472, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3460            PositionalToken { offset: 473, length: 4, token: Token::Word(Word::Word("по".to_string())) },
3461            PositionalToken { offset: 477, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3462            PositionalToken { offset: 478, length: 20, token: Token::Word(Word::Word("больничных".to_string())) },
3463            PositionalToken { offset: 498, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3464            PositionalToken { offset: 499, length: 18, token: Token::Word(Word::Word("коридорам".to_string())) },
3465            PositionalToken { offset: 517, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3466            PositionalToken { offset: 518, length: 4, token: Token::Word(Word::Word("по".to_string())) },
3467            PositionalToken { offset: 522, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3468            PositionalToken { offset: 523, length: 12, token: Token::Word(Word::Word("разным".to_string())) },
3469            PositionalToken { offset: 535, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3470            PositionalToken { offset: 536, length: 12, token: Token::Word(Word::Word("нуждам".to_string())) },
3471            PositionalToken { offset: 548, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3472            PositionalToken { offset: 549, length: 2, token: Token::Word(Word::Word("и".to_string())) },
3473            PositionalToken { offset: 551, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3474            PositionalToken { offset: 552, length: 4, token: Token::Word(Word::Word("за".to_string())) },
3475            PositionalToken { offset: 556, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3476            PositionalToken { offset: 557, length: 16, token: Token::Word(Word::Word("кипятком".to_string())) },
3477            PositionalToken { offset: 573, length: 1, token: Token::Special(Special::Punctuation('.')) },
3478            PositionalToken { offset: 574, length: 1, token: Token::Special(Special::Separator(Separator::Newline)) },
3479            PositionalToken { offset: 575, length: 10, token: Token::Word(Word::Word("Врачи".to_string())) },
3480            PositionalToken { offset: 585, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3481            PositionalToken { offset: 586, length: 14, token: Token::Word(Word::Word("обещают".to_string())) },
3482            PositionalToken { offset: 600, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3483            PositionalToken { offset: 601, length: 6, token: Token::Word(Word::Word("мне".to_string())) },
3484            PositionalToken { offset: 607, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3485            PositionalToken { offset: 608, length: 20, token: Token::Word(Word::Word("заживление".to_string())) },
3486            PositionalToken { offset: 628, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3487            PositionalToken { offset: 629, length: 18, token: Token::Word(Word::Word("отверстых".to_string())) },
3488            PositionalToken { offset: 647, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3489            PositionalToken { offset: 648, length: 6, token: Token::Word(Word::Word("ран".to_string())) },
3490            PositionalToken { offset: 654, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3491            PositionalToken { offset: 655, length: 8, token: Token::Word(Word::Word("моих".to_string())) },
3492            PositionalToken { offset: 663, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3493            PositionalToken { offset: 664, length: 2, token: Token::Word(Word::Word("в".to_string())) },
3494            PositionalToken { offset: 666, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3495            PositionalToken { offset: 667, length: 14, token: Token::Word(Word::Word("течение".to_string())) },
3496            PositionalToken { offset: 681, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3497            PositionalToken { offset: 682, length: 16, token: Token::Word(Word::Word("полугода".to_string())) },
3498            PositionalToken { offset: 698, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3499            PositionalToken { offset: 699, length: 2, token: Token::Word(Word::Word("и".to_string())) },
3500            PositionalToken { offset: 701, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3501            PositionalToken { offset: 702, length: 4, token: Token::Word(Word::Word("на".to_string())) },
3502            PositionalToken { offset: 706, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3503            PositionalToken { offset: 707, length: 8, token: Token::Word(Word::Word("этот".to_string())) },
3504            PositionalToken { offset: 715, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3505            PositionalToken { offset: 716, length: 12, token: Token::Word(Word::Word("период".to_string())) },
3506            PositionalToken { offset: 728, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3507            PositionalToken { offset: 729, length: 10, token: Token::Word(Word::Word("можно".to_string())) },
3508            PositionalToken { offset: 739, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3509            PositionalToken { offset: 740, length: 24, token: Token::Word(Word::Word("предполагать".to_string())) },
3510            PositionalToken { offset: 764, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3511            PositionalToken { offset: 765, length: 2, token: Token::Word(Word::Word("с".to_string())) },
3512            PositionalToken { offset: 767, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3513            PositionalToken { offset: 768, length: 24, token: Token::Word(Word::Word("уверенностью".to_string())) },
3514            PositionalToken { offset: 792, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3515            PositionalToken { offset: 793, length: 30, token: Token::Word(Word::Word("преимущественно".to_string())) },
3516            PositionalToken { offset: 823, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3517            PositionalToken { offset: 824, length: 16, token: Token::Word(Word::Word("домашний".to_string())) },
3518            PositionalToken { offset: 840, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3519            PositionalToken { offset: 841, length: 10, token: Token::Word(Word::Word("образ".to_string())) },
3520            PositionalToken { offset: 851, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3521            PositionalToken { offset: 852, length: 10, token: Token::Word(Word::Word("жизни".to_string())) },
3522            PositionalToken { offset: 862, length: 1, token: Token::Special(Special::Punctuation('.')) },
3523            PositionalToken { offset: 863, length: 1, token: Token::Special(Special::Separator(Separator::Newline)) },
3524            PositionalToken { offset: 864, length: 3, token: Token::BBCode { left: vec![
3525                ], right: vec![
3526                ] } },
3527            ];
3528        let lib_res = uws.into_tokenizer(TokenizerParams::complex()).collect::<Vec<_>>();
3529        //print_result(&lib_res); panic!("");
3530        check_results(&result,&lib_res,uws);
3531    }*/
3532
3533    #[test]
3534    fn html() {
3535        let uws = "<div class=\"article article_view \" id=\"article_view_-113039156_9551\" data-article-url=\"/@chaibuket-o-chem-ne-zabyt-25-noyabrya\" data-audio-context=\"article:-113039156_9551\"><h1  class=\"article_decoration_first article_decoration_last\" >День Мамы </h1><p  class=\"article_decoration_first article_decoration_last\" >День, когда поздравляют мам, бабушек, сестер и жён — это всемирный праздник, называемый «День Мамы». В настоящее время его отмечают почти в каждой стране, просто везде разные даты и способы празднования. </p><h3  class=\"article_decoration_first article_decoration_last\" ><span class='article_anchor_title'>\n  <span class='article_anchor_button' id='pochemu-my-ego-prazdnuem'></span>\n  <span class='article_anchor_fsymbol'>П</span>\n</span>ПОЧЕМУ МЫ ЕГО ПРАЗДНУЕМ</h3><p  class=\"article_decoration_first article_decoration_last article_decoration_before\" >В 1987 году комитет госдумы по делам женщин, семьи и молодежи выступил с предложением учредить «День мамы», а сам приказ был подписан уже 30 января 1988 года Борисом Ельциным. Было решено, что ежегодно в России празднество дня мамы будет выпадать на последнее воскресенье ноября. </p><figure data-type=\"101\" data-mode=\"\"  class=\"article_decoration_first article_decoration_last\" >\n  <div class=\"article_figure_content\" style=\"width: 1125px\">\n    <div class=\"article_figure_sizer_content\"><div class=\"article_object_sizer_wrap\" data-sizes=\"[{&quot;s&quot;:[&quot;https://pp.userapi.com/c849128/v849128704/c0ffd/pcNJaBH3NDo.jpg&quot;,75,50],&quot;m&quot;:[&quot;https://pp.userapi.com/c849128/v849128704/c0ffe/ozCLs2kHtRY.jpg&quot;,130,87],&quot;x&quot;:[&quot;https://pp.userapi.com/c849128/v849128704/c0fff/E4KtTNDydzE.jpg&quot;,604,403],&quot;y&quot;:[&quot;https://pp.userapi.com/c849128/v849128704/c1000/1nLxpYKavzU.jpg&quot;,807,538],&quot;z&quot;:[&quot;https://pp.userapi.com/c849128/v849128704/c1001/IgEODe90yEk.jpg&quot;,1125,750],&quot;o&quot;:[&quot;https://pp.userapi.com/c849128/v849128704/c1002/01faNwVZ2_E.jpg&quot;,130,87],&quot;p&quot;:[&quot;https://pp.userapi.com/c849128/v849128704/c1003/baDFzbdRP2s.jpg&quot;,200,133],&quot;q&quot;:[&quot;https://pp.userapi.com/c849128/v849128704/c1004/CY4khI6KJKA.jpg&quot;,320,213],&quot;r&quot;:[&quot;https://pp.userapi.com/c849128/v849128704/c1005/NOvAJ6-VltY.jpg&quot;,510,340]}]\">\n  <img class=\"article_object_sizer_inner article_object_photo__image_blur\" src=\"https://pp.userapi.com/c849128/v849128704/c0ffd/pcNJaBH3NDo.jpg\" data-baseurl=\"\"/>\n  \n</div></div>\n    <div class=\"article_figure_sizer\" style=\"padding-bottom: 66.666666666667%\"></div>";
3536        let result = vec![
3537            PositionalToken {
3538                source: uws,
3539                offset: 236,
3540                length: 8,
3541                token: Token::Word(Word::Word("День".to_string())),
3542            },
3543            PositionalToken {
3544                source: uws,
3545                offset: 244,
3546                length: 1,
3547                token: Token::Special(Special::Separator(Separator::Space)),
3548            },
3549            PositionalToken {
3550                source: uws,
3551                offset: 245,
3552                length: 8,
3553                token: Token::Word(Word::Word("Мамы".to_string())),
3554            },
3555            PositionalToken {
3556                source: uws,
3557                offset: 253,
3558                length: 1,
3559                token: Token::Special(Special::Separator(Separator::Space)),
3560            },
3561            PositionalToken {
3562                source: uws,
3563                offset: 321,
3564                length: 8,
3565                token: Token::Word(Word::Word("День".to_string())),
3566            },
3567            PositionalToken {
3568                source: uws,
3569                offset: 329,
3570                length: 1,
3571                token: Token::Special(Special::Punctuation(',')),
3572            },
3573            PositionalToken {
3574                source: uws,
3575                offset: 330,
3576                length: 1,
3577                token: Token::Special(Special::Separator(Separator::Space)),
3578            },
3579            PositionalToken {
3580                source: uws,
3581                offset: 331,
3582                length: 10,
3583                token: Token::Word(Word::Word("когда".to_string())),
3584            },
3585            PositionalToken {
3586                source: uws,
3587                offset: 341,
3588                length: 1,
3589                token: Token::Special(Special::Separator(Separator::Space)),
3590            },
3591            PositionalToken {
3592                source: uws,
3593                offset: 342,
3594                length: 22,
3595                token: Token::Word(Word::Word("поздравляют".to_string())),
3596            },
3597            PositionalToken {
3598                source: uws,
3599                offset: 364,
3600                length: 1,
3601                token: Token::Special(Special::Separator(Separator::Space)),
3602            },
3603            PositionalToken {
3604                source: uws,
3605                offset: 365,
3606                length: 6,
3607                token: Token::Word(Word::Word("мам".to_string())),
3608            },
3609            PositionalToken {
3610                source: uws,
3611                offset: 371,
3612                length: 1,
3613                token: Token::Special(Special::Punctuation(',')),
3614            },
3615            PositionalToken {
3616                source: uws,
3617                offset: 372,
3618                length: 1,
3619                token: Token::Special(Special::Separator(Separator::Space)),
3620            },
3621            PositionalToken {
3622                source: uws,
3623                offset: 373,
3624                length: 14,
3625                token: Token::Word(Word::Word("бабушек".to_string())),
3626            },
3627            PositionalToken {
3628                source: uws,
3629                offset: 387,
3630                length: 1,
3631                token: Token::Special(Special::Punctuation(',')),
3632            },
3633            PositionalToken {
3634                source: uws,
3635                offset: 388,
3636                length: 1,
3637                token: Token::Special(Special::Separator(Separator::Space)),
3638            },
3639            PositionalToken {
3640                source: uws,
3641                offset: 389,
3642                length: 12,
3643                token: Token::Word(Word::Word("сестер".to_string())),
3644            },
3645            PositionalToken {
3646                source: uws,
3647                offset: 401,
3648                length: 1,
3649                token: Token::Special(Special::Separator(Separator::Space)),
3650            },
3651            PositionalToken {
3652                source: uws,
3653                offset: 402,
3654                length: 2,
3655                token: Token::Word(Word::Word("и".to_string())),
3656            },
3657            PositionalToken {
3658                source: uws,
3659                offset: 404,
3660                length: 1,
3661                token: Token::Special(Special::Separator(Separator::Space)),
3662            },
3663            PositionalToken {
3664                source: uws,
3665                offset: 405,
3666                length: 6,
3667                token: Token::Word(Word::Word("жён".to_string())),
3668            },
3669            PositionalToken {
3670                source: uws,
3671                offset: 411,
3672                length: 1,
3673                token: Token::Special(Special::Separator(Separator::Space)),
3674            },
3675            PositionalToken {
3676                source: uws,
3677                offset: 412,
3678                length: 3,
3679                token: Token::Special(Special::Punctuation('—')),
3680            },
3681            PositionalToken {
3682                source: uws,
3683                offset: 415,
3684                length: 1,
3685                token: Token::Special(Special::Separator(Separator::Space)),
3686            },
3687            PositionalToken {
3688                source: uws,
3689                offset: 416,
3690                length: 6,
3691                token: Token::Word(Word::Word("это".to_string())),
3692            },
3693            PositionalToken {
3694                source: uws,
3695                offset: 422,
3696                length: 1,
3697                token: Token::Special(Special::Separator(Separator::Space)),
3698            },
3699            PositionalToken {
3700                source: uws,
3701                offset: 423,
3702                length: 18,
3703                token: Token::Word(Word::Word("всемирный".to_string())),
3704            },
3705            PositionalToken {
3706                source: uws,
3707                offset: 441,
3708                length: 1,
3709                token: Token::Special(Special::Separator(Separator::Space)),
3710            },
3711            PositionalToken {
3712                source: uws,
3713                offset: 442,
3714                length: 16,
3715                token: Token::Word(Word::Word("праздник".to_string())),
3716            },
3717            PositionalToken {
3718                source: uws,
3719                offset: 458,
3720                length: 1,
3721                token: Token::Special(Special::Punctuation(',')),
3722            },
3723            PositionalToken {
3724                source: uws,
3725                offset: 459,
3726                length: 1,
3727                token: Token::Special(Special::Separator(Separator::Space)),
3728            },
3729            PositionalToken {
3730                source: uws,
3731                offset: 460,
3732                length: 20,
3733                token: Token::Word(Word::Word("называемый".to_string())),
3734            },
3735            PositionalToken {
3736                source: uws,
3737                offset: 480,
3738                length: 1,
3739                token: Token::Special(Special::Separator(Separator::Space)),
3740            },
3741            PositionalToken {
3742                source: uws,
3743                offset: 481,
3744                length: 2,
3745                token: Token::Special(Special::Punctuation('«')),
3746            },
3747            PositionalToken {
3748                source: uws,
3749                offset: 483,
3750                length: 8,
3751                token: Token::Word(Word::Word("День".to_string())),
3752            },
3753            PositionalToken {
3754                source: uws,
3755                offset: 491,
3756                length: 1,
3757                token: Token::Special(Special::Separator(Separator::Space)),
3758            },
3759            PositionalToken {
3760                source: uws,
3761                offset: 492,
3762                length: 8,
3763                token: Token::Word(Word::Word("Мамы".to_string())),
3764            },
3765            PositionalToken {
3766                source: uws,
3767                offset: 500,
3768                length: 2,
3769                token: Token::Special(Special::Punctuation('»')),
3770            },
3771            PositionalToken {
3772                source: uws,
3773                offset: 502,
3774                length: 1,
3775                token: Token::Special(Special::Punctuation('.')),
3776            },
3777            PositionalToken {
3778                source: uws,
3779                offset: 503,
3780                length: 1,
3781                token: Token::Special(Special::Separator(Separator::Space)),
3782            },
3783            PositionalToken {
3784                source: uws,
3785                offset: 504,
3786                length: 2,
3787                token: Token::Word(Word::Word("В".to_string())),
3788            },
3789            PositionalToken {
3790                source: uws,
3791                offset: 506,
3792                length: 1,
3793                token: Token::Special(Special::Separator(Separator::Space)),
3794            },
3795            PositionalToken {
3796                source: uws,
3797                offset: 507,
3798                length: 18,
3799                token: Token::Word(Word::Word("настоящее".to_string())),
3800            },
3801            PositionalToken {
3802                source: uws,
3803                offset: 525,
3804                length: 1,
3805                token: Token::Special(Special::Separator(Separator::Space)),
3806            },
3807            PositionalToken {
3808                source: uws,
3809                offset: 526,
3810                length: 10,
3811                token: Token::Word(Word::Word("время".to_string())),
3812            },
3813            PositionalToken {
3814                source: uws,
3815                offset: 536,
3816                length: 1,
3817                token: Token::Special(Special::Separator(Separator::Space)),
3818            },
3819            PositionalToken {
3820                source: uws,
3821                offset: 537,
3822                length: 6,
3823                token: Token::Word(Word::Word("его".to_string())),
3824            },
3825            PositionalToken {
3826                source: uws,
3827                offset: 543,
3828                length: 1,
3829                token: Token::Special(Special::Separator(Separator::Space)),
3830            },
3831            PositionalToken {
3832                source: uws,
3833                offset: 544,
3834                length: 16,
3835                token: Token::Word(Word::Word("отмечают".to_string())),
3836            },
3837            PositionalToken {
3838                source: uws,
3839                offset: 560,
3840                length: 1,
3841                token: Token::Special(Special::Separator(Separator::Space)),
3842            },
3843            PositionalToken {
3844                source: uws,
3845                offset: 561,
3846                length: 10,
3847                token: Token::Word(Word::Word("почти".to_string())),
3848            },
3849            PositionalToken {
3850                source: uws,
3851                offset: 571,
3852                length: 1,
3853                token: Token::Special(Special::Separator(Separator::Space)),
3854            },
3855            PositionalToken {
3856                source: uws,
3857                offset: 572,
3858                length: 2,
3859                token: Token::Word(Word::Word("в".to_string())),
3860            },
3861            PositionalToken {
3862                source: uws,
3863                offset: 574,
3864                length: 1,
3865                token: Token::Special(Special::Separator(Separator::Space)),
3866            },
3867            PositionalToken {
3868                source: uws,
3869                offset: 575,
3870                length: 12,
3871                token: Token::Word(Word::Word("каждой".to_string())),
3872            },
3873            PositionalToken {
3874                source: uws,
3875                offset: 587,
3876                length: 1,
3877                token: Token::Special(Special::Separator(Separator::Space)),
3878            },
3879            PositionalToken {
3880                source: uws,
3881                offset: 588,
3882                length: 12,
3883                token: Token::Word(Word::Word("стране".to_string())),
3884            },
3885            PositionalToken {
3886                source: uws,
3887                offset: 600,
3888                length: 1,
3889                token: Token::Special(Special::Punctuation(',')),
3890            },
3891            PositionalToken {
3892                source: uws,
3893                offset: 601,
3894                length: 1,
3895                token: Token::Special(Special::Separator(Separator::Space)),
3896            },
3897            PositionalToken {
3898                source: uws,
3899                offset: 602,
3900                length: 12,
3901                token: Token::Word(Word::Word("просто".to_string())),
3902            },
3903            PositionalToken {
3904                source: uws,
3905                offset: 614,
3906                length: 1,
3907                token: Token::Special(Special::Separator(Separator::Space)),
3908            },
3909            PositionalToken {
3910                source: uws,
3911                offset: 615,
3912                length: 10,
3913                token: Token::Word(Word::Word("везде".to_string())),
3914            },
3915            PositionalToken {
3916                source: uws,
3917                offset: 625,
3918                length: 1,
3919                token: Token::Special(Special::Separator(Separator::Space)),
3920            },
3921            PositionalToken {
3922                source: uws,
3923                offset: 626,
3924                length: 12,
3925                token: Token::Word(Word::Word("разные".to_string())),
3926            },
3927            PositionalToken {
3928                source: uws,
3929                offset: 638,
3930                length: 1,
3931                token: Token::Special(Special::Separator(Separator::Space)),
3932            },
3933            PositionalToken {
3934                source: uws,
3935                offset: 639,
3936                length: 8,
3937                token: Token::Word(Word::Word("даты".to_string())),
3938            },
3939            PositionalToken {
3940                source: uws,
3941                offset: 647,
3942                length: 1,
3943                token: Token::Special(Special::Separator(Separator::Space)),
3944            },
3945            PositionalToken {
3946                source: uws,
3947                offset: 648,
3948                length: 2,
3949                token: Token::Word(Word::Word("и".to_string())),
3950            },
3951            PositionalToken {
3952                source: uws,
3953                offset: 650,
3954                length: 1,
3955                token: Token::Special(Special::Separator(Separator::Space)),
3956            },
3957            PositionalToken {
3958                source: uws,
3959                offset: 651,
3960                length: 14,
3961                token: Token::Word(Word::Word("способы".to_string())),
3962            },
3963            PositionalToken {
3964                source: uws,
3965                offset: 665,
3966                length: 1,
3967                token: Token::Special(Special::Separator(Separator::Space)),
3968            },
3969            PositionalToken {
3970                source: uws,
3971                offset: 666,
3972                length: 24,
3973                token: Token::Word(Word::Word("празднования".to_string())),
3974            },
3975            PositionalToken {
3976                source: uws,
3977                offset: 690,
3978                length: 1,
3979                token: Token::Special(Special::Punctuation('.')),
3980            },
3981            PositionalToken {
3982                source: uws,
3983                offset: 691,
3984                length: 1,
3985                token: Token::Special(Special::Separator(Separator::Space)),
3986            },
3987            PositionalToken {
3988                source: uws,
3989                offset: 794,
3990                length: 1,
3991                token: Token::Special(Special::Separator(Separator::Newline)),
3992            },
3993            PositionalToken {
3994                source: uws,
3995                offset: 795,
3996                length: 2,
3997                token: Token::Special(Special::Separator(Separator::Space)),
3998            },
3999            PositionalToken {
4000                source: uws,
4001                offset: 870,
4002                length: 1,
4003                token: Token::Special(Special::Separator(Separator::Newline)),
4004            },
4005            PositionalToken {
4006                source: uws,
4007                offset: 871,
4008                length: 2,
4009                token: Token::Special(Special::Separator(Separator::Space)),
4010            },
4011            PositionalToken {
4012                source: uws,
4013                offset: 910,
4014                length: 2,
4015                token: Token::Word(Word::Word("П".to_string())),
4016            },
4017            PositionalToken {
4018                source: uws,
4019                offset: 919,
4020                length: 1,
4021                token: Token::Special(Special::Separator(Separator::Newline)),
4022            },
4023            PositionalToken {
4024                source: uws,
4025                offset: 927,
4026                length: 12,
4027                token: Token::Word(Word::Word("ПОЧЕМУ".to_string())),
4028            },
4029            PositionalToken {
4030                source: uws,
4031                offset: 939,
4032                length: 1,
4033                token: Token::Special(Special::Separator(Separator::Space)),
4034            },
4035            PositionalToken {
4036                source: uws,
4037                offset: 940,
4038                length: 4,
4039                token: Token::Word(Word::Word("МЫ".to_string())),
4040            },
4041            PositionalToken {
4042                source: uws,
4043                offset: 944,
4044                length: 1,
4045                token: Token::Special(Special::Separator(Separator::Space)),
4046            },
4047            PositionalToken {
4048                source: uws,
4049                offset: 945,
4050                length: 6,
4051                token: Token::Word(Word::Word("ЕГО".to_string())),
4052            },
4053            PositionalToken {
4054                source: uws,
4055                offset: 951,
4056                length: 1,
4057                token: Token::Special(Special::Separator(Separator::Space)),
4058            },
4059            PositionalToken {
4060                source: uws,
4061                offset: 952,
4062                length: 18,
4063                token: Token::Word(Word::Word("ПРАЗДНУЕМ".to_string())),
4064            },
4065            PositionalToken {
4066                source: uws,
4067                offset: 1063,
4068                length: 2,
4069                token: Token::Word(Word::Word("В".to_string())),
4070            },
4071            PositionalToken {
4072                source: uws,
4073                offset: 1065,
4074                length: 1,
4075                token: Token::Special(Special::Separator(Separator::Space)),
4076            },
4077            PositionalToken {
4078                source: uws,
4079                offset: 1066,
4080                length: 4,
4081                token: Token::Word(Word::Number(Number::Integer(1987))),
4082            },
4083            PositionalToken {
4084                source: uws,
4085                offset: 1070,
4086                length: 1,
4087                token: Token::Special(Special::Separator(Separator::Space)),
4088            },
4089            PositionalToken {
4090                source: uws,
4091                offset: 1071,
4092                length: 8,
4093                token: Token::Word(Word::Word("году".to_string())),
4094            },
4095            PositionalToken {
4096                source: uws,
4097                offset: 1079,
4098                length: 1,
4099                token: Token::Special(Special::Separator(Separator::Space)),
4100            },
4101            PositionalToken {
4102                source: uws,
4103                offset: 1080,
4104                length: 14,
4105                token: Token::Word(Word::Word("комитет".to_string())),
4106            },
4107            PositionalToken {
4108                source: uws,
4109                offset: 1094,
4110                length: 1,
4111                token: Token::Special(Special::Separator(Separator::Space)),
4112            },
4113            PositionalToken {
4114                source: uws,
4115                offset: 1095,
4116                length: 14,
4117                token: Token::Word(Word::Word("госдумы".to_string())),
4118            },
4119            PositionalToken {
4120                source: uws,
4121                offset: 1109,
4122                length: 1,
4123                token: Token::Special(Special::Separator(Separator::Space)),
4124            },
4125            PositionalToken {
4126                source: uws,
4127                offset: 1110,
4128                length: 4,
4129                token: Token::Word(Word::Word("по".to_string())),
4130            },
4131            PositionalToken {
4132                source: uws,
4133                offset: 1114,
4134                length: 1,
4135                token: Token::Special(Special::Separator(Separator::Space)),
4136            },
4137            PositionalToken {
4138                source: uws,
4139                offset: 1115,
4140                length: 10,
4141                token: Token::Word(Word::Word("делам".to_string())),
4142            },
4143            PositionalToken {
4144                source: uws,
4145                offset: 1125,
4146                length: 1,
4147                token: Token::Special(Special::Separator(Separator::Space)),
4148            },
4149            PositionalToken {
4150                source: uws,
4151                offset: 1126,
4152                length: 12,
4153                token: Token::Word(Word::Word("женщин".to_string())),
4154            },
4155            PositionalToken {
4156                source: uws,
4157                offset: 1138,
4158                length: 1,
4159                token: Token::Special(Special::Punctuation(',')),
4160            },
4161            PositionalToken {
4162                source: uws,
4163                offset: 1139,
4164                length: 1,
4165                token: Token::Special(Special::Separator(Separator::Space)),
4166            },
4167            PositionalToken {
4168                source: uws,
4169                offset: 1140,
4170                length: 10,
4171                token: Token::Word(Word::Word("семьи".to_string())),
4172            },
4173            PositionalToken {
4174                source: uws,
4175                offset: 1150,
4176                length: 1,
4177                token: Token::Special(Special::Separator(Separator::Space)),
4178            },
4179            PositionalToken {
4180                source: uws,
4181                offset: 1151,
4182                length: 2,
4183                token: Token::Word(Word::Word("и".to_string())),
4184            },
4185            PositionalToken {
4186                source: uws,
4187                offset: 1153,
4188                length: 1,
4189                token: Token::Special(Special::Separator(Separator::Space)),
4190            },
4191            PositionalToken {
4192                source: uws,
4193                offset: 1154,
4194                length: 16,
4195                token: Token::Word(Word::Word("молодежи".to_string())),
4196            },
4197            PositionalToken {
4198                source: uws,
4199                offset: 1170,
4200                length: 1,
4201                token: Token::Special(Special::Separator(Separator::Space)),
4202            },
4203            PositionalToken {
4204                source: uws,
4205                offset: 1171,
4206                length: 16,
4207                token: Token::Word(Word::Word("выступил".to_string())),
4208            },
4209            PositionalToken {
4210                source: uws,
4211                offset: 1187,
4212                length: 1,
4213                token: Token::Special(Special::Separator(Separator::Space)),
4214            },
4215            PositionalToken {
4216                source: uws,
4217                offset: 1188,
4218                length: 2,
4219                token: Token::Word(Word::Word("с".to_string())),
4220            },
4221            PositionalToken {
4222                source: uws,
4223                offset: 1190,
4224                length: 1,
4225                token: Token::Special(Special::Separator(Separator::Space)),
4226            },
4227            PositionalToken {
4228                source: uws,
4229                offset: 1191,
4230                length: 24,
4231                token: Token::Word(Word::Word("предложением".to_string())),
4232            },
4233            PositionalToken {
4234                source: uws,
4235                offset: 1215,
4236                length: 1,
4237                token: Token::Special(Special::Separator(Separator::Space)),
4238            },
4239            PositionalToken {
4240                source: uws,
4241                offset: 1216,
4242                length: 16,
4243                token: Token::Word(Word::Word("учредить".to_string())),
4244            },
4245            PositionalToken {
4246                source: uws,
4247                offset: 1232,
4248                length: 1,
4249                token: Token::Special(Special::Separator(Separator::Space)),
4250            },
4251            PositionalToken {
4252                source: uws,
4253                offset: 1233,
4254                length: 2,
4255                token: Token::Special(Special::Punctuation('«')),
4256            },
4257            PositionalToken {
4258                source: uws,
4259                offset: 1235,
4260                length: 8,
4261                token: Token::Word(Word::Word("День".to_string())),
4262            },
4263            PositionalToken {
4264                source: uws,
4265                offset: 1243,
4266                length: 1,
4267                token: Token::Special(Special::Separator(Separator::Space)),
4268            },
4269            PositionalToken {
4270                source: uws,
4271                offset: 1244,
4272                length: 8,
4273                token: Token::Word(Word::Word("мамы".to_string())),
4274            },
4275            PositionalToken {
4276                source: uws,
4277                offset: 1252,
4278                length: 2,
4279                token: Token::Special(Special::Punctuation('»')),
4280            },
4281            PositionalToken {
4282                source: uws,
4283                offset: 1254,
4284                length: 1,
4285                token: Token::Special(Special::Punctuation(',')),
4286            },
4287            PositionalToken {
4288                source: uws,
4289                offset: 1255,
4290                length: 1,
4291                token: Token::Special(Special::Separator(Separator::Space)),
4292            },
4293            PositionalToken {
4294                source: uws,
4295                offset: 1256,
4296                length: 2,
4297                token: Token::Word(Word::Word("а".to_string())),
4298            },
4299            PositionalToken {
4300                source: uws,
4301                offset: 1258,
4302                length: 1,
4303                token: Token::Special(Special::Separator(Separator::Space)),
4304            },
4305            PositionalToken {
4306                source: uws,
4307                offset: 1259,
4308                length: 6,
4309                token: Token::Word(Word::Word("сам".to_string())),
4310            },
4311            PositionalToken {
4312                source: uws,
4313                offset: 1265,
4314                length: 1,
4315                token: Token::Special(Special::Separator(Separator::Space)),
4316            },
4317            PositionalToken {
4318                source: uws,
4319                offset: 1266,
4320                length: 12,
4321                token: Token::Word(Word::Word("приказ".to_string())),
4322            },
4323            PositionalToken {
4324                source: uws,
4325                offset: 1278,
4326                length: 1,
4327                token: Token::Special(Special::Separator(Separator::Space)),
4328            },
4329            PositionalToken {
4330                source: uws,
4331                offset: 1279,
4332                length: 6,
4333                token: Token::Word(Word::Word("был".to_string())),
4334            },
4335            PositionalToken {
4336                source: uws,
4337                offset: 1285,
4338                length: 1,
4339                token: Token::Special(Special::Separator(Separator::Space)),
4340            },
4341            PositionalToken {
4342                source: uws,
4343                offset: 1286,
4344                length: 16,
4345                token: Token::Word(Word::Word("подписан".to_string())),
4346            },
4347            PositionalToken {
4348                source: uws,
4349                offset: 1302,
4350                length: 1,
4351                token: Token::Special(Special::Separator(Separator::Space)),
4352            },
4353            PositionalToken {
4354                source: uws,
4355                offset: 1303,
4356                length: 6,
4357                token: Token::Word(Word::Word("уже".to_string())),
4358            },
4359            PositionalToken {
4360                source: uws,
4361                offset: 1309,
4362                length: 1,
4363                token: Token::Special(Special::Separator(Separator::Space)),
4364            },
4365            PositionalToken {
4366                source: uws,
4367                offset: 1310,
4368                length: 2,
4369                token: Token::Word(Word::Number(Number::Integer(30))),
4370            },
4371            PositionalToken {
4372                source: uws,
4373                offset: 1312,
4374                length: 1,
4375                token: Token::Special(Special::Separator(Separator::Space)),
4376            },
4377            PositionalToken {
4378                source: uws,
4379                offset: 1313,
4380                length: 12,
4381                token: Token::Word(Word::Word("января".to_string())),
4382            },
4383            PositionalToken {
4384                source: uws,
4385                offset: 1325,
4386                length: 1,
4387                token: Token::Special(Special::Separator(Separator::Space)),
4388            },
4389            PositionalToken {
4390                source: uws,
4391                offset: 1326,
4392                length: 4,
4393                token: Token::Word(Word::Number(Number::Integer(1988))),
4394            },
4395            PositionalToken {
4396                source: uws,
4397                offset: 1330,
4398                length: 1,
4399                token: Token::Special(Special::Separator(Separator::Space)),
4400            },
4401            PositionalToken {
4402                source: uws,
4403                offset: 1331,
4404                length: 8,
4405                token: Token::Word(Word::Word("года".to_string())),
4406            },
4407            PositionalToken {
4408                source: uws,
4409                offset: 1339,
4410                length: 1,
4411                token: Token::Special(Special::Separator(Separator::Space)),
4412            },
4413            PositionalToken {
4414                source: uws,
4415                offset: 1340,
4416                length: 14,
4417                token: Token::Word(Word::Word("Борисом".to_string())),
4418            },
4419            PositionalToken {
4420                source: uws,
4421                offset: 1354,
4422                length: 1,
4423                token: Token::Special(Special::Separator(Separator::Space)),
4424            },
4425            PositionalToken {
4426                source: uws,
4427                offset: 1355,
4428                length: 16,
4429                token: Token::Word(Word::Word("Ельциным".to_string())),
4430            },
4431            PositionalToken {
4432                source: uws,
4433                offset: 1371,
4434                length: 1,
4435                token: Token::Special(Special::Punctuation('.')),
4436            },
4437            PositionalToken {
4438                source: uws,
4439                offset: 1372,
4440                length: 1,
4441                token: Token::Special(Special::Separator(Separator::Space)),
4442            },
4443            PositionalToken {
4444                source: uws,
4445                offset: 1373,
4446                length: 8,
4447                token: Token::Word(Word::Word("Было".to_string())),
4448            },
4449            PositionalToken {
4450                source: uws,
4451                offset: 1381,
4452                length: 1,
4453                token: Token::Special(Special::Separator(Separator::Space)),
4454            },
4455            PositionalToken {
4456                source: uws,
4457                offset: 1382,
4458                length: 12,
4459                token: Token::Word(Word::Word("решено".to_string())),
4460            },
4461            PositionalToken {
4462                source: uws,
4463                offset: 1394,
4464                length: 1,
4465                token: Token::Special(Special::Punctuation(',')),
4466            },
4467            PositionalToken {
4468                source: uws,
4469                offset: 1395,
4470                length: 1,
4471                token: Token::Special(Special::Separator(Separator::Space)),
4472            },
4473            PositionalToken {
4474                source: uws,
4475                offset: 1396,
4476                length: 6,
4477                token: Token::Word(Word::Word("что".to_string())),
4478            },
4479            PositionalToken {
4480                source: uws,
4481                offset: 1402,
4482                length: 1,
4483                token: Token::Special(Special::Separator(Separator::Space)),
4484            },
4485            PositionalToken {
4486                source: uws,
4487                offset: 1403,
4488                length: 16,
4489                token: Token::Word(Word::Word("ежегодно".to_string())),
4490            },
4491            PositionalToken {
4492                source: uws,
4493                offset: 1419,
4494                length: 1,
4495                token: Token::Special(Special::Separator(Separator::Space)),
4496            },
4497            PositionalToken {
4498                source: uws,
4499                offset: 1420,
4500                length: 2,
4501                token: Token::Word(Word::Word("в".to_string())),
4502            },
4503            PositionalToken {
4504                source: uws,
4505                offset: 1422,
4506                length: 1,
4507                token: Token::Special(Special::Separator(Separator::Space)),
4508            },
4509            PositionalToken {
4510                source: uws,
4511                offset: 1423,
4512                length: 12,
4513                token: Token::Word(Word::Word("России".to_string())),
4514            },
4515            PositionalToken {
4516                source: uws,
4517                offset: 1435,
4518                length: 1,
4519                token: Token::Special(Special::Separator(Separator::Space)),
4520            },
4521            PositionalToken {
4522                source: uws,
4523                offset: 1436,
4524                length: 22,
4525                token: Token::Word(Word::Word("празднество".to_string())),
4526            },
4527            PositionalToken {
4528                source: uws,
4529                offset: 1458,
4530                length: 1,
4531                token: Token::Special(Special::Separator(Separator::Space)),
4532            },
4533            PositionalToken {
4534                source: uws,
4535                offset: 1459,
4536                length: 6,
4537                token: Token::Word(Word::Word("дня".to_string())),
4538            },
4539            PositionalToken {
4540                source: uws,
4541                offset: 1465,
4542                length: 1,
4543                token: Token::Special(Special::Separator(Separator::Space)),
4544            },
4545            PositionalToken {
4546                source: uws,
4547                offset: 1466,
4548                length: 8,
4549                token: Token::Word(Word::Word("мамы".to_string())),
4550            },
4551            PositionalToken {
4552                source: uws,
4553                offset: 1474,
4554                length: 1,
4555                token: Token::Special(Special::Separator(Separator::Space)),
4556            },
4557            PositionalToken {
4558                source: uws,
4559                offset: 1475,
4560                length: 10,
4561                token: Token::Word(Word::Word("будет".to_string())),
4562            },
4563            PositionalToken {
4564                source: uws,
4565                offset: 1485,
4566                length: 1,
4567                token: Token::Special(Special::Separator(Separator::Space)),
4568            },
4569            PositionalToken {
4570                source: uws,
4571                offset: 1486,
4572                length: 16,
4573                token: Token::Word(Word::Word("выпадать".to_string())),
4574            },
4575            PositionalToken {
4576                source: uws,
4577                offset: 1502,
4578                length: 1,
4579                token: Token::Special(Special::Separator(Separator::Space)),
4580            },
4581            PositionalToken {
4582                source: uws,
4583                offset: 1503,
4584                length: 4,
4585                token: Token::Word(Word::Word("на".to_string())),
4586            },
4587            PositionalToken {
4588                source: uws,
4589                offset: 1507,
4590                length: 1,
4591                token: Token::Special(Special::Separator(Separator::Space)),
4592            },
4593            PositionalToken {
4594                source: uws,
4595                offset: 1508,
4596                length: 18,
4597                token: Token::Word(Word::Word("последнее".to_string())),
4598            },
4599            PositionalToken {
4600                source: uws,
4601                offset: 1526,
4602                length: 1,
4603                token: Token::Special(Special::Separator(Separator::Space)),
4604            },
4605            PositionalToken {
4606                source: uws,
4607                offset: 1527,
4608                length: 22,
4609                token: Token::Word(Word::Word("воскресенье".to_string())),
4610            },
4611            PositionalToken {
4612                source: uws,
4613                offset: 1549,
4614                length: 1,
4615                token: Token::Special(Special::Separator(Separator::Space)),
4616            },
4617            PositionalToken {
4618                source: uws,
4619                offset: 1550,
4620                length: 12,
4621                token: Token::Word(Word::Word("ноября".to_string())),
4622            },
4623            PositionalToken {
4624                source: uws,
4625                offset: 1562,
4626                length: 1,
4627                token: Token::Special(Special::Punctuation('.')),
4628            },
4629            PositionalToken {
4630                source: uws,
4631                offset: 1563,
4632                length: 1,
4633                token: Token::Special(Special::Separator(Separator::Space)),
4634            },
4635            PositionalToken {
4636                source: uws,
4637                offset: 1664,
4638                length: 1,
4639                token: Token::Special(Special::Separator(Separator::Newline)),
4640            },
4641            PositionalToken {
4642                source: uws,
4643                offset: 1665,
4644                length: 2,
4645                token: Token::Special(Special::Separator(Separator::Space)),
4646            },
4647            PositionalToken {
4648                source: uws,
4649                offset: 1725,
4650                length: 1,
4651                token: Token::Special(Special::Separator(Separator::Newline)),
4652            },
4653            PositionalToken {
4654                source: uws,
4655                offset: 1726,
4656                length: 4,
4657                token: Token::Special(Special::Separator(Separator::Space)),
4658            },
4659            PositionalToken {
4660                source: uws,
4661                offset: 2725,
4662                length: 1,
4663                token: Token::Special(Special::Separator(Separator::Newline)),
4664            },
4665            PositionalToken {
4666                source: uws,
4667                offset: 2726,
4668                length: 2,
4669                token: Token::Special(Special::Separator(Separator::Space)),
4670            },
4671            PositionalToken {
4672                source: uws,
4673                offset: 2888,
4674                length: 1,
4675                token: Token::Special(Special::Separator(Separator::Newline)),
4676            },
4677            PositionalToken {
4678                source: uws,
4679                offset: 2889,
4680                length: 2,
4681                token: Token::Special(Special::Separator(Separator::Space)),
4682            },
4683            PositionalToken {
4684                source: uws,
4685                offset: 2891,
4686                length: 1,
4687                token: Token::Special(Special::Separator(Separator::Newline)),
4688            },
4689            PositionalToken {
4690                source: uws,
4691                offset: 2904,
4692                length: 1,
4693                token: Token::Special(Special::Separator(Separator::Newline)),
4694            },
4695            PositionalToken {
4696                source: uws,
4697                offset: 2905,
4698                length: 4,
4699                token: Token::Special(Special::Separator(Separator::Space)),
4700            },
4701        ];
4702
4703        let text = Text::new({
4704            uws.into_source()
4705                .pipe(tagger::Builder::new().create().into_breaker())
4706                .pipe(entities::Builder::new().create().into_piped())
4707                .into_separator()
4708        })
4709        .unwrap();
4710
4711        let lib_res = text
4712            .into_tokenizer(TokenizerParams::v1())
4713            .filter_map(|tt| tt.into_original_token_1())
4714            .collect::<Vec<_>>();
4715
4716        check_results(&result, &lib_res, uws);
4717    }
4718
4719    /*#[test]
4720    fn vk_bbcode() {
4721        let uws = "[club113623432|💜💜💜 - для девушек] \n[club113623432|💛💛💛 - для сохраненок]";
4722        let result = vec![
4723            PositionalToken { offset: 0, length: 52, token: Token::BBCode { left: vec![
4724                PositionalToken { offset: 1, length: 13, token: Token::Word(Word::Numerical(Numerical::Alphanumeric("club113623432".to_string()))) },
4725                ], right: vec![
4726                PositionalToken { offset: 15, length: 4, token: Token::Word(Word::Emoji("purple_heart")) },
4727                PositionalToken { offset: 19, length: 4, token: Token::Word(Word::Emoji("purple_heart")) },
4728                PositionalToken { offset: 23, length: 4, token: Token::Word(Word::Emoji("purple_heart")) },
4729                PositionalToken { offset: 27, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
4730                PositionalToken { offset: 28, length: 1, token: Token::Special(Special::Punctuation('-')) },
4731                PositionalToken { offset: 29, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
4732                PositionalToken { offset: 30, length: 6, token: Token::Word(Word::Word("для".to_string())) },
4733                PositionalToken { offset: 36, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
4734                PositionalToken { offset: 37, length: 14, token: Token::Word(Word::Word("девушек".to_string())) },
4735                ] } },
4736            PositionalToken { offset: 52, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
4737            PositionalToken { offset: 53, length: 1, token: Token::Special(Special::Separator(Separator::Newline)) },
4738            PositionalToken { offset: 54, length: 58, token: Token::BBCode { left: vec![
4739                PositionalToken { offset: 55, length: 13, token: Token::Word(Word::Numerical(Numerical::Alphanumeric("club113623432".to_string()))) },
4740                ], right: vec![
4741                PositionalToken { offset: 69, length: 4, token: Token::Word(Word::Emoji("yellow_heart")) },
4742                PositionalToken { offset: 73, length: 4, token: Token::Word(Word::Emoji("yellow_heart")) },
4743                PositionalToken { offset: 77, length: 4, token: Token::Word(Word::Emoji("yellow_heart")) },
4744                PositionalToken { offset: 81, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
4745                PositionalToken { offset: 82, length: 1, token: Token::Special(Special::Punctuation('-')) },
4746                PositionalToken { offset: 83, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
4747                PositionalToken { offset: 84, length: 6, token: Token::Word(Word::Word("для".to_string())) },
4748                PositionalToken { offset: 90, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
4749                PositionalToken { offset: 91, length: 20, token: Token::Word(Word::Word("сохраненок".to_string())) },
4750                ] } },
4751            ];
4752        let lib_res = uws.into_tokenizer(TokenizerParams::complex()).collect::<Vec<_>>();
4753        //print_result(&lib_res); panic!("");
4754        check_results(&result,&lib_res,uws);
4755    }*/
4756
4757    /*#[test]
4758    fn text_href_and_html () {
4759        let uws = "https://youtu.be/dQErLQZw3qA</a></p><figure data-type=\"102\" data-mode=\"\"  class=\"article_decoration_first article_decoration_last\" >\n";
4760        let result =  vec![
4761            PositionalToken { offset: 0, length: 28, token: Token::Struct(Struct::Url("https://youtu.be/dQErLQZw3qA".to_string())) },
4762            PositionalToken { offset: 132, length: 1, token: Token::Special(Special::Separator(Separator::Newline)) },
4763            ];
4764        let lib_res = uws.into_tokenizer(TokenizerParams::v1()).unwrap().collect::<Vec<_>>();
4765        check_results(&result,&lib_res,uws);
4766        //print_result(&lib_res); panic!("")
4767    }*/
4768
4769    #[test]
4770    fn numerical_no_split() {
4771        let uws = "12.02.18 31.28.34 23.11.2018 123.568.365.234.578 127.0.0.1 1st 1кг 123123афываыв 12321фвафыов234выалфо 12_123_343.4234_4234";
4772        let lib_res = uws.into_tokenizer(Default::default()).collect::<Vec<_>>();
4773        //print_result(&lib_res); panic!("");
4774        let result = vec![
4775            PositionalToken {
4776                source: uws,
4777                offset: 0,
4778                length: 8,
4779                token: Token::Word(Word::Numerical(Numerical::DotSeparated(
4780                    "12.02.18".to_string(),
4781                ))),
4782            },
4783            PositionalToken {
4784                source: uws,
4785                offset: 8,
4786                length: 1,
4787                token: Token::Special(Special::Separator(Separator::Space)),
4788            },
4789            PositionalToken {
4790                source: uws,
4791                offset: 9,
4792                length: 8,
4793                token: Token::Word(Word::Numerical(Numerical::DotSeparated(
4794                    "31.28.34".to_string(),
4795                ))),
4796            },
4797            PositionalToken {
4798                source: uws,
4799                offset: 17,
4800                length: 1,
4801                token: Token::Special(Special::Separator(Separator::Space)),
4802            },
4803            PositionalToken {
4804                source: uws,
4805                offset: 18,
4806                length: 10,
4807                token: Token::Word(Word::Numerical(Numerical::DotSeparated(
4808                    "23.11.2018".to_string(),
4809                ))),
4810            },
4811            PositionalToken {
4812                source: uws,
4813                offset: 28,
4814                length: 1,
4815                token: Token::Special(Special::Separator(Separator::Space)),
4816            },
4817            PositionalToken {
4818                source: uws,
4819                offset: 29,
4820                length: 19,
4821                token: Token::Word(Word::Numerical(Numerical::DotSeparated(
4822                    "123.568.365.234.578".to_string(),
4823                ))),
4824            },
4825            PositionalToken {
4826                source: uws,
4827                offset: 48,
4828                length: 1,
4829                token: Token::Special(Special::Separator(Separator::Space)),
4830            },
4831            PositionalToken {
4832                source: uws,
4833                offset: 49,
4834                length: 9,
4835                token: Token::Word(Word::Numerical(Numerical::DotSeparated(
4836                    "127.0.0.1".to_string(),
4837                ))),
4838            },
4839            PositionalToken {
4840                source: uws,
4841                offset: 58,
4842                length: 1,
4843                token: Token::Special(Special::Separator(Separator::Space)),
4844            },
4845            PositionalToken {
4846                source: uws,
4847                offset: 59,
4848                length: 3,
4849                token: Token::Word(Word::Numerical(Numerical::Measures("1st".to_string()))),
4850            },
4851            PositionalToken {
4852                source: uws,
4853                offset: 62,
4854                length: 1,
4855                token: Token::Special(Special::Separator(Separator::Space)),
4856            },
4857            PositionalToken {
4858                source: uws,
4859                offset: 63,
4860                length: 5,
4861                token: Token::Word(Word::Numerical(Numerical::Measures("1кг".to_string()))),
4862            },
4863            PositionalToken {
4864                source: uws,
4865                offset: 68,
4866                length: 1,
4867                token: Token::Special(Special::Separator(Separator::Space)),
4868            },
4869            PositionalToken {
4870                source: uws,
4871                offset: 69,
4872                length: 20,
4873                token: Token::Word(Word::Numerical(Numerical::Measures(
4874                    "123123афываыв".to_string(),
4875                ))),
4876            },
4877            PositionalToken {
4878                source: uws,
4879                offset: 89,
4880                length: 1,
4881                token: Token::Special(Special::Separator(Separator::Space)),
4882            },
4883            PositionalToken {
4884                source: uws,
4885                offset: 90,
4886                length: 34,
4887                token: Token::Word(Word::Numerical(Numerical::Alphanumeric(
4888                    "12321фвафыов234выалфо".to_string(),
4889                ))),
4890            },
4891            PositionalToken {
4892                source: uws,
4893                offset: 124,
4894                length: 1,
4895                token: Token::Special(Special::Separator(Separator::Space)),
4896            },
4897            PositionalToken {
4898                source: uws,
4899                offset: 125,
4900                length: 20,
4901                token: Token::Word(Word::Numerical(Numerical::Alphanumeric(
4902                    "12_123_343.4234_4234".to_string(),
4903                ))),
4904            },
4905        ];
4906        check_results(&result, &lib_res, uws);
4907    }
4908
4909    #[test]
4910    fn numerical_default() {
4911        let uws = "12.02.18 31.28.34 23.11.2018 123.568.365.234.578 127.0.0.1 1st 1кг 123123афываыв 12321фвафыов234выалфо 12_123_343.4234_4234";
4912        let lib_res = uws
4913            .into_tokenizer(TokenizerParams::v1())
4914            .collect::<Vec<_>>();
4915        //print_result(&lib_res); panic!("");
4916        let result = vec![
4917            PositionalToken {
4918                source: uws,
4919                offset: 0,
4920                length: 2,
4921                token: Token::Word(Word::Number(Number::Integer(12))),
4922            },
4923            PositionalToken {
4924                source: uws,
4925                offset: 2,
4926                length: 1,
4927                token: Token::Special(Special::Punctuation('.')),
4928            },
4929            PositionalToken {
4930                source: uws,
4931                offset: 3,
4932                length: 2,
4933                token: Token::Word(Word::Number(Number::Integer(2))),
4934            },
4935            PositionalToken {
4936                source: uws,
4937                offset: 5,
4938                length: 1,
4939                token: Token::Special(Special::Punctuation('.')),
4940            },
4941            PositionalToken {
4942                source: uws,
4943                offset: 6,
4944                length: 2,
4945                token: Token::Word(Word::Number(Number::Integer(18))),
4946            },
4947            PositionalToken {
4948                source: uws,
4949                offset: 8,
4950                length: 1,
4951                token: Token::Special(Special::Separator(Separator::Space)),
4952            },
4953            PositionalToken {
4954                source: uws,
4955                offset: 9,
4956                length: 2,
4957                token: Token::Word(Word::Number(Number::Integer(31))),
4958            },
4959            PositionalToken {
4960                source: uws,
4961                offset: 11,
4962                length: 1,
4963                token: Token::Special(Special::Punctuation('.')),
4964            },
4965            PositionalToken {
4966                source: uws,
4967                offset: 12,
4968                length: 2,
4969                token: Token::Word(Word::Number(Number::Integer(28))),
4970            },
4971            PositionalToken {
4972                source: uws,
4973                offset: 14,
4974                length: 1,
4975                token: Token::Special(Special::Punctuation('.')),
4976            },
4977            PositionalToken {
4978                source: uws,
4979                offset: 15,
4980                length: 2,
4981                token: Token::Word(Word::Number(Number::Integer(34))),
4982            },
4983            PositionalToken {
4984                source: uws,
4985                offset: 17,
4986                length: 1,
4987                token: Token::Special(Special::Separator(Separator::Space)),
4988            },
4989            PositionalToken {
4990                source: uws,
4991                offset: 18,
4992                length: 2,
4993                token: Token::Word(Word::Number(Number::Integer(23))),
4994            },
4995            PositionalToken {
4996                source: uws,
4997                offset: 20,
4998                length: 1,
4999                token: Token::Special(Special::Punctuation('.')),
5000            },
5001            PositionalToken {
5002                source: uws,
5003                offset: 21,
5004                length: 2,
5005                token: Token::Word(Word::Number(Number::Integer(11))),
5006            },
5007            PositionalToken {
5008                source: uws,
5009                offset: 23,
5010                length: 1,
5011                token: Token::Special(Special::Punctuation('.')),
5012            },
5013            PositionalToken {
5014                source: uws,
5015                offset: 24,
5016                length: 4,
5017                token: Token::Word(Word::Number(Number::Integer(2018))),
5018            },
5019            PositionalToken {
5020                source: uws,
5021                offset: 28,
5022                length: 1,
5023                token: Token::Special(Special::Separator(Separator::Space)),
5024            },
5025            PositionalToken {
5026                source: uws,
5027                offset: 29,
5028                length: 3,
5029                token: Token::Word(Word::Number(Number::Integer(123))),
5030            },
5031            PositionalToken {
5032                source: uws,
5033                offset: 32,
5034                length: 1,
5035                token: Token::Special(Special::Punctuation('.')),
5036            },
5037            PositionalToken {
5038                source: uws,
5039                offset: 33,
5040                length: 3,
5041                token: Token::Word(Word::Number(Number::Integer(568))),
5042            },
5043            PositionalToken {
5044                source: uws,
5045                offset: 36,
5046                length: 1,
5047                token: Token::Special(Special::Punctuation('.')),
5048            },
5049            PositionalToken {
5050                source: uws,
5051                offset: 37,
5052                length: 3,
5053                token: Token::Word(Word::Number(Number::Integer(365))),
5054            },
5055            PositionalToken {
5056                source: uws,
5057                offset: 40,
5058                length: 1,
5059                token: Token::Special(Special::Punctuation('.')),
5060            },
5061            PositionalToken {
5062                source: uws,
5063                offset: 41,
5064                length: 3,
5065                token: Token::Word(Word::Number(Number::Integer(234))),
5066            },
5067            PositionalToken {
5068                source: uws,
5069                offset: 44,
5070                length: 1,
5071                token: Token::Special(Special::Punctuation('.')),
5072            },
5073            PositionalToken {
5074                source: uws,
5075                offset: 45,
5076                length: 3,
5077                token: Token::Word(Word::Number(Number::Integer(578))),
5078            },
5079            PositionalToken {
5080                source: uws,
5081                offset: 48,
5082                length: 1,
5083                token: Token::Special(Special::Separator(Separator::Space)),
5084            },
5085            PositionalToken {
5086                source: uws,
5087                offset: 49,
5088                length: 3,
5089                token: Token::Word(Word::Number(Number::Integer(127))),
5090            },
5091            PositionalToken {
5092                source: uws,
5093                offset: 52,
5094                length: 1,
5095                token: Token::Special(Special::Punctuation('.')),
5096            },
5097            PositionalToken {
5098                source: uws,
5099                offset: 53,
5100                length: 1,
5101                token: Token::Word(Word::Number(Number::Integer(0))),
5102            },
5103            PositionalToken {
5104                source: uws,
5105                offset: 54,
5106                length: 1,
5107                token: Token::Special(Special::Punctuation('.')),
5108            },
5109            PositionalToken {
5110                source: uws,
5111                offset: 55,
5112                length: 1,
5113                token: Token::Word(Word::Number(Number::Integer(0))),
5114            },
5115            PositionalToken {
5116                source: uws,
5117                offset: 56,
5118                length: 1,
5119                token: Token::Special(Special::Punctuation('.')),
5120            },
5121            PositionalToken {
5122                source: uws,
5123                offset: 57,
5124                length: 1,
5125                token: Token::Word(Word::Number(Number::Integer(1))),
5126            },
5127            PositionalToken {
5128                source: uws,
5129                offset: 58,
5130                length: 1,
5131                token: Token::Special(Special::Separator(Separator::Space)),
5132            },
5133            PositionalToken {
5134                source: uws,
5135                offset: 59,
5136                length: 3,
5137                token: Token::Word(Word::Numerical(Numerical::Measures("1st".to_string()))),
5138            },
5139            PositionalToken {
5140                source: uws,
5141                offset: 62,
5142                length: 1,
5143                token: Token::Special(Special::Separator(Separator::Space)),
5144            },
5145            PositionalToken {
5146                source: uws,
5147                offset: 63,
5148                length: 5,
5149                token: Token::Word(Word::Numerical(Numerical::Measures("1кг".to_string()))),
5150            },
5151            PositionalToken {
5152                source: uws,
5153                offset: 68,
5154                length: 1,
5155                token: Token::Special(Special::Separator(Separator::Space)),
5156            },
5157            PositionalToken {
5158                source: uws,
5159                offset: 69,
5160                length: 20,
5161                token: Token::Word(Word::Numerical(Numerical::Measures(
5162                    "123123афываыв".to_string(),
5163                ))),
5164            },
5165            PositionalToken {
5166                source: uws,
5167                offset: 89,
5168                length: 1,
5169                token: Token::Special(Special::Separator(Separator::Space)),
5170            },
5171            PositionalToken {
5172                source: uws,
5173                offset: 90,
5174                length: 34,
5175                token: Token::Word(Word::Numerical(Numerical::Alphanumeric(
5176                    "12321фвафыов234выалфо".to_string(),
5177                ))),
5178            },
5179            PositionalToken {
5180                source: uws,
5181                offset: 124,
5182                length: 1,
5183                token: Token::Special(Special::Separator(Separator::Space)),
5184            },
5185            PositionalToken {
5186                source: uws,
5187                offset: 125,
5188                length: 2,
5189                token: Token::Word(Word::Number(Number::Integer(12))),
5190            },
5191            PositionalToken {
5192                source: uws,
5193                offset: 127,
5194                length: 1,
5195                token: Token::Special(Special::Punctuation('_')),
5196            },
5197            PositionalToken {
5198                source: uws,
5199                offset: 128,
5200                length: 3,
5201                token: Token::Word(Word::Number(Number::Integer(123))),
5202            },
5203            PositionalToken {
5204                source: uws,
5205                offset: 131,
5206                length: 1,
5207                token: Token::Special(Special::Punctuation('_')),
5208            },
5209            PositionalToken {
5210                source: uws,
5211                offset: 132,
5212                length: 3,
5213                token: Token::Word(Word::Number(Number::Integer(343))),
5214            },
5215            PositionalToken {
5216                source: uws,
5217                offset: 135,
5218                length: 1,
5219                token: Token::Special(Special::Punctuation('.')),
5220            },
5221            PositionalToken {
5222                source: uws,
5223                offset: 136,
5224                length: 4,
5225                token: Token::Word(Word::Number(Number::Integer(4234))),
5226            },
5227            PositionalToken {
5228                source: uws,
5229                offset: 140,
5230                length: 1,
5231                token: Token::Special(Special::Punctuation('_')),
5232            },
5233            PositionalToken {
5234                source: uws,
5235                offset: 141,
5236                length: 4,
5237                token: Token::Word(Word::Number(Number::Integer(4234))),
5238            },
5239        ];
5240        check_results(&result, &lib_res, uws);
5241    }
5242
5243    /*#[test]
5244        fn new_test() {
5245            let uws = "";
5246            let lib_res = uws.into_tokenizer(TokenizerParams::v1()).unwrap().collect::<Vec<_>>();
5247            print_result(&lib_res); panic!("");
5248            let result = vec![];
5249            check_results(&result,&lib_res,uws);
5250
5251    }*/
5252
5253    /* Language tests */
5254
5255    enum Lang {
5256        Zho,
5257        Jpn,
5258        Kor,
5259        Ara,
5260        Ell,
5261    }
5262
5263    #[test]
5264    fn test_lang_zho() {
5265        let (uws, result) = get_lang_test(Lang::Zho);
5266        let lib_res = uws
5267            .into_tokenizer(TokenizerParams::v1())
5268            .collect::<Vec<_>>();
5269        check_results(&result, &lib_res, &uws);
5270    }
5271
5272    #[test]
5273    fn test_lang_jpn() {
5274        let (uws, result) = get_lang_test(Lang::Jpn);
5275        let lib_res = uws
5276            .into_tokenizer(TokenizerParams::v1())
5277            .collect::<Vec<_>>();
5278        check_results(&result, &lib_res, &uws);
5279    }
5280
5281    #[test]
5282    fn test_lang_kor() {
5283        let (uws, result) = get_lang_test(Lang::Kor);
5284        let lib_res = uws
5285            .into_tokenizer(TokenizerParams::v1())
5286            .collect::<Vec<_>>();
5287        check_results(&result, &lib_res, &uws);
5288    }
5289
5290    #[test]
5291    fn test_lang_ara() {
5292        let (uws, result) = get_lang_test(Lang::Ara);
5293        let lib_res = uws
5294            .into_tokenizer(TokenizerParams::v1())
5295            .collect::<Vec<_>>();
5296        check_results(&result, &lib_res, &uws);
5297    }
5298
5299    #[test]
5300    fn test_lang_ell() {
5301        let (uws, result) = get_lang_test(Lang::Ell);
5302        let lib_res = uws
5303            .into_tokenizer(TokenizerParams::v1())
5304            .collect::<Vec<_>>();
5305        check_results(&result, &lib_res, &uws);
5306    }
5307
5308    fn get_lang_test(lng: Lang) -> (String, Vec<PositionalToken>) {
5309        let uws = match lng {
5310            Lang::Zho => "美国电视连续剧《超人前传》的第一集《试播集》于2001年10月16日在電視網首播,剧集主创人阿尔弗雷德·高夫和迈尔斯·米勒編劇,大卫·努特尔执导。这一试播首次向观众引荐了克拉克·肯特一角,他是位拥有超能力的外星孤儿,与家人和朋友一起在堪薩斯州虚构小镇斯莫维尔生活。在这一集里,肯特首度得知自己的来历,同时还需要阻止一位学生试图杀死镇上高中多名学生的报复之举。本集节目里引入了多个之后将贯穿全季甚至整部剧集的主题元素,例如几位主要角色之间的三角恋情。电视剧在加拿大溫哥華取景,旨在选用其“美国中产阶级”景观,主创人花了5个月的时间专门用于为主角物色合适的演员。试播集在所有演员选好4天后正式开拍。由于时间上的限制,剧组无法搭建好实体外景,因此只能使用计算机绘图技术将数字化的外景插入到镜头中。节目一经上映就打破了电视网的多项收视纪录,并且获得了评论员的普遍好评和多个奖项提名,并在其中两项上胜出",
5311            Lang::Kor =>  "플레이스테이션 은 소니 컴퓨터 엔터테인먼트가 개발한 세 번째 가정용 게임기이다. 마이크로소프트의 엑스박스 360, 닌텐도의 Wii와 경쟁하고 있다. 이전 제품에서 온라인 플레이 기능을 비디오 게임 개발사에 전적으로 의존하던 것과 달리 통합 온라인 게임 서비스인 플레이스테이션 네트워크 서비스를 발매와 함께 시작해 제공하고 있으며, 탄탄한 멀티미디어 재생 기능, 플레이스테이션 포터블과의 연결, 고화질 광학 디스크 포맷인 블루레이 디스크 재생 기능 등의 기능을 갖추고 있다. 2006년 11월 11일에 일본에서 처음으로 출시했으며, 11월 17일에는 북미 지역, 2007년 3월 23일에는 유럽과 오세아니아 지역에서, 대한민국의 경우 6월 5일부터 일주일간 예약판매를 실시해, 매일 준비한 수량이 동이 나는 등 많은 관심을 받았으며 6월 16일에 정식 출시 행사를 열었다",
5312            Lang::Jpn => "熊野三山本願所は、15世紀末以降における熊野三山(熊野本宮、熊野新宮、熊野那智)の造営・修造のための勧進を担った組織の総称。 熊野三山を含めて、日本における古代から中世前半にかけての寺社の造営は、寺社領経営のような恒常的財源、幕府や朝廷などからの一時的な造営料所の寄進、あるいは公権力からの臨時の保護によって行われていた。しかしながら、熊野三山では、これらの財源はすべて15世紀半ばまでに実効性を失った",
5313            Lang::Ara => "لشکرکشی‌های روس‌های وارنگی به دریای خزر مجموعه‌ای از حملات نظامی در بین سال‌های ۸۶۴ تا ۱۰۴۱ میلادی به سواحل دریای خزر بوده‌است. روس‌های وارنگی ابتدا در قرن نهم میلادی به عنوان بازرگانان پوست، عسل و برده در سرزمین‌های اسلامی(سرکلند) ظاهر شدند. این بازرگانان در مسیر تجاری ولگا به خرید و فروش می‌پرداختند. نخستین حملهٔ آنان در فاصله سال‌های ۸۶۴ تا ۸۸۴ میلادی در مقیاسی کوچک علیه علویان طبرستان رخ داد. نخستین یورش بزرگ روس‌ها در سال ۹۱۳ رخ داد و آنان با ۵۰۰ فروند درازکشتی شهر گرگان و اطراف آن را غارت کردند. آن‌ها در این حمله مقداری کالا و برده را به تاراج بردند و در راه بازگشتن به سمت شمال، در دلتای ولگا، مورد حملهٔ خزرهای مسلمان قرار گرفتند و بعضی از آنان موفق به فرار شدند، ولی در میانهٔ ولگا به قتل رسیدند. دومین هجوم بزرگ روس‌ها به دریای خزر در سال ۹۴۳ به وقوع پیوست. در این دوره ایگور یکم، حاکم روس کیف، رهبری روس‌ها را در دست داشت. روس‌ها پس از توافق با دولت خزرها برای عبور امن از منطقه، تا رود کورا و اعماق قفقاز پیش رفتند و در سال ۹۴۳ موفق شدند بندر بردعه، پایتخت اران (جمهوری آذربایجان کنونی)، را تصرف کنند. روس‌ها در آنجا به مدت چند ماه ماندند و بسیاری از ساکنان شهر را کشتند و از راه غارت‌گری اموالی را به تاراج بردند. تنها دلیل بازگشت آنان ",
5314            Lang::Ell => "Το Πρόγραμμα υλοποιείται εξ ολοκλήρου από απόσταση και μπορεί να συμμετέχει κάθε εμπλεκόμενος στη ή/και ενδιαφερόμενος για τη διδασκαλία της Ελληνικής ως δεύτερης/ξένης γλώσσας στην Ελλάδα και στο εξωτερικό, αρκεί να είναι απόφοιτος ελληνικής φιλολογίας, ξένων φιλολογιών, παιδαγωγικών τμημάτων, θεολογικών σχολών ή άλλων πανεπιστημιακών τμημάτων ελληνικών ή ισότιμων ξένων πανεπιστημίων. Υπό όρους γίνονται δεκτοί υποψήφιοι που δεν έχουν ολοκληρώσει σπουδές τριτοβάθμιας εκπαίδευσης.",
5315        };
5316        let tokens = match lng {
5317            Lang::Zho => vec![
5318                PositionalToken {
5319                    source: uws,
5320                    offset: 0,
5321                    length: 3,
5322                    token: Token::Word(Word::Word("美".to_string())),
5323                },
5324                PositionalToken {
5325                    source: uws,
5326                    offset: 3,
5327                    length: 3,
5328                    token: Token::Word(Word::Word("国".to_string())),
5329                },
5330                PositionalToken {
5331                    source: uws,
5332                    offset: 6,
5333                    length: 3,
5334                    token: Token::Word(Word::Word("电".to_string())),
5335                },
5336                PositionalToken {
5337                    source: uws,
5338                    offset: 9,
5339                    length: 3,
5340                    token: Token::Word(Word::Word("视".to_string())),
5341                },
5342                PositionalToken {
5343                    source: uws,
5344                    offset: 12,
5345                    length: 3,
5346                    token: Token::Word(Word::Word("连".to_string())),
5347                },
5348                PositionalToken {
5349                    source: uws,
5350                    offset: 15,
5351                    length: 3,
5352                    token: Token::Word(Word::Word("续".to_string())),
5353                },
5354                PositionalToken {
5355                    source: uws,
5356                    offset: 18,
5357                    length: 3,
5358                    token: Token::Word(Word::Word("剧".to_string())),
5359                },
5360                PositionalToken {
5361                    source: uws,
5362                    offset: 21,
5363                    length: 3,
5364                    token: Token::Special(Special::Punctuation('《')),
5365                },
5366                PositionalToken {
5367                    source: uws,
5368                    offset: 24,
5369                    length: 3,
5370                    token: Token::Word(Word::Word("超".to_string())),
5371                },
5372                PositionalToken {
5373                    source: uws,
5374                    offset: 27,
5375                    length: 3,
5376                    token: Token::Word(Word::Word("人".to_string())),
5377                },
5378                PositionalToken {
5379                    source: uws,
5380                    offset: 30,
5381                    length: 3,
5382                    token: Token::Word(Word::Word("前".to_string())),
5383                },
5384                PositionalToken {
5385                    source: uws,
5386                    offset: 33,
5387                    length: 3,
5388                    token: Token::Word(Word::Word("传".to_string())),
5389                },
5390                PositionalToken {
5391                    source: uws,
5392                    offset: 36,
5393                    length: 3,
5394                    token: Token::Special(Special::Punctuation('》')),
5395                },
5396                PositionalToken {
5397                    source: uws,
5398                    offset: 39,
5399                    length: 3,
5400                    token: Token::Word(Word::Word("的".to_string())),
5401                },
5402                PositionalToken {
5403                    source: uws,
5404                    offset: 42,
5405                    length: 3,
5406                    token: Token::Word(Word::Word("第".to_string())),
5407                },
5408                PositionalToken {
5409                    source: uws,
5410                    offset: 45,
5411                    length: 3,
5412                    token: Token::Word(Word::Word("一".to_string())),
5413                },
5414                PositionalToken {
5415                    source: uws,
5416                    offset: 48,
5417                    length: 3,
5418                    token: Token::Word(Word::Word("集".to_string())),
5419                },
5420                PositionalToken {
5421                    source: uws,
5422                    offset: 51,
5423                    length: 3,
5424                    token: Token::Special(Special::Punctuation('《')),
5425                },
5426                PositionalToken {
5427                    source: uws,
5428                    offset: 54,
5429                    length: 3,
5430                    token: Token::Word(Word::Word("试".to_string())),
5431                },
5432                PositionalToken {
5433                    source: uws,
5434                    offset: 57,
5435                    length: 3,
5436                    token: Token::Word(Word::Word("播".to_string())),
5437                },
5438                PositionalToken {
5439                    source: uws,
5440                    offset: 60,
5441                    length: 3,
5442                    token: Token::Word(Word::Word("集".to_string())),
5443                },
5444                PositionalToken {
5445                    source: uws,
5446                    offset: 63,
5447                    length: 3,
5448                    token: Token::Special(Special::Punctuation('》')),
5449                },
5450                PositionalToken {
5451                    source: uws,
5452                    offset: 66,
5453                    length: 3,
5454                    token: Token::Word(Word::Word("于".to_string())),
5455                },
5456                PositionalToken {
5457                    source: uws,
5458                    offset: 69,
5459                    length: 4,
5460                    token: Token::Word(Word::Number(Number::Integer(2001))),
5461                },
5462                PositionalToken {
5463                    source: uws,
5464                    offset: 73,
5465                    length: 3,
5466                    token: Token::Word(Word::Word("年".to_string())),
5467                },
5468                PositionalToken {
5469                    source: uws,
5470                    offset: 76,
5471                    length: 2,
5472                    token: Token::Word(Word::Number(Number::Integer(10))),
5473                },
5474                PositionalToken {
5475                    source: uws,
5476                    offset: 78,
5477                    length: 3,
5478                    token: Token::Word(Word::Word("月".to_string())),
5479                },
5480                PositionalToken {
5481                    source: uws,
5482                    offset: 81,
5483                    length: 2,
5484                    token: Token::Word(Word::Number(Number::Integer(16))),
5485                },
5486                PositionalToken {
5487                    source: uws,
5488                    offset: 83,
5489                    length: 3,
5490                    token: Token::Word(Word::Word("日".to_string())),
5491                },
5492                PositionalToken {
5493                    source: uws,
5494                    offset: 86,
5495                    length: 3,
5496                    token: Token::Word(Word::Word("在".to_string())),
5497                },
5498                PositionalToken {
5499                    source: uws,
5500                    offset: 89,
5501                    length: 3,
5502                    token: Token::Word(Word::Word("電".to_string())),
5503                },
5504                PositionalToken {
5505                    source: uws,
5506                    offset: 92,
5507                    length: 3,
5508                    token: Token::Word(Word::Word("視".to_string())),
5509                },
5510                PositionalToken {
5511                    source: uws,
5512                    offset: 95,
5513                    length: 3,
5514                    token: Token::Word(Word::Word("網".to_string())),
5515                },
5516                PositionalToken {
5517                    source: uws,
5518                    offset: 98,
5519                    length: 3,
5520                    token: Token::Word(Word::Word("首".to_string())),
5521                },
5522                PositionalToken {
5523                    source: uws,
5524                    offset: 101,
5525                    length: 3,
5526                    token: Token::Word(Word::Word("播".to_string())),
5527                },
5528                PositionalToken {
5529                    source: uws,
5530                    offset: 104,
5531                    length: 3,
5532                    token: Token::Special(Special::Punctuation(',')),
5533                },
5534                PositionalToken {
5535                    source: uws,
5536                    offset: 107,
5537                    length: 3,
5538                    token: Token::Word(Word::Word("剧".to_string())),
5539                },
5540                PositionalToken {
5541                    source: uws,
5542                    offset: 110,
5543                    length: 3,
5544                    token: Token::Word(Word::Word("集".to_string())),
5545                },
5546                PositionalToken {
5547                    source: uws,
5548                    offset: 113,
5549                    length: 3,
5550                    token: Token::Word(Word::Word("主".to_string())),
5551                },
5552                PositionalToken {
5553                    source: uws,
5554                    offset: 116,
5555                    length: 3,
5556                    token: Token::Word(Word::Word("创".to_string())),
5557                },
5558                PositionalToken {
5559                    source: uws,
5560                    offset: 119,
5561                    length: 3,
5562                    token: Token::Word(Word::Word("人".to_string())),
5563                },
5564                PositionalToken {
5565                    source: uws,
5566                    offset: 122,
5567                    length: 3,
5568                    token: Token::Word(Word::Word("阿".to_string())),
5569                },
5570                PositionalToken {
5571                    source: uws,
5572                    offset: 125,
5573                    length: 3,
5574                    token: Token::Word(Word::Word("尔".to_string())),
5575                },
5576                PositionalToken {
5577                    source: uws,
5578                    offset: 128,
5579                    length: 3,
5580                    token: Token::Word(Word::Word("弗".to_string())),
5581                },
5582                PositionalToken {
5583                    source: uws,
5584                    offset: 131,
5585                    length: 3,
5586                    token: Token::Word(Word::Word("雷".to_string())),
5587                },
5588                PositionalToken {
5589                    source: uws,
5590                    offset: 134,
5591                    length: 3,
5592                    token: Token::Word(Word::Word("德".to_string())),
5593                },
5594                PositionalToken {
5595                    source: uws,
5596                    offset: 137,
5597                    length: 2,
5598                    token: Token::Special(Special::Punctuation('·')),
5599                },
5600                PositionalToken {
5601                    source: uws,
5602                    offset: 139,
5603                    length: 3,
5604                    token: Token::Word(Word::Word("高".to_string())),
5605                },
5606                PositionalToken {
5607                    source: uws,
5608                    offset: 142,
5609                    length: 3,
5610                    token: Token::Word(Word::Word("夫".to_string())),
5611                },
5612                PositionalToken {
5613                    source: uws,
5614                    offset: 145,
5615                    length: 3,
5616                    token: Token::Word(Word::Word("和".to_string())),
5617                },
5618                PositionalToken {
5619                    source: uws,
5620                    offset: 148,
5621                    length: 3,
5622                    token: Token::Word(Word::Word("迈".to_string())),
5623                },
5624                PositionalToken {
5625                    source: uws,
5626                    offset: 151,
5627                    length: 3,
5628                    token: Token::Word(Word::Word("尔".to_string())),
5629                },
5630                PositionalToken {
5631                    source: uws,
5632                    offset: 154,
5633                    length: 3,
5634                    token: Token::Word(Word::Word("斯".to_string())),
5635                },
5636                PositionalToken {
5637                    source: uws,
5638                    offset: 157,
5639                    length: 2,
5640                    token: Token::Special(Special::Punctuation('·')),
5641                },
5642                PositionalToken {
5643                    source: uws,
5644                    offset: 159,
5645                    length: 3,
5646                    token: Token::Word(Word::Word("米".to_string())),
5647                },
5648                PositionalToken {
5649                    source: uws,
5650                    offset: 162,
5651                    length: 3,
5652                    token: Token::Word(Word::Word("勒".to_string())),
5653                },
5654                PositionalToken {
5655                    source: uws,
5656                    offset: 165,
5657                    length: 3,
5658                    token: Token::Word(Word::Word("編".to_string())),
5659                },
5660                PositionalToken {
5661                    source: uws,
5662                    offset: 168,
5663                    length: 3,
5664                    token: Token::Word(Word::Word("劇".to_string())),
5665                },
5666                PositionalToken {
5667                    source: uws,
5668                    offset: 171,
5669                    length: 3,
5670                    token: Token::Special(Special::Punctuation(',')),
5671                },
5672                PositionalToken {
5673                    source: uws,
5674                    offset: 174,
5675                    length: 3,
5676                    token: Token::Word(Word::Word("大".to_string())),
5677                },
5678                PositionalToken {
5679                    source: uws,
5680                    offset: 177,
5681                    length: 3,
5682                    token: Token::Word(Word::Word("卫".to_string())),
5683                },
5684                PositionalToken {
5685                    source: uws,
5686                    offset: 180,
5687                    length: 2,
5688                    token: Token::Special(Special::Punctuation('·')),
5689                },
5690                PositionalToken {
5691                    source: uws,
5692                    offset: 182,
5693                    length: 3,
5694                    token: Token::Word(Word::Word("努".to_string())),
5695                },
5696                PositionalToken {
5697                    source: uws,
5698                    offset: 185,
5699                    length: 3,
5700                    token: Token::Word(Word::Word("特".to_string())),
5701                },
5702                PositionalToken {
5703                    source: uws,
5704                    offset: 188,
5705                    length: 3,
5706                    token: Token::Word(Word::Word("尔".to_string())),
5707                },
5708                PositionalToken {
5709                    source: uws,
5710                    offset: 191,
5711                    length: 3,
5712                    token: Token::Word(Word::Word("执".to_string())),
5713                },
5714                PositionalToken {
5715                    source: uws,
5716                    offset: 194,
5717                    length: 3,
5718                    token: Token::Word(Word::Word("导".to_string())),
5719                },
5720                PositionalToken {
5721                    source: uws,
5722                    offset: 197,
5723                    length: 3,
5724                    token: Token::Special(Special::Punctuation('。')),
5725                },
5726                PositionalToken {
5727                    source: uws,
5728                    offset: 200,
5729                    length: 3,
5730                    token: Token::Word(Word::Word("这".to_string())),
5731                },
5732                PositionalToken {
5733                    source: uws,
5734                    offset: 203,
5735                    length: 3,
5736                    token: Token::Word(Word::Word("一".to_string())),
5737                },
5738                PositionalToken {
5739                    source: uws,
5740                    offset: 206,
5741                    length: 3,
5742                    token: Token::Word(Word::Word("试".to_string())),
5743                },
5744                PositionalToken {
5745                    source: uws,
5746                    offset: 209,
5747                    length: 3,
5748                    token: Token::Word(Word::Word("播".to_string())),
5749                },
5750                PositionalToken {
5751                    source: uws,
5752                    offset: 212,
5753                    length: 3,
5754                    token: Token::Word(Word::Word("首".to_string())),
5755                },
5756                PositionalToken {
5757                    source: uws,
5758                    offset: 215,
5759                    length: 3,
5760                    token: Token::Word(Word::Word("次".to_string())),
5761                },
5762                PositionalToken {
5763                    source: uws,
5764                    offset: 218,
5765                    length: 3,
5766                    token: Token::Word(Word::Word("向".to_string())),
5767                },
5768                PositionalToken {
5769                    source: uws,
5770                    offset: 221,
5771                    length: 3,
5772                    token: Token::Word(Word::Word("观".to_string())),
5773                },
5774                PositionalToken {
5775                    source: uws,
5776                    offset: 224,
5777                    length: 3,
5778                    token: Token::Word(Word::Word("众".to_string())),
5779                },
5780                PositionalToken {
5781                    source: uws,
5782                    offset: 227,
5783                    length: 3,
5784                    token: Token::Word(Word::Word("引".to_string())),
5785                },
5786                PositionalToken {
5787                    source: uws,
5788                    offset: 230,
5789                    length: 3,
5790                    token: Token::Word(Word::Word("荐".to_string())),
5791                },
5792                PositionalToken {
5793                    source: uws,
5794                    offset: 233,
5795                    length: 3,
5796                    token: Token::Word(Word::Word("了".to_string())),
5797                },
5798                PositionalToken {
5799                    source: uws,
5800                    offset: 236,
5801                    length: 3,
5802                    token: Token::Word(Word::Word("克".to_string())),
5803                },
5804                PositionalToken {
5805                    source: uws,
5806                    offset: 239,
5807                    length: 3,
5808                    token: Token::Word(Word::Word("拉".to_string())),
5809                },
5810                PositionalToken {
5811                    source: uws,
5812                    offset: 242,
5813                    length: 3,
5814                    token: Token::Word(Word::Word("克".to_string())),
5815                },
5816                PositionalToken {
5817                    source: uws,
5818                    offset: 245,
5819                    length: 2,
5820                    token: Token::Special(Special::Punctuation('·')),
5821                },
5822                PositionalToken {
5823                    source: uws,
5824                    offset: 247,
5825                    length: 3,
5826                    token: Token::Word(Word::Word("肯".to_string())),
5827                },
5828                PositionalToken {
5829                    source: uws,
5830                    offset: 250,
5831                    length: 3,
5832                    token: Token::Word(Word::Word("特".to_string())),
5833                },
5834                PositionalToken {
5835                    source: uws,
5836                    offset: 253,
5837                    length: 3,
5838                    token: Token::Word(Word::Word("一".to_string())),
5839                },
5840                PositionalToken {
5841                    source: uws,
5842                    offset: 256,
5843                    length: 3,
5844                    token: Token::Word(Word::Word("角".to_string())),
5845                },
5846                PositionalToken {
5847                    source: uws,
5848                    offset: 259,
5849                    length: 3,
5850                    token: Token::Special(Special::Punctuation(',')),
5851                },
5852                PositionalToken {
5853                    source: uws,
5854                    offset: 262,
5855                    length: 3,
5856                    token: Token::Word(Word::Word("他".to_string())),
5857                },
5858                PositionalToken {
5859                    source: uws,
5860                    offset: 265,
5861                    length: 3,
5862                    token: Token::Word(Word::Word("是".to_string())),
5863                },
5864                PositionalToken {
5865                    source: uws,
5866                    offset: 268,
5867                    length: 3,
5868                    token: Token::Word(Word::Word("位".to_string())),
5869                },
5870                PositionalToken {
5871                    source: uws,
5872                    offset: 271,
5873                    length: 3,
5874                    token: Token::Word(Word::Word("拥".to_string())),
5875                },
5876                PositionalToken {
5877                    source: uws,
5878                    offset: 274,
5879                    length: 3,
5880                    token: Token::Word(Word::Word("有".to_string())),
5881                },
5882                PositionalToken {
5883                    source: uws,
5884                    offset: 277,
5885                    length: 3,
5886                    token: Token::Word(Word::Word("超".to_string())),
5887                },
5888            ],
5889            Lang::Jpn => vec![
5890                PositionalToken {
5891                    source: uws,
5892                    offset: 0,
5893                    length: 3,
5894                    token: Token::Word(Word::Word("熊".to_string())),
5895                },
5896                PositionalToken {
5897                    source: uws,
5898                    offset: 3,
5899                    length: 3,
5900                    token: Token::Word(Word::Word("野".to_string())),
5901                },
5902                PositionalToken {
5903                    source: uws,
5904                    offset: 6,
5905                    length: 3,
5906                    token: Token::Word(Word::Word("三".to_string())),
5907                },
5908                PositionalToken {
5909                    source: uws,
5910                    offset: 9,
5911                    length: 3,
5912                    token: Token::Word(Word::Word("山".to_string())),
5913                },
5914                PositionalToken {
5915                    source: uws,
5916                    offset: 12,
5917                    length: 3,
5918                    token: Token::Word(Word::Word("本".to_string())),
5919                },
5920                PositionalToken {
5921                    source: uws,
5922                    offset: 15,
5923                    length: 3,
5924                    token: Token::Word(Word::Word("願".to_string())),
5925                },
5926                PositionalToken {
5927                    source: uws,
5928                    offset: 18,
5929                    length: 3,
5930                    token: Token::Word(Word::Word("所".to_string())),
5931                },
5932                PositionalToken {
5933                    source: uws,
5934                    offset: 21,
5935                    length: 3,
5936                    token: Token::Word(Word::Word("は".to_string())),
5937                },
5938                PositionalToken {
5939                    source: uws,
5940                    offset: 24,
5941                    length: 3,
5942                    token: Token::Special(Special::Punctuation('、')),
5943                },
5944                PositionalToken {
5945                    source: uws,
5946                    offset: 27,
5947                    length: 2,
5948                    token: Token::Word(Word::Number(Number::Integer(15))),
5949                },
5950                PositionalToken {
5951                    source: uws,
5952                    offset: 29,
5953                    length: 3,
5954                    token: Token::Word(Word::Word("世".to_string())),
5955                },
5956                PositionalToken {
5957                    source: uws,
5958                    offset: 32,
5959                    length: 3,
5960                    token: Token::Word(Word::Word("紀".to_string())),
5961                },
5962                PositionalToken {
5963                    source: uws,
5964                    offset: 35,
5965                    length: 3,
5966                    token: Token::Word(Word::Word("末".to_string())),
5967                },
5968                PositionalToken {
5969                    source: uws,
5970                    offset: 38,
5971                    length: 3,
5972                    token: Token::Word(Word::Word("以".to_string())),
5973                },
5974                PositionalToken {
5975                    source: uws,
5976                    offset: 41,
5977                    length: 3,
5978                    token: Token::Word(Word::Word("降".to_string())),
5979                },
5980                PositionalToken {
5981                    source: uws,
5982                    offset: 44,
5983                    length: 3,
5984                    token: Token::Word(Word::Word("に".to_string())),
5985                },
5986                PositionalToken {
5987                    source: uws,
5988                    offset: 47,
5989                    length: 3,
5990                    token: Token::Word(Word::Word("お".to_string())),
5991                },
5992                PositionalToken {
5993                    source: uws,
5994                    offset: 50,
5995                    length: 3,
5996                    token: Token::Word(Word::Word("け".to_string())),
5997                },
5998                PositionalToken {
5999                    source: uws,
6000                    offset: 53,
6001                    length: 3,
6002                    token: Token::Word(Word::Word("る".to_string())),
6003                },
6004                PositionalToken {
6005                    source: uws,
6006                    offset: 56,
6007                    length: 3,
6008                    token: Token::Word(Word::Word("熊".to_string())),
6009                },
6010                PositionalToken {
6011                    source: uws,
6012                    offset: 59,
6013                    length: 3,
6014                    token: Token::Word(Word::Word("野".to_string())),
6015                },
6016                PositionalToken {
6017                    source: uws,
6018                    offset: 62,
6019                    length: 3,
6020                    token: Token::Word(Word::Word("三".to_string())),
6021                },
6022                PositionalToken {
6023                    source: uws,
6024                    offset: 65,
6025                    length: 3,
6026                    token: Token::Word(Word::Word("山".to_string())),
6027                },
6028                PositionalToken {
6029                    source: uws,
6030                    offset: 68,
6031                    length: 3,
6032                    token: Token::Special(Special::Punctuation('(')),
6033                },
6034                PositionalToken {
6035                    source: uws,
6036                    offset: 71,
6037                    length: 3,
6038                    token: Token::Word(Word::Word("熊".to_string())),
6039                },
6040                PositionalToken {
6041                    source: uws,
6042                    offset: 74,
6043                    length: 3,
6044                    token: Token::Word(Word::Word("野".to_string())),
6045                },
6046                PositionalToken {
6047                    source: uws,
6048                    offset: 77,
6049                    length: 3,
6050                    token: Token::Word(Word::Word("本".to_string())),
6051                },
6052                PositionalToken {
6053                    source: uws,
6054                    offset: 80,
6055                    length: 3,
6056                    token: Token::Word(Word::Word("宮".to_string())),
6057                },
6058                PositionalToken {
6059                    source: uws,
6060                    offset: 83,
6061                    length: 3,
6062                    token: Token::Special(Special::Punctuation('、')),
6063                },
6064                PositionalToken {
6065                    source: uws,
6066                    offset: 86,
6067                    length: 3,
6068                    token: Token::Word(Word::Word("熊".to_string())),
6069                },
6070                PositionalToken {
6071                    source: uws,
6072                    offset: 89,
6073                    length: 3,
6074                    token: Token::Word(Word::Word("野".to_string())),
6075                },
6076                PositionalToken {
6077                    source: uws,
6078                    offset: 92,
6079                    length: 3,
6080                    token: Token::Word(Word::Word("新".to_string())),
6081                },
6082                PositionalToken {
6083                    source: uws,
6084                    offset: 95,
6085                    length: 3,
6086                    token: Token::Word(Word::Word("宮".to_string())),
6087                },
6088                PositionalToken {
6089                    source: uws,
6090                    offset: 98,
6091                    length: 3,
6092                    token: Token::Special(Special::Punctuation('、')),
6093                },
6094                PositionalToken {
6095                    source: uws,
6096                    offset: 101,
6097                    length: 3,
6098                    token: Token::Word(Word::Word("熊".to_string())),
6099                },
6100                PositionalToken {
6101                    source: uws,
6102                    offset: 104,
6103                    length: 3,
6104                    token: Token::Word(Word::Word("野".to_string())),
6105                },
6106                PositionalToken {
6107                    source: uws,
6108                    offset: 107,
6109                    length: 3,
6110                    token: Token::Word(Word::Word("那".to_string())),
6111                },
6112                PositionalToken {
6113                    source: uws,
6114                    offset: 110,
6115                    length: 3,
6116                    token: Token::Word(Word::Word("智".to_string())),
6117                },
6118                PositionalToken {
6119                    source: uws,
6120                    offset: 113,
6121                    length: 3,
6122                    token: Token::Special(Special::Punctuation(')')),
6123                },
6124                PositionalToken {
6125                    source: uws,
6126                    offset: 116,
6127                    length: 3,
6128                    token: Token::Word(Word::Word("の".to_string())),
6129                },
6130                PositionalToken {
6131                    source: uws,
6132                    offset: 119,
6133                    length: 3,
6134                    token: Token::Word(Word::Word("造".to_string())),
6135                },
6136                PositionalToken {
6137                    source: uws,
6138                    offset: 122,
6139                    length: 3,
6140                    token: Token::Word(Word::Word("営".to_string())),
6141                },
6142                PositionalToken {
6143                    source: uws,
6144                    offset: 125,
6145                    length: 3,
6146                    token: Token::Special(Special::Punctuation('・')),
6147                },
6148                PositionalToken {
6149                    source: uws,
6150                    offset: 128,
6151                    length: 3,
6152                    token: Token::Word(Word::Word("修".to_string())),
6153                },
6154                PositionalToken {
6155                    source: uws,
6156                    offset: 131,
6157                    length: 3,
6158                    token: Token::Word(Word::Word("造".to_string())),
6159                },
6160                PositionalToken {
6161                    source: uws,
6162                    offset: 134,
6163                    length: 3,
6164                    token: Token::Word(Word::Word("の".to_string())),
6165                },
6166                PositionalToken {
6167                    source: uws,
6168                    offset: 137,
6169                    length: 3,
6170                    token: Token::Word(Word::Word("た".to_string())),
6171                },
6172                PositionalToken {
6173                    source: uws,
6174                    offset: 140,
6175                    length: 3,
6176                    token: Token::Word(Word::Word("め".to_string())),
6177                },
6178                PositionalToken {
6179                    source: uws,
6180                    offset: 143,
6181                    length: 3,
6182                    token: Token::Word(Word::Word("の".to_string())),
6183                },
6184                PositionalToken {
6185                    source: uws,
6186                    offset: 146,
6187                    length: 3,
6188                    token: Token::Word(Word::Word("勧".to_string())),
6189                },
6190                PositionalToken {
6191                    source: uws,
6192                    offset: 149,
6193                    length: 3,
6194                    token: Token::Word(Word::Word("進".to_string())),
6195                },
6196                PositionalToken {
6197                    source: uws,
6198                    offset: 152,
6199                    length: 3,
6200                    token: Token::Word(Word::Word("を".to_string())),
6201                },
6202                PositionalToken {
6203                    source: uws,
6204                    offset: 155,
6205                    length: 3,
6206                    token: Token::Word(Word::Word("担".to_string())),
6207                },
6208                PositionalToken {
6209                    source: uws,
6210                    offset: 158,
6211                    length: 3,
6212                    token: Token::Word(Word::Word("っ".to_string())),
6213                },
6214                PositionalToken {
6215                    source: uws,
6216                    offset: 161,
6217                    length: 3,
6218                    token: Token::Word(Word::Word("た".to_string())),
6219                },
6220                PositionalToken {
6221                    source: uws,
6222                    offset: 164,
6223                    length: 3,
6224                    token: Token::Word(Word::Word("組".to_string())),
6225                },
6226                PositionalToken {
6227                    source: uws,
6228                    offset: 167,
6229                    length: 3,
6230                    token: Token::Word(Word::Word("織".to_string())),
6231                },
6232                PositionalToken {
6233                    source: uws,
6234                    offset: 170,
6235                    length: 3,
6236                    token: Token::Word(Word::Word("の".to_string())),
6237                },
6238                PositionalToken {
6239                    source: uws,
6240                    offset: 173,
6241                    length: 3,
6242                    token: Token::Word(Word::Word("総".to_string())),
6243                },
6244                PositionalToken {
6245                    source: uws,
6246                    offset: 176,
6247                    length: 3,
6248                    token: Token::Word(Word::Word("称".to_string())),
6249                },
6250                PositionalToken {
6251                    source: uws,
6252                    offset: 179,
6253                    length: 3,
6254                    token: Token::Special(Special::Punctuation('。')),
6255                },
6256                PositionalToken {
6257                    source: uws,
6258                    offset: 182,
6259                    length: 1,
6260                    token: Token::Special(Special::Separator(Separator::Space)),
6261                },
6262                PositionalToken {
6263                    source: uws,
6264                    offset: 183,
6265                    length: 3,
6266                    token: Token::Word(Word::Word("熊".to_string())),
6267                },
6268                PositionalToken {
6269                    source: uws,
6270                    offset: 186,
6271                    length: 3,
6272                    token: Token::Word(Word::Word("野".to_string())),
6273                },
6274                PositionalToken {
6275                    source: uws,
6276                    offset: 189,
6277                    length: 3,
6278                    token: Token::Word(Word::Word("三".to_string())),
6279                },
6280                PositionalToken {
6281                    source: uws,
6282                    offset: 192,
6283                    length: 3,
6284                    token: Token::Word(Word::Word("山".to_string())),
6285                },
6286                PositionalToken {
6287                    source: uws,
6288                    offset: 195,
6289                    length: 3,
6290                    token: Token::Word(Word::Word("を".to_string())),
6291                },
6292                PositionalToken {
6293                    source: uws,
6294                    offset: 198,
6295                    length: 3,
6296                    token: Token::Word(Word::Word("含".to_string())),
6297                },
6298                PositionalToken {
6299                    source: uws,
6300                    offset: 201,
6301                    length: 3,
6302                    token: Token::Word(Word::Word("め".to_string())),
6303                },
6304                PositionalToken {
6305                    source: uws,
6306                    offset: 204,
6307                    length: 3,
6308                    token: Token::Word(Word::Word("て".to_string())),
6309                },
6310                PositionalToken {
6311                    source: uws,
6312                    offset: 207,
6313                    length: 3,
6314                    token: Token::Special(Special::Punctuation('、')),
6315                },
6316                PositionalToken {
6317                    source: uws,
6318                    offset: 210,
6319                    length: 3,
6320                    token: Token::Word(Word::Word("日".to_string())),
6321                },
6322                PositionalToken {
6323                    source: uws,
6324                    offset: 213,
6325                    length: 3,
6326                    token: Token::Word(Word::Word("本".to_string())),
6327                },
6328                PositionalToken {
6329                    source: uws,
6330                    offset: 216,
6331                    length: 3,
6332                    token: Token::Word(Word::Word("に".to_string())),
6333                },
6334                PositionalToken {
6335                    source: uws,
6336                    offset: 219,
6337                    length: 3,
6338                    token: Token::Word(Word::Word("お".to_string())),
6339                },
6340                PositionalToken {
6341                    source: uws,
6342                    offset: 222,
6343                    length: 3,
6344                    token: Token::Word(Word::Word("け".to_string())),
6345                },
6346                PositionalToken {
6347                    source: uws,
6348                    offset: 225,
6349                    length: 3,
6350                    token: Token::Word(Word::Word("る".to_string())),
6351                },
6352                PositionalToken {
6353                    source: uws,
6354                    offset: 228,
6355                    length: 3,
6356                    token: Token::Word(Word::Word("古".to_string())),
6357                },
6358                PositionalToken {
6359                    source: uws,
6360                    offset: 231,
6361                    length: 3,
6362                    token: Token::Word(Word::Word("代".to_string())),
6363                },
6364                PositionalToken {
6365                    source: uws,
6366                    offset: 234,
6367                    length: 3,
6368                    token: Token::Word(Word::Word("か".to_string())),
6369                },
6370                PositionalToken {
6371                    source: uws,
6372                    offset: 237,
6373                    length: 3,
6374                    token: Token::Word(Word::Word("ら".to_string())),
6375                },
6376                PositionalToken {
6377                    source: uws,
6378                    offset: 240,
6379                    length: 3,
6380                    token: Token::Word(Word::Word("中".to_string())),
6381                },
6382                PositionalToken {
6383                    source: uws,
6384                    offset: 243,
6385                    length: 3,
6386                    token: Token::Word(Word::Word("世".to_string())),
6387                },
6388                PositionalToken {
6389                    source: uws,
6390                    offset: 246,
6391                    length: 3,
6392                    token: Token::Word(Word::Word("前".to_string())),
6393                },
6394                PositionalToken {
6395                    source: uws,
6396                    offset: 249,
6397                    length: 3,
6398                    token: Token::Word(Word::Word("半".to_string())),
6399                },
6400                PositionalToken {
6401                    source: uws,
6402                    offset: 252,
6403                    length: 3,
6404                    token: Token::Word(Word::Word("に".to_string())),
6405                },
6406                PositionalToken {
6407                    source: uws,
6408                    offset: 255,
6409                    length: 3,
6410                    token: Token::Word(Word::Word("か".to_string())),
6411                },
6412                PositionalToken {
6413                    source: uws,
6414                    offset: 258,
6415                    length: 3,
6416                    token: Token::Word(Word::Word("け".to_string())),
6417                },
6418                PositionalToken {
6419                    source: uws,
6420                    offset: 261,
6421                    length: 3,
6422                    token: Token::Word(Word::Word("て".to_string())),
6423                },
6424                PositionalToken {
6425                    source: uws,
6426                    offset: 264,
6427                    length: 3,
6428                    token: Token::Word(Word::Word("の".to_string())),
6429                },
6430                PositionalToken {
6431                    source: uws,
6432                    offset: 267,
6433                    length: 3,
6434                    token: Token::Word(Word::Word("寺".to_string())),
6435                },
6436                PositionalToken {
6437                    source: uws,
6438                    offset: 270,
6439                    length: 3,
6440                    token: Token::Word(Word::Word("社".to_string())),
6441                },
6442                PositionalToken {
6443                    source: uws,
6444                    offset: 273,
6445                    length: 3,
6446                    token: Token::Word(Word::Word("の".to_string())),
6447                },
6448                PositionalToken {
6449                    source: uws,
6450                    offset: 276,
6451                    length: 3,
6452                    token: Token::Word(Word::Word("造".to_string())),
6453                },
6454                PositionalToken {
6455                    source: uws,
6456                    offset: 279,
6457                    length: 3,
6458                    token: Token::Word(Word::Word("営".to_string())),
6459                },
6460                PositionalToken {
6461                    source: uws,
6462                    offset: 282,
6463                    length: 3,
6464                    token: Token::Word(Word::Word("は".to_string())),
6465                },
6466                PositionalToken {
6467                    source: uws,
6468                    offset: 285,
6469                    length: 3,
6470                    token: Token::Special(Special::Punctuation('、')),
6471                },
6472                PositionalToken {
6473                    source: uws,
6474                    offset: 288,
6475                    length: 3,
6476                    token: Token::Word(Word::Word("寺".to_string())),
6477                },
6478                PositionalToken {
6479                    source: uws,
6480                    offset: 291,
6481                    length: 3,
6482                    token: Token::Word(Word::Word("社".to_string())),
6483                },
6484            ],
6485            Lang::Kor => vec![
6486                PositionalToken {
6487                    source: uws,
6488                    offset: 0,
6489                    length: 21,
6490                    token: Token::Word(Word::Word("플레이스테이션".to_string())),
6491                },
6492                PositionalToken {
6493                    source: uws,
6494                    offset: 21,
6495                    length: 1,
6496                    token: Token::Special(Special::Separator(Separator::Space)),
6497                },
6498                PositionalToken {
6499                    source: uws,
6500                    offset: 22,
6501                    length: 3,
6502                    token: Token::Word(Word::Word("은".to_string())),
6503                },
6504                PositionalToken {
6505                    source: uws,
6506                    offset: 25,
6507                    length: 1,
6508                    token: Token::Special(Special::Separator(Separator::Space)),
6509                },
6510                PositionalToken {
6511                    source: uws,
6512                    offset: 26,
6513                    length: 6,
6514                    token: Token::Word(Word::Word("소니".to_string())),
6515                },
6516                PositionalToken {
6517                    source: uws,
6518                    offset: 32,
6519                    length: 1,
6520                    token: Token::Special(Special::Separator(Separator::Space)),
6521                },
6522                PositionalToken {
6523                    source: uws,
6524                    offset: 33,
6525                    length: 9,
6526                    token: Token::Word(Word::Word("컴퓨터".to_string())),
6527                },
6528                PositionalToken {
6529                    source: uws,
6530                    offset: 42,
6531                    length: 1,
6532                    token: Token::Special(Special::Separator(Separator::Space)),
6533                },
6534                PositionalToken {
6535                    source: uws,
6536                    offset: 43,
6537                    length: 21,
6538                    token: Token::Word(Word::Word("엔터테인먼트가".to_string())),
6539                },
6540                PositionalToken {
6541                    source: uws,
6542                    offset: 64,
6543                    length: 1,
6544                    token: Token::Special(Special::Separator(Separator::Space)),
6545                },
6546                PositionalToken {
6547                    source: uws,
6548                    offset: 65,
6549                    length: 9,
6550                    token: Token::Word(Word::Word("개발한".to_string())),
6551                },
6552                PositionalToken {
6553                    source: uws,
6554                    offset: 74,
6555                    length: 1,
6556                    token: Token::Special(Special::Separator(Separator::Space)),
6557                },
6558                PositionalToken {
6559                    source: uws,
6560                    offset: 75,
6561                    length: 3,
6562                    token: Token::Word(Word::Word("세".to_string())),
6563                },
6564                PositionalToken {
6565                    source: uws,
6566                    offset: 78,
6567                    length: 1,
6568                    token: Token::Special(Special::Separator(Separator::Space)),
6569                },
6570                PositionalToken {
6571                    source: uws,
6572                    offset: 79,
6573                    length: 6,
6574                    token: Token::Word(Word::Word("번째".to_string())),
6575                },
6576                PositionalToken {
6577                    source: uws,
6578                    offset: 85,
6579                    length: 1,
6580                    token: Token::Special(Special::Separator(Separator::Space)),
6581                },
6582                PositionalToken {
6583                    source: uws,
6584                    offset: 86,
6585                    length: 9,
6586                    token: Token::Word(Word::Word("가정용".to_string())),
6587                },
6588                PositionalToken {
6589                    source: uws,
6590                    offset: 95,
6591                    length: 1,
6592                    token: Token::Special(Special::Separator(Separator::Space)),
6593                },
6594                PositionalToken {
6595                    source: uws,
6596                    offset: 96,
6597                    length: 15,
6598                    token: Token::Word(Word::Word("게임기이다".to_string())),
6599                },
6600                PositionalToken {
6601                    source: uws,
6602                    offset: 111,
6603                    length: 1,
6604                    token: Token::Special(Special::Punctuation('.')),
6605                },
6606                PositionalToken {
6607                    source: uws,
6608                    offset: 112,
6609                    length: 1,
6610                    token: Token::Special(Special::Separator(Separator::Space)),
6611                },
6612                PositionalToken {
6613                    source: uws,
6614                    offset: 113,
6615                    length: 24,
6616                    token: Token::Word(Word::Word("마이크로소프트의".to_string())),
6617                },
6618                PositionalToken {
6619                    source: uws,
6620                    offset: 137,
6621                    length: 1,
6622                    token: Token::Special(Special::Separator(Separator::Space)),
6623                },
6624                PositionalToken {
6625                    source: uws,
6626                    offset: 138,
6627                    length: 12,
6628                    token: Token::Word(Word::Word("엑스박스".to_string())),
6629                },
6630                PositionalToken {
6631                    source: uws,
6632                    offset: 150,
6633                    length: 1,
6634                    token: Token::Special(Special::Separator(Separator::Space)),
6635                },
6636                PositionalToken {
6637                    source: uws,
6638                    offset: 151,
6639                    length: 3,
6640                    token: Token::Word(Word::Number(Number::Integer(360))),
6641                },
6642                PositionalToken {
6643                    source: uws,
6644                    offset: 154,
6645                    length: 1,
6646                    token: Token::Special(Special::Punctuation(',')),
6647                },
6648                PositionalToken {
6649                    source: uws,
6650                    offset: 155,
6651                    length: 1,
6652                    token: Token::Special(Special::Separator(Separator::Space)),
6653                },
6654                PositionalToken {
6655                    source: uws,
6656                    offset: 156,
6657                    length: 12,
6658                    token: Token::Word(Word::Word("닌텐도의".to_string())),
6659                },
6660                PositionalToken {
6661                    source: uws,
6662                    offset: 168,
6663                    length: 1,
6664                    token: Token::Special(Special::Separator(Separator::Space)),
6665                },
6666                PositionalToken {
6667                    source: uws,
6668                    offset: 169,
6669                    length: 6,
6670                    token: Token::Word(Word::Word("Wii와".to_string())),
6671                },
6672                PositionalToken {
6673                    source: uws,
6674                    offset: 175,
6675                    length: 1,
6676                    token: Token::Special(Special::Separator(Separator::Space)),
6677                },
6678                PositionalToken {
6679                    source: uws,
6680                    offset: 176,
6681                    length: 12,
6682                    token: Token::Word(Word::Word("경쟁하고".to_string())),
6683                },
6684                PositionalToken {
6685                    source: uws,
6686                    offset: 188,
6687                    length: 1,
6688                    token: Token::Special(Special::Separator(Separator::Space)),
6689                },
6690                PositionalToken {
6691                    source: uws,
6692                    offset: 189,
6693                    length: 6,
6694                    token: Token::Word(Word::Word("있다".to_string())),
6695                },
6696                PositionalToken {
6697                    source: uws,
6698                    offset: 195,
6699                    length: 1,
6700                    token: Token::Special(Special::Punctuation('.')),
6701                },
6702                PositionalToken {
6703                    source: uws,
6704                    offset: 196,
6705                    length: 1,
6706                    token: Token::Special(Special::Separator(Separator::Space)),
6707                },
6708                PositionalToken {
6709                    source: uws,
6710                    offset: 197,
6711                    length: 6,
6712                    token: Token::Word(Word::Word("이전".to_string())),
6713                },
6714                PositionalToken {
6715                    source: uws,
6716                    offset: 203,
6717                    length: 1,
6718                    token: Token::Special(Special::Separator(Separator::Space)),
6719                },
6720                PositionalToken {
6721                    source: uws,
6722                    offset: 204,
6723                    length: 12,
6724                    token: Token::Word(Word::Word("제품에서".to_string())),
6725                },
6726                PositionalToken {
6727                    source: uws,
6728                    offset: 216,
6729                    length: 1,
6730                    token: Token::Special(Special::Separator(Separator::Space)),
6731                },
6732                PositionalToken {
6733                    source: uws,
6734                    offset: 217,
6735                    length: 9,
6736                    token: Token::Word(Word::Word("온라인".to_string())),
6737                },
6738                PositionalToken {
6739                    source: uws,
6740                    offset: 226,
6741                    length: 1,
6742                    token: Token::Special(Special::Separator(Separator::Space)),
6743                },
6744                PositionalToken {
6745                    source: uws,
6746                    offset: 227,
6747                    length: 9,
6748                    token: Token::Word(Word::Word("플레이".to_string())),
6749                },
6750                PositionalToken {
6751                    source: uws,
6752                    offset: 236,
6753                    length: 1,
6754                    token: Token::Special(Special::Separator(Separator::Space)),
6755                },
6756                PositionalToken {
6757                    source: uws,
6758                    offset: 237,
6759                    length: 3,
6760                    token: Token::Word(Word::Word("기".to_string())),
6761                },
6762            ],
6763            Lang::Ara => vec![
6764                PositionalToken {
6765                    source: uws,
6766                    offset: 0,
6767                    length: 14,
6768                    token: Token::Word(Word::Word("لشکرکشی".to_string())),
6769                },
6770                PositionalToken {
6771                    source: uws,
6772                    offset: 14,
6773                    length: 3,
6774                    token: Token::Unicode(Unicode::Formatter(Formatter::Char('\u{200c}'))),
6775                },
6776                PositionalToken {
6777                    source: uws,
6778                    offset: 17,
6779                    length: 6,
6780                    token: Token::Word(Word::Word("های".to_string())),
6781                },
6782                PositionalToken {
6783                    source: uws,
6784                    offset: 23,
6785                    length: 1,
6786                    token: Token::Special(Special::Separator(Separator::Space)),
6787                },
6788                PositionalToken {
6789                    source: uws,
6790                    offset: 24,
6791                    length: 6,
6792                    token: Token::Word(Word::Word("روس".to_string())),
6793                },
6794                PositionalToken {
6795                    source: uws,
6796                    offset: 30,
6797                    length: 3,
6798                    token: Token::Unicode(Unicode::Formatter(Formatter::Char('\u{200c}'))),
6799                },
6800                PositionalToken {
6801                    source: uws,
6802                    offset: 33,
6803                    length: 6,
6804                    token: Token::Word(Word::Word("های".to_string())),
6805                },
6806                PositionalToken {
6807                    source: uws,
6808                    offset: 39,
6809                    length: 1,
6810                    token: Token::Special(Special::Separator(Separator::Space)),
6811                },
6812                PositionalToken {
6813                    source: uws,
6814                    offset: 40,
6815                    length: 12,
6816                    token: Token::Word(Word::Word("وارنگی".to_string())),
6817                },
6818                PositionalToken {
6819                    source: uws,
6820                    offset: 52,
6821                    length: 1,
6822                    token: Token::Special(Special::Separator(Separator::Space)),
6823                },
6824                PositionalToken {
6825                    source: uws,
6826                    offset: 53,
6827                    length: 4,
6828                    token: Token::Word(Word::Word("به".to_string())),
6829                },
6830                PositionalToken {
6831                    source: uws,
6832                    offset: 57,
6833                    length: 1,
6834                    token: Token::Special(Special::Separator(Separator::Space)),
6835                },
6836                PositionalToken {
6837                    source: uws,
6838                    offset: 58,
6839                    length: 10,
6840                    token: Token::Word(Word::Word("دریای".to_string())),
6841                },
6842                PositionalToken {
6843                    source: uws,
6844                    offset: 68,
6845                    length: 1,
6846                    token: Token::Special(Special::Separator(Separator::Space)),
6847                },
6848                PositionalToken {
6849                    source: uws,
6850                    offset: 69,
6851                    length: 6,
6852                    token: Token::Word(Word::Word("خزر".to_string())),
6853                },
6854                PositionalToken {
6855                    source: uws,
6856                    offset: 75,
6857                    length: 1,
6858                    token: Token::Special(Special::Separator(Separator::Space)),
6859                },
6860                PositionalToken {
6861                    source: uws,
6862                    offset: 76,
6863                    length: 12,
6864                    token: Token::Word(Word::Word("مجموعه".to_string())),
6865                },
6866                PositionalToken {
6867                    source: uws,
6868                    offset: 88,
6869                    length: 3,
6870                    token: Token::Unicode(Unicode::Formatter(Formatter::Char('\u{200c}'))),
6871                },
6872                PositionalToken {
6873                    source: uws,
6874                    offset: 91,
6875                    length: 4,
6876                    token: Token::Word(Word::Word("ای".to_string())),
6877                },
6878                PositionalToken {
6879                    source: uws,
6880                    offset: 95,
6881                    length: 1,
6882                    token: Token::Special(Special::Separator(Separator::Space)),
6883                },
6884                PositionalToken {
6885                    source: uws,
6886                    offset: 96,
6887                    length: 4,
6888                    token: Token::Word(Word::Word("از".to_string())),
6889                },
6890                PositionalToken {
6891                    source: uws,
6892                    offset: 100,
6893                    length: 1,
6894                    token: Token::Special(Special::Separator(Separator::Space)),
6895                },
6896                PositionalToken {
6897                    source: uws,
6898                    offset: 101,
6899                    length: 10,
6900                    token: Token::Word(Word::Word("حملات".to_string())),
6901                },
6902                PositionalToken {
6903                    source: uws,
6904                    offset: 111,
6905                    length: 1,
6906                    token: Token::Special(Special::Separator(Separator::Space)),
6907                },
6908                PositionalToken {
6909                    source: uws,
6910                    offset: 112,
6911                    length: 10,
6912                    token: Token::Word(Word::Word("نظامی".to_string())),
6913                },
6914                PositionalToken {
6915                    source: uws,
6916                    offset: 122,
6917                    length: 1,
6918                    token: Token::Special(Special::Separator(Separator::Space)),
6919                },
6920                PositionalToken {
6921                    source: uws,
6922                    offset: 123,
6923                    length: 4,
6924                    token: Token::Word(Word::Word("در".to_string())),
6925                },
6926                PositionalToken {
6927                    source: uws,
6928                    offset: 127,
6929                    length: 1,
6930                    token: Token::Special(Special::Separator(Separator::Space)),
6931                },
6932                PositionalToken {
6933                    source: uws,
6934                    offset: 128,
6935                    length: 6,
6936                    token: Token::Word(Word::Word("بین".to_string())),
6937                },
6938                PositionalToken {
6939                    source: uws,
6940                    offset: 134,
6941                    length: 1,
6942                    token: Token::Special(Special::Separator(Separator::Space)),
6943                },
6944                PositionalToken {
6945                    source: uws,
6946                    offset: 135,
6947                    length: 6,
6948                    token: Token::Word(Word::Word("سال".to_string())),
6949                },
6950                PositionalToken {
6951                    source: uws,
6952                    offset: 141,
6953                    length: 3,
6954                    token: Token::Unicode(Unicode::Formatter(Formatter::Char('\u{200c}'))),
6955                },
6956                PositionalToken {
6957                    source: uws,
6958                    offset: 144,
6959                    length: 6,
6960                    token: Token::Word(Word::Word("های".to_string())),
6961                },
6962                PositionalToken {
6963                    source: uws,
6964                    offset: 150,
6965                    length: 1,
6966                    token: Token::Special(Special::Separator(Separator::Space)),
6967                },
6968                PositionalToken {
6969                    source: uws,
6970                    offset: 151,
6971                    length: 6,
6972                    token: Token::Word(Word::StrangeWord("۸۶۴".to_string())),
6973                },
6974                PositionalToken {
6975                    source: uws,
6976                    offset: 157,
6977                    length: 1,
6978                    token: Token::Special(Special::Separator(Separator::Space)),
6979                },
6980                PositionalToken {
6981                    source: uws,
6982                    offset: 158,
6983                    length: 4,
6984                    token: Token::Word(Word::Word("تا".to_string())),
6985                },
6986                PositionalToken {
6987                    source: uws,
6988                    offset: 162,
6989                    length: 1,
6990                    token: Token::Special(Special::Separator(Separator::Space)),
6991                },
6992                PositionalToken {
6993                    source: uws,
6994                    offset: 163,
6995                    length: 8,
6996                    token: Token::Word(Word::StrangeWord("۱۰۴۱".to_string())),
6997                },
6998                PositionalToken {
6999                    source: uws,
7000                    offset: 171,
7001                    length: 1,
7002                    token: Token::Special(Special::Separator(Separator::Space)),
7003                },
7004                PositionalToken {
7005                    source: uws,
7006                    offset: 172,
7007                    length: 12,
7008                    token: Token::Word(Word::Word("میلادی".to_string())),
7009                },
7010                PositionalToken {
7011                    source: uws,
7012                    offset: 184,
7013                    length: 1,
7014                    token: Token::Special(Special::Separator(Separator::Space)),
7015                },
7016                PositionalToken {
7017                    source: uws,
7018                    offset: 185,
7019                    length: 2,
7020                    token: Token::Word(Word::Word("ب".to_string())),
7021                },
7022            ],
7023            Lang::Ell => vec![
7024                PositionalToken {
7025                    source: uws,
7026                    offset: 0,
7027                    length: 4,
7028                    token: Token::Word(Word::Word("Το".to_string())),
7029                },
7030                PositionalToken {
7031                    source: uws,
7032                    offset: 4,
7033                    length: 1,
7034                    token: Token::Special(Special::Separator(Separator::Space)),
7035                },
7036                PositionalToken {
7037                    source: uws,
7038                    offset: 5,
7039                    length: 18,
7040                    token: Token::Word(Word::Word("Πρόγραμμα".to_string())),
7041                },
7042                PositionalToken {
7043                    source: uws,
7044                    offset: 23,
7045                    length: 1,
7046                    token: Token::Special(Special::Separator(Separator::Space)),
7047                },
7048                PositionalToken {
7049                    source: uws,
7050                    offset: 24,
7051                    length: 22,
7052                    token: Token::Word(Word::Word("υλοποιείται".to_string())),
7053                },
7054                PositionalToken {
7055                    source: uws,
7056                    offset: 46,
7057                    length: 1,
7058                    token: Token::Special(Special::Separator(Separator::Space)),
7059                },
7060                PositionalToken {
7061                    source: uws,
7062                    offset: 47,
7063                    length: 4,
7064                    token: Token::Word(Word::Word("εξ".to_string())),
7065                },
7066                PositionalToken {
7067                    source: uws,
7068                    offset: 51,
7069                    length: 1,
7070                    token: Token::Special(Special::Separator(Separator::Space)),
7071                },
7072                PositionalToken {
7073                    source: uws,
7074                    offset: 52,
7075                    length: 18,
7076                    token: Token::Word(Word::Word("ολοκλήρου".to_string())),
7077                },
7078                PositionalToken {
7079                    source: uws,
7080                    offset: 70,
7081                    length: 1,
7082                    token: Token::Special(Special::Separator(Separator::Space)),
7083                },
7084                PositionalToken {
7085                    source: uws,
7086                    offset: 71,
7087                    length: 6,
7088                    token: Token::Word(Word::Word("από".to_string())),
7089                },
7090                PositionalToken {
7091                    source: uws,
7092                    offset: 77,
7093                    length: 1,
7094                    token: Token::Special(Special::Separator(Separator::Space)),
7095                },
7096                PositionalToken {
7097                    source: uws,
7098                    offset: 78,
7099                    length: 16,
7100                    token: Token::Word(Word::Word("απόσταση".to_string())),
7101                },
7102                PositionalToken {
7103                    source: uws,
7104                    offset: 94,
7105                    length: 1,
7106                    token: Token::Special(Special::Separator(Separator::Space)),
7107                },
7108                PositionalToken {
7109                    source: uws,
7110                    offset: 95,
7111                    length: 6,
7112                    token: Token::Word(Word::Word("και".to_string())),
7113                },
7114                PositionalToken {
7115                    source: uws,
7116                    offset: 101,
7117                    length: 1,
7118                    token: Token::Special(Special::Separator(Separator::Space)),
7119                },
7120                PositionalToken {
7121                    source: uws,
7122                    offset: 102,
7123                    length: 12,
7124                    token: Token::Word(Word::Word("μπορεί".to_string())),
7125                },
7126                PositionalToken {
7127                    source: uws,
7128                    offset: 114,
7129                    length: 1,
7130                    token: Token::Special(Special::Separator(Separator::Space)),
7131                },
7132                PositionalToken {
7133                    source: uws,
7134                    offset: 115,
7135                    length: 4,
7136                    token: Token::Word(Word::Word("να".to_string())),
7137                },
7138                PositionalToken {
7139                    source: uws,
7140                    offset: 119,
7141                    length: 1,
7142                    token: Token::Special(Special::Separator(Separator::Space)),
7143                },
7144                PositionalToken {
7145                    source: uws,
7146                    offset: 120,
7147                    length: 20,
7148                    token: Token::Word(Word::Word("συμμετέχει".to_string())),
7149                },
7150                PositionalToken {
7151                    source: uws,
7152                    offset: 140,
7153                    length: 1,
7154                    token: Token::Special(Special::Separator(Separator::Space)),
7155                },
7156                PositionalToken {
7157                    source: uws,
7158                    offset: 141,
7159                    length: 8,
7160                    token: Token::Word(Word::Word("κάθε".to_string())),
7161                },
7162                PositionalToken {
7163                    source: uws,
7164                    offset: 149,
7165                    length: 1,
7166                    token: Token::Special(Special::Separator(Separator::Space)),
7167                },
7168                PositionalToken {
7169                    source: uws,
7170                    offset: 150,
7171                    length: 24,
7172                    token: Token::Word(Word::Word("εμπλεκόμενος".to_string())),
7173                },
7174                PositionalToken {
7175                    source: uws,
7176                    offset: 174,
7177                    length: 1,
7178                    token: Token::Special(Special::Separator(Separator::Space)),
7179                },
7180                PositionalToken {
7181                    source: uws,
7182                    offset: 175,
7183                    length: 6,
7184                    token: Token::Word(Word::Word("στη".to_string())),
7185                },
7186                PositionalToken {
7187                    source: uws,
7188                    offset: 181,
7189                    length: 1,
7190                    token: Token::Special(Special::Separator(Separator::Space)),
7191                },
7192                PositionalToken {
7193                    source: uws,
7194                    offset: 182,
7195                    length: 2,
7196                    token: Token::Word(Word::Word("ή".to_string())),
7197                },
7198                PositionalToken {
7199                    source: uws,
7200                    offset: 184,
7201                    length: 1,
7202                    token: Token::Special(Special::Punctuation('/')),
7203                },
7204            ],
7205        };
7206        (
7207            uws.chars()
7208                .take(100)
7209                .fold(String::new(), |acc, c| acc + &format!("{}", c)),
7210            tokens,
7211        )
7212    }
7213}