text_tokenizer/
lib.rs

1use std::sync::Arc;
2use text_parsing::{Breaker, IntoSource, Local, Localize, Snip, Source, SourceEvent};
3
4mod emoji;
5pub use emoji::EMOJIMAP;
6
7mod breakers;
8pub use breakers::{SentenceBreaker, UnicodeSentenceBreaker};
9
10mod wordbreaker;
11
12mod options;
13pub use options::{IntoTokenizer, TokenizerOptions, TokenizerParams};
14
15mod tokens;
16pub use tokens::Tokens;
17
18mod text_tokens;
19use text_tokens::InnerBound;
20pub use text_tokens::TextTokens;
21
22#[derive(Debug)]
23pub enum Error {
24    TextParser(text_parsing::Error),
25}
26
27const EPS: f64 = 1e-8;
28
29#[cfg(feature = "strings")]
30#[derive(Debug, Clone, PartialEq, PartialOrd)]
31pub enum Number {
32    Integer(i64),
33    Float(f64),
34    // String is an integer, but with the leading zeroes, for example: "007"
35    ZeroInteger { i: i64, s: String },
36}
37
38#[cfg(not(feature = "strings"))]
39#[derive(Debug, Clone, Copy, PartialEq, PartialOrd)]
40pub enum Number {
41    Integer(i64),
42    Float(f64),
43    ZeroInteger { i: i64 },
44}
45
46impl Number {
47    pub fn as_f64(&self) -> f64 {
48        match self {
49            Number::Integer(i) => *i as f64,
50            Number::Float(f) => *f,
51            Number::ZeroInteger { i, .. } => *i as f64,
52        }
53    }
54}
55impl Ord for Number {
56    fn cmp(&self, other: &Number) -> std::cmp::Ordering {
57        let s = self.as_f64();
58        let o = other.as_f64();
59        let d = s - o;
60        match d.abs() < EPS {
61            true => std::cmp::Ordering::Equal,
62            false => {
63                if d > 0.0 {
64                    return std::cmp::Ordering::Greater;
65                }
66                if d < 0.0 {
67                    return std::cmp::Ordering::Less;
68                }
69                std::cmp::Ordering::Equal
70            }
71        }
72    }
73}
74impl Eq for Number {}
75
76#[derive(Debug, Clone, Copy, Eq, PartialEq, Ord, PartialOrd)]
77pub enum Separator {
78    Space,
79    Tab,
80    Newline,
81    Char(char),
82}
83
84#[derive(Debug, Clone, Copy, Eq, PartialEq, Ord, PartialOrd)]
85pub enum Formatter {
86    Char(char),
87    Joiner, // u{200d}
88}
89
90#[derive(Debug, Clone, Copy, PartialEq, Eq, Ord, PartialOrd)]
91pub enum Special {
92    Currency(char),
93    Punctuation(char),
94    Symbol(char),
95    Separator(Separator),
96}
97
98#[cfg(feature = "strings")]
99#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord)]
100pub enum Word {
101    Word(String),
102    StrangeWord(String),
103    Numerical(Numerical),
104    Number(Number),
105    Emoji(&'static str),
106}
107
108#[cfg(feature = "strings")]
109#[derive(Debug, Clone, Eq, PartialEq, Ord, PartialOrd)]
110pub enum Numerical {
111    //Date(String),
112    //Ip(String),
113    //Countable(String),
114    DotSeparated(String),
115    Measures(String),
116    Alphanumeric(String),
117}
118
119#[cfg(feature = "strings")]
120#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord)]
121pub enum Struct {
122    Hashtag(String),
123    Mention(String),
124    //Url(String),
125}
126
127#[cfg(feature = "strings")]
128#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord)]
129pub enum Unicode {
130    String(String),
131    Formatter(Formatter),
132}
133
134#[cfg(not(feature = "strings"))]
135#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord)]
136pub enum Word {
137    Word,
138    StrangeWord,
139    Numerical(Numerical),
140    Number(Number),
141    Emoji(&'static str),
142}
143
144#[cfg(not(feature = "strings"))]
145#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord)]
146pub enum Numerical {
147    //Date,
148    //Ip,
149    //Countable,
150    DotSeparated,
151    Measures,
152    Alphanumeric,
153}
154
155#[cfg(not(feature = "strings"))]
156#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord)]
157pub enum Struct {
158    Hashtag,
159    Mention,
160    //Url,
161}
162
163#[cfg(not(feature = "strings"))]
164#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord)]
165pub enum Unicode {
166    String,
167    Formatter(Formatter),
168}
169
170#[cfg(feature = "strings")]
171#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord)]
172pub enum Token {
173    Word(Word),
174    Struct(Struct),
175    Special(Special),
176    Unicode(Unicode),
177}
178
179#[cfg(not(feature = "strings"))]
180#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord)]
181pub enum Token {
182    Word(Word),
183    Struct(Struct),
184    Special(Special),
185    Unicode(Unicode),
186}
187
188/*pub trait IntoTokens<T> {
189    type IntoTokens: IntoTokenizer<IntoTokens = T>;
190}
191
192impl<'s> IntoTokenSource<Token2> for &'s str {
193    type IntoTokens = TextStr<'s>;
194
195    fn (self) -> Result<TextStr<'s>,Error> {
196        TextStr::new(self)
197    }
198
199}*/
200
201#[derive(Debug)]
202pub struct TextStr<'s> {
203    buffer: &'s str,
204    localities: Arc<Vec<TextLocality>>,
205    breakers: Arc<Vec<InnerBound>>,
206}
207impl<'s> TextStr<'s> {
208    pub fn new<'a>(s: &'a str) -> Result<TextStr<'a>, Error> {
209        let text = inner_new(s.into_source(), false)?;
210        Ok(TextStr {
211            buffer: s,
212            localities: text.localities,
213            breakers: text.breakers,
214        })
215    }
216}
217
218fn inner_new<S: Source>(mut source: S, with_buffer: bool) -> Result<Text, Error> {
219    let mut buffer = String::new();
220    let mut localities = Vec::new();
221    let mut breakers = Vec::new();
222    let mut buffer_len = 0;
223
224    while let Some(local_se) = source.next_char().map_err(Error::TextParser)? {
225        let (local, se) = local_se.into_inner();
226        let c = match se {
227            SourceEvent::Char(c) => match c {
228                '\u{0060}' => '\u{0027}',
229                _ => c,
230            },
231            SourceEvent::Breaker(b) => {
232                let (c, opt_b) = match b {
233                    Breaker::None => continue,
234                    Breaker::Space => (' ', None),
235                    Breaker::Line => ('\n', None),
236                    Breaker::Word => ('\u{200B}', Some(b)), // zero width space
237                    Breaker::Sentence | Breaker::Paragraph | Breaker::Section => ('\n', Some(b)),
238                };
239                if let Some(b) = opt_b {
240                    let br = InnerBound {
241                        bytes: Snip {
242                            offset: buffer_len,
243                            length: c.len_utf8(),
244                        },
245                        chars: Snip {
246                            offset: localities.len(),
247                            length: 1,
248                        },
249                        breaker: b,
250                        original: Some(local),
251                    };
252                    //println!("BR: {:?}",br);
253                    breakers.push(br);
254                }
255                c
256            }
257        };
258
259        let buf_local = ().localize(
260            Snip {
261                // chars
262                offset: localities.len(),
263                length: 1,
264            },
265            Snip {
266                // bytes
267                offset: buffer_len,
268                length: c.len_utf8(),
269            },
270        );
271        if with_buffer {
272            buffer.push(c);
273        }
274        buffer_len += c.len_utf8();
275        localities.push(TextLocality {
276            buffer: buf_local,
277            original: local,
278        });
279    }
280    Ok(Text {
281        buffer: Arc::new(buffer),
282        localities: Arc::new(localities),
283        breakers: Arc::new(breakers),
284    })
285}
286
287#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord)]
288pub struct TextLocality {
289    pub buffer: Local<()>,
290    pub original: Local<()>,
291}
292
293#[derive(Debug)]
294pub struct Text {
295    buffer: Arc<String>,
296    localities: Arc<Vec<TextLocality>>,
297    breakers: Arc<Vec<InnerBound>>,
298}
299impl Text {
300    pub fn new<S: Source>(source: S) -> Result<Text, Error> {
301        inner_new(source, true)
302    }
303    pub fn token_text<'s>(&'s self, token: &TextToken) -> &'s str {
304        let Snip {
305            offset: begin,
306            length: len,
307        } = token.locality.bytes();
308        let end = begin + len;
309        &self.buffer[begin..end]
310    }
311    pub fn text(&self) -> &str {
312        self.buffer.as_ref()
313    }
314    pub fn original_locality(&self, idx: usize) -> Option<Local<()>> {
315        self.localities.get(idx).map(|tl| tl.original)
316    }
317    pub fn localities(&self) -> &Vec<TextLocality> {
318        self.localities.as_ref()
319    }
320    pub fn shared_text(&self) -> Text {
321        Text {
322            buffer: self.buffer.clone(),
323            localities: self.localities.clone(),
324            breakers: self.breakers.clone(),
325        }
326    }
327}
328
329impl TryFrom<String> for Text {
330    type Error = Error;
331
332    fn try_from(s: String) -> Result<Text, Error> {
333        let mut text = inner_new((&s).into_source(), false)?;
334        text.buffer = Arc::new(s);
335        Ok(text)
336    }
337}
338
339impl TryFrom<&str> for Text {
340    type Error = Error;
341
342    fn try_from(s: &str) -> Result<Text, Error> {
343        Text::new(s.into_source())
344    }
345}
346
347#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord)]
348pub enum Bound {
349    Sentence,
350    Paragraph,
351    Section,
352}
353
354#[cfg(feature = "strings")]
355#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord)]
356pub struct TextToken {
357    locality: Local<()>,
358    original: Option<Local<()>>,
359    pub token: Token2,
360}
361
362#[cfg(not(feature = "strings"))]
363#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord)]
364pub struct TextToken {
365    locality: Local<()>,
366    original: Option<Local<()>>,
367    pub token: Token2,
368}
369
370#[cfg(test)]
371impl TextToken {
372    fn into_original_token_1(self) -> Option<Local<Token>> {
373        match self.original {
374            Some(original) => self.token.into_token().map(|t| original.local(t)),
375            None => None,
376        }
377    }
378}
379
380impl TextToken {
381    pub fn local(&self) -> Local<()> {
382        self.locality
383    }
384    pub fn original(&self) -> Option<Local<()>> {
385        self.original
386    }
387    pub fn into_position(mut self) -> TextToken {
388        self.locality = self.locality.into_position();
389        self.original = self.original.map(|or| or.into_position());
390        self
391    }
392    pub fn try_as_token(&self) -> Result<Token, Bound> {
393        self.token.try_as_token()
394    }
395    pub fn as_original_token(&self) -> Option<Local<&Token2>> {
396        self.original.map(|original| original.local(&self.token))
397    }
398    pub fn into_original_token(self) -> Option<Local<Token2>> {
399        self.original.map(|original| original.local(self.token))
400    }
401    pub fn original_str<'s>(&self, original: &'s str) -> Result<&'s str, OriginalError> {
402        match self.original {
403            Some(local) => {
404                let Snip {
405                    offset: begin,
406                    length: len,
407                } = local.bytes();
408                let end = begin + len;
409                match original.get(begin..end) {
410                    Some(s) => Ok(s),
411                    None => Err(OriginalError::InvalidSnip),
412                }
413            }
414            None => Err(OriginalError::NoOriginal),
415        }
416    }
417
418    pub fn test_token(lt: Local<Token2>) -> TextToken {
419        let (local, token) = lt.into_inner();
420        TextToken {
421            locality: local,
422            original: Some(local.local(())),
423            token,
424        }
425    }
426    pub fn test_new(token: Token2, local: Local<()>, original: Option<Local<()>>) -> TextToken {
427        TextToken {
428            locality: local,
429            original,
430            token,
431        }
432    }
433}
434
435/*pub trait TokenExt: Iterator<Item = TextToken> + Sized {
436    fn merge_separators(self) -> Merger<Self>;
437}
438
439impl<T> TokenExt for T where T: Iterator<Item = TextToken> {
440    fn merge_separators(self) -> Merger<Self> {
441        Merger {
442            tokens: self,
443        }
444    }
445}
446
447pub struct Merger<T>
448where T: Iterator<Item = TextToken>
449{
450    tokens: T,
451}
452impl<T> Iterator for Merger<T>
453where T: Iterator<Item = TextToken>
454{
455    type Item = TextToken;
456    fn next(&mut self) -> Option<Self::Item> {
457        self.tokens.next()
458    }
459}*/
460
461#[derive(Debug)]
462pub enum OriginalError {
463    NoOriginal,
464    InvalidSnip,
465}
466
467/*#[derive(Debug,Clone,PartialEq)]
468pub enum ExtToken {
469    Token(Local<Token>),
470    Breaker(Local<Bound>),
471    Bound(Bound),
472}*/
473
474#[cfg(feature = "strings")]
475#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord)]
476pub enum Token2 {
477    Word(Word),
478    Struct(Struct),
479    Special(Special),
480    Unicode(Unicode),
481
482    Bound(Bound),
483}
484#[cfg(not(feature = "strings"))]
485#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord)]
486pub enum Token2 {
487    Word(Word),
488    Struct(Struct),
489    Special(Special),
490    Unicode(Unicode),
491
492    Bound(Bound),
493}
494impl From<Token> for Token2 {
495    fn from(t: Token) -> Token2 {
496        match t {
497            Token::Word(w) => Token2::Word(w),
498            Token::Struct(s) => Token2::Struct(s),
499            Token::Special(s) => Token2::Special(s),
500            Token::Unicode(u) => Token2::Unicode(u),
501        }
502    }
503}
504impl Token2 {
505    #[cfg(not(feature = "strings"))]
506    fn try_as_token(&self) -> Result<Token, Bound> {
507        (*self).try_into_token()
508    }
509
510    #[cfg(feature = "strings")]
511    fn try_as_token(&self) -> Result<Token, Bound> {
512        self.clone().try_into_token()
513    }
514
515    fn try_into_token(self) -> Result<Token, Bound> {
516        match self {
517            Token2::Word(w) => Ok(Token::Word(w)),
518            Token2::Struct(s) => Ok(Token::Struct(s)),
519            Token2::Special(s) => Ok(Token::Special(s)),
520            Token2::Unicode(u) => Ok(Token::Unicode(u)),
521            Token2::Bound(b) => Err(b),
522        }
523    }
524}
525#[cfg(test)]
526impl Token2 {
527    fn into_token(self) -> Option<Token> {
528        match self {
529            Token2::Word(w) => Some(Token::Word(w)),
530            Token2::Struct(s) => Some(Token::Struct(s)),
531            Token2::Special(s) => Some(Token::Special(s)),
532            Token2::Unicode(u) => Some(Token::Unicode(u)),
533            Token2::Bound(_) => None,
534        }
535    }
536}
537
538#[cfg(test)]
539#[cfg(not(feature = "strings"))]
540mod test {
541    use super::*;
542    use text_parsing::{
543        IntoPipeParser, IntoSource, Localize, ParserExt, SourceExt, entities, tagger,
544    };
545
546    fn check_results(result: &Vec<Local<Token>>, lib_res: &Vec<Local<Token>>, _uws: &str) {
547        assert_eq!(result.len(), lib_res.len());
548        for i in 0..result.len() {
549            let res: Local<Token> = result[i].clone().into();
550            assert_eq!(res, lib_res[i]);
551        }
552    }
553
554    //#[test]
555    fn symbols() {
556        let uws = "Сибирь Арене 17 30 от 2560₽ 😀";
557        //let result = Vec::new();
558
559        let lib_res = uws
560            .into_tokenizer(TokenizerParams::v1())
561            .collect::<Vec<_>>();
562        //check_results(&result, &lib_res, uws);
563        for t in lib_res {
564            println!("{:?}", t);
565        }
566        panic!()
567    }
568}
569
570#[cfg(test)]
571mod test_v0_5 {
572    use super::*;
573    use text_parsing::{IntoPipeParser, IntoSource, ParserExt, SourceExt, entities, tagger};
574
575    //#[test]
576    fn basic() {
577        /*let uws = "Oxana Putan shared the quick (\"brown\") fox can't jump 32.3 feet, right?4pda etc. qeq U.S.A  asd\n\n\nBrr, it's 29.3°F!\n Русское предложение #36.6 для тестирования деления по юникод-словам...\n🇷🇺 🇸🇹\n👱🏿👶🏽👨🏽\n+Done! Готово";
578
579        /*let result = vec![
580            PositionalToken { source: uws, offset: 0, length: 7, token: Token::Word("l'oreal".to_string()) },
581            PositionalToken { source: uws, offset: 7, length: 1, token: Token::Punctuation(";".to_string()) },
582            PositionalToken { source: uws, offset: 8, length: 1, token: Token::Separator(Separator::Space) },
583            PositionalToken { source: uws, offset: 9, length: 7, token: Token::Word("l'oreal".to_string()) },
584        ];*/
585        let text = Text::new({
586            uws.into_source()
587                .into_separator()
588                .merge_separators()
589        }).unwrap();*/
590
591        let uws = "<p>Oxana Putan shared the quick (\"brown\") fox can't jump 32.3 feet, right? 4pda etc.</p><p> qeq U.S.A  asd\n\n\nBrr, it's 29.3°F!\n Русское предложение #36.6 для тестирования деления по юникод-словам...\n🇷🇺 🇸🇹\n👱🏿👶🏽👨🏽\n+Done! Готово</p>";
592        let text = Text::new({
593            uws.into_source()
594                .pipe(tagger::Builder::new().create().into_breaker())
595                .pipe(entities::Builder::new().create().into_piped())
596                .into_separator()
597        })
598        .unwrap();
599        let lib_res = text
600            .into_tokenizer({
601                TokenizerParams::default()
602                    .add_option(TokenizerOptions::SplitDot)
603                    .add_option(TokenizerOptions::SplitUnderscore)
604                    .add_option(TokenizerOptions::SplitColon)
605                    .with_default_sentences()
606            })
607            .collect::<Vec<_>>();
608
609        for tok in lib_res {
610            println!(
611                "C{:?}, B{:?}, {:?} -> {:?}",
612                tok.original.map(|loc| loc.chars()),
613                tok.original.map(|loc| loc.bytes()),
614                tok.token,
615                tok.original_str(uws)
616            );
617        }
618
619        panic!()
620    }
621}
622
623#[cfg(test)]
624#[cfg(feature = "strings")]
625mod test {
626    use super::*;
627    use text_parsing::{
628        IntoPipeParser, IntoSource, Localize, ParserExt, SourceExt, entities, tagger,
629    };
630
631    /*
632    #[allow(dead_code)]
633    fn print_pt(tok: &PositionalToken) -> String {
634        let mut r = match &tok.token {
635            Token::BBCode{ left, right } => {
636                let left = print_pts(left);
637                let right = print_pts(right);
638                format!("PositionalToken {{ offset: {}, length: {}, token: Token::BBCode {{ left: vec![\n{}], right: vec![\n{}] }} }},",tok.offset,tok.length,left,right)
639            },
640            _ => format!("PositionalToken {{ offset: {}, length: {}, token: Token::{:?} }},",tok.offset,tok.length,tok.token),
641        };
642        r = r.replace("\")","\".to_string())");
643        r
644    }
645    #[allow(dead_code)]
646    fn print_pts(lib_res: &Vec<PositionalToken>) -> String {
647        let mut r = String::new();
648        for tok in lib_res {
649            r += &print_pt(&tok);
650            r += "\n";
651        }
652        r
653    }
654    #[allow(dead_code)]
655    fn print_result(lib_res: &Vec<PositionalToken>) {
656        let mut r = print_pts(lib_res);
657        r = r.replace("Separator(","Separator(Separator::");
658        r = r.replace("UnicodeFormatter(","UnicodeFormatter(Formatter::");
659        r = r.replace("Number(","Number(Number::");
660        r = r.replace("Numerical(","Numerical(Numerical::");
661        println!("{}",r);
662    }
663
664    #[allow(dead_code)]
665    fn print_ct(tok: &CharToken) -> String {
666        let mut r = format!("CharToken {{ byte_offset: {}, byte_length: {}, char_offset: {}, char_length: {}, token: Token::{:?} }},",tok.byte_offset,tok.byte_length,tok.char_offset,tok.char_length,tok.token);
667        r = r.replace("\")","\".to_string())");
668        r
669    }
670
671    #[allow(dead_code)]
672    fn print_cts(lib_res: &Vec<CharToken>) -> String {
673        let mut r = String::new();
674        for tok in lib_res {
675            r += &print_ct(&tok);
676            r += "\n";
677        }
678        r
679    }
680
681    #[allow(dead_code)]
682    fn print_cresult(lib_res: &Vec<CharToken>) {
683        let mut r = print_cts(lib_res);
684        r = r.replace("Separator(","Separator(Separator::");
685        r = r.replace("UnicodeFormatter(","UnicodeFormatter(Formatter::");
686        r = r.replace("Number(","Number(Number::");
687        r = r.replace("Numerical(","Numerical(Numerical::");
688        println!("{}",r);
689    }*/
690
691    #[derive(Debug, Clone)]
692    struct CharToken {
693        byte_offset: usize,
694        byte_length: usize,
695        char_offset: usize,
696        char_length: usize,
697        token: Token,
698    }
699    impl Into<Local<Token>> for CharToken {
700        fn into(self) -> Local<Token> {
701            self.token.localize(
702                Snip {
703                    offset: self.char_offset,
704                    length: self.char_length,
705                },
706                Snip {
707                    offset: self.byte_offset,
708                    length: self.byte_length,
709                },
710            )
711        }
712    }
713
714    #[derive(Debug, Clone)]
715    struct PositionalToken {
716        source: &'static str,
717        offset: usize,
718        length: usize,
719        token: Token,
720    }
721    impl Into<Local<Token>> for PositionalToken {
722        fn into(self) -> Local<Token> {
723            self.token.localize(
724                Snip {
725                    offset: self.source[..self.offset].chars().count(),
726                    length: self.source[self.offset..self.offset + self.length]
727                        .chars()
728                        .count(),
729                },
730                Snip {
731                    offset: self.offset,
732                    length: self.length,
733                },
734            )
735        }
736    }
737
738    fn check_results(result: &Vec<PositionalToken>, lib_res: &Vec<Local<Token>>, _uws: &str) {
739        assert_eq!(result.len(), lib_res.len());
740        for i in 0..result.len() {
741            let res: Local<Token> = result[i].clone().into();
742            assert_eq!(res, lib_res[i]);
743        }
744    }
745
746    fn check_cresults(result: &Vec<CharToken>, lib_res: &Vec<Local<Token>>, _uws: &str) {
747        assert_eq!(result.len(), lib_res.len());
748        for i in 0..result.len() {
749            let res: Local<Token> = result[i].clone().into();
750            assert_eq!(res, lib_res[i]);
751        }
752    }
753
754    fn check<T: Clone + std::fmt::Debug + Into<Local<Token>>>(
755        res: &Vec<T>,
756        lib: &Vec<Local<Token>>,
757        _uws: &str,
758    ) {
759        let mut lib = lib.iter();
760        let mut res = res.iter().map(|r| {
761            let res: Local<Token> = r.clone().into();
762            res
763        });
764        let mut diff = Vec::new();
765        loop {
766            match (lib.next(), res.next()) {
767                (Some(lw), Some(rw)) => {
768                    if *lw != rw {
769                        diff.push(format!("LIB:  {:?}", lw));
770                        diff.push(format!("TEST: {:?}", rw));
771                        diff.push("".to_string())
772                    }
773                }
774                (Some(lw), None) => {
775                    diff.push(format!("LIB:  {:?}", lw));
776                    diff.push("TEST: ----".to_string());
777                    diff.push("".to_string())
778                }
779                (None, Some(rw)) => {
780                    diff.push("LIB:  ----".to_string());
781                    diff.push(format!("TEST: {:?}", rw));
782                    diff.push("".to_string())
783                }
784                (None, None) => break,
785            }
786        }
787        if diff.len() > 0 {
788            for ln in &diff {
789                println!("{}", ln);
790            }
791            panic!("Diff count: {}", diff.len() / 3);
792        }
793    }
794
795    #[test]
796    fn spaces() {
797        let uws = "    spaces    too   many   apces   ";
798        let result = vec![
799            PositionalToken {
800                source: uws,
801                offset: 0,
802                length: 4,
803                token: Token::Special(Special::Separator(Separator::Space)),
804            },
805            PositionalToken {
806                source: uws,
807                offset: 4,
808                length: 6,
809                token: Token::Word(Word::Word("spaces".to_string())),
810            },
811            PositionalToken {
812                source: uws,
813                offset: 10,
814                length: 4,
815                token: Token::Special(Special::Separator(Separator::Space)),
816            },
817            PositionalToken {
818                source: uws,
819                offset: 14,
820                length: 3,
821                token: Token::Word(Word::Word("too".to_string())),
822            },
823            PositionalToken {
824                source: uws,
825                offset: 17,
826                length: 3,
827                token: Token::Special(Special::Separator(Separator::Space)),
828            },
829            PositionalToken {
830                source: uws,
831                offset: 20,
832                length: 4,
833                token: Token::Word(Word::Word("many".to_string())),
834            },
835            PositionalToken {
836                source: uws,
837                offset: 24,
838                length: 3,
839                token: Token::Special(Special::Separator(Separator::Space)),
840            },
841            PositionalToken {
842                source: uws,
843                offset: 27,
844                length: 5,
845                token: Token::Word(Word::Word("apces".to_string())),
846            },
847            PositionalToken {
848                source: uws,
849                offset: 32,
850                length: 3,
851                token: Token::Special(Special::Separator(Separator::Space)),
852            },
853        ];
854        let lib_res = uws
855            .into_tokenizer(TokenizerParams::v1())
856            .collect::<Vec<_>>();
857        check_results(&result, &lib_res, uws);
858        //panic!()
859    }
860
861    #[test]
862    fn numbers() {
863        let uws = "(() -2\n()  -2";
864        let result = vec![
865            PositionalToken {
866                source: uws,
867                offset: 0,
868                length: 1,
869                token: Token::Special(Special::Punctuation('(')),
870            },
871            PositionalToken {
872                source: uws,
873                offset: 1,
874                length: 1,
875                token: Token::Special(Special::Punctuation('(')),
876            },
877            PositionalToken {
878                source: uws,
879                offset: 2,
880                length: 1,
881                token: Token::Special(Special::Punctuation(')')),
882            },
883            PositionalToken {
884                source: uws,
885                offset: 3,
886                length: 1,
887                token: Token::Special(Special::Separator(Separator::Space)),
888            },
889            PositionalToken {
890                source: uws,
891                offset: 4,
892                length: 2,
893                token: Token::Word(Word::Number(Number::Integer(-2))),
894            },
895            PositionalToken {
896                source: uws,
897                offset: 6,
898                length: 1,
899                token: Token::Special(Special::Separator(Separator::Newline)),
900            },
901            PositionalToken {
902                source: uws,
903                offset: 7,
904                length: 1,
905                token: Token::Special(Special::Punctuation('(')),
906            },
907            PositionalToken {
908                source: uws,
909                offset: 8,
910                length: 1,
911                token: Token::Special(Special::Punctuation(')')),
912            },
913            PositionalToken {
914                source: uws,
915                offset: 9,
916                length: 2,
917                token: Token::Special(Special::Separator(Separator::Space)),
918            },
919            PositionalToken {
920                source: uws,
921                offset: 11,
922                length: 2,
923                token: Token::Word(Word::Number(Number::Integer(-2))),
924            },
925        ];
926        let lib_res = uws
927            .into_tokenizer({
928                TokenizerParams::default()
929                    .add_option(TokenizerOptions::SplitDot)
930                    .add_option(TokenizerOptions::SplitUnderscore)
931                    .add_option(TokenizerOptions::SplitColon)
932                    .add_option(TokenizerOptions::MergeWhites)
933            })
934            .collect::<Vec<_>>();
935        check_results(&result, &lib_res, uws);
936    }
937
938    #[test]
939    fn word_with_inner_hyphens() {
940        let uws = "Опро­сы по­ка­зы­ва­ют";
941        let result = vec![
942            PositionalToken {
943                source: uws,
944                offset: 0,
945                length: 14,
946                token: Token::Word(Word::StrangeWord("Опро­сы".to_string())),
947            },
948            PositionalToken {
949                source: uws,
950                offset: 14,
951                length: 1,
952                token: Token::Special(Special::Separator(Separator::Space)),
953            },
954            PositionalToken {
955                source: uws,
956                offset: 15,
957                length: 28,
958                token: Token::Word(Word::StrangeWord("по­ка­зы­ва­ют".to_string())),
959            },
960        ];
961        let lib_res = uws
962            .into_tokenizer(TokenizerParams::v1())
963            .collect::<Vec<_>>();
964        check_results(&result, &lib_res, uws);
965    }
966
967    #[test]
968    fn mixed_but_word() {
969        let uws = "L’Oreal";
970        let result = vec![PositionalToken {
971            source: uws,
972            offset: 0,
973            length: 9,
974            token: Token::Word(Word::StrangeWord("L’Oreal".to_string())),
975        }];
976        let lib_res = uws
977            .into_tokenizer(TokenizerParams::v1())
978            .collect::<Vec<_>>();
979        check_results(&result, &lib_res, uws);
980    }
981
982    #[test]
983    fn hashtags() {
984        let uws = "#hashtag#hashtag2";
985        let result = vec![
986            PositionalToken {
987                source: uws,
988                offset: 0,
989                length: 1,
990                token: Token::Special(Special::Punctuation('#')),
991            },
992            PositionalToken {
993                source: uws,
994                offset: 1,
995                length: 7,
996                token: Token::Word(Word::Word("hashtag".to_string())),
997            },
998            PositionalToken {
999                source: uws,
1000                offset: 8,
1001                length: 1,
1002                token: Token::Special(Special::Punctuation('#')),
1003            },
1004            PositionalToken {
1005                source: uws,
1006                offset: 9,
1007                length: 8,
1008                token: Token::Word(Word::Numerical(Numerical::Alphanumeric(
1009                    "hashtag2".to_string(),
1010                ))),
1011            },
1012        ];
1013        let lib_res = uws
1014            .into_tokenizer(TokenizerParams::v1())
1015            .collect::<Vec<_>>();
1016        check_results(&result, &lib_res, uws);
1017    }
1018
1019    #[test]
1020    fn hashtags2() {
1021        let uws = "#hashtag#hashtag2 #hash_tag";
1022        let result = vec![
1023            PositionalToken {
1024                source: uws,
1025                offset: 0,
1026                length: 8,
1027                token: Token::Struct(Struct::Hashtag("hashtag".to_string())),
1028            },
1029            PositionalToken {
1030                source: uws,
1031                offset: 8,
1032                length: 9,
1033                token: Token::Struct(Struct::Hashtag("hashtag2".to_string())),
1034            },
1035            PositionalToken {
1036                source: uws,
1037                offset: 17,
1038                length: 1,
1039                token: Token::Special(Special::Separator(Separator::Space)),
1040            },
1041            PositionalToken {
1042                source: uws,
1043                offset: 18,
1044                length: 9,
1045                token: Token::Struct(Struct::Hashtag("hash_tag".to_string())),
1046            },
1047        ];
1048        let lib_res = uws
1049            .into_tokenizer(TokenizerParams::v1().add_option(TokenizerOptions::StructTokens))
1050            .collect::<Vec<_>>();
1051        check_results(&result, &lib_res, uws);
1052    }
1053
1054    #[test]
1055    fn mention2() {
1056        let uws = "@hashtag@hashtag2 @hash_tag";
1057        let result = vec![
1058            PositionalToken {
1059                source: uws,
1060                offset: 0,
1061                length: 8,
1062                token: Token::Struct(Struct::Mention("hashtag".to_string())),
1063            },
1064            PositionalToken {
1065                source: uws,
1066                offset: 8,
1067                length: 9,
1068                token: Token::Struct(Struct::Mention("hashtag2".to_string())),
1069            },
1070            PositionalToken {
1071                source: uws,
1072                offset: 17,
1073                length: 1,
1074                token: Token::Special(Special::Separator(Separator::Space)),
1075            },
1076            PositionalToken {
1077                source: uws,
1078                offset: 18,
1079                length: 9,
1080                token: Token::Struct(Struct::Mention("hash_tag".to_string())),
1081            },
1082        ];
1083        let lib_res = uws
1084            .into_tokenizer(TokenizerParams::v1().add_option(TokenizerOptions::StructTokens))
1085            .collect::<Vec<_>>();
1086        check_results(&result, &lib_res, uws);
1087    }
1088
1089    #[test]
1090    fn apostrophe() {
1091        let uws = "l'oreal; l\u{0060}oreal";
1092        let result = vec![
1093            PositionalToken {
1094                source: uws,
1095                offset: 0,
1096                length: 7,
1097                token: Token::Word(Word::Word("l'oreal".to_string())),
1098            },
1099            PositionalToken {
1100                source: uws,
1101                offset: 7,
1102                length: 1,
1103                token: Token::Special(Special::Punctuation(';')),
1104            },
1105            PositionalToken {
1106                source: uws,
1107                offset: 8,
1108                length: 1,
1109                token: Token::Special(Special::Separator(Separator::Space)),
1110            },
1111            PositionalToken {
1112                source: uws,
1113                offset: 9,
1114                length: 7,
1115                token: Token::Word(Word::Word("l'oreal".to_string())),
1116            },
1117        ];
1118        let text = Text::new(uws.into_source()).unwrap();
1119        let lib_res = text
1120            .into_tokenizer(TokenizerParams::v1())
1121            .filter_map(|tt| tt.into_original_token_1())
1122            .collect::<Vec<_>>();
1123        check_results(&result, &lib_res, uws);
1124    }
1125
1126    #[test]
1127    fn char_tokens() {
1128        let uws = "[Oxana Putan|1712640565] shared the quick (\"brown\") fox can't jump 32.3 feet, right? 4pda etc. qeq U.S.A  asd\n\n\nBrr, it's 29.3°F!\n Русское предложение #36.6 для тестирования деления по юникод-словам...\n🇷🇺 🇸🇹\n👱🏿👶🏽👨🏽\n+Done! Готово";
1129        let result = vec![
1130            CharToken {
1131                byte_offset: 0,
1132                byte_length: 1,
1133                char_offset: 0,
1134                char_length: 1,
1135                token: Token::Special(Special::Punctuation('[')),
1136            },
1137            CharToken {
1138                byte_offset: 1,
1139                byte_length: 5,
1140                char_offset: 1,
1141                char_length: 5,
1142                token: Token::Word(Word::Word("Oxana".to_string())),
1143            },
1144            CharToken {
1145                byte_offset: 6,
1146                byte_length: 1,
1147                char_offset: 6,
1148                char_length: 1,
1149                token: Token::Special(Special::Separator(Separator::Space)),
1150            },
1151            CharToken {
1152                byte_offset: 7,
1153                byte_length: 5,
1154                char_offset: 7,
1155                char_length: 5,
1156                token: Token::Word(Word::Word("Putan".to_string())),
1157            },
1158            CharToken {
1159                byte_offset: 12,
1160                byte_length: 1,
1161                char_offset: 12,
1162                char_length: 1,
1163                token: Token::Special(Special::Punctuation('|')),
1164            },
1165            CharToken {
1166                byte_offset: 13,
1167                byte_length: 10,
1168                char_offset: 13,
1169                char_length: 10,
1170                token: Token::Word(Word::Number(Number::Integer(1712640565))),
1171            },
1172            CharToken {
1173                byte_offset: 23,
1174                byte_length: 1,
1175                char_offset: 23,
1176                char_length: 1,
1177                token: Token::Special(Special::Punctuation(']')),
1178            },
1179            /*CharToken { byte_offset: 0, byte_length: 24, char_offset: 0, char_length: 24, token: Token::BBCode { left: vec![
1180            CharToken { byte_offset: 1, byte_length: 5, char_offset: 1, char_length: 5, token: Token::Word(Word::Word("Oxana".to_string())) },
1181            CharToken { byte_offset: 6, byte_length: 1, char_offset: 6, char_length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
1182            CharToken { byte_offset: 7, byte_length: 5, char_offset: 7, char_length: 5, token: Token::Word(Word::Word("Putan".to_string())) },
1183            ], right: vec![
1184            CharToken { byte_offset: 13, byte_length: 10, char_offset: 13, char_length: 10, token: Token::Word(Word::Number(Number::Integer(1712640565))) },
1185            ] } },*/
1186            CharToken {
1187                byte_offset: 24,
1188                byte_length: 1,
1189                char_offset: 24,
1190                char_length: 1,
1191                token: Token::Special(Special::Separator(Separator::Space)),
1192            },
1193            CharToken {
1194                byte_offset: 25,
1195                byte_length: 6,
1196                char_offset: 25,
1197                char_length: 6,
1198                token: Token::Word(Word::Word("shared".to_string())),
1199            },
1200            CharToken {
1201                byte_offset: 31,
1202                byte_length: 1,
1203                char_offset: 31,
1204                char_length: 1,
1205                token: Token::Special(Special::Separator(Separator::Space)),
1206            },
1207            CharToken {
1208                byte_offset: 32,
1209                byte_length: 3,
1210                char_offset: 32,
1211                char_length: 3,
1212                token: Token::Word(Word::Word("the".to_string())),
1213            },
1214            CharToken {
1215                byte_offset: 35,
1216                byte_length: 1,
1217                char_offset: 35,
1218                char_length: 1,
1219                token: Token::Special(Special::Separator(Separator::Space)),
1220            },
1221            CharToken {
1222                byte_offset: 36,
1223                byte_length: 5,
1224                char_offset: 36,
1225                char_length: 5,
1226                token: Token::Word(Word::Word("quick".to_string())),
1227            },
1228            CharToken {
1229                byte_offset: 41,
1230                byte_length: 1,
1231                char_offset: 41,
1232                char_length: 1,
1233                token: Token::Special(Special::Separator(Separator::Space)),
1234            },
1235            CharToken {
1236                byte_offset: 42,
1237                byte_length: 1,
1238                char_offset: 42,
1239                char_length: 1,
1240                token: Token::Special(Special::Punctuation('(')),
1241            },
1242            CharToken {
1243                byte_offset: 43,
1244                byte_length: 1,
1245                char_offset: 43,
1246                char_length: 1,
1247                token: Token::Special(Special::Punctuation('"')),
1248            },
1249            CharToken {
1250                byte_offset: 44,
1251                byte_length: 5,
1252                char_offset: 44,
1253                char_length: 5,
1254                token: Token::Word(Word::Word("brown".to_string())),
1255            },
1256            CharToken {
1257                byte_offset: 49,
1258                byte_length: 1,
1259                char_offset: 49,
1260                char_length: 1,
1261                token: Token::Special(Special::Punctuation('"')),
1262            },
1263            CharToken {
1264                byte_offset: 50,
1265                byte_length: 1,
1266                char_offset: 50,
1267                char_length: 1,
1268                token: Token::Special(Special::Punctuation(')')),
1269            },
1270            CharToken {
1271                byte_offset: 51,
1272                byte_length: 1,
1273                char_offset: 51,
1274                char_length: 1,
1275                token: Token::Special(Special::Separator(Separator::Space)),
1276            },
1277            CharToken {
1278                byte_offset: 52,
1279                byte_length: 3,
1280                char_offset: 52,
1281                char_length: 3,
1282                token: Token::Word(Word::Word("fox".to_string())),
1283            },
1284            CharToken {
1285                byte_offset: 55,
1286                byte_length: 1,
1287                char_offset: 55,
1288                char_length: 1,
1289                token: Token::Special(Special::Separator(Separator::Space)),
1290            },
1291            CharToken {
1292                byte_offset: 56,
1293                byte_length: 5,
1294                char_offset: 56,
1295                char_length: 5,
1296                token: Token::Word(Word::Word("can\'t".to_string())),
1297            },
1298            CharToken {
1299                byte_offset: 61,
1300                byte_length: 1,
1301                char_offset: 61,
1302                char_length: 1,
1303                token: Token::Special(Special::Separator(Separator::Space)),
1304            },
1305            CharToken {
1306                byte_offset: 62,
1307                byte_length: 4,
1308                char_offset: 62,
1309                char_length: 4,
1310                token: Token::Word(Word::Word("jump".to_string())),
1311            },
1312            CharToken {
1313                byte_offset: 66,
1314                byte_length: 1,
1315                char_offset: 66,
1316                char_length: 1,
1317                token: Token::Special(Special::Separator(Separator::Space)),
1318            },
1319            CharToken {
1320                byte_offset: 67,
1321                byte_length: 4,
1322                char_offset: 67,
1323                char_length: 4,
1324                token: Token::Word(Word::Number(Number::Float(32.3))),
1325            },
1326            CharToken {
1327                byte_offset: 71,
1328                byte_length: 1,
1329                char_offset: 71,
1330                char_length: 1,
1331                token: Token::Special(Special::Separator(Separator::Space)),
1332            },
1333            CharToken {
1334                byte_offset: 72,
1335                byte_length: 4,
1336                char_offset: 72,
1337                char_length: 4,
1338                token: Token::Word(Word::Word("feet".to_string())),
1339            },
1340            CharToken {
1341                byte_offset: 76,
1342                byte_length: 1,
1343                char_offset: 76,
1344                char_length: 1,
1345                token: Token::Special(Special::Punctuation(',')),
1346            },
1347            CharToken {
1348                byte_offset: 77,
1349                byte_length: 1,
1350                char_offset: 77,
1351                char_length: 1,
1352                token: Token::Special(Special::Separator(Separator::Space)),
1353            },
1354            CharToken {
1355                byte_offset: 78,
1356                byte_length: 5,
1357                char_offset: 78,
1358                char_length: 5,
1359                token: Token::Word(Word::Word("right".to_string())),
1360            },
1361            CharToken {
1362                byte_offset: 83,
1363                byte_length: 1,
1364                char_offset: 83,
1365                char_length: 1,
1366                token: Token::Special(Special::Punctuation('?')),
1367            },
1368            CharToken {
1369                byte_offset: 84,
1370                byte_length: 1,
1371                char_offset: 84,
1372                char_length: 1,
1373                token: Token::Special(Special::Separator(Separator::Space)),
1374            },
1375            CharToken {
1376                byte_offset: 85,
1377                byte_length: 4,
1378                char_offset: 85,
1379                char_length: 4,
1380                token: Token::Word(Word::Numerical(Numerical::Measures("4pda".to_string()))),
1381            },
1382            CharToken {
1383                byte_offset: 89,
1384                byte_length: 1,
1385                char_offset: 89,
1386                char_length: 1,
1387                token: Token::Special(Special::Separator(Separator::Space)),
1388            },
1389            CharToken {
1390                byte_offset: 90,
1391                byte_length: 3,
1392                char_offset: 90,
1393                char_length: 3,
1394                token: Token::Word(Word::Word("etc".to_string())),
1395            },
1396            CharToken {
1397                byte_offset: 93,
1398                byte_length: 1,
1399                char_offset: 93,
1400                char_length: 1,
1401                token: Token::Special(Special::Punctuation('.')),
1402            },
1403            CharToken {
1404                byte_offset: 94,
1405                byte_length: 1,
1406                char_offset: 94,
1407                char_length: 1,
1408                token: Token::Special(Special::Separator(Separator::Space)),
1409            },
1410            CharToken {
1411                byte_offset: 95,
1412                byte_length: 3,
1413                char_offset: 95,
1414                char_length: 3,
1415                token: Token::Word(Word::Word("qeq".to_string())),
1416            },
1417            CharToken {
1418                byte_offset: 98,
1419                byte_length: 1,
1420                char_offset: 98,
1421                char_length: 1,
1422                token: Token::Special(Special::Separator(Separator::Space)),
1423            },
1424            CharToken {
1425                byte_offset: 99,
1426                byte_length: 5,
1427                char_offset: 99,
1428                char_length: 5,
1429                token: Token::Word(Word::Word("U.S.A".to_string())),
1430            },
1431            CharToken {
1432                byte_offset: 104,
1433                byte_length: 2,
1434                char_offset: 104,
1435                char_length: 2,
1436                token: Token::Special(Special::Separator(Separator::Space)),
1437            },
1438            CharToken {
1439                byte_offset: 106,
1440                byte_length: 3,
1441                char_offset: 106,
1442                char_length: 3,
1443                token: Token::Word(Word::Word("asd".to_string())),
1444            },
1445            CharToken {
1446                byte_offset: 109,
1447                byte_length: 3,
1448                char_offset: 109,
1449                char_length: 3,
1450                token: Token::Special(Special::Separator(Separator::Newline)),
1451            },
1452            CharToken {
1453                byte_offset: 112,
1454                byte_length: 3,
1455                char_offset: 112,
1456                char_length: 3,
1457                token: Token::Word(Word::Word("Brr".to_string())),
1458            },
1459            CharToken {
1460                byte_offset: 115,
1461                byte_length: 1,
1462                char_offset: 115,
1463                char_length: 1,
1464                token: Token::Special(Special::Punctuation(',')),
1465            },
1466            CharToken {
1467                byte_offset: 116,
1468                byte_length: 1,
1469                char_offset: 116,
1470                char_length: 1,
1471                token: Token::Special(Special::Separator(Separator::Space)),
1472            },
1473            CharToken {
1474                byte_offset: 117,
1475                byte_length: 4,
1476                char_offset: 117,
1477                char_length: 4,
1478                token: Token::Word(Word::Word("it\'s".to_string())),
1479            },
1480            CharToken {
1481                byte_offset: 121,
1482                byte_length: 1,
1483                char_offset: 121,
1484                char_length: 1,
1485                token: Token::Special(Special::Separator(Separator::Space)),
1486            },
1487            CharToken {
1488                byte_offset: 122,
1489                byte_length: 4,
1490                char_offset: 122,
1491                char_length: 4,
1492                token: Token::Word(Word::Number(Number::Float(29.3))),
1493            },
1494            CharToken {
1495                byte_offset: 126,
1496                byte_length: 2,
1497                char_offset: 126,
1498                char_length: 1,
1499                token: Token::Special(Special::Symbol('°')),
1500            },
1501            CharToken {
1502                byte_offset: 128,
1503                byte_length: 1,
1504                char_offset: 127,
1505                char_length: 1,
1506                token: Token::Word(Word::Word("F".to_string())),
1507            },
1508            CharToken {
1509                byte_offset: 129,
1510                byte_length: 1,
1511                char_offset: 128,
1512                char_length: 1,
1513                token: Token::Special(Special::Punctuation('!')),
1514            },
1515            CharToken {
1516                byte_offset: 130,
1517                byte_length: 1,
1518                char_offset: 129,
1519                char_length: 1,
1520                token: Token::Special(Special::Separator(Separator::Newline)),
1521            },
1522            CharToken {
1523                byte_offset: 131,
1524                byte_length: 1,
1525                char_offset: 130,
1526                char_length: 1,
1527                token: Token::Special(Special::Separator(Separator::Space)),
1528            },
1529            CharToken {
1530                byte_offset: 132,
1531                byte_length: 14,
1532                char_offset: 131,
1533                char_length: 7,
1534                token: Token::Word(Word::Word("Русское".to_string())),
1535            },
1536            CharToken {
1537                byte_offset: 146,
1538                byte_length: 1,
1539                char_offset: 138,
1540                char_length: 1,
1541                token: Token::Special(Special::Separator(Separator::Space)),
1542            },
1543            CharToken {
1544                byte_offset: 147,
1545                byte_length: 22,
1546                char_offset: 139,
1547                char_length: 11,
1548                token: Token::Word(Word::Word("предложение".to_string())),
1549            },
1550            CharToken {
1551                byte_offset: 169,
1552                byte_length: 1,
1553                char_offset: 150,
1554                char_length: 1,
1555                token: Token::Special(Special::Separator(Separator::Space)),
1556            },
1557            CharToken {
1558                byte_offset: 170,
1559                byte_length: 5,
1560                char_offset: 151,
1561                char_length: 5,
1562                token: Token::Struct(Struct::Hashtag("36.6".to_string())),
1563            },
1564            CharToken {
1565                byte_offset: 175,
1566                byte_length: 1,
1567                char_offset: 156,
1568                char_length: 1,
1569                token: Token::Special(Special::Separator(Separator::Space)),
1570            },
1571            CharToken {
1572                byte_offset: 176,
1573                byte_length: 6,
1574                char_offset: 157,
1575                char_length: 3,
1576                token: Token::Word(Word::Word("для".to_string())),
1577            },
1578            CharToken {
1579                byte_offset: 182,
1580                byte_length: 1,
1581                char_offset: 160,
1582                char_length: 1,
1583                token: Token::Special(Special::Separator(Separator::Space)),
1584            },
1585            CharToken {
1586                byte_offset: 183,
1587                byte_length: 24,
1588                char_offset: 161,
1589                char_length: 12,
1590                token: Token::Word(Word::Word("тестирования".to_string())),
1591            },
1592            CharToken {
1593                byte_offset: 207,
1594                byte_length: 1,
1595                char_offset: 173,
1596                char_length: 1,
1597                token: Token::Special(Special::Separator(Separator::Space)),
1598            },
1599            CharToken {
1600                byte_offset: 208,
1601                byte_length: 14,
1602                char_offset: 174,
1603                char_length: 7,
1604                token: Token::Word(Word::Word("деления".to_string())),
1605            },
1606            CharToken {
1607                byte_offset: 222,
1608                byte_length: 1,
1609                char_offset: 181,
1610                char_length: 1,
1611                token: Token::Special(Special::Separator(Separator::Space)),
1612            },
1613            CharToken {
1614                byte_offset: 223,
1615                byte_length: 4,
1616                char_offset: 182,
1617                char_length: 2,
1618                token: Token::Word(Word::Word("по".to_string())),
1619            },
1620            CharToken {
1621                byte_offset: 227,
1622                byte_length: 1,
1623                char_offset: 184,
1624                char_length: 1,
1625                token: Token::Special(Special::Separator(Separator::Space)),
1626            },
1627            CharToken {
1628                byte_offset: 228,
1629                byte_length: 12,
1630                char_offset: 185,
1631                char_length: 6,
1632                token: Token::Word(Word::Word("юникод".to_string())),
1633            },
1634            CharToken {
1635                byte_offset: 240,
1636                byte_length: 1,
1637                char_offset: 191,
1638                char_length: 1,
1639                token: Token::Special(Special::Punctuation('-')),
1640            },
1641            CharToken {
1642                byte_offset: 241,
1643                byte_length: 12,
1644                char_offset: 192,
1645                char_length: 6,
1646                token: Token::Word(Word::Word("словам".to_string())),
1647            },
1648            CharToken {
1649                byte_offset: 253,
1650                byte_length: 3,
1651                char_offset: 198,
1652                char_length: 3,
1653                token: Token::Special(Special::Punctuation('.')),
1654            },
1655            CharToken {
1656                byte_offset: 256,
1657                byte_length: 1,
1658                char_offset: 201,
1659                char_length: 1,
1660                token: Token::Special(Special::Separator(Separator::Newline)),
1661            },
1662            CharToken {
1663                byte_offset: 257,
1664                byte_length: 8,
1665                char_offset: 202,
1666                char_length: 2,
1667                token: Token::Word(Word::Emoji("russia")),
1668            },
1669            CharToken {
1670                byte_offset: 265,
1671                byte_length: 1,
1672                char_offset: 204,
1673                char_length: 1,
1674                token: Token::Special(Special::Separator(Separator::Space)),
1675            },
1676            CharToken {
1677                byte_offset: 266,
1678                byte_length: 8,
1679                char_offset: 205,
1680                char_length: 2,
1681                token: Token::Word(Word::Emoji("sao_tome_and_principe")),
1682            },
1683            CharToken {
1684                byte_offset: 274,
1685                byte_length: 1,
1686                char_offset: 207,
1687                char_length: 1,
1688                token: Token::Special(Special::Separator(Separator::Newline)),
1689            },
1690            CharToken {
1691                byte_offset: 275,
1692                byte_length: 8,
1693                char_offset: 208,
1694                char_length: 2,
1695                token: Token::Word(Word::Emoji("blond_haired_person_dark_skin_tone")),
1696            },
1697            CharToken {
1698                byte_offset: 283,
1699                byte_length: 8,
1700                char_offset: 210,
1701                char_length: 2,
1702                token: Token::Word(Word::Emoji("baby_medium_skin_tone")),
1703            },
1704            CharToken {
1705                byte_offset: 291,
1706                byte_length: 8,
1707                char_offset: 212,
1708                char_length: 2,
1709                token: Token::Word(Word::Emoji("man_medium_skin_tone")),
1710            },
1711            CharToken {
1712                byte_offset: 299,
1713                byte_length: 1,
1714                char_offset: 214,
1715                char_length: 1,
1716                token: Token::Special(Special::Separator(Separator::Newline)),
1717            },
1718            CharToken {
1719                byte_offset: 300,
1720                byte_length: 1,
1721                char_offset: 215,
1722                char_length: 1,
1723                token: Token::Special(Special::Punctuation('+')),
1724            },
1725            CharToken {
1726                byte_offset: 301,
1727                byte_length: 4,
1728                char_offset: 216,
1729                char_length: 4,
1730                token: Token::Word(Word::Word("Done".to_string())),
1731            },
1732            CharToken {
1733                byte_offset: 305,
1734                byte_length: 1,
1735                char_offset: 220,
1736                char_length: 1,
1737                token: Token::Special(Special::Punctuation('!')),
1738            },
1739            CharToken {
1740                byte_offset: 306,
1741                byte_length: 1,
1742                char_offset: 221,
1743                char_length: 1,
1744                token: Token::Special(Special::Separator(Separator::Space)),
1745            },
1746            CharToken {
1747                byte_offset: 307,
1748                byte_length: 12,
1749                char_offset: 222,
1750                char_length: 6,
1751                token: Token::Word(Word::Word("Готово".to_string())),
1752            },
1753        ];
1754
1755        let lib_res = uws
1756            .into_tokenizer(TokenizerParams::complex())
1757            .collect::<Vec<_>>();
1758
1759        //print_cresult(); panic!();
1760        check_cresults(&result, &lib_res, uws);
1761    }
1762
1763    #[test]
1764    fn general_default() {
1765        let uws = "The quick (\"brown\") fox can't jump 32.3 feet, right? 4pda etc. qeq U.S.A  asd\n\n\nBrr, it's 29.3°F!\n Русское предложение #36.6 для тестирования деления по юникод-словам...\n";
1766        let result = vec![
1767            PositionalToken {
1768                source: uws,
1769                offset: 0,
1770                length: 3,
1771                token: Token::Word(Word::Word("The".to_string())),
1772            },
1773            PositionalToken {
1774                source: uws,
1775                offset: 3,
1776                length: 1,
1777                token: Token::Special(Special::Separator(Separator::Space)),
1778            },
1779            PositionalToken {
1780                source: uws,
1781                offset: 4,
1782                length: 5,
1783                token: Token::Word(Word::Word("quick".to_string())),
1784            },
1785            PositionalToken {
1786                source: uws,
1787                offset: 9,
1788                length: 1,
1789                token: Token::Special(Special::Separator(Separator::Space)),
1790            },
1791            PositionalToken {
1792                source: uws,
1793                offset: 10,
1794                length: 1,
1795                token: Token::Special(Special::Punctuation('(')),
1796            },
1797            PositionalToken {
1798                source: uws,
1799                offset: 11,
1800                length: 1,
1801                token: Token::Special(Special::Punctuation('"')),
1802            },
1803            PositionalToken {
1804                source: uws,
1805                offset: 12,
1806                length: 5,
1807                token: Token::Word(Word::Word("brown".to_string())),
1808            },
1809            PositionalToken {
1810                source: uws,
1811                offset: 17,
1812                length: 1,
1813                token: Token::Special(Special::Punctuation('"')),
1814            },
1815            PositionalToken {
1816                source: uws,
1817                offset: 18,
1818                length: 1,
1819                token: Token::Special(Special::Punctuation(')')),
1820            },
1821            PositionalToken {
1822                source: uws,
1823                offset: 19,
1824                length: 1,
1825                token: Token::Special(Special::Separator(Separator::Space)),
1826            },
1827            PositionalToken {
1828                source: uws,
1829                offset: 20,
1830                length: 3,
1831                token: Token::Word(Word::Word("fox".to_string())),
1832            },
1833            PositionalToken {
1834                source: uws,
1835                offset: 23,
1836                length: 1,
1837                token: Token::Special(Special::Separator(Separator::Space)),
1838            },
1839            PositionalToken {
1840                source: uws,
1841                offset: 24,
1842                length: 5,
1843                token: Token::Word(Word::Word("can\'t".to_string())),
1844            },
1845            PositionalToken {
1846                source: uws,
1847                offset: 29,
1848                length: 1,
1849                token: Token::Special(Special::Separator(Separator::Space)),
1850            },
1851            PositionalToken {
1852                source: uws,
1853                offset: 30,
1854                length: 4,
1855                token: Token::Word(Word::Word("jump".to_string())),
1856            },
1857            PositionalToken {
1858                source: uws,
1859                offset: 34,
1860                length: 1,
1861                token: Token::Special(Special::Separator(Separator::Space)),
1862            },
1863            PositionalToken {
1864                source: uws,
1865                offset: 35,
1866                length: 4,
1867                token: Token::Word(Word::Number(Number::Float(32.3))),
1868            },
1869            PositionalToken {
1870                source: uws,
1871                offset: 39,
1872                length: 1,
1873                token: Token::Special(Special::Separator(Separator::Space)),
1874            },
1875            PositionalToken {
1876                source: uws,
1877                offset: 40,
1878                length: 4,
1879                token: Token::Word(Word::Word("feet".to_string())),
1880            },
1881            PositionalToken {
1882                source: uws,
1883                offset: 44,
1884                length: 1,
1885                token: Token::Special(Special::Punctuation(',')),
1886            },
1887            PositionalToken {
1888                source: uws,
1889                offset: 45,
1890                length: 1,
1891                token: Token::Special(Special::Separator(Separator::Space)),
1892            },
1893            PositionalToken {
1894                source: uws,
1895                offset: 46,
1896                length: 5,
1897                token: Token::Word(Word::Word("right".to_string())),
1898            },
1899            PositionalToken {
1900                source: uws,
1901                offset: 51,
1902                length: 1,
1903                token: Token::Special(Special::Punctuation('?')),
1904            },
1905            PositionalToken {
1906                source: uws,
1907                offset: 52,
1908                length: 1,
1909                token: Token::Special(Special::Separator(Separator::Space)),
1910            },
1911            PositionalToken {
1912                source: uws,
1913                offset: 53,
1914                length: 4,
1915                token: Token::Word(Word::Numerical(Numerical::Measures("4pda".to_string()))),
1916            }, // TODO
1917            PositionalToken {
1918                source: uws,
1919                offset: 57,
1920                length: 1,
1921                token: Token::Special(Special::Separator(Separator::Space)),
1922            },
1923            PositionalToken {
1924                source: uws,
1925                offset: 58,
1926                length: 3,
1927                token: Token::Word(Word::Word("etc".to_string())),
1928            },
1929            PositionalToken {
1930                source: uws,
1931                offset: 61,
1932                length: 1,
1933                token: Token::Special(Special::Punctuation('.')),
1934            },
1935            PositionalToken {
1936                source: uws,
1937                offset: 62,
1938                length: 1,
1939                token: Token::Special(Special::Separator(Separator::Space)),
1940            },
1941            PositionalToken {
1942                source: uws,
1943                offset: 63,
1944                length: 3,
1945                token: Token::Word(Word::Word("qeq".to_string())),
1946            },
1947            PositionalToken {
1948                source: uws,
1949                offset: 66,
1950                length: 1,
1951                token: Token::Special(Special::Separator(Separator::Space)),
1952            },
1953            PositionalToken {
1954                source: uws,
1955                offset: 67,
1956                length: 1,
1957                token: Token::Word(Word::Word("U".to_string())),
1958            },
1959            PositionalToken {
1960                source: uws,
1961                offset: 68,
1962                length: 1,
1963                token: Token::Special(Special::Punctuation('.')),
1964            },
1965            PositionalToken {
1966                source: uws,
1967                offset: 69,
1968                length: 1,
1969                token: Token::Word(Word::Word("S".to_string())),
1970            },
1971            PositionalToken {
1972                source: uws,
1973                offset: 70,
1974                length: 1,
1975                token: Token::Special(Special::Punctuation('.')),
1976            },
1977            PositionalToken {
1978                source: uws,
1979                offset: 71,
1980                length: 1,
1981                token: Token::Word(Word::Word("A".to_string())),
1982            },
1983            PositionalToken {
1984                source: uws,
1985                offset: 72,
1986                length: 2,
1987                token: Token::Special(Special::Separator(Separator::Space)),
1988            },
1989            PositionalToken {
1990                source: uws,
1991                offset: 74,
1992                length: 3,
1993                token: Token::Word(Word::Word("asd".to_string())),
1994            },
1995            PositionalToken {
1996                source: uws,
1997                offset: 77,
1998                length: 3,
1999                token: Token::Special(Special::Separator(Separator::Newline)),
2000            },
2001            PositionalToken {
2002                source: uws,
2003                offset: 80,
2004                length: 3,
2005                token: Token::Word(Word::Word("Brr".to_string())),
2006            },
2007            PositionalToken {
2008                source: uws,
2009                offset: 83,
2010                length: 1,
2011                token: Token::Special(Special::Punctuation(',')),
2012            },
2013            PositionalToken {
2014                source: uws,
2015                offset: 84,
2016                length: 1,
2017                token: Token::Special(Special::Separator(Separator::Space)),
2018            },
2019            PositionalToken {
2020                source: uws,
2021                offset: 85,
2022                length: 4,
2023                token: Token::Word(Word::Word("it\'s".to_string())),
2024            },
2025            PositionalToken {
2026                source: uws,
2027                offset: 89,
2028                length: 1,
2029                token: Token::Special(Special::Separator(Separator::Space)),
2030            },
2031            PositionalToken {
2032                source: uws,
2033                offset: 90,
2034                length: 4,
2035                token: Token::Word(Word::Number(Number::Float(29.3))),
2036            },
2037            PositionalToken {
2038                source: uws,
2039                offset: 94,
2040                length: 2,
2041                token: Token::Special(Special::Symbol('°')),
2042            },
2043            PositionalToken {
2044                source: uws,
2045                offset: 96,
2046                length: 1,
2047                token: Token::Word(Word::Word("F".to_string())),
2048            },
2049            PositionalToken {
2050                source: uws,
2051                offset: 97,
2052                length: 1,
2053                token: Token::Special(Special::Punctuation('!')),
2054            },
2055            PositionalToken {
2056                source: uws,
2057                offset: 98,
2058                length: 1,
2059                token: Token::Special(Special::Separator(Separator::Newline)),
2060            },
2061            PositionalToken {
2062                source: uws,
2063                offset: 99,
2064                length: 1,
2065                token: Token::Special(Special::Separator(Separator::Space)),
2066            },
2067            PositionalToken {
2068                source: uws,
2069                offset: 100,
2070                length: 14,
2071                token: Token::Word(Word::Word("Русское".to_string())),
2072            },
2073            PositionalToken {
2074                source: uws,
2075                offset: 114,
2076                length: 1,
2077                token: Token::Special(Special::Separator(Separator::Space)),
2078            },
2079            PositionalToken {
2080                source: uws,
2081                offset: 115,
2082                length: 22,
2083                token: Token::Word(Word::Word("предложение".to_string())),
2084            },
2085            PositionalToken {
2086                source: uws,
2087                offset: 137,
2088                length: 1,
2089                token: Token::Special(Special::Separator(Separator::Space)),
2090            },
2091            PositionalToken {
2092                source: uws,
2093                offset: 138,
2094                length: 1,
2095                token: Token::Special(Special::Punctuation('#')),
2096            },
2097            PositionalToken {
2098                source: uws,
2099                offset: 139,
2100                length: 4,
2101                token: Token::Word(Word::Number(Number::Float(36.6))),
2102            },
2103            PositionalToken {
2104                source: uws,
2105                offset: 143,
2106                length: 1,
2107                token: Token::Special(Special::Separator(Separator::Space)),
2108            },
2109            PositionalToken {
2110                source: uws,
2111                offset: 144,
2112                length: 6,
2113                token: Token::Word(Word::Word("для".to_string())),
2114            },
2115            PositionalToken {
2116                source: uws,
2117                offset: 150,
2118                length: 1,
2119                token: Token::Special(Special::Separator(Separator::Space)),
2120            },
2121            PositionalToken {
2122                source: uws,
2123                offset: 151,
2124                length: 24,
2125                token: Token::Word(Word::Word("тестирования".to_string())),
2126            },
2127            PositionalToken {
2128                source: uws,
2129                offset: 175,
2130                length: 1,
2131                token: Token::Special(Special::Separator(Separator::Space)),
2132            },
2133            PositionalToken {
2134                source: uws,
2135                offset: 176,
2136                length: 14,
2137                token: Token::Word(Word::Word("деления".to_string())),
2138            },
2139            PositionalToken {
2140                source: uws,
2141                offset: 190,
2142                length: 1,
2143                token: Token::Special(Special::Separator(Separator::Space)),
2144            },
2145            PositionalToken {
2146                source: uws,
2147                offset: 191,
2148                length: 4,
2149                token: Token::Word(Word::Word("по".to_string())),
2150            },
2151            PositionalToken {
2152                source: uws,
2153                offset: 195,
2154                length: 1,
2155                token: Token::Special(Special::Separator(Separator::Space)),
2156            },
2157            PositionalToken {
2158                source: uws,
2159                offset: 196,
2160                length: 12,
2161                token: Token::Word(Word::Word("юникод".to_string())),
2162            },
2163            PositionalToken {
2164                source: uws,
2165                offset: 208,
2166                length: 1,
2167                token: Token::Special(Special::Punctuation('-')),
2168            },
2169            PositionalToken {
2170                source: uws,
2171                offset: 209,
2172                length: 12,
2173                token: Token::Word(Word::Word("словам".to_string())),
2174            },
2175            PositionalToken {
2176                source: uws,
2177                offset: 221,
2178                length: 3,
2179                token: Token::Special(Special::Punctuation('.')),
2180            },
2181            PositionalToken {
2182                source: uws,
2183                offset: 224,
2184                length: 1,
2185                token: Token::Special(Special::Separator(Separator::Newline)),
2186            },
2187        ];
2188        let lib_res = uws
2189            .into_tokenizer(TokenizerParams::v1())
2190            .collect::<Vec<_>>();
2191        check_results(&result, &lib_res, uws);
2192    }
2193
2194    #[test]
2195    fn general_no_split() {
2196        let uws = "The quick (\"brown\") fox can't jump 32.3 feet, right? 4pda etc. qeq U.S.A  asd\n\n\nBrr, it's 29.3°F!\n Русское предложение #36.6 для тестирования деления по юникод-словам...\n";
2197        let result = vec![
2198            PositionalToken {
2199                source: uws,
2200                offset: 0,
2201                length: 3,
2202                token: Token::Word(Word::Word("The".to_string())),
2203            },
2204            PositionalToken {
2205                source: uws,
2206                offset: 3,
2207                length: 1,
2208                token: Token::Special(Special::Separator(Separator::Space)),
2209            },
2210            PositionalToken {
2211                source: uws,
2212                offset: 4,
2213                length: 5,
2214                token: Token::Word(Word::Word("quick".to_string())),
2215            },
2216            PositionalToken {
2217                source: uws,
2218                offset: 9,
2219                length: 1,
2220                token: Token::Special(Special::Separator(Separator::Space)),
2221            },
2222            PositionalToken {
2223                source: uws,
2224                offset: 10,
2225                length: 1,
2226                token: Token::Special(Special::Punctuation('(')),
2227            },
2228            PositionalToken {
2229                source: uws,
2230                offset: 11,
2231                length: 1,
2232                token: Token::Special(Special::Punctuation('"')),
2233            },
2234            PositionalToken {
2235                source: uws,
2236                offset: 12,
2237                length: 5,
2238                token: Token::Word(Word::Word("brown".to_string())),
2239            },
2240            PositionalToken {
2241                source: uws,
2242                offset: 17,
2243                length: 1,
2244                token: Token::Special(Special::Punctuation('"')),
2245            },
2246            PositionalToken {
2247                source: uws,
2248                offset: 18,
2249                length: 1,
2250                token: Token::Special(Special::Punctuation(')')),
2251            },
2252            PositionalToken {
2253                source: uws,
2254                offset: 19,
2255                length: 1,
2256                token: Token::Special(Special::Separator(Separator::Space)),
2257            },
2258            PositionalToken {
2259                source: uws,
2260                offset: 20,
2261                length: 3,
2262                token: Token::Word(Word::Word("fox".to_string())),
2263            },
2264            PositionalToken {
2265                source: uws,
2266                offset: 23,
2267                length: 1,
2268                token: Token::Special(Special::Separator(Separator::Space)),
2269            },
2270            PositionalToken {
2271                source: uws,
2272                offset: 24,
2273                length: 5,
2274                token: Token::Word(Word::Word("can\'t".to_string())),
2275            },
2276            PositionalToken {
2277                source: uws,
2278                offset: 29,
2279                length: 1,
2280                token: Token::Special(Special::Separator(Separator::Space)),
2281            },
2282            PositionalToken {
2283                source: uws,
2284                offset: 30,
2285                length: 4,
2286                token: Token::Word(Word::Word("jump".to_string())),
2287            },
2288            PositionalToken {
2289                source: uws,
2290                offset: 34,
2291                length: 1,
2292                token: Token::Special(Special::Separator(Separator::Space)),
2293            },
2294            PositionalToken {
2295                source: uws,
2296                offset: 35,
2297                length: 4,
2298                token: Token::Word(Word::Number(Number::Float(32.3))),
2299            },
2300            PositionalToken {
2301                source: uws,
2302                offset: 39,
2303                length: 1,
2304                token: Token::Special(Special::Separator(Separator::Space)),
2305            },
2306            PositionalToken {
2307                source: uws,
2308                offset: 40,
2309                length: 4,
2310                token: Token::Word(Word::Word("feet".to_string())),
2311            },
2312            PositionalToken {
2313                source: uws,
2314                offset: 44,
2315                length: 1,
2316                token: Token::Special(Special::Punctuation(',')),
2317            },
2318            PositionalToken {
2319                source: uws,
2320                offset: 45,
2321                length: 1,
2322                token: Token::Special(Special::Separator(Separator::Space)),
2323            },
2324            PositionalToken {
2325                source: uws,
2326                offset: 46,
2327                length: 5,
2328                token: Token::Word(Word::Word("right".to_string())),
2329            },
2330            PositionalToken {
2331                source: uws,
2332                offset: 51,
2333                length: 1,
2334                token: Token::Special(Special::Punctuation('?')),
2335            },
2336            PositionalToken {
2337                source: uws,
2338                offset: 52,
2339                length: 1,
2340                token: Token::Special(Special::Separator(Separator::Space)),
2341            },
2342            PositionalToken {
2343                source: uws,
2344                offset: 53,
2345                length: 4,
2346                token: Token::Word(Word::Numerical(Numerical::Measures("4pda".to_string()))),
2347            }, // TODO
2348            PositionalToken {
2349                source: uws,
2350                offset: 57,
2351                length: 1,
2352                token: Token::Special(Special::Separator(Separator::Space)),
2353            },
2354            PositionalToken {
2355                source: uws,
2356                offset: 58,
2357                length: 3,
2358                token: Token::Word(Word::Word("etc".to_string())),
2359            },
2360            PositionalToken {
2361                source: uws,
2362                offset: 61,
2363                length: 1,
2364                token: Token::Special(Special::Punctuation('.')),
2365            },
2366            PositionalToken {
2367                source: uws,
2368                offset: 62,
2369                length: 1,
2370                token: Token::Special(Special::Separator(Separator::Space)),
2371            },
2372            PositionalToken {
2373                source: uws,
2374                offset: 63,
2375                length: 3,
2376                token: Token::Word(Word::Word("qeq".to_string())),
2377            },
2378            PositionalToken {
2379                source: uws,
2380                offset: 66,
2381                length: 1,
2382                token: Token::Special(Special::Separator(Separator::Space)),
2383            },
2384            PositionalToken {
2385                source: uws,
2386                offset: 67,
2387                length: 5,
2388                token: Token::Word(Word::Word("U.S.A".to_string())),
2389            },
2390            PositionalToken {
2391                source: uws,
2392                offset: 72,
2393                length: 1,
2394                token: Token::Special(Special::Separator(Separator::Space)),
2395            },
2396            PositionalToken {
2397                source: uws,
2398                offset: 73,
2399                length: 1,
2400                token: Token::Special(Special::Separator(Separator::Space)),
2401            },
2402            PositionalToken {
2403                source: uws,
2404                offset: 74,
2405                length: 3,
2406                token: Token::Word(Word::Word("asd".to_string())),
2407            },
2408            PositionalToken {
2409                source: uws,
2410                offset: 77,
2411                length: 1,
2412                token: Token::Special(Special::Separator(Separator::Newline)),
2413            },
2414            PositionalToken {
2415                source: uws,
2416                offset: 78,
2417                length: 1,
2418                token: Token::Special(Special::Separator(Separator::Newline)),
2419            },
2420            PositionalToken {
2421                source: uws,
2422                offset: 79,
2423                length: 1,
2424                token: Token::Special(Special::Separator(Separator::Newline)),
2425            },
2426            PositionalToken {
2427                source: uws,
2428                offset: 80,
2429                length: 3,
2430                token: Token::Word(Word::Word("Brr".to_string())),
2431            },
2432            PositionalToken {
2433                source: uws,
2434                offset: 83,
2435                length: 1,
2436                token: Token::Special(Special::Punctuation(',')),
2437            },
2438            PositionalToken {
2439                source: uws,
2440                offset: 84,
2441                length: 1,
2442                token: Token::Special(Special::Separator(Separator::Space)),
2443            },
2444            PositionalToken {
2445                source: uws,
2446                offset: 85,
2447                length: 4,
2448                token: Token::Word(Word::Word("it\'s".to_string())),
2449            },
2450            PositionalToken {
2451                source: uws,
2452                offset: 89,
2453                length: 1,
2454                token: Token::Special(Special::Separator(Separator::Space)),
2455            },
2456            PositionalToken {
2457                source: uws,
2458                offset: 90,
2459                length: 4,
2460                token: Token::Word(Word::Number(Number::Float(29.3))),
2461            },
2462            PositionalToken {
2463                source: uws,
2464                offset: 94,
2465                length: 2,
2466                token: Token::Special(Special::Symbol('°')),
2467            },
2468            PositionalToken {
2469                source: uws,
2470                offset: 96,
2471                length: 1,
2472                token: Token::Word(Word::Word("F".to_string())),
2473            },
2474            PositionalToken {
2475                source: uws,
2476                offset: 97,
2477                length: 1,
2478                token: Token::Special(Special::Punctuation('!')),
2479            },
2480            PositionalToken {
2481                source: uws,
2482                offset: 98,
2483                length: 1,
2484                token: Token::Special(Special::Separator(Separator::Newline)),
2485            },
2486            PositionalToken {
2487                source: uws,
2488                offset: 99,
2489                length: 1,
2490                token: Token::Special(Special::Separator(Separator::Space)),
2491            },
2492            PositionalToken {
2493                source: uws,
2494                offset: 100,
2495                length: 14,
2496                token: Token::Word(Word::Word("Русское".to_string())),
2497            },
2498            PositionalToken {
2499                source: uws,
2500                offset: 114,
2501                length: 1,
2502                token: Token::Special(Special::Separator(Separator::Space)),
2503            },
2504            PositionalToken {
2505                source: uws,
2506                offset: 115,
2507                length: 22,
2508                token: Token::Word(Word::Word("предложение".to_string())),
2509            },
2510            PositionalToken {
2511                source: uws,
2512                offset: 137,
2513                length: 1,
2514                token: Token::Special(Special::Separator(Separator::Space)),
2515            },
2516            PositionalToken {
2517                source: uws,
2518                offset: 138,
2519                length: 1,
2520                token: Token::Special(Special::Punctuation('#')),
2521            },
2522            PositionalToken {
2523                source: uws,
2524                offset: 139,
2525                length: 4,
2526                token: Token::Word(Word::Number(Number::Float(36.6))),
2527            },
2528            PositionalToken {
2529                source: uws,
2530                offset: 143,
2531                length: 1,
2532                token: Token::Special(Special::Separator(Separator::Space)),
2533            },
2534            PositionalToken {
2535                source: uws,
2536                offset: 144,
2537                length: 6,
2538                token: Token::Word(Word::Word("для".to_string())),
2539            },
2540            PositionalToken {
2541                source: uws,
2542                offset: 150,
2543                length: 1,
2544                token: Token::Special(Special::Separator(Separator::Space)),
2545            },
2546            PositionalToken {
2547                source: uws,
2548                offset: 151,
2549                length: 24,
2550                token: Token::Word(Word::Word("тестирования".to_string())),
2551            },
2552            PositionalToken {
2553                source: uws,
2554                offset: 175,
2555                length: 1,
2556                token: Token::Special(Special::Separator(Separator::Space)),
2557            },
2558            PositionalToken {
2559                source: uws,
2560                offset: 176,
2561                length: 14,
2562                token: Token::Word(Word::Word("деления".to_string())),
2563            },
2564            PositionalToken {
2565                source: uws,
2566                offset: 190,
2567                length: 1,
2568                token: Token::Special(Special::Separator(Separator::Space)),
2569            },
2570            PositionalToken {
2571                source: uws,
2572                offset: 191,
2573                length: 4,
2574                token: Token::Word(Word::Word("по".to_string())),
2575            },
2576            PositionalToken {
2577                source: uws,
2578                offset: 195,
2579                length: 1,
2580                token: Token::Special(Special::Separator(Separator::Space)),
2581            },
2582            PositionalToken {
2583                source: uws,
2584                offset: 196,
2585                length: 12,
2586                token: Token::Word(Word::Word("юникод".to_string())),
2587            },
2588            PositionalToken {
2589                source: uws,
2590                offset: 208,
2591                length: 1,
2592                token: Token::Special(Special::Punctuation('-')),
2593            },
2594            PositionalToken {
2595                source: uws,
2596                offset: 209,
2597                length: 12,
2598                token: Token::Word(Word::Word("словам".to_string())),
2599            },
2600            PositionalToken {
2601                source: uws,
2602                offset: 221,
2603                length: 1,
2604                token: Token::Special(Special::Punctuation('.')),
2605            },
2606            PositionalToken {
2607                source: uws,
2608                offset: 222,
2609                length: 1,
2610                token: Token::Special(Special::Punctuation('.')),
2611            },
2612            PositionalToken {
2613                source: uws,
2614                offset: 223,
2615                length: 1,
2616                token: Token::Special(Special::Punctuation('.')),
2617            },
2618            PositionalToken {
2619                source: uws,
2620                offset: 224,
2621                length: 1,
2622                token: Token::Special(Special::Separator(Separator::Newline)),
2623            },
2624        ];
2625        let lib_res = uws.into_tokenizer(Default::default()).collect::<Vec<_>>();
2626        check_results(&result, &lib_res, uws);
2627    }
2628
2629    #[test]
2630    fn general_complex() {
2631        let uws = "The quick (\"brown\") fox can't jump 32.3 feet, right? 4pda etc. qeq U.S.A  asd\n\n\nBrr, it's 29.3°F!\n Русское предложение #36.6 для тестирования деления по юникод-словам...\n";
2632        let result = vec![
2633            PositionalToken {
2634                source: uws,
2635                offset: 0,
2636                length: 3,
2637                token: Token::Word(Word::Word("The".to_string())),
2638            },
2639            PositionalToken {
2640                source: uws,
2641                offset: 3,
2642                length: 1,
2643                token: Token::Special(Special::Separator(Separator::Space)),
2644            },
2645            PositionalToken {
2646                source: uws,
2647                offset: 4,
2648                length: 5,
2649                token: Token::Word(Word::Word("quick".to_string())),
2650            },
2651            PositionalToken {
2652                source: uws,
2653                offset: 9,
2654                length: 1,
2655                token: Token::Special(Special::Separator(Separator::Space)),
2656            },
2657            PositionalToken {
2658                source: uws,
2659                offset: 10,
2660                length: 1,
2661                token: Token::Special(Special::Punctuation('(')),
2662            },
2663            PositionalToken {
2664                source: uws,
2665                offset: 11,
2666                length: 1,
2667                token: Token::Special(Special::Punctuation('"')),
2668            },
2669            PositionalToken {
2670                source: uws,
2671                offset: 12,
2672                length: 5,
2673                token: Token::Word(Word::Word("brown".to_string())),
2674            },
2675            PositionalToken {
2676                source: uws,
2677                offset: 17,
2678                length: 1,
2679                token: Token::Special(Special::Punctuation('"')),
2680            },
2681            PositionalToken {
2682                source: uws,
2683                offset: 18,
2684                length: 1,
2685                token: Token::Special(Special::Punctuation(')')),
2686            },
2687            PositionalToken {
2688                source: uws,
2689                offset: 19,
2690                length: 1,
2691                token: Token::Special(Special::Separator(Separator::Space)),
2692            },
2693            PositionalToken {
2694                source: uws,
2695                offset: 20,
2696                length: 3,
2697                token: Token::Word(Word::Word("fox".to_string())),
2698            },
2699            PositionalToken {
2700                source: uws,
2701                offset: 23,
2702                length: 1,
2703                token: Token::Special(Special::Separator(Separator::Space)),
2704            },
2705            PositionalToken {
2706                source: uws,
2707                offset: 24,
2708                length: 5,
2709                token: Token::Word(Word::Word("can\'t".to_string())),
2710            },
2711            PositionalToken {
2712                source: uws,
2713                offset: 29,
2714                length: 1,
2715                token: Token::Special(Special::Separator(Separator::Space)),
2716            },
2717            PositionalToken {
2718                source: uws,
2719                offset: 30,
2720                length: 4,
2721                token: Token::Word(Word::Word("jump".to_string())),
2722            },
2723            PositionalToken {
2724                source: uws,
2725                offset: 34,
2726                length: 1,
2727                token: Token::Special(Special::Separator(Separator::Space)),
2728            },
2729            PositionalToken {
2730                source: uws,
2731                offset: 35,
2732                length: 4,
2733                token: Token::Word(Word::Number(Number::Float(32.3))),
2734            },
2735            PositionalToken {
2736                source: uws,
2737                offset: 39,
2738                length: 1,
2739                token: Token::Special(Special::Separator(Separator::Space)),
2740            },
2741            PositionalToken {
2742                source: uws,
2743                offset: 40,
2744                length: 4,
2745                token: Token::Word(Word::Word("feet".to_string())),
2746            },
2747            PositionalToken {
2748                source: uws,
2749                offset: 44,
2750                length: 1,
2751                token: Token::Special(Special::Punctuation(',')),
2752            },
2753            PositionalToken {
2754                source: uws,
2755                offset: 45,
2756                length: 1,
2757                token: Token::Special(Special::Separator(Separator::Space)),
2758            },
2759            PositionalToken {
2760                source: uws,
2761                offset: 46,
2762                length: 5,
2763                token: Token::Word(Word::Word("right".to_string())),
2764            },
2765            PositionalToken {
2766                source: uws,
2767                offset: 51,
2768                length: 1,
2769                token: Token::Special(Special::Punctuation('?')),
2770            },
2771            PositionalToken {
2772                source: uws,
2773                offset: 52,
2774                length: 1,
2775                token: Token::Special(Special::Separator(Separator::Space)),
2776            },
2777            PositionalToken {
2778                source: uws,
2779                offset: 53,
2780                length: 4,
2781                token: Token::Word(Word::Numerical(Numerical::Measures("4pda".to_string()))),
2782            }, // TODO
2783            PositionalToken {
2784                source: uws,
2785                offset: 57,
2786                length: 1,
2787                token: Token::Special(Special::Separator(Separator::Space)),
2788            },
2789            PositionalToken {
2790                source: uws,
2791                offset: 58,
2792                length: 3,
2793                token: Token::Word(Word::Word("etc".to_string())),
2794            },
2795            PositionalToken {
2796                source: uws,
2797                offset: 61,
2798                length: 1,
2799                token: Token::Special(Special::Punctuation('.')),
2800            },
2801            PositionalToken {
2802                source: uws,
2803                offset: 62,
2804                length: 1,
2805                token: Token::Special(Special::Separator(Separator::Space)),
2806            },
2807            PositionalToken {
2808                source: uws,
2809                offset: 63,
2810                length: 3,
2811                token: Token::Word(Word::Word("qeq".to_string())),
2812            },
2813            PositionalToken {
2814                source: uws,
2815                offset: 66,
2816                length: 1,
2817                token: Token::Special(Special::Separator(Separator::Space)),
2818            },
2819            PositionalToken {
2820                source: uws,
2821                offset: 67,
2822                length: 5,
2823                token: Token::Word(Word::Word("U.S.A".to_string())),
2824            },
2825            PositionalToken {
2826                source: uws,
2827                offset: 72,
2828                length: 2,
2829                token: Token::Special(Special::Separator(Separator::Space)),
2830            },
2831            PositionalToken {
2832                source: uws,
2833                offset: 74,
2834                length: 3,
2835                token: Token::Word(Word::Word("asd".to_string())),
2836            },
2837            PositionalToken {
2838                source: uws,
2839                offset: 77,
2840                length: 3,
2841                token: Token::Special(Special::Separator(Separator::Newline)),
2842            },
2843            PositionalToken {
2844                source: uws,
2845                offset: 80,
2846                length: 3,
2847                token: Token::Word(Word::Word("Brr".to_string())),
2848            },
2849            PositionalToken {
2850                source: uws,
2851                offset: 83,
2852                length: 1,
2853                token: Token::Special(Special::Punctuation(',')),
2854            },
2855            PositionalToken {
2856                source: uws,
2857                offset: 84,
2858                length: 1,
2859                token: Token::Special(Special::Separator(Separator::Space)),
2860            },
2861            PositionalToken {
2862                source: uws,
2863                offset: 85,
2864                length: 4,
2865                token: Token::Word(Word::Word("it\'s".to_string())),
2866            },
2867            PositionalToken {
2868                source: uws,
2869                offset: 89,
2870                length: 1,
2871                token: Token::Special(Special::Separator(Separator::Space)),
2872            },
2873            PositionalToken {
2874                source: uws,
2875                offset: 90,
2876                length: 4,
2877                token: Token::Word(Word::Number(Number::Float(29.3))),
2878            },
2879            PositionalToken {
2880                source: uws,
2881                offset: 94,
2882                length: 2,
2883                token: Token::Special(Special::Symbol('°')),
2884            },
2885            PositionalToken {
2886                source: uws,
2887                offset: 96,
2888                length: 1,
2889                token: Token::Word(Word::Word("F".to_string())),
2890            },
2891            PositionalToken {
2892                source: uws,
2893                offset: 97,
2894                length: 1,
2895                token: Token::Special(Special::Punctuation('!')),
2896            },
2897            PositionalToken {
2898                source: uws,
2899                offset: 98,
2900                length: 1,
2901                token: Token::Special(Special::Separator(Separator::Newline)),
2902            },
2903            PositionalToken {
2904                source: uws,
2905                offset: 99,
2906                length: 1,
2907                token: Token::Special(Special::Separator(Separator::Space)),
2908            },
2909            PositionalToken {
2910                source: uws,
2911                offset: 100,
2912                length: 14,
2913                token: Token::Word(Word::Word("Русское".to_string())),
2914            },
2915            PositionalToken {
2916                source: uws,
2917                offset: 114,
2918                length: 1,
2919                token: Token::Special(Special::Separator(Separator::Space)),
2920            },
2921            PositionalToken {
2922                source: uws,
2923                offset: 115,
2924                length: 22,
2925                token: Token::Word(Word::Word("предложение".to_string())),
2926            },
2927            PositionalToken {
2928                source: uws,
2929                offset: 137,
2930                length: 1,
2931                token: Token::Special(Special::Separator(Separator::Space)),
2932            },
2933            PositionalToken {
2934                source: uws,
2935                offset: 138,
2936                length: 5,
2937                token: Token::Struct(Struct::Hashtag("36.6".to_string())),
2938            },
2939            PositionalToken {
2940                source: uws,
2941                offset: 143,
2942                length: 1,
2943                token: Token::Special(Special::Separator(Separator::Space)),
2944            },
2945            PositionalToken {
2946                source: uws,
2947                offset: 144,
2948                length: 6,
2949                token: Token::Word(Word::Word("для".to_string())),
2950            },
2951            PositionalToken {
2952                source: uws,
2953                offset: 150,
2954                length: 1,
2955                token: Token::Special(Special::Separator(Separator::Space)),
2956            },
2957            PositionalToken {
2958                source: uws,
2959                offset: 151,
2960                length: 24,
2961                token: Token::Word(Word::Word("тестирования".to_string())),
2962            },
2963            PositionalToken {
2964                source: uws,
2965                offset: 175,
2966                length: 1,
2967                token: Token::Special(Special::Separator(Separator::Space)),
2968            },
2969            PositionalToken {
2970                source: uws,
2971                offset: 176,
2972                length: 14,
2973                token: Token::Word(Word::Word("деления".to_string())),
2974            },
2975            PositionalToken {
2976                source: uws,
2977                offset: 190,
2978                length: 1,
2979                token: Token::Special(Special::Separator(Separator::Space)),
2980            },
2981            PositionalToken {
2982                source: uws,
2983                offset: 191,
2984                length: 4,
2985                token: Token::Word(Word::Word("по".to_string())),
2986            },
2987            PositionalToken {
2988                source: uws,
2989                offset: 195,
2990                length: 1,
2991                token: Token::Special(Special::Separator(Separator::Space)),
2992            },
2993            PositionalToken {
2994                source: uws,
2995                offset: 196,
2996                length: 12,
2997                token: Token::Word(Word::Word("юникод".to_string())),
2998            },
2999            PositionalToken {
3000                source: uws,
3001                offset: 208,
3002                length: 1,
3003                token: Token::Special(Special::Punctuation('-')),
3004            },
3005            PositionalToken {
3006                source: uws,
3007                offset: 209,
3008                length: 12,
3009                token: Token::Word(Word::Word("словам".to_string())),
3010            },
3011            PositionalToken {
3012                source: uws,
3013                offset: 221,
3014                length: 3,
3015                token: Token::Special(Special::Punctuation('.')),
3016            },
3017            PositionalToken {
3018                source: uws,
3019                offset: 224,
3020                length: 1,
3021                token: Token::Special(Special::Separator(Separator::Newline)),
3022            },
3023        ];
3024        let lib_res = uws
3025            .into_tokenizer(TokenizerParams::complex())
3026            .collect::<Vec<_>>();
3027        check_results(&result, &lib_res, uws);
3028    }
3029
3030    #[test]
3031    fn plus_minus() {
3032        let uws = "+23 -4.5 -34 +25.7 - 2 + 5.6";
3033        let result = vec![
3034            PositionalToken {
3035                source: uws,
3036                offset: 0,
3037                length: 3,
3038                token: Token::Word(Word::Number(Number::Integer(23))),
3039            },
3040            PositionalToken {
3041                source: uws,
3042                offset: 3,
3043                length: 1,
3044                token: Token::Special(Special::Separator(Separator::Space)),
3045            },
3046            PositionalToken {
3047                source: uws,
3048                offset: 4,
3049                length: 4,
3050                token: Token::Word(Word::Number(Number::Float(-4.5))),
3051            },
3052            PositionalToken {
3053                source: uws,
3054                offset: 8,
3055                length: 1,
3056                token: Token::Special(Special::Separator(Separator::Space)),
3057            },
3058            PositionalToken {
3059                source: uws,
3060                offset: 9,
3061                length: 3,
3062                token: Token::Word(Word::Number(Number::Integer(-34))),
3063            },
3064            PositionalToken {
3065                source: uws,
3066                offset: 12,
3067                length: 1,
3068                token: Token::Special(Special::Separator(Separator::Space)),
3069            },
3070            PositionalToken {
3071                source: uws,
3072                offset: 13,
3073                length: 5,
3074                token: Token::Word(Word::Number(Number::Float(25.7))),
3075            },
3076            PositionalToken {
3077                source: uws,
3078                offset: 18,
3079                length: 1,
3080                token: Token::Special(Special::Separator(Separator::Space)),
3081            },
3082            PositionalToken {
3083                source: uws,
3084                offset: 19,
3085                length: 1,
3086                token: Token::Special(Special::Punctuation('-')),
3087            },
3088            PositionalToken {
3089                source: uws,
3090                offset: 20,
3091                length: 1,
3092                token: Token::Special(Special::Separator(Separator::Space)),
3093            },
3094            PositionalToken {
3095                source: uws,
3096                offset: 21,
3097                length: 1,
3098                token: Token::Word(Word::Number(Number::Integer(2))),
3099            },
3100            PositionalToken {
3101                source: uws,
3102                offset: 22,
3103                length: 1,
3104                token: Token::Special(Special::Separator(Separator::Space)),
3105            },
3106            PositionalToken {
3107                source: uws,
3108                offset: 23,
3109                length: 1,
3110                token: Token::Special(Special::Punctuation('+')),
3111            },
3112            PositionalToken {
3113                source: uws,
3114                offset: 24,
3115                length: 1,
3116                token: Token::Special(Special::Separator(Separator::Space)),
3117            },
3118            PositionalToken {
3119                source: uws,
3120                offset: 25,
3121                length: 3,
3122                token: Token::Word(Word::Number(Number::Float(5.6))),
3123            },
3124        ];
3125        let lib_res = uws
3126            .into_tokenizer(TokenizerParams::v1())
3127            .collect::<Vec<_>>();
3128        check(&result, &lib_res, uws);
3129        //print_result(&lib_res); panic!("")
3130    }
3131
3132    #[test]
3133    #[ignore]
3134    fn woman_bouncing_ball() {
3135        let uws = "\u{26f9}\u{200d}\u{2640}";
3136        let result = vec![PositionalToken {
3137            source: uws,
3138            offset: 0,
3139            length: 9,
3140            token: Token::Word(Word::Emoji("woman_bouncing_ball")),
3141        }];
3142        let lib_res = uws
3143            .into_tokenizer(TokenizerParams::v1())
3144            .collect::<Vec<_>>();
3145        check_results(&result, &lib_res, uws);
3146        //print_result(&lib_res); panic!("")
3147    }
3148
3149    #[test]
3150    fn emoji_and_rusabbr_default() {
3151        let uws = "🇷🇺 🇸🇹\n👱🏿👶🏽👨🏽\n👱\nС.С.С.Р.\n👨‍👩‍👦‍👦\n🧠\n";
3152        let result = vec![
3153            PositionalToken {
3154                source: uws,
3155                offset: 0,
3156                length: 8,
3157                token: Token::Word(Word::Emoji("russia")),
3158            },
3159            PositionalToken {
3160                source: uws,
3161                offset: 8,
3162                length: 1,
3163                token: Token::Special(Special::Separator(Separator::Space)),
3164            },
3165            PositionalToken {
3166                source: uws,
3167                offset: 9,
3168                length: 8,
3169                token: Token::Word(Word::Emoji("sao_tome_and_principe")),
3170            },
3171            PositionalToken {
3172                source: uws,
3173                offset: 17,
3174                length: 1,
3175                token: Token::Special(Special::Separator(Separator::Newline)),
3176            },
3177            PositionalToken {
3178                source: uws,
3179                offset: 18,
3180                length: 8,
3181                token: Token::Word(Word::Emoji("blond_haired_person_dark_skin_tone")),
3182            },
3183            PositionalToken {
3184                source: uws,
3185                offset: 26,
3186                length: 8,
3187                token: Token::Word(Word::Emoji("baby_medium_skin_tone")),
3188            },
3189            PositionalToken {
3190                source: uws,
3191                offset: 34,
3192                length: 8,
3193                token: Token::Word(Word::Emoji("man_medium_skin_tone")),
3194            },
3195            PositionalToken {
3196                source: uws,
3197                offset: 42,
3198                length: 1,
3199                token: Token::Special(Special::Separator(Separator::Newline)),
3200            },
3201            PositionalToken {
3202                source: uws,
3203                offset: 43,
3204                length: 4,
3205                token: Token::Word(Word::Emoji("blond_haired_person")),
3206            },
3207            PositionalToken {
3208                source: uws,
3209                offset: 47,
3210                length: 1,
3211                token: Token::Special(Special::Separator(Separator::Newline)),
3212            },
3213            PositionalToken {
3214                source: uws,
3215                offset: 48,
3216                length: 2,
3217                token: Token::Word(Word::Word("С".to_string())),
3218            },
3219            PositionalToken {
3220                source: uws,
3221                offset: 50,
3222                length: 1,
3223                token: Token::Special(Special::Punctuation('.')),
3224            },
3225            PositionalToken {
3226                source: uws,
3227                offset: 51,
3228                length: 2,
3229                token: Token::Word(Word::Word("С".to_string())),
3230            },
3231            PositionalToken {
3232                source: uws,
3233                offset: 53,
3234                length: 1,
3235                token: Token::Special(Special::Punctuation('.')),
3236            },
3237            PositionalToken {
3238                source: uws,
3239                offset: 54,
3240                length: 2,
3241                token: Token::Word(Word::Word("С".to_string())),
3242            },
3243            PositionalToken {
3244                source: uws,
3245                offset: 56,
3246                length: 1,
3247                token: Token::Special(Special::Punctuation('.')),
3248            },
3249            PositionalToken {
3250                source: uws,
3251                offset: 57,
3252                length: 2,
3253                token: Token::Word(Word::Word("Р".to_string())),
3254            },
3255            PositionalToken {
3256                source: uws,
3257                offset: 59,
3258                length: 1,
3259                token: Token::Special(Special::Punctuation('.')),
3260            },
3261            PositionalToken {
3262                source: uws,
3263                offset: 60,
3264                length: 1,
3265                token: Token::Special(Special::Separator(Separator::Newline)),
3266            },
3267            PositionalToken {
3268                source: uws,
3269                offset: 61,
3270                length: 25,
3271                token: Token::Word(Word::Emoji("family_man_woman_boy_boy")),
3272            },
3273            PositionalToken {
3274                source: uws,
3275                offset: 86,
3276                length: 1,
3277                token: Token::Special(Special::Separator(Separator::Newline)),
3278            },
3279            PositionalToken {
3280                source: uws,
3281                offset: 87,
3282                length: 4,
3283                token: Token::Word(Word::Emoji("brain")),
3284            },
3285            PositionalToken {
3286                source: uws,
3287                offset: 91,
3288                length: 1,
3289                token: Token::Special(Special::Separator(Separator::Newline)),
3290            },
3291        ];
3292
3293        let lib_res = uws
3294            .into_tokenizer(TokenizerParams::v1())
3295            .collect::<Vec<_>>();
3296        check_results(&result, &lib_res, uws);
3297        //print_result(&lib_res); panic!();
3298    }
3299
3300    #[test]
3301    fn emoji_and_rusabbr_no_split() {
3302        let uws = "🇷🇺 🇸🇹\n👱🏿👶🏽👨🏽\n👱\nС.С.С.Р.\n👨‍👩‍👦‍👦\n🧠\n";
3303        let result = vec![
3304            PositionalToken {
3305                source: uws,
3306                offset: 0,
3307                length: 8,
3308                token: Token::Word(Word::Emoji("russia")),
3309            },
3310            PositionalToken {
3311                source: uws,
3312                offset: 8,
3313                length: 1,
3314                token: Token::Special(Special::Separator(Separator::Space)),
3315            },
3316            PositionalToken {
3317                source: uws,
3318                offset: 9,
3319                length: 8,
3320                token: Token::Word(Word::Emoji("sao_tome_and_principe")),
3321            },
3322            PositionalToken {
3323                source: uws,
3324                offset: 17,
3325                length: 1,
3326                token: Token::Special(Special::Separator(Separator::Newline)),
3327            },
3328            PositionalToken {
3329                source: uws,
3330                offset: 18,
3331                length: 8,
3332                token: Token::Word(Word::Emoji("blond_haired_person_dark_skin_tone")),
3333            },
3334            PositionalToken {
3335                source: uws,
3336                offset: 26,
3337                length: 8,
3338                token: Token::Word(Word::Emoji("baby_medium_skin_tone")),
3339            },
3340            PositionalToken {
3341                source: uws,
3342                offset: 34,
3343                length: 8,
3344                token: Token::Word(Word::Emoji("man_medium_skin_tone")),
3345            },
3346            PositionalToken {
3347                source: uws,
3348                offset: 42,
3349                length: 1,
3350                token: Token::Special(Special::Separator(Separator::Newline)),
3351            },
3352            PositionalToken {
3353                source: uws,
3354                offset: 43,
3355                length: 4,
3356                token: Token::Word(Word::Emoji("blond_haired_person")),
3357            },
3358            PositionalToken {
3359                source: uws,
3360                offset: 47,
3361                length: 1,
3362                token: Token::Special(Special::Separator(Separator::Newline)),
3363            },
3364            PositionalToken {
3365                source: uws,
3366                offset: 48,
3367                length: 11,
3368                token: Token::Word(Word::Word("С.С.С.Р".to_string())),
3369            },
3370            PositionalToken {
3371                source: uws,
3372                offset: 59,
3373                length: 1,
3374                token: Token::Special(Special::Punctuation('.')),
3375            },
3376            PositionalToken {
3377                source: uws,
3378                offset: 60,
3379                length: 1,
3380                token: Token::Special(Special::Separator(Separator::Newline)),
3381            },
3382            PositionalToken {
3383                source: uws,
3384                offset: 61,
3385                length: 25,
3386                token: Token::Word(Word::Emoji("family_man_woman_boy_boy")),
3387            },
3388            PositionalToken {
3389                source: uws,
3390                offset: 86,
3391                length: 1,
3392                token: Token::Special(Special::Separator(Separator::Newline)),
3393            },
3394            PositionalToken {
3395                source: uws,
3396                offset: 87,
3397                length: 4,
3398                token: Token::Word(Word::Emoji("brain")),
3399            },
3400            PositionalToken {
3401                source: uws,
3402                offset: 91,
3403                length: 1,
3404                token: Token::Special(Special::Separator(Separator::Newline)),
3405            },
3406        ];
3407
3408        let lib_res = uws.into_tokenizer(Default::default()).collect::<Vec<_>>();
3409        check_results(&result, &lib_res, uws);
3410        //print_result(&lib_res); panic!();
3411    }
3412
3413    /*#[test]
3414    fn hashtags_mentions_urls() {
3415        let uws = "\nSome ##text with #hashtags and @other components\nadfa wdsfdf asdf asd http://asdfasdfsd.com/fasdfd/sadfsadf/sdfas/12312_12414/asdf?fascvx=fsfwer&dsdfasdf=fasdf#fasdf asdfa sdfa sdf\nasdfas df asd who@bla-bla.com asdfas df asdfsd\n";
3416        let result = vec![
3417            PositionalToken { source: uws, offset: 0, length: 1, token: Token::Special(Special::Separator(Separator::Newline)) },
3418            PositionalToken { source: uws, offset: 1, length: 4, token: Token::Word(Word::Word("Some".to_string())) },
3419            PositionalToken { source: uws, offset: 5, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3420            PositionalToken { source: uws, offset: 6, length: 2, token: Token::Special(Special::Punctuation("##".to_string())) },
3421            PositionalToken { source: uws, offset: 8, length: 4, token: Token::Word(Word::Word("text".to_string())) },
3422            PositionalToken { source: uws, offset: 12, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3423            PositionalToken { source: uws, offset: 13, length: 4, token: Token::Word(Word::Word("with".to_string())) },
3424            PositionalToken { source: uws, offset: 17, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3425            PositionalToken { source: uws, offset: 18, length: 9, token: Token::Struct(Struct::Hashtag("hashtags".to_string())) },
3426            PositionalToken { source: uws, offset: 27, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3427            PositionalToken { source: uws, offset: 28, length: 3, token: Token::Word(Word::Word("and".to_string())) },
3428            PositionalToken { source: uws, offset: 31, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3429            PositionalToken { source: uws, offset: 32, length: 6, token: Token::Struct(Struct::Mention("other".to_string())) },
3430            PositionalToken { source: uws, offset: 38, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3431            PositionalToken { source: uws, offset: 39, length: 10, token: Token::Word(Word::Word("components".to_string())) },
3432            PositionalToken { source: uws, offset: 49, length: 1, token: Token::Special(Special::Separator(Separator::Newline)) },
3433            PositionalToken { source: uws, offset: 50, length: 4, token: Token::Word(Word::Word("adfa".to_string())) },
3434            PositionalToken { source: uws, offset: 54, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3435            PositionalToken { source: uws, offset: 55, length: 6, token: Token::Word(Word::Word("wdsfdf".to_string())) },
3436            PositionalToken { source: uws, offset: 61, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3437            PositionalToken { source: uws, offset: 62, length: 4, token: Token::Word(Word::Word("asdf".to_string())) },
3438            PositionalToken { source: uws, offset: 66, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3439            PositionalToken { source: uws, offset: 67, length: 3, token: Token::Word(Word::Word("asd".to_string())) },
3440            PositionalToken { source: uws, offset: 70, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3441            PositionalToken { source: uws, offset: 71, length: 95, token: Token::Struct(Struct::Url("http://asdfasdfsd.com/fasdfd/sadfsadf/sdfas/12312_12414/asdf?fascvx=fsfwer&dsdfasdf=fasdf#fasdf".to_string())) },
3442            PositionalToken { source: uws, offset: 166, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3443            PositionalToken { source: uws, offset: 167, length: 5, token: Token::Word(Word::Word("asdfa".to_string())) },
3444            PositionalToken { source: uws, offset: 172, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3445            PositionalToken { source: uws, offset: 173, length: 4, token: Token::Word(Word::Word("sdfa".to_string())) },
3446            PositionalToken { source: uws, offset: 177, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3447            PositionalToken { source: uws, offset: 178, length: 3, token: Token::Word(Word::Word("sdf".to_string())) },
3448            PositionalToken { source: uws, offset: 181, length: 1, token: Token::Special(Special::Separator(Separator::Newline)) },
3449            PositionalToken { source: uws, offset: 182, length: 6, token: Token::Word(Word::Word("asdfas".to_string())) },
3450            PositionalToken { source: uws, offset: 188, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3451            PositionalToken { source: uws, offset: 189, length: 2, token: Token::Word(Word::Word("df".to_string())) },
3452            PositionalToken { source: uws, offset: 191, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3453            PositionalToken { source: uws, offset: 192, length: 3, token: Token::Word(Word::Word("asd".to_string())) },
3454            PositionalToken { source: uws, offset: 195, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3455            PositionalToken { source: uws, offset: 196, length: 3, token: Token::Word(Word::Word("who".to_string())) },
3456            PositionalToken { source: uws, offset: 199, length: 4, token: Token::Struct(Struct::Mention("bla".to_string())) },
3457            PositionalToken { source: uws, offset: 203, length: 1, token: Token::Special(Special::Punctuation('-')) },
3458            PositionalToken { source: uws, offset: 204, length: 7, token: Token::Word(Word::Word("bla.com".to_string())) },
3459            PositionalToken { source: uws, offset: 211, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3460            PositionalToken { source: uws, offset: 212, length: 6, token: Token::Word(Word::Word("asdfas".to_string())) },
3461            PositionalToken { source: uws, offset: 218, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3462            PositionalToken { source: uws, offset: 219, length: 2, token: Token::Word(Word::Word("df".to_string())) },
3463            PositionalToken { source: uws, offset: 221, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3464            PositionalToken { source: uws, offset: 222, length: 6, token: Token::Word(Word::Word("asdfsd".to_string())) },
3465            PositionalToken { source: uws, offset: 228, length: 1, token: Token::Special(Special::Separator(Separator::Newline)) },
3466            ];
3467        let lib_res = uws.into_tokenizer(TokenizerParams::complex()).collect::<Vec<_>>();
3468        check_results(&result,&lib_res,uws);
3469        //print_result(&lib_res); panic!("")
3470    }*/
3471
3472    /*#[test]
3473    fn bb_code() {
3474        let uws = "[Oxana Putan|1712640565] shared a [post|100001150683379_1873048549410150]. \nAndrew\n[link|https://www.facebook.com/100001150683379/posts/1873048549410150]\nДрузья мои, издатели, редакторы, просветители, культуртрегеры, субъекты мирового рынка и ту хум ит ещё мей консёрн.\nНа текущий момент я лишен былой подвижности, хоть и ковыляю по больничных коридорам по разным нуждам и за кипятком.\nВрачи обещают мне заживление отверстых ран моих в течение полугода и на этот период можно предполагать с уверенностью преимущественно домашний образ жизни.\n[|]";
3475        let result = vec![
3476            PositionalToken { offset: 0, length: 24, token: Token::BBCode { left: vec![
3477                PositionalToken { offset: 1, length: 5, token: Token::Word(Word::Word("Oxana".to_string())) },
3478                PositionalToken { offset: 6, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3479                PositionalToken { offset: 7, length: 5, token: Token::Word(Word::Word("Putan".to_string())) },
3480                ], right: vec![
3481                PositionalToken { offset: 13, length: 10, token: Token::Word(Word::Number(Number::Integer(1712640565))) },
3482                ] } },
3483            PositionalToken { offset: 24, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3484            PositionalToken { offset: 25, length: 6, token: Token::Word(Word::Word("shared".to_string())) },
3485            PositionalToken { offset: 31, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3486            PositionalToken { offset: 32, length: 1, token: Token::Word(Word::Word("a".to_string())) },
3487            PositionalToken { offset: 33, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3488            PositionalToken { offset: 34, length: 39, token: Token::BBCode { left: vec![
3489                PositionalToken { offset: 35, length: 4, token: Token::Word(Word::Word("post".to_string())) },
3490                ], right: vec![
3491                PositionalToken { offset: 40, length: 32, token: Token::Word(Word::Numerical(Numerical::Alphanumeric("100001150683379_1873048549410150".to_string()))) },
3492                ] } },
3493            PositionalToken { offset: 73, length: 1, token: Token::Special(Special::Punctuation('.')) },
3494            PositionalToken { offset: 74, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3495            PositionalToken { offset: 75, length: 1, token: Token::Special(Special::Separator(Separator::Newline)) },
3496            PositionalToken { offset: 76, length: 6, token: Token::Word(Word::Word("Andrew".to_string())) },
3497            PositionalToken { offset: 82, length: 1, token: Token::Special(Special::Separator(Separator::Newline)) },
3498            PositionalToken { offset: 83, length: 70, token: Token::BBCode { left: vec![
3499                PositionalToken { offset: 84, length: 4, token: Token::Word(Word::Word("link".to_string())) },
3500                ], right: vec![
3501                PositionalToken { offset: 89, length: 63, token: Token::Struct(Struct::Url("https://www.facebook.com/100001150683379/posts/1873048549410150".to_string())) },
3502                ] } },
3503            PositionalToken { offset: 153, length: 1, token: Token::Special(Special::Separator(Separator::Newline)) },
3504            PositionalToken { offset: 154, length: 12, token: Token::Word(Word::Word("Друзья".to_string())) },
3505            PositionalToken { offset: 166, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3506            PositionalToken { offset: 167, length: 6, token: Token::Word(Word::Word("мои".to_string())) },
3507            PositionalToken { offset: 173, length: 1, token: Token::Special(Special::Punctuation(',')) },
3508            PositionalToken { offset: 174, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3509            PositionalToken { offset: 175, length: 16, token: Token::Word(Word::Word("издатели".to_string())) },
3510            PositionalToken { offset: 191, length: 1, token: Token::Special(Special::Punctuation(',')) },
3511            PositionalToken { offset: 192, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3512            PositionalToken { offset: 193, length: 18, token: Token::Word(Word::Word("редакторы".to_string())) },
3513            PositionalToken { offset: 211, length: 1, token: Token::Special(Special::Punctuation(',')) },
3514            PositionalToken { offset: 212, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3515            PositionalToken { offset: 213, length: 24, token: Token::Word(Word::Word("просветители".to_string())) },
3516            PositionalToken { offset: 237, length: 1, token: Token::Special(Special::Punctuation(',')) },
3517            PositionalToken { offset: 238, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3518            PositionalToken { offset: 239, length: 28, token: Token::Word(Word::Word("культуртрегеры".to_string())) },
3519            PositionalToken { offset: 267, length: 1, token: Token::Special(Special::Punctuation(',')) },
3520            PositionalToken { offset: 268, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3521            PositionalToken { offset: 269, length: 16, token: Token::Word(Word::Word("субъекты".to_string())) },
3522            PositionalToken { offset: 285, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3523            PositionalToken { offset: 286, length: 16, token: Token::Word(Word::Word("мирового".to_string())) },
3524            PositionalToken { offset: 302, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3525            PositionalToken { offset: 303, length: 10, token: Token::Word(Word::Word("рынка".to_string())) },
3526            PositionalToken { offset: 313, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3527            PositionalToken { offset: 314, length: 2, token: Token::Word(Word::Word("и".to_string())) },
3528            PositionalToken { offset: 316, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3529            PositionalToken { offset: 317, length: 4, token: Token::Word(Word::Word("ту".to_string())) },
3530            PositionalToken { offset: 321, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3531            PositionalToken { offset: 322, length: 6, token: Token::Word(Word::Word("хум".to_string())) },
3532            PositionalToken { offset: 328, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3533            PositionalToken { offset: 329, length: 4, token: Token::Word(Word::Word("ит".to_string())) },
3534            PositionalToken { offset: 333, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3535            PositionalToken { offset: 334, length: 6, token: Token::Word(Word::Word("ещё".to_string())) },
3536            PositionalToken { offset: 340, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3537            PositionalToken { offset: 341, length: 6, token: Token::Word(Word::Word("мей".to_string())) },
3538            PositionalToken { offset: 347, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3539            PositionalToken { offset: 348, length: 14, token: Token::Word(Word::Word("консёрн".to_string())) },
3540            PositionalToken { offset: 362, length: 1, token: Token::Special(Special::Punctuation('.')) },
3541            PositionalToken { offset: 363, length: 1, token: Token::Special(Special::Separator(Separator::Newline)) },
3542            PositionalToken { offset: 364, length: 4, token: Token::Word(Word::Word("На".to_string())) },
3543            PositionalToken { offset: 368, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3544            PositionalToken { offset: 369, length: 14, token: Token::Word(Word::Word("текущий".to_string())) },
3545            PositionalToken { offset: 383, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3546            PositionalToken { offset: 384, length: 12, token: Token::Word(Word::Word("момент".to_string())) },
3547            PositionalToken { offset: 396, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3548            PositionalToken { offset: 397, length: 2, token: Token::Word(Word::Word("я".to_string())) },
3549            PositionalToken { offset: 399, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3550            PositionalToken { offset: 400, length: 10, token: Token::Word(Word::Word("лишен".to_string())) },
3551            PositionalToken { offset: 410, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3552            PositionalToken { offset: 411, length: 10, token: Token::Word(Word::Word("былой".to_string())) },
3553            PositionalToken { offset: 421, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3554            PositionalToken { offset: 422, length: 22, token: Token::Word(Word::Word("подвижности".to_string())) },
3555            PositionalToken { offset: 444, length: 1, token: Token::Special(Special::Punctuation(',')) },
3556            PositionalToken { offset: 445, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3557            PositionalToken { offset: 446, length: 8, token: Token::Word(Word::Word("хоть".to_string())) },
3558            PositionalToken { offset: 454, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3559            PositionalToken { offset: 455, length: 2, token: Token::Word(Word::Word("и".to_string())) },
3560            PositionalToken { offset: 457, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3561            PositionalToken { offset: 458, length: 14, token: Token::Word(Word::Word("ковыляю".to_string())) },
3562            PositionalToken { offset: 472, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3563            PositionalToken { offset: 473, length: 4, token: Token::Word(Word::Word("по".to_string())) },
3564            PositionalToken { offset: 477, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3565            PositionalToken { offset: 478, length: 20, token: Token::Word(Word::Word("больничных".to_string())) },
3566            PositionalToken { offset: 498, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3567            PositionalToken { offset: 499, length: 18, token: Token::Word(Word::Word("коридорам".to_string())) },
3568            PositionalToken { offset: 517, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3569            PositionalToken { offset: 518, length: 4, token: Token::Word(Word::Word("по".to_string())) },
3570            PositionalToken { offset: 522, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3571            PositionalToken { offset: 523, length: 12, token: Token::Word(Word::Word("разным".to_string())) },
3572            PositionalToken { offset: 535, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3573            PositionalToken { offset: 536, length: 12, token: Token::Word(Word::Word("нуждам".to_string())) },
3574            PositionalToken { offset: 548, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3575            PositionalToken { offset: 549, length: 2, token: Token::Word(Word::Word("и".to_string())) },
3576            PositionalToken { offset: 551, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3577            PositionalToken { offset: 552, length: 4, token: Token::Word(Word::Word("за".to_string())) },
3578            PositionalToken { offset: 556, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3579            PositionalToken { offset: 557, length: 16, token: Token::Word(Word::Word("кипятком".to_string())) },
3580            PositionalToken { offset: 573, length: 1, token: Token::Special(Special::Punctuation('.')) },
3581            PositionalToken { offset: 574, length: 1, token: Token::Special(Special::Separator(Separator::Newline)) },
3582            PositionalToken { offset: 575, length: 10, token: Token::Word(Word::Word("Врачи".to_string())) },
3583            PositionalToken { offset: 585, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3584            PositionalToken { offset: 586, length: 14, token: Token::Word(Word::Word("обещают".to_string())) },
3585            PositionalToken { offset: 600, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3586            PositionalToken { offset: 601, length: 6, token: Token::Word(Word::Word("мне".to_string())) },
3587            PositionalToken { offset: 607, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3588            PositionalToken { offset: 608, length: 20, token: Token::Word(Word::Word("заживление".to_string())) },
3589            PositionalToken { offset: 628, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3590            PositionalToken { offset: 629, length: 18, token: Token::Word(Word::Word("отверстых".to_string())) },
3591            PositionalToken { offset: 647, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3592            PositionalToken { offset: 648, length: 6, token: Token::Word(Word::Word("ран".to_string())) },
3593            PositionalToken { offset: 654, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3594            PositionalToken { offset: 655, length: 8, token: Token::Word(Word::Word("моих".to_string())) },
3595            PositionalToken { offset: 663, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3596            PositionalToken { offset: 664, length: 2, token: Token::Word(Word::Word("в".to_string())) },
3597            PositionalToken { offset: 666, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3598            PositionalToken { offset: 667, length: 14, token: Token::Word(Word::Word("течение".to_string())) },
3599            PositionalToken { offset: 681, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3600            PositionalToken { offset: 682, length: 16, token: Token::Word(Word::Word("полугода".to_string())) },
3601            PositionalToken { offset: 698, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3602            PositionalToken { offset: 699, length: 2, token: Token::Word(Word::Word("и".to_string())) },
3603            PositionalToken { offset: 701, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3604            PositionalToken { offset: 702, length: 4, token: Token::Word(Word::Word("на".to_string())) },
3605            PositionalToken { offset: 706, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3606            PositionalToken { offset: 707, length: 8, token: Token::Word(Word::Word("этот".to_string())) },
3607            PositionalToken { offset: 715, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3608            PositionalToken { offset: 716, length: 12, token: Token::Word(Word::Word("период".to_string())) },
3609            PositionalToken { offset: 728, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3610            PositionalToken { offset: 729, length: 10, token: Token::Word(Word::Word("можно".to_string())) },
3611            PositionalToken { offset: 739, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3612            PositionalToken { offset: 740, length: 24, token: Token::Word(Word::Word("предполагать".to_string())) },
3613            PositionalToken { offset: 764, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3614            PositionalToken { offset: 765, length: 2, token: Token::Word(Word::Word("с".to_string())) },
3615            PositionalToken { offset: 767, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3616            PositionalToken { offset: 768, length: 24, token: Token::Word(Word::Word("уверенностью".to_string())) },
3617            PositionalToken { offset: 792, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3618            PositionalToken { offset: 793, length: 30, token: Token::Word(Word::Word("преимущественно".to_string())) },
3619            PositionalToken { offset: 823, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3620            PositionalToken { offset: 824, length: 16, token: Token::Word(Word::Word("домашний".to_string())) },
3621            PositionalToken { offset: 840, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3622            PositionalToken { offset: 841, length: 10, token: Token::Word(Word::Word("образ".to_string())) },
3623            PositionalToken { offset: 851, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
3624            PositionalToken { offset: 852, length: 10, token: Token::Word(Word::Word("жизни".to_string())) },
3625            PositionalToken { offset: 862, length: 1, token: Token::Special(Special::Punctuation('.')) },
3626            PositionalToken { offset: 863, length: 1, token: Token::Special(Special::Separator(Separator::Newline)) },
3627            PositionalToken { offset: 864, length: 3, token: Token::BBCode { left: vec![
3628                ], right: vec![
3629                ] } },
3630            ];
3631        let lib_res = uws.into_tokenizer(TokenizerParams::complex()).collect::<Vec<_>>();
3632        //print_result(&lib_res); panic!("");
3633        check_results(&result,&lib_res,uws);
3634    }*/
3635
3636    #[test]
3637    fn html() {
3638        let uws = "<div class=\"article article_view \" id=\"article_view_-113039156_9551\" data-article-url=\"/@chaibuket-o-chem-ne-zabyt-25-noyabrya\" data-audio-context=\"article:-113039156_9551\"><h1  class=\"article_decoration_first article_decoration_last\" >День Мамы </h1><p  class=\"article_decoration_first article_decoration_last\" >День, когда поздравляют мам, бабушек, сестер и жён — это всемирный праздник, называемый «День Мамы». В настоящее время его отмечают почти в каждой стране, просто везде разные даты и способы празднования. </p><h3  class=\"article_decoration_first article_decoration_last\" ><span class='article_anchor_title'>\n  <span class='article_anchor_button' id='pochemu-my-ego-prazdnuem'></span>\n  <span class='article_anchor_fsymbol'>П</span>\n</span>ПОЧЕМУ МЫ ЕГО ПРАЗДНУЕМ</h3><p  class=\"article_decoration_first article_decoration_last article_decoration_before\" >В 1987 году комитет госдумы по делам женщин, семьи и молодежи выступил с предложением учредить «День мамы», а сам приказ был подписан уже 30 января 1988 года Борисом Ельциным. Было решено, что ежегодно в России празднество дня мамы будет выпадать на последнее воскресенье ноября. </p><figure data-type=\"101\" data-mode=\"\"  class=\"article_decoration_first article_decoration_last\" >\n  <div class=\"article_figure_content\" style=\"width: 1125px\">\n    <div class=\"article_figure_sizer_content\"><div class=\"article_object_sizer_wrap\" data-sizes=\"[{&quot;s&quot;:[&quot;https://pp.userapi.com/c849128/v849128704/c0ffd/pcNJaBH3NDo.jpg&quot;,75,50],&quot;m&quot;:[&quot;https://pp.userapi.com/c849128/v849128704/c0ffe/ozCLs2kHtRY.jpg&quot;,130,87],&quot;x&quot;:[&quot;https://pp.userapi.com/c849128/v849128704/c0fff/E4KtTNDydzE.jpg&quot;,604,403],&quot;y&quot;:[&quot;https://pp.userapi.com/c849128/v849128704/c1000/1nLxpYKavzU.jpg&quot;,807,538],&quot;z&quot;:[&quot;https://pp.userapi.com/c849128/v849128704/c1001/IgEODe90yEk.jpg&quot;,1125,750],&quot;o&quot;:[&quot;https://pp.userapi.com/c849128/v849128704/c1002/01faNwVZ2_E.jpg&quot;,130,87],&quot;p&quot;:[&quot;https://pp.userapi.com/c849128/v849128704/c1003/baDFzbdRP2s.jpg&quot;,200,133],&quot;q&quot;:[&quot;https://pp.userapi.com/c849128/v849128704/c1004/CY4khI6KJKA.jpg&quot;,320,213],&quot;r&quot;:[&quot;https://pp.userapi.com/c849128/v849128704/c1005/NOvAJ6-VltY.jpg&quot;,510,340]}]\">\n  <img class=\"article_object_sizer_inner article_object_photo__image_blur\" src=\"https://pp.userapi.com/c849128/v849128704/c0ffd/pcNJaBH3NDo.jpg\" data-baseurl=\"\"/>\n  \n</div></div>\n    <div class=\"article_figure_sizer\" style=\"padding-bottom: 66.666666666667%\"></div>";
3639        let result = vec![
3640            PositionalToken {
3641                source: uws,
3642                offset: 236,
3643                length: 8,
3644                token: Token::Word(Word::Word("День".to_string())),
3645            },
3646            PositionalToken {
3647                source: uws,
3648                offset: 244,
3649                length: 1,
3650                token: Token::Special(Special::Separator(Separator::Space)),
3651            },
3652            PositionalToken {
3653                source: uws,
3654                offset: 245,
3655                length: 8,
3656                token: Token::Word(Word::Word("Мамы".to_string())),
3657            },
3658            PositionalToken {
3659                source: uws,
3660                offset: 253,
3661                length: 1,
3662                token: Token::Special(Special::Separator(Separator::Space)),
3663            },
3664            PositionalToken {
3665                source: uws,
3666                offset: 321,
3667                length: 8,
3668                token: Token::Word(Word::Word("День".to_string())),
3669            },
3670            PositionalToken {
3671                source: uws,
3672                offset: 329,
3673                length: 1,
3674                token: Token::Special(Special::Punctuation(',')),
3675            },
3676            PositionalToken {
3677                source: uws,
3678                offset: 330,
3679                length: 1,
3680                token: Token::Special(Special::Separator(Separator::Space)),
3681            },
3682            PositionalToken {
3683                source: uws,
3684                offset: 331,
3685                length: 10,
3686                token: Token::Word(Word::Word("когда".to_string())),
3687            },
3688            PositionalToken {
3689                source: uws,
3690                offset: 341,
3691                length: 1,
3692                token: Token::Special(Special::Separator(Separator::Space)),
3693            },
3694            PositionalToken {
3695                source: uws,
3696                offset: 342,
3697                length: 22,
3698                token: Token::Word(Word::Word("поздравляют".to_string())),
3699            },
3700            PositionalToken {
3701                source: uws,
3702                offset: 364,
3703                length: 1,
3704                token: Token::Special(Special::Separator(Separator::Space)),
3705            },
3706            PositionalToken {
3707                source: uws,
3708                offset: 365,
3709                length: 6,
3710                token: Token::Word(Word::Word("мам".to_string())),
3711            },
3712            PositionalToken {
3713                source: uws,
3714                offset: 371,
3715                length: 1,
3716                token: Token::Special(Special::Punctuation(',')),
3717            },
3718            PositionalToken {
3719                source: uws,
3720                offset: 372,
3721                length: 1,
3722                token: Token::Special(Special::Separator(Separator::Space)),
3723            },
3724            PositionalToken {
3725                source: uws,
3726                offset: 373,
3727                length: 14,
3728                token: Token::Word(Word::Word("бабушек".to_string())),
3729            },
3730            PositionalToken {
3731                source: uws,
3732                offset: 387,
3733                length: 1,
3734                token: Token::Special(Special::Punctuation(',')),
3735            },
3736            PositionalToken {
3737                source: uws,
3738                offset: 388,
3739                length: 1,
3740                token: Token::Special(Special::Separator(Separator::Space)),
3741            },
3742            PositionalToken {
3743                source: uws,
3744                offset: 389,
3745                length: 12,
3746                token: Token::Word(Word::Word("сестер".to_string())),
3747            },
3748            PositionalToken {
3749                source: uws,
3750                offset: 401,
3751                length: 1,
3752                token: Token::Special(Special::Separator(Separator::Space)),
3753            },
3754            PositionalToken {
3755                source: uws,
3756                offset: 402,
3757                length: 2,
3758                token: Token::Word(Word::Word("и".to_string())),
3759            },
3760            PositionalToken {
3761                source: uws,
3762                offset: 404,
3763                length: 1,
3764                token: Token::Special(Special::Separator(Separator::Space)),
3765            },
3766            PositionalToken {
3767                source: uws,
3768                offset: 405,
3769                length: 6,
3770                token: Token::Word(Word::Word("жён".to_string())),
3771            },
3772            PositionalToken {
3773                source: uws,
3774                offset: 411,
3775                length: 1,
3776                token: Token::Special(Special::Separator(Separator::Space)),
3777            },
3778            PositionalToken {
3779                source: uws,
3780                offset: 412,
3781                length: 3,
3782                token: Token::Special(Special::Punctuation('—')),
3783            },
3784            PositionalToken {
3785                source: uws,
3786                offset: 415,
3787                length: 1,
3788                token: Token::Special(Special::Separator(Separator::Space)),
3789            },
3790            PositionalToken {
3791                source: uws,
3792                offset: 416,
3793                length: 6,
3794                token: Token::Word(Word::Word("это".to_string())),
3795            },
3796            PositionalToken {
3797                source: uws,
3798                offset: 422,
3799                length: 1,
3800                token: Token::Special(Special::Separator(Separator::Space)),
3801            },
3802            PositionalToken {
3803                source: uws,
3804                offset: 423,
3805                length: 18,
3806                token: Token::Word(Word::Word("всемирный".to_string())),
3807            },
3808            PositionalToken {
3809                source: uws,
3810                offset: 441,
3811                length: 1,
3812                token: Token::Special(Special::Separator(Separator::Space)),
3813            },
3814            PositionalToken {
3815                source: uws,
3816                offset: 442,
3817                length: 16,
3818                token: Token::Word(Word::Word("праздник".to_string())),
3819            },
3820            PositionalToken {
3821                source: uws,
3822                offset: 458,
3823                length: 1,
3824                token: Token::Special(Special::Punctuation(',')),
3825            },
3826            PositionalToken {
3827                source: uws,
3828                offset: 459,
3829                length: 1,
3830                token: Token::Special(Special::Separator(Separator::Space)),
3831            },
3832            PositionalToken {
3833                source: uws,
3834                offset: 460,
3835                length: 20,
3836                token: Token::Word(Word::Word("называемый".to_string())),
3837            },
3838            PositionalToken {
3839                source: uws,
3840                offset: 480,
3841                length: 1,
3842                token: Token::Special(Special::Separator(Separator::Space)),
3843            },
3844            PositionalToken {
3845                source: uws,
3846                offset: 481,
3847                length: 2,
3848                token: Token::Special(Special::Punctuation('«')),
3849            },
3850            PositionalToken {
3851                source: uws,
3852                offset: 483,
3853                length: 8,
3854                token: Token::Word(Word::Word("День".to_string())),
3855            },
3856            PositionalToken {
3857                source: uws,
3858                offset: 491,
3859                length: 1,
3860                token: Token::Special(Special::Separator(Separator::Space)),
3861            },
3862            PositionalToken {
3863                source: uws,
3864                offset: 492,
3865                length: 8,
3866                token: Token::Word(Word::Word("Мамы".to_string())),
3867            },
3868            PositionalToken {
3869                source: uws,
3870                offset: 500,
3871                length: 2,
3872                token: Token::Special(Special::Punctuation('»')),
3873            },
3874            PositionalToken {
3875                source: uws,
3876                offset: 502,
3877                length: 1,
3878                token: Token::Special(Special::Punctuation('.')),
3879            },
3880            PositionalToken {
3881                source: uws,
3882                offset: 503,
3883                length: 1,
3884                token: Token::Special(Special::Separator(Separator::Space)),
3885            },
3886            PositionalToken {
3887                source: uws,
3888                offset: 504,
3889                length: 2,
3890                token: Token::Word(Word::Word("В".to_string())),
3891            },
3892            PositionalToken {
3893                source: uws,
3894                offset: 506,
3895                length: 1,
3896                token: Token::Special(Special::Separator(Separator::Space)),
3897            },
3898            PositionalToken {
3899                source: uws,
3900                offset: 507,
3901                length: 18,
3902                token: Token::Word(Word::Word("настоящее".to_string())),
3903            },
3904            PositionalToken {
3905                source: uws,
3906                offset: 525,
3907                length: 1,
3908                token: Token::Special(Special::Separator(Separator::Space)),
3909            },
3910            PositionalToken {
3911                source: uws,
3912                offset: 526,
3913                length: 10,
3914                token: Token::Word(Word::Word("время".to_string())),
3915            },
3916            PositionalToken {
3917                source: uws,
3918                offset: 536,
3919                length: 1,
3920                token: Token::Special(Special::Separator(Separator::Space)),
3921            },
3922            PositionalToken {
3923                source: uws,
3924                offset: 537,
3925                length: 6,
3926                token: Token::Word(Word::Word("его".to_string())),
3927            },
3928            PositionalToken {
3929                source: uws,
3930                offset: 543,
3931                length: 1,
3932                token: Token::Special(Special::Separator(Separator::Space)),
3933            },
3934            PositionalToken {
3935                source: uws,
3936                offset: 544,
3937                length: 16,
3938                token: Token::Word(Word::Word("отмечают".to_string())),
3939            },
3940            PositionalToken {
3941                source: uws,
3942                offset: 560,
3943                length: 1,
3944                token: Token::Special(Special::Separator(Separator::Space)),
3945            },
3946            PositionalToken {
3947                source: uws,
3948                offset: 561,
3949                length: 10,
3950                token: Token::Word(Word::Word("почти".to_string())),
3951            },
3952            PositionalToken {
3953                source: uws,
3954                offset: 571,
3955                length: 1,
3956                token: Token::Special(Special::Separator(Separator::Space)),
3957            },
3958            PositionalToken {
3959                source: uws,
3960                offset: 572,
3961                length: 2,
3962                token: Token::Word(Word::Word("в".to_string())),
3963            },
3964            PositionalToken {
3965                source: uws,
3966                offset: 574,
3967                length: 1,
3968                token: Token::Special(Special::Separator(Separator::Space)),
3969            },
3970            PositionalToken {
3971                source: uws,
3972                offset: 575,
3973                length: 12,
3974                token: Token::Word(Word::Word("каждой".to_string())),
3975            },
3976            PositionalToken {
3977                source: uws,
3978                offset: 587,
3979                length: 1,
3980                token: Token::Special(Special::Separator(Separator::Space)),
3981            },
3982            PositionalToken {
3983                source: uws,
3984                offset: 588,
3985                length: 12,
3986                token: Token::Word(Word::Word("стране".to_string())),
3987            },
3988            PositionalToken {
3989                source: uws,
3990                offset: 600,
3991                length: 1,
3992                token: Token::Special(Special::Punctuation(',')),
3993            },
3994            PositionalToken {
3995                source: uws,
3996                offset: 601,
3997                length: 1,
3998                token: Token::Special(Special::Separator(Separator::Space)),
3999            },
4000            PositionalToken {
4001                source: uws,
4002                offset: 602,
4003                length: 12,
4004                token: Token::Word(Word::Word("просто".to_string())),
4005            },
4006            PositionalToken {
4007                source: uws,
4008                offset: 614,
4009                length: 1,
4010                token: Token::Special(Special::Separator(Separator::Space)),
4011            },
4012            PositionalToken {
4013                source: uws,
4014                offset: 615,
4015                length: 10,
4016                token: Token::Word(Word::Word("везде".to_string())),
4017            },
4018            PositionalToken {
4019                source: uws,
4020                offset: 625,
4021                length: 1,
4022                token: Token::Special(Special::Separator(Separator::Space)),
4023            },
4024            PositionalToken {
4025                source: uws,
4026                offset: 626,
4027                length: 12,
4028                token: Token::Word(Word::Word("разные".to_string())),
4029            },
4030            PositionalToken {
4031                source: uws,
4032                offset: 638,
4033                length: 1,
4034                token: Token::Special(Special::Separator(Separator::Space)),
4035            },
4036            PositionalToken {
4037                source: uws,
4038                offset: 639,
4039                length: 8,
4040                token: Token::Word(Word::Word("даты".to_string())),
4041            },
4042            PositionalToken {
4043                source: uws,
4044                offset: 647,
4045                length: 1,
4046                token: Token::Special(Special::Separator(Separator::Space)),
4047            },
4048            PositionalToken {
4049                source: uws,
4050                offset: 648,
4051                length: 2,
4052                token: Token::Word(Word::Word("и".to_string())),
4053            },
4054            PositionalToken {
4055                source: uws,
4056                offset: 650,
4057                length: 1,
4058                token: Token::Special(Special::Separator(Separator::Space)),
4059            },
4060            PositionalToken {
4061                source: uws,
4062                offset: 651,
4063                length: 14,
4064                token: Token::Word(Word::Word("способы".to_string())),
4065            },
4066            PositionalToken {
4067                source: uws,
4068                offset: 665,
4069                length: 1,
4070                token: Token::Special(Special::Separator(Separator::Space)),
4071            },
4072            PositionalToken {
4073                source: uws,
4074                offset: 666,
4075                length: 24,
4076                token: Token::Word(Word::Word("празднования".to_string())),
4077            },
4078            PositionalToken {
4079                source: uws,
4080                offset: 690,
4081                length: 1,
4082                token: Token::Special(Special::Punctuation('.')),
4083            },
4084            PositionalToken {
4085                source: uws,
4086                offset: 691,
4087                length: 1,
4088                token: Token::Special(Special::Separator(Separator::Space)),
4089            },
4090            PositionalToken {
4091                source: uws,
4092                offset: 794,
4093                length: 1,
4094                token: Token::Special(Special::Separator(Separator::Newline)),
4095            },
4096            PositionalToken {
4097                source: uws,
4098                offset: 795,
4099                length: 2,
4100                token: Token::Special(Special::Separator(Separator::Space)),
4101            },
4102            PositionalToken {
4103                source: uws,
4104                offset: 870,
4105                length: 1,
4106                token: Token::Special(Special::Separator(Separator::Newline)),
4107            },
4108            PositionalToken {
4109                source: uws,
4110                offset: 871,
4111                length: 2,
4112                token: Token::Special(Special::Separator(Separator::Space)),
4113            },
4114            PositionalToken {
4115                source: uws,
4116                offset: 910,
4117                length: 2,
4118                token: Token::Word(Word::Word("П".to_string())),
4119            },
4120            PositionalToken {
4121                source: uws,
4122                offset: 919,
4123                length: 1,
4124                token: Token::Special(Special::Separator(Separator::Newline)),
4125            },
4126            PositionalToken {
4127                source: uws,
4128                offset: 927,
4129                length: 12,
4130                token: Token::Word(Word::Word("ПОЧЕМУ".to_string())),
4131            },
4132            PositionalToken {
4133                source: uws,
4134                offset: 939,
4135                length: 1,
4136                token: Token::Special(Special::Separator(Separator::Space)),
4137            },
4138            PositionalToken {
4139                source: uws,
4140                offset: 940,
4141                length: 4,
4142                token: Token::Word(Word::Word("МЫ".to_string())),
4143            },
4144            PositionalToken {
4145                source: uws,
4146                offset: 944,
4147                length: 1,
4148                token: Token::Special(Special::Separator(Separator::Space)),
4149            },
4150            PositionalToken {
4151                source: uws,
4152                offset: 945,
4153                length: 6,
4154                token: Token::Word(Word::Word("ЕГО".to_string())),
4155            },
4156            PositionalToken {
4157                source: uws,
4158                offset: 951,
4159                length: 1,
4160                token: Token::Special(Special::Separator(Separator::Space)),
4161            },
4162            PositionalToken {
4163                source: uws,
4164                offset: 952,
4165                length: 18,
4166                token: Token::Word(Word::Word("ПРАЗДНУЕМ".to_string())),
4167            },
4168            PositionalToken {
4169                source: uws,
4170                offset: 1063,
4171                length: 2,
4172                token: Token::Word(Word::Word("В".to_string())),
4173            },
4174            PositionalToken {
4175                source: uws,
4176                offset: 1065,
4177                length: 1,
4178                token: Token::Special(Special::Separator(Separator::Space)),
4179            },
4180            PositionalToken {
4181                source: uws,
4182                offset: 1066,
4183                length: 4,
4184                token: Token::Word(Word::Number(Number::Integer(1987))),
4185            },
4186            PositionalToken {
4187                source: uws,
4188                offset: 1070,
4189                length: 1,
4190                token: Token::Special(Special::Separator(Separator::Space)),
4191            },
4192            PositionalToken {
4193                source: uws,
4194                offset: 1071,
4195                length: 8,
4196                token: Token::Word(Word::Word("году".to_string())),
4197            },
4198            PositionalToken {
4199                source: uws,
4200                offset: 1079,
4201                length: 1,
4202                token: Token::Special(Special::Separator(Separator::Space)),
4203            },
4204            PositionalToken {
4205                source: uws,
4206                offset: 1080,
4207                length: 14,
4208                token: Token::Word(Word::Word("комитет".to_string())),
4209            },
4210            PositionalToken {
4211                source: uws,
4212                offset: 1094,
4213                length: 1,
4214                token: Token::Special(Special::Separator(Separator::Space)),
4215            },
4216            PositionalToken {
4217                source: uws,
4218                offset: 1095,
4219                length: 14,
4220                token: Token::Word(Word::Word("госдумы".to_string())),
4221            },
4222            PositionalToken {
4223                source: uws,
4224                offset: 1109,
4225                length: 1,
4226                token: Token::Special(Special::Separator(Separator::Space)),
4227            },
4228            PositionalToken {
4229                source: uws,
4230                offset: 1110,
4231                length: 4,
4232                token: Token::Word(Word::Word("по".to_string())),
4233            },
4234            PositionalToken {
4235                source: uws,
4236                offset: 1114,
4237                length: 1,
4238                token: Token::Special(Special::Separator(Separator::Space)),
4239            },
4240            PositionalToken {
4241                source: uws,
4242                offset: 1115,
4243                length: 10,
4244                token: Token::Word(Word::Word("делам".to_string())),
4245            },
4246            PositionalToken {
4247                source: uws,
4248                offset: 1125,
4249                length: 1,
4250                token: Token::Special(Special::Separator(Separator::Space)),
4251            },
4252            PositionalToken {
4253                source: uws,
4254                offset: 1126,
4255                length: 12,
4256                token: Token::Word(Word::Word("женщин".to_string())),
4257            },
4258            PositionalToken {
4259                source: uws,
4260                offset: 1138,
4261                length: 1,
4262                token: Token::Special(Special::Punctuation(',')),
4263            },
4264            PositionalToken {
4265                source: uws,
4266                offset: 1139,
4267                length: 1,
4268                token: Token::Special(Special::Separator(Separator::Space)),
4269            },
4270            PositionalToken {
4271                source: uws,
4272                offset: 1140,
4273                length: 10,
4274                token: Token::Word(Word::Word("семьи".to_string())),
4275            },
4276            PositionalToken {
4277                source: uws,
4278                offset: 1150,
4279                length: 1,
4280                token: Token::Special(Special::Separator(Separator::Space)),
4281            },
4282            PositionalToken {
4283                source: uws,
4284                offset: 1151,
4285                length: 2,
4286                token: Token::Word(Word::Word("и".to_string())),
4287            },
4288            PositionalToken {
4289                source: uws,
4290                offset: 1153,
4291                length: 1,
4292                token: Token::Special(Special::Separator(Separator::Space)),
4293            },
4294            PositionalToken {
4295                source: uws,
4296                offset: 1154,
4297                length: 16,
4298                token: Token::Word(Word::Word("молодежи".to_string())),
4299            },
4300            PositionalToken {
4301                source: uws,
4302                offset: 1170,
4303                length: 1,
4304                token: Token::Special(Special::Separator(Separator::Space)),
4305            },
4306            PositionalToken {
4307                source: uws,
4308                offset: 1171,
4309                length: 16,
4310                token: Token::Word(Word::Word("выступил".to_string())),
4311            },
4312            PositionalToken {
4313                source: uws,
4314                offset: 1187,
4315                length: 1,
4316                token: Token::Special(Special::Separator(Separator::Space)),
4317            },
4318            PositionalToken {
4319                source: uws,
4320                offset: 1188,
4321                length: 2,
4322                token: Token::Word(Word::Word("с".to_string())),
4323            },
4324            PositionalToken {
4325                source: uws,
4326                offset: 1190,
4327                length: 1,
4328                token: Token::Special(Special::Separator(Separator::Space)),
4329            },
4330            PositionalToken {
4331                source: uws,
4332                offset: 1191,
4333                length: 24,
4334                token: Token::Word(Word::Word("предложением".to_string())),
4335            },
4336            PositionalToken {
4337                source: uws,
4338                offset: 1215,
4339                length: 1,
4340                token: Token::Special(Special::Separator(Separator::Space)),
4341            },
4342            PositionalToken {
4343                source: uws,
4344                offset: 1216,
4345                length: 16,
4346                token: Token::Word(Word::Word("учредить".to_string())),
4347            },
4348            PositionalToken {
4349                source: uws,
4350                offset: 1232,
4351                length: 1,
4352                token: Token::Special(Special::Separator(Separator::Space)),
4353            },
4354            PositionalToken {
4355                source: uws,
4356                offset: 1233,
4357                length: 2,
4358                token: Token::Special(Special::Punctuation('«')),
4359            },
4360            PositionalToken {
4361                source: uws,
4362                offset: 1235,
4363                length: 8,
4364                token: Token::Word(Word::Word("День".to_string())),
4365            },
4366            PositionalToken {
4367                source: uws,
4368                offset: 1243,
4369                length: 1,
4370                token: Token::Special(Special::Separator(Separator::Space)),
4371            },
4372            PositionalToken {
4373                source: uws,
4374                offset: 1244,
4375                length: 8,
4376                token: Token::Word(Word::Word("мамы".to_string())),
4377            },
4378            PositionalToken {
4379                source: uws,
4380                offset: 1252,
4381                length: 2,
4382                token: Token::Special(Special::Punctuation('»')),
4383            },
4384            PositionalToken {
4385                source: uws,
4386                offset: 1254,
4387                length: 1,
4388                token: Token::Special(Special::Punctuation(',')),
4389            },
4390            PositionalToken {
4391                source: uws,
4392                offset: 1255,
4393                length: 1,
4394                token: Token::Special(Special::Separator(Separator::Space)),
4395            },
4396            PositionalToken {
4397                source: uws,
4398                offset: 1256,
4399                length: 2,
4400                token: Token::Word(Word::Word("а".to_string())),
4401            },
4402            PositionalToken {
4403                source: uws,
4404                offset: 1258,
4405                length: 1,
4406                token: Token::Special(Special::Separator(Separator::Space)),
4407            },
4408            PositionalToken {
4409                source: uws,
4410                offset: 1259,
4411                length: 6,
4412                token: Token::Word(Word::Word("сам".to_string())),
4413            },
4414            PositionalToken {
4415                source: uws,
4416                offset: 1265,
4417                length: 1,
4418                token: Token::Special(Special::Separator(Separator::Space)),
4419            },
4420            PositionalToken {
4421                source: uws,
4422                offset: 1266,
4423                length: 12,
4424                token: Token::Word(Word::Word("приказ".to_string())),
4425            },
4426            PositionalToken {
4427                source: uws,
4428                offset: 1278,
4429                length: 1,
4430                token: Token::Special(Special::Separator(Separator::Space)),
4431            },
4432            PositionalToken {
4433                source: uws,
4434                offset: 1279,
4435                length: 6,
4436                token: Token::Word(Word::Word("был".to_string())),
4437            },
4438            PositionalToken {
4439                source: uws,
4440                offset: 1285,
4441                length: 1,
4442                token: Token::Special(Special::Separator(Separator::Space)),
4443            },
4444            PositionalToken {
4445                source: uws,
4446                offset: 1286,
4447                length: 16,
4448                token: Token::Word(Word::Word("подписан".to_string())),
4449            },
4450            PositionalToken {
4451                source: uws,
4452                offset: 1302,
4453                length: 1,
4454                token: Token::Special(Special::Separator(Separator::Space)),
4455            },
4456            PositionalToken {
4457                source: uws,
4458                offset: 1303,
4459                length: 6,
4460                token: Token::Word(Word::Word("уже".to_string())),
4461            },
4462            PositionalToken {
4463                source: uws,
4464                offset: 1309,
4465                length: 1,
4466                token: Token::Special(Special::Separator(Separator::Space)),
4467            },
4468            PositionalToken {
4469                source: uws,
4470                offset: 1310,
4471                length: 2,
4472                token: Token::Word(Word::Number(Number::Integer(30))),
4473            },
4474            PositionalToken {
4475                source: uws,
4476                offset: 1312,
4477                length: 1,
4478                token: Token::Special(Special::Separator(Separator::Space)),
4479            },
4480            PositionalToken {
4481                source: uws,
4482                offset: 1313,
4483                length: 12,
4484                token: Token::Word(Word::Word("января".to_string())),
4485            },
4486            PositionalToken {
4487                source: uws,
4488                offset: 1325,
4489                length: 1,
4490                token: Token::Special(Special::Separator(Separator::Space)),
4491            },
4492            PositionalToken {
4493                source: uws,
4494                offset: 1326,
4495                length: 4,
4496                token: Token::Word(Word::Number(Number::Integer(1988))),
4497            },
4498            PositionalToken {
4499                source: uws,
4500                offset: 1330,
4501                length: 1,
4502                token: Token::Special(Special::Separator(Separator::Space)),
4503            },
4504            PositionalToken {
4505                source: uws,
4506                offset: 1331,
4507                length: 8,
4508                token: Token::Word(Word::Word("года".to_string())),
4509            },
4510            PositionalToken {
4511                source: uws,
4512                offset: 1339,
4513                length: 1,
4514                token: Token::Special(Special::Separator(Separator::Space)),
4515            },
4516            PositionalToken {
4517                source: uws,
4518                offset: 1340,
4519                length: 14,
4520                token: Token::Word(Word::Word("Борисом".to_string())),
4521            },
4522            PositionalToken {
4523                source: uws,
4524                offset: 1354,
4525                length: 1,
4526                token: Token::Special(Special::Separator(Separator::Space)),
4527            },
4528            PositionalToken {
4529                source: uws,
4530                offset: 1355,
4531                length: 16,
4532                token: Token::Word(Word::Word("Ельциным".to_string())),
4533            },
4534            PositionalToken {
4535                source: uws,
4536                offset: 1371,
4537                length: 1,
4538                token: Token::Special(Special::Punctuation('.')),
4539            },
4540            PositionalToken {
4541                source: uws,
4542                offset: 1372,
4543                length: 1,
4544                token: Token::Special(Special::Separator(Separator::Space)),
4545            },
4546            PositionalToken {
4547                source: uws,
4548                offset: 1373,
4549                length: 8,
4550                token: Token::Word(Word::Word("Было".to_string())),
4551            },
4552            PositionalToken {
4553                source: uws,
4554                offset: 1381,
4555                length: 1,
4556                token: Token::Special(Special::Separator(Separator::Space)),
4557            },
4558            PositionalToken {
4559                source: uws,
4560                offset: 1382,
4561                length: 12,
4562                token: Token::Word(Word::Word("решено".to_string())),
4563            },
4564            PositionalToken {
4565                source: uws,
4566                offset: 1394,
4567                length: 1,
4568                token: Token::Special(Special::Punctuation(',')),
4569            },
4570            PositionalToken {
4571                source: uws,
4572                offset: 1395,
4573                length: 1,
4574                token: Token::Special(Special::Separator(Separator::Space)),
4575            },
4576            PositionalToken {
4577                source: uws,
4578                offset: 1396,
4579                length: 6,
4580                token: Token::Word(Word::Word("что".to_string())),
4581            },
4582            PositionalToken {
4583                source: uws,
4584                offset: 1402,
4585                length: 1,
4586                token: Token::Special(Special::Separator(Separator::Space)),
4587            },
4588            PositionalToken {
4589                source: uws,
4590                offset: 1403,
4591                length: 16,
4592                token: Token::Word(Word::Word("ежегодно".to_string())),
4593            },
4594            PositionalToken {
4595                source: uws,
4596                offset: 1419,
4597                length: 1,
4598                token: Token::Special(Special::Separator(Separator::Space)),
4599            },
4600            PositionalToken {
4601                source: uws,
4602                offset: 1420,
4603                length: 2,
4604                token: Token::Word(Word::Word("в".to_string())),
4605            },
4606            PositionalToken {
4607                source: uws,
4608                offset: 1422,
4609                length: 1,
4610                token: Token::Special(Special::Separator(Separator::Space)),
4611            },
4612            PositionalToken {
4613                source: uws,
4614                offset: 1423,
4615                length: 12,
4616                token: Token::Word(Word::Word("России".to_string())),
4617            },
4618            PositionalToken {
4619                source: uws,
4620                offset: 1435,
4621                length: 1,
4622                token: Token::Special(Special::Separator(Separator::Space)),
4623            },
4624            PositionalToken {
4625                source: uws,
4626                offset: 1436,
4627                length: 22,
4628                token: Token::Word(Word::Word("празднество".to_string())),
4629            },
4630            PositionalToken {
4631                source: uws,
4632                offset: 1458,
4633                length: 1,
4634                token: Token::Special(Special::Separator(Separator::Space)),
4635            },
4636            PositionalToken {
4637                source: uws,
4638                offset: 1459,
4639                length: 6,
4640                token: Token::Word(Word::Word("дня".to_string())),
4641            },
4642            PositionalToken {
4643                source: uws,
4644                offset: 1465,
4645                length: 1,
4646                token: Token::Special(Special::Separator(Separator::Space)),
4647            },
4648            PositionalToken {
4649                source: uws,
4650                offset: 1466,
4651                length: 8,
4652                token: Token::Word(Word::Word("мамы".to_string())),
4653            },
4654            PositionalToken {
4655                source: uws,
4656                offset: 1474,
4657                length: 1,
4658                token: Token::Special(Special::Separator(Separator::Space)),
4659            },
4660            PositionalToken {
4661                source: uws,
4662                offset: 1475,
4663                length: 10,
4664                token: Token::Word(Word::Word("будет".to_string())),
4665            },
4666            PositionalToken {
4667                source: uws,
4668                offset: 1485,
4669                length: 1,
4670                token: Token::Special(Special::Separator(Separator::Space)),
4671            },
4672            PositionalToken {
4673                source: uws,
4674                offset: 1486,
4675                length: 16,
4676                token: Token::Word(Word::Word("выпадать".to_string())),
4677            },
4678            PositionalToken {
4679                source: uws,
4680                offset: 1502,
4681                length: 1,
4682                token: Token::Special(Special::Separator(Separator::Space)),
4683            },
4684            PositionalToken {
4685                source: uws,
4686                offset: 1503,
4687                length: 4,
4688                token: Token::Word(Word::Word("на".to_string())),
4689            },
4690            PositionalToken {
4691                source: uws,
4692                offset: 1507,
4693                length: 1,
4694                token: Token::Special(Special::Separator(Separator::Space)),
4695            },
4696            PositionalToken {
4697                source: uws,
4698                offset: 1508,
4699                length: 18,
4700                token: Token::Word(Word::Word("последнее".to_string())),
4701            },
4702            PositionalToken {
4703                source: uws,
4704                offset: 1526,
4705                length: 1,
4706                token: Token::Special(Special::Separator(Separator::Space)),
4707            },
4708            PositionalToken {
4709                source: uws,
4710                offset: 1527,
4711                length: 22,
4712                token: Token::Word(Word::Word("воскресенье".to_string())),
4713            },
4714            PositionalToken {
4715                source: uws,
4716                offset: 1549,
4717                length: 1,
4718                token: Token::Special(Special::Separator(Separator::Space)),
4719            },
4720            PositionalToken {
4721                source: uws,
4722                offset: 1550,
4723                length: 12,
4724                token: Token::Word(Word::Word("ноября".to_string())),
4725            },
4726            PositionalToken {
4727                source: uws,
4728                offset: 1562,
4729                length: 1,
4730                token: Token::Special(Special::Punctuation('.')),
4731            },
4732            PositionalToken {
4733                source: uws,
4734                offset: 1563,
4735                length: 1,
4736                token: Token::Special(Special::Separator(Separator::Space)),
4737            },
4738            PositionalToken {
4739                source: uws,
4740                offset: 1664,
4741                length: 1,
4742                token: Token::Special(Special::Separator(Separator::Newline)),
4743            },
4744            PositionalToken {
4745                source: uws,
4746                offset: 1665,
4747                length: 2,
4748                token: Token::Special(Special::Separator(Separator::Space)),
4749            },
4750            PositionalToken {
4751                source: uws,
4752                offset: 1725,
4753                length: 1,
4754                token: Token::Special(Special::Separator(Separator::Newline)),
4755            },
4756            PositionalToken {
4757                source: uws,
4758                offset: 1726,
4759                length: 4,
4760                token: Token::Special(Special::Separator(Separator::Space)),
4761            },
4762            PositionalToken {
4763                source: uws,
4764                offset: 2725,
4765                length: 1,
4766                token: Token::Special(Special::Separator(Separator::Newline)),
4767            },
4768            PositionalToken {
4769                source: uws,
4770                offset: 2726,
4771                length: 2,
4772                token: Token::Special(Special::Separator(Separator::Space)),
4773            },
4774            PositionalToken {
4775                source: uws,
4776                offset: 2888,
4777                length: 1,
4778                token: Token::Special(Special::Separator(Separator::Newline)),
4779            },
4780            PositionalToken {
4781                source: uws,
4782                offset: 2889,
4783                length: 2,
4784                token: Token::Special(Special::Separator(Separator::Space)),
4785            },
4786            PositionalToken {
4787                source: uws,
4788                offset: 2891,
4789                length: 1,
4790                token: Token::Special(Special::Separator(Separator::Newline)),
4791            },
4792            PositionalToken {
4793                source: uws,
4794                offset: 2904,
4795                length: 1,
4796                token: Token::Special(Special::Separator(Separator::Newline)),
4797            },
4798            PositionalToken {
4799                source: uws,
4800                offset: 2905,
4801                length: 4,
4802                token: Token::Special(Special::Separator(Separator::Space)),
4803            },
4804        ];
4805
4806        let text = Text::new({
4807            uws.into_source()
4808                .pipe(tagger::Builder::new().create().into_breaker())
4809                .pipe(entities::Builder::new().create().into_piped())
4810                .into_separator()
4811        })
4812        .unwrap();
4813
4814        let lib_res = text
4815            .into_tokenizer(TokenizerParams::v1())
4816            .filter_map(|tt| tt.into_original_token_1())
4817            .collect::<Vec<_>>();
4818
4819        check_results(&result, &lib_res, uws);
4820    }
4821
4822    /*#[test]
4823    fn vk_bbcode() {
4824        let uws = "[club113623432|💜💜💜 - для девушек] \n[club113623432|💛💛💛 - для сохраненок]";
4825        let result = vec![
4826            PositionalToken { offset: 0, length: 52, token: Token::BBCode { left: vec![
4827                PositionalToken { offset: 1, length: 13, token: Token::Word(Word::Numerical(Numerical::Alphanumeric("club113623432".to_string()))) },
4828                ], right: vec![
4829                PositionalToken { offset: 15, length: 4, token: Token::Word(Word::Emoji("purple_heart")) },
4830                PositionalToken { offset: 19, length: 4, token: Token::Word(Word::Emoji("purple_heart")) },
4831                PositionalToken { offset: 23, length: 4, token: Token::Word(Word::Emoji("purple_heart")) },
4832                PositionalToken { offset: 27, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
4833                PositionalToken { offset: 28, length: 1, token: Token::Special(Special::Punctuation('-')) },
4834                PositionalToken { offset: 29, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
4835                PositionalToken { offset: 30, length: 6, token: Token::Word(Word::Word("для".to_string())) },
4836                PositionalToken { offset: 36, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
4837                PositionalToken { offset: 37, length: 14, token: Token::Word(Word::Word("девушек".to_string())) },
4838                ] } },
4839            PositionalToken { offset: 52, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
4840            PositionalToken { offset: 53, length: 1, token: Token::Special(Special::Separator(Separator::Newline)) },
4841            PositionalToken { offset: 54, length: 58, token: Token::BBCode { left: vec![
4842                PositionalToken { offset: 55, length: 13, token: Token::Word(Word::Numerical(Numerical::Alphanumeric("club113623432".to_string()))) },
4843                ], right: vec![
4844                PositionalToken { offset: 69, length: 4, token: Token::Word(Word::Emoji("yellow_heart")) },
4845                PositionalToken { offset: 73, length: 4, token: Token::Word(Word::Emoji("yellow_heart")) },
4846                PositionalToken { offset: 77, length: 4, token: Token::Word(Word::Emoji("yellow_heart")) },
4847                PositionalToken { offset: 81, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
4848                PositionalToken { offset: 82, length: 1, token: Token::Special(Special::Punctuation('-')) },
4849                PositionalToken { offset: 83, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
4850                PositionalToken { offset: 84, length: 6, token: Token::Word(Word::Word("для".to_string())) },
4851                PositionalToken { offset: 90, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
4852                PositionalToken { offset: 91, length: 20, token: Token::Word(Word::Word("сохраненок".to_string())) },
4853                ] } },
4854            ];
4855        let lib_res = uws.into_tokenizer(TokenizerParams::complex()).collect::<Vec<_>>();
4856        //print_result(&lib_res); panic!("");
4857        check_results(&result,&lib_res,uws);
4858    }*/
4859
4860    /*#[test]
4861    fn text_href_and_html () {
4862        let uws = "https://youtu.be/dQErLQZw3qA</a></p><figure data-type=\"102\" data-mode=\"\"  class=\"article_decoration_first article_decoration_last\" >\n";
4863        let result =  vec![
4864            PositionalToken { offset: 0, length: 28, token: Token::Struct(Struct::Url("https://youtu.be/dQErLQZw3qA".to_string())) },
4865            PositionalToken { offset: 132, length: 1, token: Token::Special(Special::Separator(Separator::Newline)) },
4866            ];
4867        let lib_res = uws.into_tokenizer(TokenizerParams::v1()).unwrap().collect::<Vec<_>>();
4868        check_results(&result,&lib_res,uws);
4869        //print_result(&lib_res); panic!("")
4870    }*/
4871
4872    #[test]
4873    fn numerical_no_split() {
4874        let uws = "12.02.18 31.28.34 23.11.2018 123.568.365.234.578 127.0.0.1 1st 1кг 123123афываыв 12321фвафыов234выалфо 12_123_343.4234_4234";
4875        let lib_res = uws.into_tokenizer(Default::default()).collect::<Vec<_>>();
4876        //print_result(&lib_res); panic!("");
4877        let result = vec![
4878            PositionalToken {
4879                source: uws,
4880                offset: 0,
4881                length: 8,
4882                token: Token::Word(Word::Numerical(Numerical::DotSeparated(
4883                    "12.02.18".to_string(),
4884                ))),
4885            },
4886            PositionalToken {
4887                source: uws,
4888                offset: 8,
4889                length: 1,
4890                token: Token::Special(Special::Separator(Separator::Space)),
4891            },
4892            PositionalToken {
4893                source: uws,
4894                offset: 9,
4895                length: 8,
4896                token: Token::Word(Word::Numerical(Numerical::DotSeparated(
4897                    "31.28.34".to_string(),
4898                ))),
4899            },
4900            PositionalToken {
4901                source: uws,
4902                offset: 17,
4903                length: 1,
4904                token: Token::Special(Special::Separator(Separator::Space)),
4905            },
4906            PositionalToken {
4907                source: uws,
4908                offset: 18,
4909                length: 10,
4910                token: Token::Word(Word::Numerical(Numerical::DotSeparated(
4911                    "23.11.2018".to_string(),
4912                ))),
4913            },
4914            PositionalToken {
4915                source: uws,
4916                offset: 28,
4917                length: 1,
4918                token: Token::Special(Special::Separator(Separator::Space)),
4919            },
4920            PositionalToken {
4921                source: uws,
4922                offset: 29,
4923                length: 19,
4924                token: Token::Word(Word::Numerical(Numerical::DotSeparated(
4925                    "123.568.365.234.578".to_string(),
4926                ))),
4927            },
4928            PositionalToken {
4929                source: uws,
4930                offset: 48,
4931                length: 1,
4932                token: Token::Special(Special::Separator(Separator::Space)),
4933            },
4934            PositionalToken {
4935                source: uws,
4936                offset: 49,
4937                length: 9,
4938                token: Token::Word(Word::Numerical(Numerical::DotSeparated(
4939                    "127.0.0.1".to_string(),
4940                ))),
4941            },
4942            PositionalToken {
4943                source: uws,
4944                offset: 58,
4945                length: 1,
4946                token: Token::Special(Special::Separator(Separator::Space)),
4947            },
4948            PositionalToken {
4949                source: uws,
4950                offset: 59,
4951                length: 3,
4952                token: Token::Word(Word::Numerical(Numerical::Measures("1st".to_string()))),
4953            },
4954            PositionalToken {
4955                source: uws,
4956                offset: 62,
4957                length: 1,
4958                token: Token::Special(Special::Separator(Separator::Space)),
4959            },
4960            PositionalToken {
4961                source: uws,
4962                offset: 63,
4963                length: 5,
4964                token: Token::Word(Word::Numerical(Numerical::Measures("1кг".to_string()))),
4965            },
4966            PositionalToken {
4967                source: uws,
4968                offset: 68,
4969                length: 1,
4970                token: Token::Special(Special::Separator(Separator::Space)),
4971            },
4972            PositionalToken {
4973                source: uws,
4974                offset: 69,
4975                length: 20,
4976                token: Token::Word(Word::Numerical(Numerical::Measures(
4977                    "123123афываыв".to_string(),
4978                ))),
4979            },
4980            PositionalToken {
4981                source: uws,
4982                offset: 89,
4983                length: 1,
4984                token: Token::Special(Special::Separator(Separator::Space)),
4985            },
4986            PositionalToken {
4987                source: uws,
4988                offset: 90,
4989                length: 34,
4990                token: Token::Word(Word::Numerical(Numerical::Alphanumeric(
4991                    "12321фвафыов234выалфо".to_string(),
4992                ))),
4993            },
4994            PositionalToken {
4995                source: uws,
4996                offset: 124,
4997                length: 1,
4998                token: Token::Special(Special::Separator(Separator::Space)),
4999            },
5000            PositionalToken {
5001                source: uws,
5002                offset: 125,
5003                length: 20,
5004                token: Token::Word(Word::Numerical(Numerical::Alphanumeric(
5005                    "12_123_343.4234_4234".to_string(),
5006                ))),
5007            },
5008        ];
5009        check_results(&result, &lib_res, uws);
5010    }
5011
5012    #[test]
5013    fn numerical_default() {
5014        let uws = "12.02.18 31.28.34 23.11.2018 123.568.365.234.578 127.0.0.1 1st 1кг 123123афываыв 12321фвафыов234выалфо 12_123_343.4234_4234";
5015        let lib_res = uws
5016            .into_tokenizer(TokenizerParams::v1())
5017            .collect::<Vec<_>>();
5018        //print_result(&lib_res); panic!("");
5019        let result = vec![
5020            PositionalToken {
5021                source: uws,
5022                offset: 0,
5023                length: 2,
5024                token: Token::Word(Word::Number(Number::Integer(12))),
5025            },
5026            PositionalToken {
5027                source: uws,
5028                offset: 2,
5029                length: 1,
5030                token: Token::Special(Special::Punctuation('.')),
5031            },
5032            PositionalToken {
5033                source: uws,
5034                offset: 3,
5035                length: 2,
5036                token: Token::Word(Word::Number(Number::ZeroInteger {
5037                    i: 2,
5038                    s: "02".to_string(),
5039                })),
5040            },
5041            PositionalToken {
5042                source: uws,
5043                offset: 5,
5044                length: 1,
5045                token: Token::Special(Special::Punctuation('.')),
5046            },
5047            PositionalToken {
5048                source: uws,
5049                offset: 6,
5050                length: 2,
5051                token: Token::Word(Word::Number(Number::Integer(18))),
5052            },
5053            PositionalToken {
5054                source: uws,
5055                offset: 8,
5056                length: 1,
5057                token: Token::Special(Special::Separator(Separator::Space)),
5058            },
5059            PositionalToken {
5060                source: uws,
5061                offset: 9,
5062                length: 2,
5063                token: Token::Word(Word::Number(Number::Integer(31))),
5064            },
5065            PositionalToken {
5066                source: uws,
5067                offset: 11,
5068                length: 1,
5069                token: Token::Special(Special::Punctuation('.')),
5070            },
5071            PositionalToken {
5072                source: uws,
5073                offset: 12,
5074                length: 2,
5075                token: Token::Word(Word::Number(Number::Integer(28))),
5076            },
5077            PositionalToken {
5078                source: uws,
5079                offset: 14,
5080                length: 1,
5081                token: Token::Special(Special::Punctuation('.')),
5082            },
5083            PositionalToken {
5084                source: uws,
5085                offset: 15,
5086                length: 2,
5087                token: Token::Word(Word::Number(Number::Integer(34))),
5088            },
5089            PositionalToken {
5090                source: uws,
5091                offset: 17,
5092                length: 1,
5093                token: Token::Special(Special::Separator(Separator::Space)),
5094            },
5095            PositionalToken {
5096                source: uws,
5097                offset: 18,
5098                length: 2,
5099                token: Token::Word(Word::Number(Number::Integer(23))),
5100            },
5101            PositionalToken {
5102                source: uws,
5103                offset: 20,
5104                length: 1,
5105                token: Token::Special(Special::Punctuation('.')),
5106            },
5107            PositionalToken {
5108                source: uws,
5109                offset: 21,
5110                length: 2,
5111                token: Token::Word(Word::Number(Number::Integer(11))),
5112            },
5113            PositionalToken {
5114                source: uws,
5115                offset: 23,
5116                length: 1,
5117                token: Token::Special(Special::Punctuation('.')),
5118            },
5119            PositionalToken {
5120                source: uws,
5121                offset: 24,
5122                length: 4,
5123                token: Token::Word(Word::Number(Number::Integer(2018))),
5124            },
5125            PositionalToken {
5126                source: uws,
5127                offset: 28,
5128                length: 1,
5129                token: Token::Special(Special::Separator(Separator::Space)),
5130            },
5131            PositionalToken {
5132                source: uws,
5133                offset: 29,
5134                length: 3,
5135                token: Token::Word(Word::Number(Number::Integer(123))),
5136            },
5137            PositionalToken {
5138                source: uws,
5139                offset: 32,
5140                length: 1,
5141                token: Token::Special(Special::Punctuation('.')),
5142            },
5143            PositionalToken {
5144                source: uws,
5145                offset: 33,
5146                length: 3,
5147                token: Token::Word(Word::Number(Number::Integer(568))),
5148            },
5149            PositionalToken {
5150                source: uws,
5151                offset: 36,
5152                length: 1,
5153                token: Token::Special(Special::Punctuation('.')),
5154            },
5155            PositionalToken {
5156                source: uws,
5157                offset: 37,
5158                length: 3,
5159                token: Token::Word(Word::Number(Number::Integer(365))),
5160            },
5161            PositionalToken {
5162                source: uws,
5163                offset: 40,
5164                length: 1,
5165                token: Token::Special(Special::Punctuation('.')),
5166            },
5167            PositionalToken {
5168                source: uws,
5169                offset: 41,
5170                length: 3,
5171                token: Token::Word(Word::Number(Number::Integer(234))),
5172            },
5173            PositionalToken {
5174                source: uws,
5175                offset: 44,
5176                length: 1,
5177                token: Token::Special(Special::Punctuation('.')),
5178            },
5179            PositionalToken {
5180                source: uws,
5181                offset: 45,
5182                length: 3,
5183                token: Token::Word(Word::Number(Number::Integer(578))),
5184            },
5185            PositionalToken {
5186                source: uws,
5187                offset: 48,
5188                length: 1,
5189                token: Token::Special(Special::Separator(Separator::Space)),
5190            },
5191            PositionalToken {
5192                source: uws,
5193                offset: 49,
5194                length: 3,
5195                token: Token::Word(Word::Number(Number::Integer(127))),
5196            },
5197            PositionalToken {
5198                source: uws,
5199                offset: 52,
5200                length: 1,
5201                token: Token::Special(Special::Punctuation('.')),
5202            },
5203            PositionalToken {
5204                source: uws,
5205                offset: 53,
5206                length: 1,
5207                token: Token::Word(Word::Number(Number::ZeroInteger {
5208                    i: 0,
5209                    s: "0".to_string(),
5210                })),
5211            },
5212            PositionalToken {
5213                source: uws,
5214                offset: 54,
5215                length: 1,
5216                token: Token::Special(Special::Punctuation('.')),
5217            },
5218            PositionalToken {
5219                source: uws,
5220                offset: 55,
5221                length: 1,
5222                token: Token::Word(Word::Number(Number::ZeroInteger {
5223                    i: 0,
5224                    s: "0".to_string(),
5225                })),
5226            },
5227            PositionalToken {
5228                source: uws,
5229                offset: 56,
5230                length: 1,
5231                token: Token::Special(Special::Punctuation('.')),
5232            },
5233            PositionalToken {
5234                source: uws,
5235                offset: 57,
5236                length: 1,
5237                token: Token::Word(Word::Number(Number::Integer(1))),
5238            },
5239            PositionalToken {
5240                source: uws,
5241                offset: 58,
5242                length: 1,
5243                token: Token::Special(Special::Separator(Separator::Space)),
5244            },
5245            PositionalToken {
5246                source: uws,
5247                offset: 59,
5248                length: 3,
5249                token: Token::Word(Word::Numerical(Numerical::Measures("1st".to_string()))),
5250            },
5251            PositionalToken {
5252                source: uws,
5253                offset: 62,
5254                length: 1,
5255                token: Token::Special(Special::Separator(Separator::Space)),
5256            },
5257            PositionalToken {
5258                source: uws,
5259                offset: 63,
5260                length: 5,
5261                token: Token::Word(Word::Numerical(Numerical::Measures("1кг".to_string()))),
5262            },
5263            PositionalToken {
5264                source: uws,
5265                offset: 68,
5266                length: 1,
5267                token: Token::Special(Special::Separator(Separator::Space)),
5268            },
5269            PositionalToken {
5270                source: uws,
5271                offset: 69,
5272                length: 20,
5273                token: Token::Word(Word::Numerical(Numerical::Measures(
5274                    "123123афываыв".to_string(),
5275                ))),
5276            },
5277            PositionalToken {
5278                source: uws,
5279                offset: 89,
5280                length: 1,
5281                token: Token::Special(Special::Separator(Separator::Space)),
5282            },
5283            PositionalToken {
5284                source: uws,
5285                offset: 90,
5286                length: 34,
5287                token: Token::Word(Word::Numerical(Numerical::Alphanumeric(
5288                    "12321фвафыов234выалфо".to_string(),
5289                ))),
5290            },
5291            PositionalToken {
5292                source: uws,
5293                offset: 124,
5294                length: 1,
5295                token: Token::Special(Special::Separator(Separator::Space)),
5296            },
5297            PositionalToken {
5298                source: uws,
5299                offset: 125,
5300                length: 2,
5301                token: Token::Word(Word::Number(Number::Integer(12))),
5302            },
5303            PositionalToken {
5304                source: uws,
5305                offset: 127,
5306                length: 1,
5307                token: Token::Special(Special::Punctuation('_')),
5308            },
5309            PositionalToken {
5310                source: uws,
5311                offset: 128,
5312                length: 3,
5313                token: Token::Word(Word::Number(Number::Integer(123))),
5314            },
5315            PositionalToken {
5316                source: uws,
5317                offset: 131,
5318                length: 1,
5319                token: Token::Special(Special::Punctuation('_')),
5320            },
5321            PositionalToken {
5322                source: uws,
5323                offset: 132,
5324                length: 3,
5325                token: Token::Word(Word::Number(Number::Integer(343))),
5326            },
5327            PositionalToken {
5328                source: uws,
5329                offset: 135,
5330                length: 1,
5331                token: Token::Special(Special::Punctuation('.')),
5332            },
5333            PositionalToken {
5334                source: uws,
5335                offset: 136,
5336                length: 4,
5337                token: Token::Word(Word::Number(Number::Integer(4234))),
5338            },
5339            PositionalToken {
5340                source: uws,
5341                offset: 140,
5342                length: 1,
5343                token: Token::Special(Special::Punctuation('_')),
5344            },
5345            PositionalToken {
5346                source: uws,
5347                offset: 141,
5348                length: 4,
5349                token: Token::Word(Word::Number(Number::Integer(4234))),
5350            },
5351        ];
5352        check_results(&result, &lib_res, uws);
5353    }
5354
5355    /*#[test]
5356        fn new_test() {
5357            let uws = "";
5358            let lib_res = uws.into_tokenizer(TokenizerParams::v1()).unwrap().collect::<Vec<_>>();
5359            print_result(&lib_res); panic!("");
5360            let result = vec![];
5361            check_results(&result,&lib_res,uws);
5362
5363    }*/
5364
5365    /* Language tests */
5366
5367    enum Lang {
5368        Zho,
5369        Jpn,
5370        Kor,
5371        Ara,
5372        Ell,
5373    }
5374
5375    #[test]
5376    fn test_lang_zho() {
5377        let (uws, result) = get_lang_test(Lang::Zho);
5378        let lib_res = uws
5379            .into_tokenizer(TokenizerParams::v1())
5380            .collect::<Vec<_>>();
5381        check_results(&result, &lib_res, &uws);
5382    }
5383
5384    #[test]
5385    fn test_lang_jpn() {
5386        let (uws, result) = get_lang_test(Lang::Jpn);
5387        let lib_res = uws
5388            .into_tokenizer(TokenizerParams::v1())
5389            .collect::<Vec<_>>();
5390        check_results(&result, &lib_res, &uws);
5391    }
5392
5393    #[test]
5394    fn test_lang_kor() {
5395        let (uws, result) = get_lang_test(Lang::Kor);
5396        let lib_res = uws
5397            .into_tokenizer(TokenizerParams::v1())
5398            .collect::<Vec<_>>();
5399        check_results(&result, &lib_res, &uws);
5400    }
5401
5402    #[test]
5403    fn test_lang_ara() {
5404        let (uws, result) = get_lang_test(Lang::Ara);
5405        let lib_res = uws
5406            .into_tokenizer(TokenizerParams::v1())
5407            .collect::<Vec<_>>();
5408        check_results(&result, &lib_res, &uws);
5409    }
5410
5411    #[test]
5412    fn test_lang_ell() {
5413        let (uws, result) = get_lang_test(Lang::Ell);
5414        let lib_res = uws
5415            .into_tokenizer(TokenizerParams::v1())
5416            .collect::<Vec<_>>();
5417        check_results(&result, &lib_res, &uws);
5418    }
5419
5420    fn get_lang_test(lng: Lang) -> (String, Vec<PositionalToken>) {
5421        let uws = match lng {
5422            Lang::Zho => {
5423                "美国电视连续剧《超人前传》的第一集《试播集》于2001年10月16日在電視網首播,剧集主创人阿尔弗雷德·高夫和迈尔斯·米勒編劇,大卫·努特尔执导。这一试播首次向观众引荐了克拉克·肯特一角,他是位拥有超能力的外星孤儿,与家人和朋友一起在堪薩斯州虚构小镇斯莫维尔生活。在这一集里,肯特首度得知自己的来历,同时还需要阻止一位学生试图杀死镇上高中多名学生的报复之举。本集节目里引入了多个之后将贯穿全季甚至整部剧集的主题元素,例如几位主要角色之间的三角恋情。电视剧在加拿大溫哥華取景,旨在选用其“美国中产阶级”景观,主创人花了5个月的时间专门用于为主角物色合适的演员。试播集在所有演员选好4天后正式开拍。由于时间上的限制,剧组无法搭建好实体外景,因此只能使用计算机绘图技术将数字化的外景插入到镜头中。节目一经上映就打破了电视网的多项收视纪录,并且获得了评论员的普遍好评和多个奖项提名,并在其中两项上胜出"
5424            }
5425            Lang::Kor => {
5426                "플레이스테이션 은 소니 컴퓨터 엔터테인먼트가 개발한 세 번째 가정용 게임기이다. 마이크로소프트의 엑스박스 360, 닌텐도의 Wii와 경쟁하고 있다. 이전 제품에서 온라인 플레이 기능을 비디오 게임 개발사에 전적으로 의존하던 것과 달리 통합 온라인 게임 서비스인 플레이스테이션 네트워크 서비스를 발매와 함께 시작해 제공하고 있으며, 탄탄한 멀티미디어 재생 기능, 플레이스테이션 포터블과의 연결, 고화질 광학 디스크 포맷인 블루레이 디스크 재생 기능 등의 기능을 갖추고 있다. 2006년 11월 11일에 일본에서 처음으로 출시했으며, 11월 17일에는 북미 지역, 2007년 3월 23일에는 유럽과 오세아니아 지역에서, 대한민국의 경우 6월 5일부터 일주일간 예약판매를 실시해, 매일 준비한 수량이 동이 나는 등 많은 관심을 받았으며 6월 16일에 정식 출시 행사를 열었다"
5427            }
5428            Lang::Jpn => {
5429                "熊野三山本願所は、15世紀末以降における熊野三山(熊野本宮、熊野新宮、熊野那智)の造営・修造のための勧進を担った組織の総称。 熊野三山を含めて、日本における古代から中世前半にかけての寺社の造営は、寺社領経営のような恒常的財源、幕府や朝廷などからの一時的な造営料所の寄進、あるいは公権力からの臨時の保護によって行われていた。しかしながら、熊野三山では、これらの財源はすべて15世紀半ばまでに実効性を失った"
5430            }
5431            Lang::Ara => {
5432                "لشکرکشی‌های روس‌های وارنگی به دریای خزر مجموعه‌ای از حملات نظامی در بین سال‌های ۸۶۴ تا ۱۰۴۱ میلادی به سواحل دریای خزر بوده‌است. روس‌های وارنگی ابتدا در قرن نهم میلادی به عنوان بازرگانان پوست، عسل و برده در سرزمین‌های اسلامی(سرکلند) ظاهر شدند. این بازرگانان در مسیر تجاری ولگا به خرید و فروش می‌پرداختند. نخستین حملهٔ آنان در فاصله سال‌های ۸۶۴ تا ۸۸۴ میلادی در مقیاسی کوچک علیه علویان طبرستان رخ داد. نخستین یورش بزرگ روس‌ها در سال ۹۱۳ رخ داد و آنان با ۵۰۰ فروند درازکشتی شهر گرگان و اطراف آن را غارت کردند. آن‌ها در این حمله مقداری کالا و برده را به تاراج بردند و در راه بازگشتن به سمت شمال، در دلتای ولگا، مورد حملهٔ خزرهای مسلمان قرار گرفتند و بعضی از آنان موفق به فرار شدند، ولی در میانهٔ ولگا به قتل رسیدند. دومین هجوم بزرگ روس‌ها به دریای خزر در سال ۹۴۳ به وقوع پیوست. در این دوره ایگور یکم، حاکم روس کیف، رهبری روس‌ها را در دست داشت. روس‌ها پس از توافق با دولت خزرها برای عبور امن از منطقه، تا رود کورا و اعماق قفقاز پیش رفتند و در سال ۹۴۳ موفق شدند بندر بردعه، پایتخت اران (جمهوری آذربایجان کنونی)، را تصرف کنند. روس‌ها در آنجا به مدت چند ماه ماندند و بسیاری از ساکنان شهر را کشتند و از راه غارت‌گری اموالی را به تاراج بردند. تنها دلیل بازگشت آنان "
5433            }
5434            Lang::Ell => {
5435                "Το Πρόγραμμα υλοποιείται εξ ολοκλήρου από απόσταση και μπορεί να συμμετέχει κάθε εμπλεκόμενος στη ή/και ενδιαφερόμενος για τη διδασκαλία της Ελληνικής ως δεύτερης/ξένης γλώσσας στην Ελλάδα και στο εξωτερικό, αρκεί να είναι απόφοιτος ελληνικής φιλολογίας, ξένων φιλολογιών, παιδαγωγικών τμημάτων, θεολογικών σχολών ή άλλων πανεπιστημιακών τμημάτων ελληνικών ή ισότιμων ξένων πανεπιστημίων. Υπό όρους γίνονται δεκτοί υποψήφιοι που δεν έχουν ολοκληρώσει σπουδές τριτοβάθμιας εκπαίδευσης."
5436            }
5437        };
5438        let tokens = match lng {
5439            Lang::Zho => vec![
5440                PositionalToken {
5441                    source: uws,
5442                    offset: 0,
5443                    length: 3,
5444                    token: Token::Word(Word::Word("美".to_string())),
5445                },
5446                PositionalToken {
5447                    source: uws,
5448                    offset: 3,
5449                    length: 3,
5450                    token: Token::Word(Word::Word("国".to_string())),
5451                },
5452                PositionalToken {
5453                    source: uws,
5454                    offset: 6,
5455                    length: 3,
5456                    token: Token::Word(Word::Word("电".to_string())),
5457                },
5458                PositionalToken {
5459                    source: uws,
5460                    offset: 9,
5461                    length: 3,
5462                    token: Token::Word(Word::Word("视".to_string())),
5463                },
5464                PositionalToken {
5465                    source: uws,
5466                    offset: 12,
5467                    length: 3,
5468                    token: Token::Word(Word::Word("连".to_string())),
5469                },
5470                PositionalToken {
5471                    source: uws,
5472                    offset: 15,
5473                    length: 3,
5474                    token: Token::Word(Word::Word("续".to_string())),
5475                },
5476                PositionalToken {
5477                    source: uws,
5478                    offset: 18,
5479                    length: 3,
5480                    token: Token::Word(Word::Word("剧".to_string())),
5481                },
5482                PositionalToken {
5483                    source: uws,
5484                    offset: 21,
5485                    length: 3,
5486                    token: Token::Special(Special::Punctuation('《')),
5487                },
5488                PositionalToken {
5489                    source: uws,
5490                    offset: 24,
5491                    length: 3,
5492                    token: Token::Word(Word::Word("超".to_string())),
5493                },
5494                PositionalToken {
5495                    source: uws,
5496                    offset: 27,
5497                    length: 3,
5498                    token: Token::Word(Word::Word("人".to_string())),
5499                },
5500                PositionalToken {
5501                    source: uws,
5502                    offset: 30,
5503                    length: 3,
5504                    token: Token::Word(Word::Word("前".to_string())),
5505                },
5506                PositionalToken {
5507                    source: uws,
5508                    offset: 33,
5509                    length: 3,
5510                    token: Token::Word(Word::Word("传".to_string())),
5511                },
5512                PositionalToken {
5513                    source: uws,
5514                    offset: 36,
5515                    length: 3,
5516                    token: Token::Special(Special::Punctuation('》')),
5517                },
5518                PositionalToken {
5519                    source: uws,
5520                    offset: 39,
5521                    length: 3,
5522                    token: Token::Word(Word::Word("的".to_string())),
5523                },
5524                PositionalToken {
5525                    source: uws,
5526                    offset: 42,
5527                    length: 3,
5528                    token: Token::Word(Word::Word("第".to_string())),
5529                },
5530                PositionalToken {
5531                    source: uws,
5532                    offset: 45,
5533                    length: 3,
5534                    token: Token::Word(Word::Word("一".to_string())),
5535                },
5536                PositionalToken {
5537                    source: uws,
5538                    offset: 48,
5539                    length: 3,
5540                    token: Token::Word(Word::Word("集".to_string())),
5541                },
5542                PositionalToken {
5543                    source: uws,
5544                    offset: 51,
5545                    length: 3,
5546                    token: Token::Special(Special::Punctuation('《')),
5547                },
5548                PositionalToken {
5549                    source: uws,
5550                    offset: 54,
5551                    length: 3,
5552                    token: Token::Word(Word::Word("试".to_string())),
5553                },
5554                PositionalToken {
5555                    source: uws,
5556                    offset: 57,
5557                    length: 3,
5558                    token: Token::Word(Word::Word("播".to_string())),
5559                },
5560                PositionalToken {
5561                    source: uws,
5562                    offset: 60,
5563                    length: 3,
5564                    token: Token::Word(Word::Word("集".to_string())),
5565                },
5566                PositionalToken {
5567                    source: uws,
5568                    offset: 63,
5569                    length: 3,
5570                    token: Token::Special(Special::Punctuation('》')),
5571                },
5572                PositionalToken {
5573                    source: uws,
5574                    offset: 66,
5575                    length: 3,
5576                    token: Token::Word(Word::Word("于".to_string())),
5577                },
5578                PositionalToken {
5579                    source: uws,
5580                    offset: 69,
5581                    length: 4,
5582                    token: Token::Word(Word::Number(Number::Integer(2001))),
5583                },
5584                PositionalToken {
5585                    source: uws,
5586                    offset: 73,
5587                    length: 3,
5588                    token: Token::Word(Word::Word("年".to_string())),
5589                },
5590                PositionalToken {
5591                    source: uws,
5592                    offset: 76,
5593                    length: 2,
5594                    token: Token::Word(Word::Number(Number::Integer(10))),
5595                },
5596                PositionalToken {
5597                    source: uws,
5598                    offset: 78,
5599                    length: 3,
5600                    token: Token::Word(Word::Word("月".to_string())),
5601                },
5602                PositionalToken {
5603                    source: uws,
5604                    offset: 81,
5605                    length: 2,
5606                    token: Token::Word(Word::Number(Number::Integer(16))),
5607                },
5608                PositionalToken {
5609                    source: uws,
5610                    offset: 83,
5611                    length: 3,
5612                    token: Token::Word(Word::Word("日".to_string())),
5613                },
5614                PositionalToken {
5615                    source: uws,
5616                    offset: 86,
5617                    length: 3,
5618                    token: Token::Word(Word::Word("在".to_string())),
5619                },
5620                PositionalToken {
5621                    source: uws,
5622                    offset: 89,
5623                    length: 3,
5624                    token: Token::Word(Word::Word("電".to_string())),
5625                },
5626                PositionalToken {
5627                    source: uws,
5628                    offset: 92,
5629                    length: 3,
5630                    token: Token::Word(Word::Word("視".to_string())),
5631                },
5632                PositionalToken {
5633                    source: uws,
5634                    offset: 95,
5635                    length: 3,
5636                    token: Token::Word(Word::Word("網".to_string())),
5637                },
5638                PositionalToken {
5639                    source: uws,
5640                    offset: 98,
5641                    length: 3,
5642                    token: Token::Word(Word::Word("首".to_string())),
5643                },
5644                PositionalToken {
5645                    source: uws,
5646                    offset: 101,
5647                    length: 3,
5648                    token: Token::Word(Word::Word("播".to_string())),
5649                },
5650                PositionalToken {
5651                    source: uws,
5652                    offset: 104,
5653                    length: 3,
5654                    token: Token::Special(Special::Punctuation(',')),
5655                },
5656                PositionalToken {
5657                    source: uws,
5658                    offset: 107,
5659                    length: 3,
5660                    token: Token::Word(Word::Word("剧".to_string())),
5661                },
5662                PositionalToken {
5663                    source: uws,
5664                    offset: 110,
5665                    length: 3,
5666                    token: Token::Word(Word::Word("集".to_string())),
5667                },
5668                PositionalToken {
5669                    source: uws,
5670                    offset: 113,
5671                    length: 3,
5672                    token: Token::Word(Word::Word("主".to_string())),
5673                },
5674                PositionalToken {
5675                    source: uws,
5676                    offset: 116,
5677                    length: 3,
5678                    token: Token::Word(Word::Word("创".to_string())),
5679                },
5680                PositionalToken {
5681                    source: uws,
5682                    offset: 119,
5683                    length: 3,
5684                    token: Token::Word(Word::Word("人".to_string())),
5685                },
5686                PositionalToken {
5687                    source: uws,
5688                    offset: 122,
5689                    length: 3,
5690                    token: Token::Word(Word::Word("阿".to_string())),
5691                },
5692                PositionalToken {
5693                    source: uws,
5694                    offset: 125,
5695                    length: 3,
5696                    token: Token::Word(Word::Word("尔".to_string())),
5697                },
5698                PositionalToken {
5699                    source: uws,
5700                    offset: 128,
5701                    length: 3,
5702                    token: Token::Word(Word::Word("弗".to_string())),
5703                },
5704                PositionalToken {
5705                    source: uws,
5706                    offset: 131,
5707                    length: 3,
5708                    token: Token::Word(Word::Word("雷".to_string())),
5709                },
5710                PositionalToken {
5711                    source: uws,
5712                    offset: 134,
5713                    length: 3,
5714                    token: Token::Word(Word::Word("德".to_string())),
5715                },
5716                PositionalToken {
5717                    source: uws,
5718                    offset: 137,
5719                    length: 2,
5720                    token: Token::Special(Special::Punctuation('·')),
5721                },
5722                PositionalToken {
5723                    source: uws,
5724                    offset: 139,
5725                    length: 3,
5726                    token: Token::Word(Word::Word("高".to_string())),
5727                },
5728                PositionalToken {
5729                    source: uws,
5730                    offset: 142,
5731                    length: 3,
5732                    token: Token::Word(Word::Word("夫".to_string())),
5733                },
5734                PositionalToken {
5735                    source: uws,
5736                    offset: 145,
5737                    length: 3,
5738                    token: Token::Word(Word::Word("和".to_string())),
5739                },
5740                PositionalToken {
5741                    source: uws,
5742                    offset: 148,
5743                    length: 3,
5744                    token: Token::Word(Word::Word("迈".to_string())),
5745                },
5746                PositionalToken {
5747                    source: uws,
5748                    offset: 151,
5749                    length: 3,
5750                    token: Token::Word(Word::Word("尔".to_string())),
5751                },
5752                PositionalToken {
5753                    source: uws,
5754                    offset: 154,
5755                    length: 3,
5756                    token: Token::Word(Word::Word("斯".to_string())),
5757                },
5758                PositionalToken {
5759                    source: uws,
5760                    offset: 157,
5761                    length: 2,
5762                    token: Token::Special(Special::Punctuation('·')),
5763                },
5764                PositionalToken {
5765                    source: uws,
5766                    offset: 159,
5767                    length: 3,
5768                    token: Token::Word(Word::Word("米".to_string())),
5769                },
5770                PositionalToken {
5771                    source: uws,
5772                    offset: 162,
5773                    length: 3,
5774                    token: Token::Word(Word::Word("勒".to_string())),
5775                },
5776                PositionalToken {
5777                    source: uws,
5778                    offset: 165,
5779                    length: 3,
5780                    token: Token::Word(Word::Word("編".to_string())),
5781                },
5782                PositionalToken {
5783                    source: uws,
5784                    offset: 168,
5785                    length: 3,
5786                    token: Token::Word(Word::Word("劇".to_string())),
5787                },
5788                PositionalToken {
5789                    source: uws,
5790                    offset: 171,
5791                    length: 3,
5792                    token: Token::Special(Special::Punctuation(',')),
5793                },
5794                PositionalToken {
5795                    source: uws,
5796                    offset: 174,
5797                    length: 3,
5798                    token: Token::Word(Word::Word("大".to_string())),
5799                },
5800                PositionalToken {
5801                    source: uws,
5802                    offset: 177,
5803                    length: 3,
5804                    token: Token::Word(Word::Word("卫".to_string())),
5805                },
5806                PositionalToken {
5807                    source: uws,
5808                    offset: 180,
5809                    length: 2,
5810                    token: Token::Special(Special::Punctuation('·')),
5811                },
5812                PositionalToken {
5813                    source: uws,
5814                    offset: 182,
5815                    length: 3,
5816                    token: Token::Word(Word::Word("努".to_string())),
5817                },
5818                PositionalToken {
5819                    source: uws,
5820                    offset: 185,
5821                    length: 3,
5822                    token: Token::Word(Word::Word("特".to_string())),
5823                },
5824                PositionalToken {
5825                    source: uws,
5826                    offset: 188,
5827                    length: 3,
5828                    token: Token::Word(Word::Word("尔".to_string())),
5829                },
5830                PositionalToken {
5831                    source: uws,
5832                    offset: 191,
5833                    length: 3,
5834                    token: Token::Word(Word::Word("执".to_string())),
5835                },
5836                PositionalToken {
5837                    source: uws,
5838                    offset: 194,
5839                    length: 3,
5840                    token: Token::Word(Word::Word("导".to_string())),
5841                },
5842                PositionalToken {
5843                    source: uws,
5844                    offset: 197,
5845                    length: 3,
5846                    token: Token::Special(Special::Punctuation('。')),
5847                },
5848                PositionalToken {
5849                    source: uws,
5850                    offset: 200,
5851                    length: 3,
5852                    token: Token::Word(Word::Word("这".to_string())),
5853                },
5854                PositionalToken {
5855                    source: uws,
5856                    offset: 203,
5857                    length: 3,
5858                    token: Token::Word(Word::Word("一".to_string())),
5859                },
5860                PositionalToken {
5861                    source: uws,
5862                    offset: 206,
5863                    length: 3,
5864                    token: Token::Word(Word::Word("试".to_string())),
5865                },
5866                PositionalToken {
5867                    source: uws,
5868                    offset: 209,
5869                    length: 3,
5870                    token: Token::Word(Word::Word("播".to_string())),
5871                },
5872                PositionalToken {
5873                    source: uws,
5874                    offset: 212,
5875                    length: 3,
5876                    token: Token::Word(Word::Word("首".to_string())),
5877                },
5878                PositionalToken {
5879                    source: uws,
5880                    offset: 215,
5881                    length: 3,
5882                    token: Token::Word(Word::Word("次".to_string())),
5883                },
5884                PositionalToken {
5885                    source: uws,
5886                    offset: 218,
5887                    length: 3,
5888                    token: Token::Word(Word::Word("向".to_string())),
5889                },
5890                PositionalToken {
5891                    source: uws,
5892                    offset: 221,
5893                    length: 3,
5894                    token: Token::Word(Word::Word("观".to_string())),
5895                },
5896                PositionalToken {
5897                    source: uws,
5898                    offset: 224,
5899                    length: 3,
5900                    token: Token::Word(Word::Word("众".to_string())),
5901                },
5902                PositionalToken {
5903                    source: uws,
5904                    offset: 227,
5905                    length: 3,
5906                    token: Token::Word(Word::Word("引".to_string())),
5907                },
5908                PositionalToken {
5909                    source: uws,
5910                    offset: 230,
5911                    length: 3,
5912                    token: Token::Word(Word::Word("荐".to_string())),
5913                },
5914                PositionalToken {
5915                    source: uws,
5916                    offset: 233,
5917                    length: 3,
5918                    token: Token::Word(Word::Word("了".to_string())),
5919                },
5920                PositionalToken {
5921                    source: uws,
5922                    offset: 236,
5923                    length: 3,
5924                    token: Token::Word(Word::Word("克".to_string())),
5925                },
5926                PositionalToken {
5927                    source: uws,
5928                    offset: 239,
5929                    length: 3,
5930                    token: Token::Word(Word::Word("拉".to_string())),
5931                },
5932                PositionalToken {
5933                    source: uws,
5934                    offset: 242,
5935                    length: 3,
5936                    token: Token::Word(Word::Word("克".to_string())),
5937                },
5938                PositionalToken {
5939                    source: uws,
5940                    offset: 245,
5941                    length: 2,
5942                    token: Token::Special(Special::Punctuation('·')),
5943                },
5944                PositionalToken {
5945                    source: uws,
5946                    offset: 247,
5947                    length: 3,
5948                    token: Token::Word(Word::Word("肯".to_string())),
5949                },
5950                PositionalToken {
5951                    source: uws,
5952                    offset: 250,
5953                    length: 3,
5954                    token: Token::Word(Word::Word("特".to_string())),
5955                },
5956                PositionalToken {
5957                    source: uws,
5958                    offset: 253,
5959                    length: 3,
5960                    token: Token::Word(Word::Word("一".to_string())),
5961                },
5962                PositionalToken {
5963                    source: uws,
5964                    offset: 256,
5965                    length: 3,
5966                    token: Token::Word(Word::Word("角".to_string())),
5967                },
5968                PositionalToken {
5969                    source: uws,
5970                    offset: 259,
5971                    length: 3,
5972                    token: Token::Special(Special::Punctuation(',')),
5973                },
5974                PositionalToken {
5975                    source: uws,
5976                    offset: 262,
5977                    length: 3,
5978                    token: Token::Word(Word::Word("他".to_string())),
5979                },
5980                PositionalToken {
5981                    source: uws,
5982                    offset: 265,
5983                    length: 3,
5984                    token: Token::Word(Word::Word("是".to_string())),
5985                },
5986                PositionalToken {
5987                    source: uws,
5988                    offset: 268,
5989                    length: 3,
5990                    token: Token::Word(Word::Word("位".to_string())),
5991                },
5992                PositionalToken {
5993                    source: uws,
5994                    offset: 271,
5995                    length: 3,
5996                    token: Token::Word(Word::Word("拥".to_string())),
5997                },
5998                PositionalToken {
5999                    source: uws,
6000                    offset: 274,
6001                    length: 3,
6002                    token: Token::Word(Word::Word("有".to_string())),
6003                },
6004                PositionalToken {
6005                    source: uws,
6006                    offset: 277,
6007                    length: 3,
6008                    token: Token::Word(Word::Word("超".to_string())),
6009                },
6010            ],
6011            Lang::Jpn => vec![
6012                PositionalToken {
6013                    source: uws,
6014                    offset: 0,
6015                    length: 3,
6016                    token: Token::Word(Word::Word("熊".to_string())),
6017                },
6018                PositionalToken {
6019                    source: uws,
6020                    offset: 3,
6021                    length: 3,
6022                    token: Token::Word(Word::Word("野".to_string())),
6023                },
6024                PositionalToken {
6025                    source: uws,
6026                    offset: 6,
6027                    length: 3,
6028                    token: Token::Word(Word::Word("三".to_string())),
6029                },
6030                PositionalToken {
6031                    source: uws,
6032                    offset: 9,
6033                    length: 3,
6034                    token: Token::Word(Word::Word("山".to_string())),
6035                },
6036                PositionalToken {
6037                    source: uws,
6038                    offset: 12,
6039                    length: 3,
6040                    token: Token::Word(Word::Word("本".to_string())),
6041                },
6042                PositionalToken {
6043                    source: uws,
6044                    offset: 15,
6045                    length: 3,
6046                    token: Token::Word(Word::Word("願".to_string())),
6047                },
6048                PositionalToken {
6049                    source: uws,
6050                    offset: 18,
6051                    length: 3,
6052                    token: Token::Word(Word::Word("所".to_string())),
6053                },
6054                PositionalToken {
6055                    source: uws,
6056                    offset: 21,
6057                    length: 3,
6058                    token: Token::Word(Word::Word("は".to_string())),
6059                },
6060                PositionalToken {
6061                    source: uws,
6062                    offset: 24,
6063                    length: 3,
6064                    token: Token::Special(Special::Punctuation('、')),
6065                },
6066                PositionalToken {
6067                    source: uws,
6068                    offset: 27,
6069                    length: 2,
6070                    token: Token::Word(Word::Number(Number::Integer(15))),
6071                },
6072                PositionalToken {
6073                    source: uws,
6074                    offset: 29,
6075                    length: 3,
6076                    token: Token::Word(Word::Word("世".to_string())),
6077                },
6078                PositionalToken {
6079                    source: uws,
6080                    offset: 32,
6081                    length: 3,
6082                    token: Token::Word(Word::Word("紀".to_string())),
6083                },
6084                PositionalToken {
6085                    source: uws,
6086                    offset: 35,
6087                    length: 3,
6088                    token: Token::Word(Word::Word("末".to_string())),
6089                },
6090                PositionalToken {
6091                    source: uws,
6092                    offset: 38,
6093                    length: 3,
6094                    token: Token::Word(Word::Word("以".to_string())),
6095                },
6096                PositionalToken {
6097                    source: uws,
6098                    offset: 41,
6099                    length: 3,
6100                    token: Token::Word(Word::Word("降".to_string())),
6101                },
6102                PositionalToken {
6103                    source: uws,
6104                    offset: 44,
6105                    length: 3,
6106                    token: Token::Word(Word::Word("に".to_string())),
6107                },
6108                PositionalToken {
6109                    source: uws,
6110                    offset: 47,
6111                    length: 3,
6112                    token: Token::Word(Word::Word("お".to_string())),
6113                },
6114                PositionalToken {
6115                    source: uws,
6116                    offset: 50,
6117                    length: 3,
6118                    token: Token::Word(Word::Word("け".to_string())),
6119                },
6120                PositionalToken {
6121                    source: uws,
6122                    offset: 53,
6123                    length: 3,
6124                    token: Token::Word(Word::Word("る".to_string())),
6125                },
6126                PositionalToken {
6127                    source: uws,
6128                    offset: 56,
6129                    length: 3,
6130                    token: Token::Word(Word::Word("熊".to_string())),
6131                },
6132                PositionalToken {
6133                    source: uws,
6134                    offset: 59,
6135                    length: 3,
6136                    token: Token::Word(Word::Word("野".to_string())),
6137                },
6138                PositionalToken {
6139                    source: uws,
6140                    offset: 62,
6141                    length: 3,
6142                    token: Token::Word(Word::Word("三".to_string())),
6143                },
6144                PositionalToken {
6145                    source: uws,
6146                    offset: 65,
6147                    length: 3,
6148                    token: Token::Word(Word::Word("山".to_string())),
6149                },
6150                PositionalToken {
6151                    source: uws,
6152                    offset: 68,
6153                    length: 3,
6154                    token: Token::Special(Special::Punctuation('(')),
6155                },
6156                PositionalToken {
6157                    source: uws,
6158                    offset: 71,
6159                    length: 3,
6160                    token: Token::Word(Word::Word("熊".to_string())),
6161                },
6162                PositionalToken {
6163                    source: uws,
6164                    offset: 74,
6165                    length: 3,
6166                    token: Token::Word(Word::Word("野".to_string())),
6167                },
6168                PositionalToken {
6169                    source: uws,
6170                    offset: 77,
6171                    length: 3,
6172                    token: Token::Word(Word::Word("本".to_string())),
6173                },
6174                PositionalToken {
6175                    source: uws,
6176                    offset: 80,
6177                    length: 3,
6178                    token: Token::Word(Word::Word("宮".to_string())),
6179                },
6180                PositionalToken {
6181                    source: uws,
6182                    offset: 83,
6183                    length: 3,
6184                    token: Token::Special(Special::Punctuation('、')),
6185                },
6186                PositionalToken {
6187                    source: uws,
6188                    offset: 86,
6189                    length: 3,
6190                    token: Token::Word(Word::Word("熊".to_string())),
6191                },
6192                PositionalToken {
6193                    source: uws,
6194                    offset: 89,
6195                    length: 3,
6196                    token: Token::Word(Word::Word("野".to_string())),
6197                },
6198                PositionalToken {
6199                    source: uws,
6200                    offset: 92,
6201                    length: 3,
6202                    token: Token::Word(Word::Word("新".to_string())),
6203                },
6204                PositionalToken {
6205                    source: uws,
6206                    offset: 95,
6207                    length: 3,
6208                    token: Token::Word(Word::Word("宮".to_string())),
6209                },
6210                PositionalToken {
6211                    source: uws,
6212                    offset: 98,
6213                    length: 3,
6214                    token: Token::Special(Special::Punctuation('、')),
6215                },
6216                PositionalToken {
6217                    source: uws,
6218                    offset: 101,
6219                    length: 3,
6220                    token: Token::Word(Word::Word("熊".to_string())),
6221                },
6222                PositionalToken {
6223                    source: uws,
6224                    offset: 104,
6225                    length: 3,
6226                    token: Token::Word(Word::Word("野".to_string())),
6227                },
6228                PositionalToken {
6229                    source: uws,
6230                    offset: 107,
6231                    length: 3,
6232                    token: Token::Word(Word::Word("那".to_string())),
6233                },
6234                PositionalToken {
6235                    source: uws,
6236                    offset: 110,
6237                    length: 3,
6238                    token: Token::Word(Word::Word("智".to_string())),
6239                },
6240                PositionalToken {
6241                    source: uws,
6242                    offset: 113,
6243                    length: 3,
6244                    token: Token::Special(Special::Punctuation(')')),
6245                },
6246                PositionalToken {
6247                    source: uws,
6248                    offset: 116,
6249                    length: 3,
6250                    token: Token::Word(Word::Word("の".to_string())),
6251                },
6252                PositionalToken {
6253                    source: uws,
6254                    offset: 119,
6255                    length: 3,
6256                    token: Token::Word(Word::Word("造".to_string())),
6257                },
6258                PositionalToken {
6259                    source: uws,
6260                    offset: 122,
6261                    length: 3,
6262                    token: Token::Word(Word::Word("営".to_string())),
6263                },
6264                PositionalToken {
6265                    source: uws,
6266                    offset: 125,
6267                    length: 3,
6268                    token: Token::Special(Special::Punctuation('・')),
6269                },
6270                PositionalToken {
6271                    source: uws,
6272                    offset: 128,
6273                    length: 3,
6274                    token: Token::Word(Word::Word("修".to_string())),
6275                },
6276                PositionalToken {
6277                    source: uws,
6278                    offset: 131,
6279                    length: 3,
6280                    token: Token::Word(Word::Word("造".to_string())),
6281                },
6282                PositionalToken {
6283                    source: uws,
6284                    offset: 134,
6285                    length: 3,
6286                    token: Token::Word(Word::Word("の".to_string())),
6287                },
6288                PositionalToken {
6289                    source: uws,
6290                    offset: 137,
6291                    length: 3,
6292                    token: Token::Word(Word::Word("た".to_string())),
6293                },
6294                PositionalToken {
6295                    source: uws,
6296                    offset: 140,
6297                    length: 3,
6298                    token: Token::Word(Word::Word("め".to_string())),
6299                },
6300                PositionalToken {
6301                    source: uws,
6302                    offset: 143,
6303                    length: 3,
6304                    token: Token::Word(Word::Word("の".to_string())),
6305                },
6306                PositionalToken {
6307                    source: uws,
6308                    offset: 146,
6309                    length: 3,
6310                    token: Token::Word(Word::Word("勧".to_string())),
6311                },
6312                PositionalToken {
6313                    source: uws,
6314                    offset: 149,
6315                    length: 3,
6316                    token: Token::Word(Word::Word("進".to_string())),
6317                },
6318                PositionalToken {
6319                    source: uws,
6320                    offset: 152,
6321                    length: 3,
6322                    token: Token::Word(Word::Word("を".to_string())),
6323                },
6324                PositionalToken {
6325                    source: uws,
6326                    offset: 155,
6327                    length: 3,
6328                    token: Token::Word(Word::Word("担".to_string())),
6329                },
6330                PositionalToken {
6331                    source: uws,
6332                    offset: 158,
6333                    length: 3,
6334                    token: Token::Word(Word::Word("っ".to_string())),
6335                },
6336                PositionalToken {
6337                    source: uws,
6338                    offset: 161,
6339                    length: 3,
6340                    token: Token::Word(Word::Word("た".to_string())),
6341                },
6342                PositionalToken {
6343                    source: uws,
6344                    offset: 164,
6345                    length: 3,
6346                    token: Token::Word(Word::Word("組".to_string())),
6347                },
6348                PositionalToken {
6349                    source: uws,
6350                    offset: 167,
6351                    length: 3,
6352                    token: Token::Word(Word::Word("織".to_string())),
6353                },
6354                PositionalToken {
6355                    source: uws,
6356                    offset: 170,
6357                    length: 3,
6358                    token: Token::Word(Word::Word("の".to_string())),
6359                },
6360                PositionalToken {
6361                    source: uws,
6362                    offset: 173,
6363                    length: 3,
6364                    token: Token::Word(Word::Word("総".to_string())),
6365                },
6366                PositionalToken {
6367                    source: uws,
6368                    offset: 176,
6369                    length: 3,
6370                    token: Token::Word(Word::Word("称".to_string())),
6371                },
6372                PositionalToken {
6373                    source: uws,
6374                    offset: 179,
6375                    length: 3,
6376                    token: Token::Special(Special::Punctuation('。')),
6377                },
6378                PositionalToken {
6379                    source: uws,
6380                    offset: 182,
6381                    length: 1,
6382                    token: Token::Special(Special::Separator(Separator::Space)),
6383                },
6384                PositionalToken {
6385                    source: uws,
6386                    offset: 183,
6387                    length: 3,
6388                    token: Token::Word(Word::Word("熊".to_string())),
6389                },
6390                PositionalToken {
6391                    source: uws,
6392                    offset: 186,
6393                    length: 3,
6394                    token: Token::Word(Word::Word("野".to_string())),
6395                },
6396                PositionalToken {
6397                    source: uws,
6398                    offset: 189,
6399                    length: 3,
6400                    token: Token::Word(Word::Word("三".to_string())),
6401                },
6402                PositionalToken {
6403                    source: uws,
6404                    offset: 192,
6405                    length: 3,
6406                    token: Token::Word(Word::Word("山".to_string())),
6407                },
6408                PositionalToken {
6409                    source: uws,
6410                    offset: 195,
6411                    length: 3,
6412                    token: Token::Word(Word::Word("を".to_string())),
6413                },
6414                PositionalToken {
6415                    source: uws,
6416                    offset: 198,
6417                    length: 3,
6418                    token: Token::Word(Word::Word("含".to_string())),
6419                },
6420                PositionalToken {
6421                    source: uws,
6422                    offset: 201,
6423                    length: 3,
6424                    token: Token::Word(Word::Word("め".to_string())),
6425                },
6426                PositionalToken {
6427                    source: uws,
6428                    offset: 204,
6429                    length: 3,
6430                    token: Token::Word(Word::Word("て".to_string())),
6431                },
6432                PositionalToken {
6433                    source: uws,
6434                    offset: 207,
6435                    length: 3,
6436                    token: Token::Special(Special::Punctuation('、')),
6437                },
6438                PositionalToken {
6439                    source: uws,
6440                    offset: 210,
6441                    length: 3,
6442                    token: Token::Word(Word::Word("日".to_string())),
6443                },
6444                PositionalToken {
6445                    source: uws,
6446                    offset: 213,
6447                    length: 3,
6448                    token: Token::Word(Word::Word("本".to_string())),
6449                },
6450                PositionalToken {
6451                    source: uws,
6452                    offset: 216,
6453                    length: 3,
6454                    token: Token::Word(Word::Word("に".to_string())),
6455                },
6456                PositionalToken {
6457                    source: uws,
6458                    offset: 219,
6459                    length: 3,
6460                    token: Token::Word(Word::Word("お".to_string())),
6461                },
6462                PositionalToken {
6463                    source: uws,
6464                    offset: 222,
6465                    length: 3,
6466                    token: Token::Word(Word::Word("け".to_string())),
6467                },
6468                PositionalToken {
6469                    source: uws,
6470                    offset: 225,
6471                    length: 3,
6472                    token: Token::Word(Word::Word("る".to_string())),
6473                },
6474                PositionalToken {
6475                    source: uws,
6476                    offset: 228,
6477                    length: 3,
6478                    token: Token::Word(Word::Word("古".to_string())),
6479                },
6480                PositionalToken {
6481                    source: uws,
6482                    offset: 231,
6483                    length: 3,
6484                    token: Token::Word(Word::Word("代".to_string())),
6485                },
6486                PositionalToken {
6487                    source: uws,
6488                    offset: 234,
6489                    length: 3,
6490                    token: Token::Word(Word::Word("か".to_string())),
6491                },
6492                PositionalToken {
6493                    source: uws,
6494                    offset: 237,
6495                    length: 3,
6496                    token: Token::Word(Word::Word("ら".to_string())),
6497                },
6498                PositionalToken {
6499                    source: uws,
6500                    offset: 240,
6501                    length: 3,
6502                    token: Token::Word(Word::Word("中".to_string())),
6503                },
6504                PositionalToken {
6505                    source: uws,
6506                    offset: 243,
6507                    length: 3,
6508                    token: Token::Word(Word::Word("世".to_string())),
6509                },
6510                PositionalToken {
6511                    source: uws,
6512                    offset: 246,
6513                    length: 3,
6514                    token: Token::Word(Word::Word("前".to_string())),
6515                },
6516                PositionalToken {
6517                    source: uws,
6518                    offset: 249,
6519                    length: 3,
6520                    token: Token::Word(Word::Word("半".to_string())),
6521                },
6522                PositionalToken {
6523                    source: uws,
6524                    offset: 252,
6525                    length: 3,
6526                    token: Token::Word(Word::Word("に".to_string())),
6527                },
6528                PositionalToken {
6529                    source: uws,
6530                    offset: 255,
6531                    length: 3,
6532                    token: Token::Word(Word::Word("か".to_string())),
6533                },
6534                PositionalToken {
6535                    source: uws,
6536                    offset: 258,
6537                    length: 3,
6538                    token: Token::Word(Word::Word("け".to_string())),
6539                },
6540                PositionalToken {
6541                    source: uws,
6542                    offset: 261,
6543                    length: 3,
6544                    token: Token::Word(Word::Word("て".to_string())),
6545                },
6546                PositionalToken {
6547                    source: uws,
6548                    offset: 264,
6549                    length: 3,
6550                    token: Token::Word(Word::Word("の".to_string())),
6551                },
6552                PositionalToken {
6553                    source: uws,
6554                    offset: 267,
6555                    length: 3,
6556                    token: Token::Word(Word::Word("寺".to_string())),
6557                },
6558                PositionalToken {
6559                    source: uws,
6560                    offset: 270,
6561                    length: 3,
6562                    token: Token::Word(Word::Word("社".to_string())),
6563                },
6564                PositionalToken {
6565                    source: uws,
6566                    offset: 273,
6567                    length: 3,
6568                    token: Token::Word(Word::Word("の".to_string())),
6569                },
6570                PositionalToken {
6571                    source: uws,
6572                    offset: 276,
6573                    length: 3,
6574                    token: Token::Word(Word::Word("造".to_string())),
6575                },
6576                PositionalToken {
6577                    source: uws,
6578                    offset: 279,
6579                    length: 3,
6580                    token: Token::Word(Word::Word("営".to_string())),
6581                },
6582                PositionalToken {
6583                    source: uws,
6584                    offset: 282,
6585                    length: 3,
6586                    token: Token::Word(Word::Word("は".to_string())),
6587                },
6588                PositionalToken {
6589                    source: uws,
6590                    offset: 285,
6591                    length: 3,
6592                    token: Token::Special(Special::Punctuation('、')),
6593                },
6594                PositionalToken {
6595                    source: uws,
6596                    offset: 288,
6597                    length: 3,
6598                    token: Token::Word(Word::Word("寺".to_string())),
6599                },
6600                PositionalToken {
6601                    source: uws,
6602                    offset: 291,
6603                    length: 3,
6604                    token: Token::Word(Word::Word("社".to_string())),
6605                },
6606            ],
6607            Lang::Kor => vec![
6608                PositionalToken {
6609                    source: uws,
6610                    offset: 0,
6611                    length: 21,
6612                    token: Token::Word(Word::Word("플레이스테이션".to_string())),
6613                },
6614                PositionalToken {
6615                    source: uws,
6616                    offset: 21,
6617                    length: 1,
6618                    token: Token::Special(Special::Separator(Separator::Space)),
6619                },
6620                PositionalToken {
6621                    source: uws,
6622                    offset: 22,
6623                    length: 3,
6624                    token: Token::Word(Word::Word("은".to_string())),
6625                },
6626                PositionalToken {
6627                    source: uws,
6628                    offset: 25,
6629                    length: 1,
6630                    token: Token::Special(Special::Separator(Separator::Space)),
6631                },
6632                PositionalToken {
6633                    source: uws,
6634                    offset: 26,
6635                    length: 6,
6636                    token: Token::Word(Word::Word("소니".to_string())),
6637                },
6638                PositionalToken {
6639                    source: uws,
6640                    offset: 32,
6641                    length: 1,
6642                    token: Token::Special(Special::Separator(Separator::Space)),
6643                },
6644                PositionalToken {
6645                    source: uws,
6646                    offset: 33,
6647                    length: 9,
6648                    token: Token::Word(Word::Word("컴퓨터".to_string())),
6649                },
6650                PositionalToken {
6651                    source: uws,
6652                    offset: 42,
6653                    length: 1,
6654                    token: Token::Special(Special::Separator(Separator::Space)),
6655                },
6656                PositionalToken {
6657                    source: uws,
6658                    offset: 43,
6659                    length: 21,
6660                    token: Token::Word(Word::Word("엔터테인먼트가".to_string())),
6661                },
6662                PositionalToken {
6663                    source: uws,
6664                    offset: 64,
6665                    length: 1,
6666                    token: Token::Special(Special::Separator(Separator::Space)),
6667                },
6668                PositionalToken {
6669                    source: uws,
6670                    offset: 65,
6671                    length: 9,
6672                    token: Token::Word(Word::Word("개발한".to_string())),
6673                },
6674                PositionalToken {
6675                    source: uws,
6676                    offset: 74,
6677                    length: 1,
6678                    token: Token::Special(Special::Separator(Separator::Space)),
6679                },
6680                PositionalToken {
6681                    source: uws,
6682                    offset: 75,
6683                    length: 3,
6684                    token: Token::Word(Word::Word("세".to_string())),
6685                },
6686                PositionalToken {
6687                    source: uws,
6688                    offset: 78,
6689                    length: 1,
6690                    token: Token::Special(Special::Separator(Separator::Space)),
6691                },
6692                PositionalToken {
6693                    source: uws,
6694                    offset: 79,
6695                    length: 6,
6696                    token: Token::Word(Word::Word("번째".to_string())),
6697                },
6698                PositionalToken {
6699                    source: uws,
6700                    offset: 85,
6701                    length: 1,
6702                    token: Token::Special(Special::Separator(Separator::Space)),
6703                },
6704                PositionalToken {
6705                    source: uws,
6706                    offset: 86,
6707                    length: 9,
6708                    token: Token::Word(Word::Word("가정용".to_string())),
6709                },
6710                PositionalToken {
6711                    source: uws,
6712                    offset: 95,
6713                    length: 1,
6714                    token: Token::Special(Special::Separator(Separator::Space)),
6715                },
6716                PositionalToken {
6717                    source: uws,
6718                    offset: 96,
6719                    length: 15,
6720                    token: Token::Word(Word::Word("게임기이다".to_string())),
6721                },
6722                PositionalToken {
6723                    source: uws,
6724                    offset: 111,
6725                    length: 1,
6726                    token: Token::Special(Special::Punctuation('.')),
6727                },
6728                PositionalToken {
6729                    source: uws,
6730                    offset: 112,
6731                    length: 1,
6732                    token: Token::Special(Special::Separator(Separator::Space)),
6733                },
6734                PositionalToken {
6735                    source: uws,
6736                    offset: 113,
6737                    length: 24,
6738                    token: Token::Word(Word::Word("마이크로소프트의".to_string())),
6739                },
6740                PositionalToken {
6741                    source: uws,
6742                    offset: 137,
6743                    length: 1,
6744                    token: Token::Special(Special::Separator(Separator::Space)),
6745                },
6746                PositionalToken {
6747                    source: uws,
6748                    offset: 138,
6749                    length: 12,
6750                    token: Token::Word(Word::Word("엑스박스".to_string())),
6751                },
6752                PositionalToken {
6753                    source: uws,
6754                    offset: 150,
6755                    length: 1,
6756                    token: Token::Special(Special::Separator(Separator::Space)),
6757                },
6758                PositionalToken {
6759                    source: uws,
6760                    offset: 151,
6761                    length: 3,
6762                    token: Token::Word(Word::Number(Number::Integer(360))),
6763                },
6764                PositionalToken {
6765                    source: uws,
6766                    offset: 154,
6767                    length: 1,
6768                    token: Token::Special(Special::Punctuation(',')),
6769                },
6770                PositionalToken {
6771                    source: uws,
6772                    offset: 155,
6773                    length: 1,
6774                    token: Token::Special(Special::Separator(Separator::Space)),
6775                },
6776                PositionalToken {
6777                    source: uws,
6778                    offset: 156,
6779                    length: 12,
6780                    token: Token::Word(Word::Word("닌텐도의".to_string())),
6781                },
6782                PositionalToken {
6783                    source: uws,
6784                    offset: 168,
6785                    length: 1,
6786                    token: Token::Special(Special::Separator(Separator::Space)),
6787                },
6788                PositionalToken {
6789                    source: uws,
6790                    offset: 169,
6791                    length: 6,
6792                    token: Token::Word(Word::Word("Wii와".to_string())),
6793                },
6794                PositionalToken {
6795                    source: uws,
6796                    offset: 175,
6797                    length: 1,
6798                    token: Token::Special(Special::Separator(Separator::Space)),
6799                },
6800                PositionalToken {
6801                    source: uws,
6802                    offset: 176,
6803                    length: 12,
6804                    token: Token::Word(Word::Word("경쟁하고".to_string())),
6805                },
6806                PositionalToken {
6807                    source: uws,
6808                    offset: 188,
6809                    length: 1,
6810                    token: Token::Special(Special::Separator(Separator::Space)),
6811                },
6812                PositionalToken {
6813                    source: uws,
6814                    offset: 189,
6815                    length: 6,
6816                    token: Token::Word(Word::Word("있다".to_string())),
6817                },
6818                PositionalToken {
6819                    source: uws,
6820                    offset: 195,
6821                    length: 1,
6822                    token: Token::Special(Special::Punctuation('.')),
6823                },
6824                PositionalToken {
6825                    source: uws,
6826                    offset: 196,
6827                    length: 1,
6828                    token: Token::Special(Special::Separator(Separator::Space)),
6829                },
6830                PositionalToken {
6831                    source: uws,
6832                    offset: 197,
6833                    length: 6,
6834                    token: Token::Word(Word::Word("이전".to_string())),
6835                },
6836                PositionalToken {
6837                    source: uws,
6838                    offset: 203,
6839                    length: 1,
6840                    token: Token::Special(Special::Separator(Separator::Space)),
6841                },
6842                PositionalToken {
6843                    source: uws,
6844                    offset: 204,
6845                    length: 12,
6846                    token: Token::Word(Word::Word("제품에서".to_string())),
6847                },
6848                PositionalToken {
6849                    source: uws,
6850                    offset: 216,
6851                    length: 1,
6852                    token: Token::Special(Special::Separator(Separator::Space)),
6853                },
6854                PositionalToken {
6855                    source: uws,
6856                    offset: 217,
6857                    length: 9,
6858                    token: Token::Word(Word::Word("온라인".to_string())),
6859                },
6860                PositionalToken {
6861                    source: uws,
6862                    offset: 226,
6863                    length: 1,
6864                    token: Token::Special(Special::Separator(Separator::Space)),
6865                },
6866                PositionalToken {
6867                    source: uws,
6868                    offset: 227,
6869                    length: 9,
6870                    token: Token::Word(Word::Word("플레이".to_string())),
6871                },
6872                PositionalToken {
6873                    source: uws,
6874                    offset: 236,
6875                    length: 1,
6876                    token: Token::Special(Special::Separator(Separator::Space)),
6877                },
6878                PositionalToken {
6879                    source: uws,
6880                    offset: 237,
6881                    length: 3,
6882                    token: Token::Word(Word::Word("기".to_string())),
6883                },
6884            ],
6885            Lang::Ara => vec![
6886                PositionalToken {
6887                    source: uws,
6888                    offset: 0,
6889                    length: 14,
6890                    token: Token::Word(Word::Word("لشکرکشی".to_string())),
6891                },
6892                PositionalToken {
6893                    source: uws,
6894                    offset: 14,
6895                    length: 3,
6896                    token: Token::Unicode(Unicode::Formatter(Formatter::Char('\u{200c}'))),
6897                },
6898                PositionalToken {
6899                    source: uws,
6900                    offset: 17,
6901                    length: 6,
6902                    token: Token::Word(Word::Word("های".to_string())),
6903                },
6904                PositionalToken {
6905                    source: uws,
6906                    offset: 23,
6907                    length: 1,
6908                    token: Token::Special(Special::Separator(Separator::Space)),
6909                },
6910                PositionalToken {
6911                    source: uws,
6912                    offset: 24,
6913                    length: 6,
6914                    token: Token::Word(Word::Word("روس".to_string())),
6915                },
6916                PositionalToken {
6917                    source: uws,
6918                    offset: 30,
6919                    length: 3,
6920                    token: Token::Unicode(Unicode::Formatter(Formatter::Char('\u{200c}'))),
6921                },
6922                PositionalToken {
6923                    source: uws,
6924                    offset: 33,
6925                    length: 6,
6926                    token: Token::Word(Word::Word("های".to_string())),
6927                },
6928                PositionalToken {
6929                    source: uws,
6930                    offset: 39,
6931                    length: 1,
6932                    token: Token::Special(Special::Separator(Separator::Space)),
6933                },
6934                PositionalToken {
6935                    source: uws,
6936                    offset: 40,
6937                    length: 12,
6938                    token: Token::Word(Word::Word("وارنگی".to_string())),
6939                },
6940                PositionalToken {
6941                    source: uws,
6942                    offset: 52,
6943                    length: 1,
6944                    token: Token::Special(Special::Separator(Separator::Space)),
6945                },
6946                PositionalToken {
6947                    source: uws,
6948                    offset: 53,
6949                    length: 4,
6950                    token: Token::Word(Word::Word("به".to_string())),
6951                },
6952                PositionalToken {
6953                    source: uws,
6954                    offset: 57,
6955                    length: 1,
6956                    token: Token::Special(Special::Separator(Separator::Space)),
6957                },
6958                PositionalToken {
6959                    source: uws,
6960                    offset: 58,
6961                    length: 10,
6962                    token: Token::Word(Word::Word("دریای".to_string())),
6963                },
6964                PositionalToken {
6965                    source: uws,
6966                    offset: 68,
6967                    length: 1,
6968                    token: Token::Special(Special::Separator(Separator::Space)),
6969                },
6970                PositionalToken {
6971                    source: uws,
6972                    offset: 69,
6973                    length: 6,
6974                    token: Token::Word(Word::Word("خزر".to_string())),
6975                },
6976                PositionalToken {
6977                    source: uws,
6978                    offset: 75,
6979                    length: 1,
6980                    token: Token::Special(Special::Separator(Separator::Space)),
6981                },
6982                PositionalToken {
6983                    source: uws,
6984                    offset: 76,
6985                    length: 12,
6986                    token: Token::Word(Word::Word("مجموعه".to_string())),
6987                },
6988                PositionalToken {
6989                    source: uws,
6990                    offset: 88,
6991                    length: 3,
6992                    token: Token::Unicode(Unicode::Formatter(Formatter::Char('\u{200c}'))),
6993                },
6994                PositionalToken {
6995                    source: uws,
6996                    offset: 91,
6997                    length: 4,
6998                    token: Token::Word(Word::Word("ای".to_string())),
6999                },
7000                PositionalToken {
7001                    source: uws,
7002                    offset: 95,
7003                    length: 1,
7004                    token: Token::Special(Special::Separator(Separator::Space)),
7005                },
7006                PositionalToken {
7007                    source: uws,
7008                    offset: 96,
7009                    length: 4,
7010                    token: Token::Word(Word::Word("از".to_string())),
7011                },
7012                PositionalToken {
7013                    source: uws,
7014                    offset: 100,
7015                    length: 1,
7016                    token: Token::Special(Special::Separator(Separator::Space)),
7017                },
7018                PositionalToken {
7019                    source: uws,
7020                    offset: 101,
7021                    length: 10,
7022                    token: Token::Word(Word::Word("حملات".to_string())),
7023                },
7024                PositionalToken {
7025                    source: uws,
7026                    offset: 111,
7027                    length: 1,
7028                    token: Token::Special(Special::Separator(Separator::Space)),
7029                },
7030                PositionalToken {
7031                    source: uws,
7032                    offset: 112,
7033                    length: 10,
7034                    token: Token::Word(Word::Word("نظامی".to_string())),
7035                },
7036                PositionalToken {
7037                    source: uws,
7038                    offset: 122,
7039                    length: 1,
7040                    token: Token::Special(Special::Separator(Separator::Space)),
7041                },
7042                PositionalToken {
7043                    source: uws,
7044                    offset: 123,
7045                    length: 4,
7046                    token: Token::Word(Word::Word("در".to_string())),
7047                },
7048                PositionalToken {
7049                    source: uws,
7050                    offset: 127,
7051                    length: 1,
7052                    token: Token::Special(Special::Separator(Separator::Space)),
7053                },
7054                PositionalToken {
7055                    source: uws,
7056                    offset: 128,
7057                    length: 6,
7058                    token: Token::Word(Word::Word("بین".to_string())),
7059                },
7060                PositionalToken {
7061                    source: uws,
7062                    offset: 134,
7063                    length: 1,
7064                    token: Token::Special(Special::Separator(Separator::Space)),
7065                },
7066                PositionalToken {
7067                    source: uws,
7068                    offset: 135,
7069                    length: 6,
7070                    token: Token::Word(Word::Word("سال".to_string())),
7071                },
7072                PositionalToken {
7073                    source: uws,
7074                    offset: 141,
7075                    length: 3,
7076                    token: Token::Unicode(Unicode::Formatter(Formatter::Char('\u{200c}'))),
7077                },
7078                PositionalToken {
7079                    source: uws,
7080                    offset: 144,
7081                    length: 6,
7082                    token: Token::Word(Word::Word("های".to_string())),
7083                },
7084                PositionalToken {
7085                    source: uws,
7086                    offset: 150,
7087                    length: 1,
7088                    token: Token::Special(Special::Separator(Separator::Space)),
7089                },
7090                PositionalToken {
7091                    source: uws,
7092                    offset: 151,
7093                    length: 6,
7094                    token: Token::Word(Word::StrangeWord("۸۶۴".to_string())),
7095                },
7096                PositionalToken {
7097                    source: uws,
7098                    offset: 157,
7099                    length: 1,
7100                    token: Token::Special(Special::Separator(Separator::Space)),
7101                },
7102                PositionalToken {
7103                    source: uws,
7104                    offset: 158,
7105                    length: 4,
7106                    token: Token::Word(Word::Word("تا".to_string())),
7107                },
7108                PositionalToken {
7109                    source: uws,
7110                    offset: 162,
7111                    length: 1,
7112                    token: Token::Special(Special::Separator(Separator::Space)),
7113                },
7114                PositionalToken {
7115                    source: uws,
7116                    offset: 163,
7117                    length: 8,
7118                    token: Token::Word(Word::StrangeWord("۱۰۴۱".to_string())),
7119                },
7120                PositionalToken {
7121                    source: uws,
7122                    offset: 171,
7123                    length: 1,
7124                    token: Token::Special(Special::Separator(Separator::Space)),
7125                },
7126                PositionalToken {
7127                    source: uws,
7128                    offset: 172,
7129                    length: 12,
7130                    token: Token::Word(Word::Word("میلادی".to_string())),
7131                },
7132                PositionalToken {
7133                    source: uws,
7134                    offset: 184,
7135                    length: 1,
7136                    token: Token::Special(Special::Separator(Separator::Space)),
7137                },
7138                PositionalToken {
7139                    source: uws,
7140                    offset: 185,
7141                    length: 2,
7142                    token: Token::Word(Word::Word("ب".to_string())),
7143                },
7144            ],
7145            Lang::Ell => vec![
7146                PositionalToken {
7147                    source: uws,
7148                    offset: 0,
7149                    length: 4,
7150                    token: Token::Word(Word::Word("Το".to_string())),
7151                },
7152                PositionalToken {
7153                    source: uws,
7154                    offset: 4,
7155                    length: 1,
7156                    token: Token::Special(Special::Separator(Separator::Space)),
7157                },
7158                PositionalToken {
7159                    source: uws,
7160                    offset: 5,
7161                    length: 18,
7162                    token: Token::Word(Word::Word("Πρόγραμμα".to_string())),
7163                },
7164                PositionalToken {
7165                    source: uws,
7166                    offset: 23,
7167                    length: 1,
7168                    token: Token::Special(Special::Separator(Separator::Space)),
7169                },
7170                PositionalToken {
7171                    source: uws,
7172                    offset: 24,
7173                    length: 22,
7174                    token: Token::Word(Word::Word("υλοποιείται".to_string())),
7175                },
7176                PositionalToken {
7177                    source: uws,
7178                    offset: 46,
7179                    length: 1,
7180                    token: Token::Special(Special::Separator(Separator::Space)),
7181                },
7182                PositionalToken {
7183                    source: uws,
7184                    offset: 47,
7185                    length: 4,
7186                    token: Token::Word(Word::Word("εξ".to_string())),
7187                },
7188                PositionalToken {
7189                    source: uws,
7190                    offset: 51,
7191                    length: 1,
7192                    token: Token::Special(Special::Separator(Separator::Space)),
7193                },
7194                PositionalToken {
7195                    source: uws,
7196                    offset: 52,
7197                    length: 18,
7198                    token: Token::Word(Word::Word("ολοκλήρου".to_string())),
7199                },
7200                PositionalToken {
7201                    source: uws,
7202                    offset: 70,
7203                    length: 1,
7204                    token: Token::Special(Special::Separator(Separator::Space)),
7205                },
7206                PositionalToken {
7207                    source: uws,
7208                    offset: 71,
7209                    length: 6,
7210                    token: Token::Word(Word::Word("από".to_string())),
7211                },
7212                PositionalToken {
7213                    source: uws,
7214                    offset: 77,
7215                    length: 1,
7216                    token: Token::Special(Special::Separator(Separator::Space)),
7217                },
7218                PositionalToken {
7219                    source: uws,
7220                    offset: 78,
7221                    length: 16,
7222                    token: Token::Word(Word::Word("απόσταση".to_string())),
7223                },
7224                PositionalToken {
7225                    source: uws,
7226                    offset: 94,
7227                    length: 1,
7228                    token: Token::Special(Special::Separator(Separator::Space)),
7229                },
7230                PositionalToken {
7231                    source: uws,
7232                    offset: 95,
7233                    length: 6,
7234                    token: Token::Word(Word::Word("και".to_string())),
7235                },
7236                PositionalToken {
7237                    source: uws,
7238                    offset: 101,
7239                    length: 1,
7240                    token: Token::Special(Special::Separator(Separator::Space)),
7241                },
7242                PositionalToken {
7243                    source: uws,
7244                    offset: 102,
7245                    length: 12,
7246                    token: Token::Word(Word::Word("μπορεί".to_string())),
7247                },
7248                PositionalToken {
7249                    source: uws,
7250                    offset: 114,
7251                    length: 1,
7252                    token: Token::Special(Special::Separator(Separator::Space)),
7253                },
7254                PositionalToken {
7255                    source: uws,
7256                    offset: 115,
7257                    length: 4,
7258                    token: Token::Word(Word::Word("να".to_string())),
7259                },
7260                PositionalToken {
7261                    source: uws,
7262                    offset: 119,
7263                    length: 1,
7264                    token: Token::Special(Special::Separator(Separator::Space)),
7265                },
7266                PositionalToken {
7267                    source: uws,
7268                    offset: 120,
7269                    length: 20,
7270                    token: Token::Word(Word::Word("συμμετέχει".to_string())),
7271                },
7272                PositionalToken {
7273                    source: uws,
7274                    offset: 140,
7275                    length: 1,
7276                    token: Token::Special(Special::Separator(Separator::Space)),
7277                },
7278                PositionalToken {
7279                    source: uws,
7280                    offset: 141,
7281                    length: 8,
7282                    token: Token::Word(Word::Word("κάθε".to_string())),
7283                },
7284                PositionalToken {
7285                    source: uws,
7286                    offset: 149,
7287                    length: 1,
7288                    token: Token::Special(Special::Separator(Separator::Space)),
7289                },
7290                PositionalToken {
7291                    source: uws,
7292                    offset: 150,
7293                    length: 24,
7294                    token: Token::Word(Word::Word("εμπλεκόμενος".to_string())),
7295                },
7296                PositionalToken {
7297                    source: uws,
7298                    offset: 174,
7299                    length: 1,
7300                    token: Token::Special(Special::Separator(Separator::Space)),
7301                },
7302                PositionalToken {
7303                    source: uws,
7304                    offset: 175,
7305                    length: 6,
7306                    token: Token::Word(Word::Word("στη".to_string())),
7307                },
7308                PositionalToken {
7309                    source: uws,
7310                    offset: 181,
7311                    length: 1,
7312                    token: Token::Special(Special::Separator(Separator::Space)),
7313                },
7314                PositionalToken {
7315                    source: uws,
7316                    offset: 182,
7317                    length: 2,
7318                    token: Token::Word(Word::Word("ή".to_string())),
7319                },
7320                PositionalToken {
7321                    source: uws,
7322                    offset: 184,
7323                    length: 1,
7324                    token: Token::Special(Special::Punctuation('/')),
7325                },
7326            ],
7327        };
7328        (
7329            uws.chars()
7330                .take(100)
7331                .fold(String::new(), |acc, c| acc + &format!("{}", c)),
7332            tokens,
7333        )
7334    }
7335}