1use std::{fmt, sync::Arc};
2use text_parsing::{Breaker, IntoSource, Local, Localize, Snip, Source, SourceEvent};
3
4mod emoji;
5pub use emoji::EMOJIMAP;
6
7mod breakers;
8pub use breakers::{SentenceBreaker, UnicodeSentenceBreaker};
9
10mod numbers;
11pub use numbers::NumberNotation;
12
13mod wordbreaker;
14
15mod options;
16pub use options::{IntoTokenizer, TokenizerOptions, TokenizerParams};
17
18mod tokens;
19pub use tokens::Tokens;
20
21mod text_tokens;
22use text_tokens::InnerBound;
23pub use text_tokens::TextTokens;
24
25#[cfg(test)]
26mod test {
27    mod numbers_ru_en;
28}
29
30#[derive(Debug)]
31pub enum Error {
32    TextParser(text_parsing::Error),
33}
34
35pub const EPS: f64 = 1e-8;
36
37#[cfg(feature = "strings")]
38#[derive(Debug, Clone, PartialEq, PartialOrd)]
39pub enum Number {
40    Integer(i64),
41    Float(f64),
42    ZeroInteger { i: i64, s: String },
44}
45
46#[cfg(not(feature = "strings"))]
47#[derive(Debug, Clone, Copy, PartialEq, PartialOrd)]
48pub enum Number {
49    Integer(i64),
50    Float(f64),
51    ZeroInteger { i: i64 },
52}
53
54impl Number {
55    pub fn as_f64(&self) -> f64 {
56        match self {
57            Number::Integer(i) => *i as f64,
58            Number::Float(f) => *f,
59            Number::ZeroInteger { i, .. } => *i as f64,
60        }
61    }
62}
63impl Ord for Number {
64    fn cmp(&self, other: &Number) -> std::cmp::Ordering {
65        let s = self.as_f64();
66        let o = other.as_f64();
67        let d = s - o;
68        match d.abs() < EPS {
69            true => std::cmp::Ordering::Equal,
70            false => {
71                if d > 0.0 {
72                    return std::cmp::Ordering::Greater;
73                }
74                if d < 0.0 {
75                    return std::cmp::Ordering::Less;
76                }
77                std::cmp::Ordering::Equal
78            }
79        }
80    }
81}
82impl Eq for Number {}
83
84#[derive(Debug, Clone, Copy, Eq, PartialEq, Ord, PartialOrd)]
85pub enum Separator {
86    Space,
87    Tab,
88    Newline,
89    Char(char),
90}
91
92#[derive(Debug, Clone, Copy, Eq, PartialEq, Ord, PartialOrd)]
93pub enum Formatter {
94    Char(char),
95    Joiner, }
97
98#[derive(Debug, Clone, Copy, PartialEq, Eq, Ord, PartialOrd)]
99pub enum Special {
100    Currency(char),
101    Punctuation(char),
102    Symbol(char),
103    Separator(Separator),
104}
105
106#[cfg(feature = "strings")]
107#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord)]
108pub enum Word {
109    Word(String),
110    StrangeWord(String),
111    Numerical(Numerical),
112    Number(Number),
113    Emoji(&'static str),
114}
115
116#[cfg(feature = "strings")]
117#[derive(Debug, Clone, Eq, PartialEq, Ord, PartialOrd)]
118pub enum Numerical {
119    DotSeparated(String),
123    Measures(String),
124    Alphanumeric(String),
125}
126
127#[cfg(feature = "strings")]
128#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord)]
129pub enum Struct {
130    Hashtag(String),
131    Mention(String),
132    }
134
135#[cfg(feature = "strings")]
136#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord)]
137pub enum Unicode {
138    String(String),
139    Formatter(Formatter),
140}
141
142#[cfg(not(feature = "strings"))]
143#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord)]
144pub enum Word {
145    Word,
146    StrangeWord,
147    Numerical(Numerical),
148    Number(Number),
149    Emoji(&'static str),
150}
151
152#[cfg(not(feature = "strings"))]
153#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord)]
154pub enum Numerical {
155    DotSeparated,
159    Measures,
160    Alphanumeric,
161}
162
163#[cfg(not(feature = "strings"))]
164#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord)]
165pub enum Struct {
166    Hashtag,
167    Mention,
168    }
170
171#[cfg(not(feature = "strings"))]
172#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord)]
173pub enum Unicode {
174    String,
175    Formatter(Formatter),
176}
177
178#[cfg(feature = "strings")]
179#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord)]
180pub enum Token {
181    Word(Word),
182    Struct(Struct),
183    Special(Special),
184    Unicode(Unicode),
185}
186
187#[cfg(not(feature = "strings"))]
188#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord)]
189pub enum Token {
190    Word(Word),
191    Struct(Struct),
192    Special(Special),
193    Unicode(Unicode),
194}
195
196#[derive(Debug)]
210pub struct TextStr<'s> {
211    buffer: &'s str,
212    localities: Arc<Vec<TextLocality>>,
213    breakers: Arc<Vec<InnerBound>>,
214}
215impl<'s> TextStr<'s> {
216    pub fn new<'a>(s: &'a str) -> Result<TextStr<'a>, Error> {
217        let text = inner_new(s.into_source(), false)?;
218        Ok(TextStr {
219            buffer: s,
220            localities: text.localities,
221            breakers: text.breakers,
222        })
223    }
224}
225
226fn inner_new<S: Source>(mut source: S, with_buffer: bool) -> Result<Text, Error> {
227    let mut buffer = String::new();
228    let mut localities = Vec::new();
229    let mut breakers = Vec::new();
230    let mut buffer_len = 0;
231
232    while let Some(local_se) = source.next_char().map_err(Error::TextParser)? {
233        let (local, se) = local_se.into_inner();
234        let c = match se {
235            SourceEvent::Char(c) => match c {
236                '\u{0060}' => '\u{0027}',
237                _ => c,
238            },
239            SourceEvent::Breaker(b) => {
240                let (c, opt_b) = match b {
241                    Breaker::None => continue,
242                    Breaker::Space => (' ', None),
243                    Breaker::Line => ('\n', None),
244                    Breaker::Word => ('\u{200B}', Some(b)), Breaker::Sentence | Breaker::Paragraph | Breaker::Section => ('\n', Some(b)),
246                };
247                if let Some(b) = opt_b {
248                    let br = InnerBound {
249                        bytes: Snip {
250                            offset: buffer_len,
251                            length: c.len_utf8(),
252                        },
253                        chars: Snip {
254                            offset: localities.len(),
255                            length: 1,
256                        },
257                        breaker: b,
258                        original: Some(local),
259                    };
260                    breakers.push(br);
262                }
263                c
264            }
265        };
266
267        let buf_local = ().localize(
268            Snip {
269                offset: localities.len(),
271                length: 1,
272            },
273            Snip {
274                offset: buffer_len,
276                length: c.len_utf8(),
277            },
278        );
279        if with_buffer {
280            buffer.push(c);
281        }
282        buffer_len += c.len_utf8();
283        localities.push(TextLocality {
284            buffer: buf_local,
285            original: local,
286        });
287    }
288    Ok(Text {
289        buffer: Arc::new(buffer),
290        localities: Arc::new(localities),
291        breakers: Arc::new(breakers),
292    })
293}
294
295#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord)]
296pub struct TextLocality {
297    pub buffer: Local<()>,
298    pub original: Local<()>,
299}
300
301#[derive(Debug)]
302pub struct Text {
303    buffer: Arc<String>,
304    localities: Arc<Vec<TextLocality>>,
305    breakers: Arc<Vec<InnerBound>>,
306}
307impl Text {
308    pub fn new<S: Source>(source: S) -> Result<Text, Error> {
309        inner_new(source, true)
310    }
311    pub fn token_text<'s>(&'s self, token: &TextToken) -> &'s str {
312        let Snip {
313            offset: begin,
314            length: len,
315        } = token.locality.bytes();
316        let end = begin + len;
317        &self.buffer[begin..end]
318    }
319    pub fn text(&self) -> &str {
320        self.buffer.as_ref()
321    }
322    pub fn original_locality(&self, idx: usize) -> Option<Local<()>> {
323        self.localities.get(idx).map(|tl| tl.original)
324    }
325    pub fn localities(&self) -> &Vec<TextLocality> {
326        self.localities.as_ref()
327    }
328    pub fn shared_text(&self) -> Text {
329        Text {
330            buffer: self.buffer.clone(),
331            localities: self.localities.clone(),
332            breakers: self.breakers.clone(),
333        }
334    }
335}
336
337impl TryFrom<String> for Text {
338    type Error = Error;
339
340    fn try_from(s: String) -> Result<Text, Error> {
341        let mut text = inner_new((&s).into_source(), false)?;
342        text.buffer = Arc::new(s);
343        Ok(text)
344    }
345}
346
347impl TryFrom<&str> for Text {
348    type Error = Error;
349
350    fn try_from(s: &str) -> Result<Text, Error> {
351        Text::new(s.into_source())
352    }
353}
354
355#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord)]
356pub enum Bound {
357    Sentence,
358    Paragraph,
359    Section,
360}
361
362#[cfg(feature = "strings")]
363#[derive(Clone, PartialEq, PartialOrd, Eq, Ord)]
364pub struct TextToken {
365    locality: Local<()>,
366    original: Option<Local<()>>,
367    pub token: Token2,
368}
369
370#[cfg(not(feature = "strings"))]
371#[derive(Clone, Copy, PartialEq, PartialOrd, Eq, Ord)]
372pub struct TextToken {
373    locality: Local<()>,
374    original: Option<Local<()>>,
375    pub token: Token2,
376}
377
378impl fmt::Debug for TextToken {
379    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
380        write!(
381            f,
382            "TextToken {{ local: {:?} [{:?}] }}, ",
383            self.locality.bytes(),
384            self.locality.chars()
385        )?;
386        match &self.original {
387            Some(orig) => write!(f, "orig: {:?} [{:?}], ", orig.bytes(), orig.chars())?,
388            None => {}
389        }
390        write!(f, "token: {:?} }}", self.token)
391    }
392}
393
394#[cfg(test)]
395impl TextToken {
396    fn into_original_token_1(self) -> Option<Local<Token>> {
397        match self.original {
398            Some(original) => self.token.into_token().map(|t| original.local(t)),
399            None => None,
400        }
401    }
402}
403
404impl TextToken {
405    pub fn local(&self) -> Local<()> {
406        self.locality
407    }
408    pub fn original(&self) -> Option<Local<()>> {
409        self.original
410    }
411    pub fn into_position(mut self) -> TextToken {
412        self.locality = self.locality.into_position();
413        self.original = self.original.map(|or| or.into_position());
414        self
415    }
416    pub fn try_as_token(&self) -> Result<Token, Bound> {
417        self.token.try_as_token()
418    }
419    pub fn as_original_token(&self) -> Option<Local<&Token2>> {
420        self.original.map(|original| original.local(&self.token))
421    }
422    pub fn into_original_token(self) -> Option<Local<Token2>> {
423        self.original.map(|original| original.local(self.token))
424    }
425    pub fn original_str<'s>(&self, original: &'s str) -> Result<&'s str, OriginalError> {
426        match self.original {
427            Some(local) => {
428                let Snip {
429                    offset: begin,
430                    length: len,
431                } = local.bytes();
432                let end = begin + len;
433                match original.get(begin..end) {
434                    Some(s) => Ok(s),
435                    None => Err(OriginalError::InvalidSnip),
436                }
437            }
438            None => Err(OriginalError::NoOriginal),
439        }
440    }
441
442    #[cfg(feature = "strings")]
443    fn token_clone(&self) -> Token2 {
444        self.token.clone()
445    }
446
447    #[cfg(not(feature = "strings"))]
448    fn token_clone(&self) -> Token2 {
449        self.token
450    }
451
452    pub fn merge_tokens(
453        &self,
454        other: &TextToken,
455        new_token: Option<Token2>,
456    ) -> Result<TextToken, TextToken> {
457        let (local, left_lb, left_lc) = add_local(&self.locality, &other.locality);
458        let must_be_left = left_lb;
459        let mut ok = must_be_left == left_lc;
460        let orig = match (&self.original, &other.original) {
461            (None, None) => None,
462            (Some(o), None) | (None, Some(o)) => Some(*o),
463            (Some(s), Some(o)) => {
464                let (orig, lb, lc) = add_local(s, o);
465                ok &= must_be_left == lb;
466                ok &= must_be_left == lc;
467                Some(orig)
468            }
469        };
470        let token = TextToken {
471            locality: local,
472            original: orig,
473            token: match new_token {
474                Some(t) => t,
475                None => self.token_clone(),
476            },
477        };
478        match ok {
479            true => Ok(token),
480            false => Err(token),
481        }
482    }
483}
484
485fn add_local(slf: &Local<()>, other: &Local<()>) -> (Local<()>, bool, bool) {
486    let b1 = slf.bytes();
488    let b2 = other.bytes();
489    let c1 = slf.chars();
490    let c2 = other.chars();
491    let (bytes, slf_is_left_by_bytes) = match b1.offset < b2.offset {
492        true => (
493            Snip {
494                offset: b1.offset,
495                length: (b2.offset + b2.length) - b1.offset,
496            },
497            true,
498        ),
499        false => (
500            Snip {
501                offset: b2.offset,
502                length: (b1.offset + b1.length) - b2.offset,
503            },
504            false,
505        ),
506    };
507    let (chars, slf_is_left_by_chars) = match c1.offset < c2.offset {
508        true => (
509            Snip {
510                offset: c1.offset,
511                length: (c2.offset + c2.length) - c1.offset,
512            },
513            true,
514        ),
515        false => (
516            Snip {
517                offset: c2.offset,
518                length: (c1.offset + c1.length) - c2.offset,
519            },
520            false,
521        ),
522    };
523    (
524        ().localize(chars, bytes),
525        slf_is_left_by_bytes,
526        slf_is_left_by_chars,
527    )
528}
529
530impl TextToken {
531    pub fn test_token(lt: Local<Token2>) -> TextToken {
532        let (local, token) = lt.into_inner();
533        TextToken {
534            locality: local,
535            original: Some(local.local(())),
536            token,
537        }
538    }
539    pub fn test_new(token: Token2, local: Local<()>, original: Option<Local<()>>) -> TextToken {
540        TextToken {
541            locality: local,
542            original,
543            token,
544        }
545    }
546}
547
548#[derive(Debug)]
575pub enum OriginalError {
576    NoOriginal,
577    InvalidSnip,
578}
579
580#[cfg(feature = "strings")]
588#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord)]
589pub enum Token2 {
590    Word(Word),
591    Struct(Struct),
592    Special(Special),
593    Unicode(Unicode),
594
595    Bound(Bound),
596}
597#[cfg(not(feature = "strings"))]
598#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord)]
599pub enum Token2 {
600    Word(Word),
601    Struct(Struct),
602    Special(Special),
603    Unicode(Unicode),
604
605    Bound(Bound),
606}
607impl From<Token> for Token2 {
608    fn from(t: Token) -> Token2 {
609        match t {
610            Token::Word(w) => Token2::Word(w),
611            Token::Struct(s) => Token2::Struct(s),
612            Token::Special(s) => Token2::Special(s),
613            Token::Unicode(u) => Token2::Unicode(u),
614        }
615    }
616}
617impl Token2 {
618    #[cfg(not(feature = "strings"))]
619    fn try_as_token(&self) -> Result<Token, Bound> {
620        (*self).try_into_token()
621    }
622
623    #[cfg(feature = "strings")]
624    fn try_as_token(&self) -> Result<Token, Bound> {
625        self.clone().try_into_token()
626    }
627
628    fn try_into_token(self) -> Result<Token, Bound> {
629        match self {
630            Token2::Word(w) => Ok(Token::Word(w)),
631            Token2::Struct(s) => Ok(Token::Struct(s)),
632            Token2::Special(s) => Ok(Token::Special(s)),
633            Token2::Unicode(u) => Ok(Token::Unicode(u)),
634            Token2::Bound(b) => Err(b),
635        }
636    }
637}
638#[cfg(test)]
639impl Token2 {
640    fn into_token(self) -> Option<Token> {
641        match self {
642            Token2::Word(w) => Some(Token::Word(w)),
643            Token2::Struct(s) => Some(Token::Struct(s)),
644            Token2::Special(s) => Some(Token::Special(s)),
645            Token2::Unicode(u) => Some(Token::Unicode(u)),
646            Token2::Bound(_) => None,
647        }
648    }
649}
650
651#[cfg(test)]
652#[cfg(not(feature = "strings"))]
653mod test_no_strings {
654    use super::*;
655    use text_parsing::{
656        IntoPipeParser, IntoSource, Localize, ParserExt, SourceExt, entities, tagger,
657    };
658
659    fn check_results(result: &Vec<Local<Token>>, lib_res: &Vec<Local<Token>>, _uws: &str) {
660        assert_eq!(result.len(), lib_res.len());
661        for i in 0..result.len() {
662            let res: Local<Token> = result[i].clone().into();
663            assert_eq!(res, lib_res[i]);
664        }
665    }
666
667    fn symbols() {
669        let uws = "Сибирь Арене 17 30 от 2560₽ 😀";
670        let lib_res = uws
673            .into_tokenizer(TokenizerParams::v1())
674            .collect::<Vec<_>>();
675        for t in lib_res {
677            println!("{:?}", t);
678        }
679        panic!()
680    }
681}
682
683#[cfg(test)]
684mod test_v0_5 {
685    use super::*;
686    use text_parsing::{IntoPipeParser, IntoSource, ParserExt, SourceExt, entities, tagger};
687
688    fn basic() {
690        let uws = "<p>Oxana Putan shared the quick (\"brown\") fox can't jump 32.3 feet, right? 4pda etc.</p><p> qeq U.S.A  asd\n\n\nBrr, it's 29.3°F!\n Русское предложение #36.6 для тестирования деления по юникод-словам...\n🇷🇺 🇸🇹\n👱🏿👶🏽👨🏽\n+Done! Готово</p>";
705        let text = Text::new({
706            uws.into_source()
707                .pipe(tagger::Builder::new().create().into_breaker())
708                .pipe(entities::Builder::new().create().into_piped())
709                .into_separator()
710        })
711        .unwrap();
712        let lib_res = text
713            .into_tokenizer({
714                TokenizerParams::default()
715                    .add_option(TokenizerOptions::SplitDot)
716                    .add_option(TokenizerOptions::SplitUnderscore)
717                    .add_option(TokenizerOptions::SplitColon)
718                    .with_default_sentences()
719            })
720            .collect::<Vec<_>>();
721
722        for tok in lib_res {
723            println!(
724                "C{:?}, B{:?}, {:?} -> {:?}",
725                tok.original.map(|loc| loc.chars()),
726                tok.original.map(|loc| loc.bytes()),
727                tok.token,
728                tok.original_str(uws)
729            );
730        }
731
732        panic!()
733    }
734}
735
736#[cfg(test)]
737#[cfg(feature = "strings")]
738mod test_strings {
739    use super::*;
740    use text_parsing::{
741        IntoPipeParser, IntoSource, Localize, ParserExt, SourceExt, entities, tagger,
742    };
743
744    #[allow(dead_code)]
763    fn print_result(lib_res: &Vec<Local<Token>>) {
764        for lt in lib_res {
769            println!("{:?}", lt);
770        }
771    }
772    #[derive(Debug, Clone)]
801    struct CharToken {
802        byte_offset: usize,
803        byte_length: usize,
804        char_offset: usize,
805        char_length: usize,
806        token: Token,
807    }
808    impl Into<Local<Token>> for CharToken {
809        fn into(self) -> Local<Token> {
810            self.token.localize(
811                Snip {
812                    offset: self.char_offset,
813                    length: self.char_length,
814                },
815                Snip {
816                    offset: self.byte_offset,
817                    length: self.byte_length,
818                },
819            )
820        }
821    }
822
823    #[derive(Debug, Clone)]
824    struct PositionalToken {
825        source: &'static str,
826        offset: usize,
827        length: usize,
828        token: Token,
829    }
830    impl Into<Local<Token>> for PositionalToken {
831        fn into(self) -> Local<Token> {
832            self.token.localize(
833                Snip {
834                    offset: self.source[..self.offset].chars().count(),
835                    length: self.source[self.offset..self.offset + self.length]
836                        .chars()
837                        .count(),
838                },
839                Snip {
840                    offset: self.offset,
841                    length: self.length,
842                },
843            )
844        }
845    }
846
847    fn check_results(result: &Vec<PositionalToken>, lib_res: &Vec<Local<Token>>, _uws: &str) {
848        assert_eq!(result.len(), lib_res.len());
849        for i in 0..result.len() {
850            let res: Local<Token> = result[i].clone().into();
851            assert_eq!(res, lib_res[i]);
852        }
853    }
854
855    fn check_cresults(result: &Vec<CharToken>, lib_res: &Vec<Local<Token>>, _uws: &str) {
856        assert_eq!(result.len(), lib_res.len());
857        for i in 0..result.len() {
858            let res: Local<Token> = result[i].clone().into();
859            assert_eq!(res, lib_res[i]);
860        }
861    }
862
863    fn check<T: Clone + std::fmt::Debug + Into<Local<Token>>>(
864        res: &Vec<T>,
865        lib: &Vec<Local<Token>>,
866        _uws: &str,
867    ) {
868        let mut lib = lib.iter();
869        let mut res = res.iter().map(|r| {
870            let res: Local<Token> = r.clone().into();
871            res
872        });
873        let mut diff = Vec::new();
874        loop {
875            match (lib.next(), res.next()) {
876                (Some(lw), Some(rw)) => {
877                    if *lw != rw {
878                        diff.push(format!("LIB:  {:?}", lw));
879                        diff.push(format!("TEST: {:?}", rw));
880                        diff.push("".to_string())
881                    }
882                }
883                (Some(lw), None) => {
884                    diff.push(format!("LIB:  {:?}", lw));
885                    diff.push("TEST: ----".to_string());
886                    diff.push("".to_string())
887                }
888                (None, Some(rw)) => {
889                    diff.push("LIB:  ----".to_string());
890                    diff.push(format!("TEST: {:?}", rw));
891                    diff.push("".to_string())
892                }
893                (None, None) => break,
894            }
895        }
896        if diff.len() > 0 {
897            for ln in &diff {
898                println!("{}", ln);
899            }
900            panic!("Diff count: {}", diff.len() / 3);
901        }
902    }
903
904    #[test]
905    #[rustfmt::skip]
906    fn custom_numbers() {
907        let uws = "115,7 123,398,398 2,123.45 0,05%";
908        let result = vec![
909            PositionalToken { source: uws, offset: 0, length: 5, token: Token::Word(Word::Number(Number::Float(115.7))) },            
910            PositionalToken { source: uws, offset: 5, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
911            PositionalToken { source: uws, offset: 6, length: 11, token: Token::Word(Word::Number(Number::Integer(123398398))) },
912            PositionalToken { source: uws, offset: 17, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
913            PositionalToken { source: uws, offset: 18, length: 8, token: Token::Word(Word::Number(Number::Float(2123.45))) },
914            PositionalToken { source: uws, offset: 26, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
915            PositionalToken { source: uws, offset: 27, length: 4, token: Token::Word(Word::Number(Number::Float(0.05))) },
916            PositionalToken { source: uws, offset: 31, length: 1, token: Token::Special(Special::Punctuation('%')) },
917        ];
918        let lib_res = uws
919            .into_tokenizer(TokenizerParams::v1())
920            .collect::<Vec<_>>();
921        check_results(&result, &lib_res, uws);
923    }
924
925    #[test]
926    #[rustfmt::skip]
927    fn custom_numbers_ftoi() {
928        let uws = "1.1 10.0000";
929        let result = vec![
930            PositionalToken { source: uws, offset: 0, length: 3, token: Token::Word(Word::Number(Number::Float(1.1))) },            
931            PositionalToken { source: uws, offset: 3, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
932            PositionalToken { source: uws, offset: 4, length: 7, token: Token::Word(Word::Number(Number::Integer(10))) },
933        ];
934        let lib_res = uws
935            .into_tokenizer(TokenizerParams::v1())
936            .collect::<Vec<_>>();
937        check_results(&result, &lib_res, uws);
939    }
940
941    #[test]
942    #[rustfmt::skip]
943    fn custom_numbers_en_1() {
944        let uws = "1.1 10,000";
945        let result = vec![
946            PositionalToken { source: uws, offset: 0, length: 3, token: Token::Word(Word::Number(Number::Float(1.1))) },            
947            PositionalToken { source: uws, offset: 3, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
948            PositionalToken { source: uws, offset: 4, length: 6, token: Token::Word(Word::Number(Number::Integer(10000))) },
949        ];
950        let lib_res = uws
951            .into_tokenizer(TokenizerParams::v1())
952            .collect::<Vec<_>>();
953        check_results(&result, &lib_res, uws);
955    }
956
957    #[test]
958    #[rustfmt::skip]
959    fn custom_numbers_en_2() {
960        let uws = "1,000.1 10,000";
961        let result = vec![
962            PositionalToken { source: uws, offset: 0, length: 7, token: Token::Word(Word::Number(Number::Float(1000.1))) },
963            PositionalToken { source: uws, offset: 7, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
964            PositionalToken { source: uws, offset: 8, length: 6, token: Token::Word(Word::Number(Number::Integer(10000))) },
965        ];
966        let lib_res = uws
967            .into_tokenizer(TokenizerParams::v1())
968            .collect::<Vec<_>>();
969        check_results(&result, &lib_res, uws);
971    }
972
973    #[test]
974    #[rustfmt::skip]
975    fn custom_numbers_ru_1() {
976        let uws = "1.1 10,001";
977        let result = vec![
978            PositionalToken { source: uws, offset: 0, length: 3, token: Token::Word(Word::Number(Number::Float(1.1))) },            
979            PositionalToken { source: uws, offset: 3, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
980            PositionalToken { source: uws, offset: 4, length: 6, token: Token::Word(Word::Number(Number::Integer(10001))) }, ];
982        let lib_res = uws
983            .into_tokenizer(TokenizerParams::v1().add_option(TokenizerOptions::NumberUnknownComaAsDot))
984            .collect::<Vec<_>>();
985        check_results(&result, &lib_res, uws);
987    }
988
989    #[test]
990    #[rustfmt::skip]
991    fn custom_numbers_ru_2() {
992        let uws = "1,1 10,001";
993        let result = vec![
994            PositionalToken { source: uws, offset: 0, length: 3, token: Token::Word(Word::Number(Number::Float(1.1))) },            
995            PositionalToken { source: uws, offset: 3, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
996            PositionalToken { source: uws, offset: 4, length: 6, token: Token::Word(Word::Number(Number::Integer(10001))) }, ];
998        let lib_res = uws
999            .into_tokenizer(TokenizerParams::v1())
1000            .collect::<Vec<_>>();
1001        check_results(&result, &lib_res, uws);
1003    }
1004
1005    #[test]
1006    #[rustfmt::skip]
1007    fn custom_numbers_ru_3() {
1008        let uws = "10000,1 10,001";
1009        let result = vec![
1010            PositionalToken { source: uws, offset: 0, length: 7, token: Token::Word(Word::Number(Number::Float(10000.1))) },
1011            PositionalToken { source: uws, offset: 7, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
1012            PositionalToken { source: uws, offset: 8, length: 6, token: Token::Word(Word::Number(Number::Integer(10001))) }, ];
1014        let lib_res = uws
1015            .into_tokenizer(TokenizerParams::v1())
1016            .collect::<Vec<_>>();
1017        check_results(&result, &lib_res, uws);
1019    }
1020
1021    #[test]
1022    #[rustfmt::skip]
1023    fn currency() {
1024        let uws = "$ ₽ € ¥";
1025        let result = vec![
1026            PositionalToken { source: uws, offset: 0, length: 1, token: Token::Special(Special::Currency('$')) },
1027            PositionalToken { source: uws, offset: 1, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
1028            PositionalToken { source: uws, offset: 2, length: 3, token: Token::Special(Special::Currency('₽')) },
1029            PositionalToken { source: uws, offset: 5, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
1030            PositionalToken { source: uws, offset: 6, length: 3, token: Token::Special(Special::Currency('€')) },
1031            PositionalToken { source: uws, offset: 9, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
1032            PositionalToken { source: uws, offset: 10, length: 2, token: Token::Special(Special::Currency('¥')) },
1033        ];
1034        let lib_res = uws
1035            .into_tokenizer(TokenizerParams::v1())
1036            .collect::<Vec<_>>();
1037        check_results(&result, &lib_res, uws);
1039        }
1041
1042    #[test]
1043    fn spaces() {
1044        let uws = "    spaces    too   many   apces   ";
1045        let result = vec![
1046            PositionalToken {
1047                source: uws,
1048                offset: 0,
1049                length: 4,
1050                token: Token::Special(Special::Separator(Separator::Space)),
1051            },
1052            PositionalToken {
1053                source: uws,
1054                offset: 4,
1055                length: 6,
1056                token: Token::Word(Word::Word("spaces".to_string())),
1057            },
1058            PositionalToken {
1059                source: uws,
1060                offset: 10,
1061                length: 4,
1062                token: Token::Special(Special::Separator(Separator::Space)),
1063            },
1064            PositionalToken {
1065                source: uws,
1066                offset: 14,
1067                length: 3,
1068                token: Token::Word(Word::Word("too".to_string())),
1069            },
1070            PositionalToken {
1071                source: uws,
1072                offset: 17,
1073                length: 3,
1074                token: Token::Special(Special::Separator(Separator::Space)),
1075            },
1076            PositionalToken {
1077                source: uws,
1078                offset: 20,
1079                length: 4,
1080                token: Token::Word(Word::Word("many".to_string())),
1081            },
1082            PositionalToken {
1083                source: uws,
1084                offset: 24,
1085                length: 3,
1086                token: Token::Special(Special::Separator(Separator::Space)),
1087            },
1088            PositionalToken {
1089                source: uws,
1090                offset: 27,
1091                length: 5,
1092                token: Token::Word(Word::Word("apces".to_string())),
1093            },
1094            PositionalToken {
1095                source: uws,
1096                offset: 32,
1097                length: 3,
1098                token: Token::Special(Special::Separator(Separator::Space)),
1099            },
1100        ];
1101        let lib_res = uws
1102            .into_tokenizer(TokenizerParams::v1())
1103            .collect::<Vec<_>>();
1104        check_results(&result, &lib_res, uws);
1105        }
1107
1108    #[test]
1109    fn numbers() {
1110        let uws = "(() -2\n()  -2";
1111        let result = vec![
1112            PositionalToken {
1113                source: uws,
1114                offset: 0,
1115                length: 1,
1116                token: Token::Special(Special::Punctuation('(')),
1117            },
1118            PositionalToken {
1119                source: uws,
1120                offset: 1,
1121                length: 1,
1122                token: Token::Special(Special::Punctuation('(')),
1123            },
1124            PositionalToken {
1125                source: uws,
1126                offset: 2,
1127                length: 1,
1128                token: Token::Special(Special::Punctuation(')')),
1129            },
1130            PositionalToken {
1131                source: uws,
1132                offset: 3,
1133                length: 1,
1134                token: Token::Special(Special::Separator(Separator::Space)),
1135            },
1136            PositionalToken {
1137                source: uws,
1138                offset: 4,
1139                length: 2,
1140                token: Token::Word(Word::Number(Number::Integer(-2))),
1141            },
1142            PositionalToken {
1143                source: uws,
1144                offset: 6,
1145                length: 1,
1146                token: Token::Special(Special::Separator(Separator::Newline)),
1147            },
1148            PositionalToken {
1149                source: uws,
1150                offset: 7,
1151                length: 1,
1152                token: Token::Special(Special::Punctuation('(')),
1153            },
1154            PositionalToken {
1155                source: uws,
1156                offset: 8,
1157                length: 1,
1158                token: Token::Special(Special::Punctuation(')')),
1159            },
1160            PositionalToken {
1161                source: uws,
1162                offset: 9,
1163                length: 2,
1164                token: Token::Special(Special::Separator(Separator::Space)),
1165            },
1166            PositionalToken {
1167                source: uws,
1168                offset: 11,
1169                length: 2,
1170                token: Token::Word(Word::Number(Number::Integer(-2))),
1171            },
1172        ];
1173        let lib_res = uws
1174            .into_tokenizer({
1175                TokenizerParams::default()
1176                    .add_option(TokenizerOptions::SplitDot)
1177                    .add_option(TokenizerOptions::SplitUnderscore)
1178                    .add_option(TokenizerOptions::SplitColon)
1179                    .add_option(TokenizerOptions::MergeWhites)
1180            })
1181            .collect::<Vec<_>>();
1182        check_results(&result, &lib_res, uws);
1183    }
1184
1185    #[test]
1186    fn word_with_inner_hyphens() {
1187        let uws = "Опросы показывают";
1188        let result = vec![
1189            PositionalToken {
1190                source: uws,
1191                offset: 0,
1192                length: 14,
1193                token: Token::Word(Word::StrangeWord("Опросы".to_string())),
1194            },
1195            PositionalToken {
1196                source: uws,
1197                offset: 14,
1198                length: 1,
1199                token: Token::Special(Special::Separator(Separator::Space)),
1200            },
1201            PositionalToken {
1202                source: uws,
1203                offset: 15,
1204                length: 28,
1205                token: Token::Word(Word::StrangeWord("показывают".to_string())),
1206            },
1207        ];
1208        let lib_res = uws
1209            .into_tokenizer(TokenizerParams::v1())
1210            .collect::<Vec<_>>();
1211        check_results(&result, &lib_res, uws);
1212    }
1213
1214    #[test]
1215    fn mixed_but_word() {
1216        let uws = "L’Oreal";
1217        let result = vec![PositionalToken {
1218            source: uws,
1219            offset: 0,
1220            length: 9,
1221            token: Token::Word(Word::StrangeWord("L’Oreal".to_string())),
1222        }];
1223        let lib_res = uws
1224            .into_tokenizer(TokenizerParams::v1())
1225            .collect::<Vec<_>>();
1226        check_results(&result, &lib_res, uws);
1227    }
1228
1229    #[test]
1230    fn hashtags() {
1231        let uws = "#hashtag#hashtag2";
1232        let result = vec![
1233            PositionalToken {
1234                source: uws,
1235                offset: 0,
1236                length: 8,
1237                token: Token::Struct(Struct::Hashtag("hashtag".to_string())),
1238            },
1239            PositionalToken {
1240                source: uws,
1241                offset: 8,
1242                length: 9,
1243                token: Token::Struct(Struct::Hashtag("hashtag2".to_string())),
1244            },
1245            ];
1272        let lib_res = uws
1273            .into_tokenizer(TokenizerParams::v1())
1274            .collect::<Vec<_>>();
1275        check_results(&result, &lib_res, uws);
1276    }
1277
1278    #[test]
1279    fn hashtags2() {
1280        let uws = "#hashtag#hashtag2 #hash_tag";
1281        let result = vec![
1282            PositionalToken {
1283                source: uws,
1284                offset: 0,
1285                length: 8,
1286                token: Token::Struct(Struct::Hashtag("hashtag".to_string())),
1287            },
1288            PositionalToken {
1289                source: uws,
1290                offset: 8,
1291                length: 9,
1292                token: Token::Struct(Struct::Hashtag("hashtag2".to_string())),
1293            },
1294            PositionalToken {
1295                source: uws,
1296                offset: 17,
1297                length: 1,
1298                token: Token::Special(Special::Separator(Separator::Space)),
1299            },
1300            PositionalToken {
1301                source: uws,
1302                offset: 18,
1303                length: 9,
1304                token: Token::Struct(Struct::Hashtag("hash_tag".to_string())),
1305            },
1306        ];
1307        let lib_res = uws
1308            .into_tokenizer(TokenizerParams::v1().add_option(TokenizerOptions::StructTokens))
1309            .collect::<Vec<_>>();
1310        check_results(&result, &lib_res, uws);
1311    }
1312
1313    #[test]
1314    fn mention2() {
1315        let uws = "@hashtag@hashtag2 @hash_tag";
1316        let result = vec![
1317            PositionalToken {
1318                source: uws,
1319                offset: 0,
1320                length: 8,
1321                token: Token::Struct(Struct::Mention("hashtag".to_string())),
1322            },
1323            PositionalToken {
1324                source: uws,
1325                offset: 8,
1326                length: 9,
1327                token: Token::Struct(Struct::Mention("hashtag2".to_string())),
1328            },
1329            PositionalToken {
1330                source: uws,
1331                offset: 17,
1332                length: 1,
1333                token: Token::Special(Special::Separator(Separator::Space)),
1334            },
1335            PositionalToken {
1336                source: uws,
1337                offset: 18,
1338                length: 9,
1339                token: Token::Struct(Struct::Mention("hash_tag".to_string())),
1340            },
1341        ];
1342        let lib_res = uws
1343            .into_tokenizer(TokenizerParams::v1().add_option(TokenizerOptions::StructTokens))
1344            .collect::<Vec<_>>();
1345        check_results(&result, &lib_res, uws);
1346    }
1347
1348    #[test]
1349    fn apostrophe() {
1350        let uws = "l'oreal; l\u{0060}oreal";
1351        let result = vec![
1352            PositionalToken {
1353                source: uws,
1354                offset: 0,
1355                length: 7,
1356                token: Token::Word(Word::Word("l'oreal".to_string())),
1357            },
1358            PositionalToken {
1359                source: uws,
1360                offset: 7,
1361                length: 1,
1362                token: Token::Special(Special::Punctuation(';')),
1363            },
1364            PositionalToken {
1365                source: uws,
1366                offset: 8,
1367                length: 1,
1368                token: Token::Special(Special::Separator(Separator::Space)),
1369            },
1370            PositionalToken {
1371                source: uws,
1372                offset: 9,
1373                length: 7,
1374                token: Token::Word(Word::Word("l'oreal".to_string())),
1375            },
1376        ];
1377        let text = Text::new(uws.into_source()).unwrap();
1378        let lib_res = text
1379            .into_tokenizer(TokenizerParams::v1())
1380            .filter_map(|tt| tt.into_original_token_1())
1381            .collect::<Vec<_>>();
1382        check_results(&result, &lib_res, uws);
1383    }
1384
1385    #[test]
1386    fn char_tokens() {
1387        let uws = "[Oxana Putan|1712640565] shared the quick (\"brown\") fox can't jump 32.3 feet, right? 4pda etc. qeq U.S.A  asd\n\n\nBrr, it's 29.3°F!\n Русское предложение #36.6 для тестирования деления по юникод-словам...\n🇷🇺 🇸🇹\n👱🏿👶🏽👨🏽\n+Done! Готово";
1388        let result = vec![
1389            CharToken {
1390                byte_offset: 0,
1391                byte_length: 1,
1392                char_offset: 0,
1393                char_length: 1,
1394                token: Token::Special(Special::Punctuation('[')),
1395            },
1396            CharToken {
1397                byte_offset: 1,
1398                byte_length: 5,
1399                char_offset: 1,
1400                char_length: 5,
1401                token: Token::Word(Word::Word("Oxana".to_string())),
1402            },
1403            CharToken {
1404                byte_offset: 6,
1405                byte_length: 1,
1406                char_offset: 6,
1407                char_length: 1,
1408                token: Token::Special(Special::Separator(Separator::Space)),
1409            },
1410            CharToken {
1411                byte_offset: 7,
1412                byte_length: 5,
1413                char_offset: 7,
1414                char_length: 5,
1415                token: Token::Word(Word::Word("Putan".to_string())),
1416            },
1417            CharToken {
1418                byte_offset: 12,
1419                byte_length: 1,
1420                char_offset: 12,
1421                char_length: 1,
1422                token: Token::Special(Special::Punctuation('|')),
1423            },
1424            CharToken {
1425                byte_offset: 13,
1426                byte_length: 10,
1427                char_offset: 13,
1428                char_length: 10,
1429                token: Token::Word(Word::Number(Number::Integer(1712640565))),
1430            },
1431            CharToken {
1432                byte_offset: 23,
1433                byte_length: 1,
1434                char_offset: 23,
1435                char_length: 1,
1436                token: Token::Special(Special::Punctuation(']')),
1437            },
1438            CharToken {
1446                byte_offset: 24,
1447                byte_length: 1,
1448                char_offset: 24,
1449                char_length: 1,
1450                token: Token::Special(Special::Separator(Separator::Space)),
1451            },
1452            CharToken {
1453                byte_offset: 25,
1454                byte_length: 6,
1455                char_offset: 25,
1456                char_length: 6,
1457                token: Token::Word(Word::Word("shared".to_string())),
1458            },
1459            CharToken {
1460                byte_offset: 31,
1461                byte_length: 1,
1462                char_offset: 31,
1463                char_length: 1,
1464                token: Token::Special(Special::Separator(Separator::Space)),
1465            },
1466            CharToken {
1467                byte_offset: 32,
1468                byte_length: 3,
1469                char_offset: 32,
1470                char_length: 3,
1471                token: Token::Word(Word::Word("the".to_string())),
1472            },
1473            CharToken {
1474                byte_offset: 35,
1475                byte_length: 1,
1476                char_offset: 35,
1477                char_length: 1,
1478                token: Token::Special(Special::Separator(Separator::Space)),
1479            },
1480            CharToken {
1481                byte_offset: 36,
1482                byte_length: 5,
1483                char_offset: 36,
1484                char_length: 5,
1485                token: Token::Word(Word::Word("quick".to_string())),
1486            },
1487            CharToken {
1488                byte_offset: 41,
1489                byte_length: 1,
1490                char_offset: 41,
1491                char_length: 1,
1492                token: Token::Special(Special::Separator(Separator::Space)),
1493            },
1494            CharToken {
1495                byte_offset: 42,
1496                byte_length: 1,
1497                char_offset: 42,
1498                char_length: 1,
1499                token: Token::Special(Special::Punctuation('(')),
1500            },
1501            CharToken {
1502                byte_offset: 43,
1503                byte_length: 1,
1504                char_offset: 43,
1505                char_length: 1,
1506                token: Token::Special(Special::Punctuation('"')),
1507            },
1508            CharToken {
1509                byte_offset: 44,
1510                byte_length: 5,
1511                char_offset: 44,
1512                char_length: 5,
1513                token: Token::Word(Word::Word("brown".to_string())),
1514            },
1515            CharToken {
1516                byte_offset: 49,
1517                byte_length: 1,
1518                char_offset: 49,
1519                char_length: 1,
1520                token: Token::Special(Special::Punctuation('"')),
1521            },
1522            CharToken {
1523                byte_offset: 50,
1524                byte_length: 1,
1525                char_offset: 50,
1526                char_length: 1,
1527                token: Token::Special(Special::Punctuation(')')),
1528            },
1529            CharToken {
1530                byte_offset: 51,
1531                byte_length: 1,
1532                char_offset: 51,
1533                char_length: 1,
1534                token: Token::Special(Special::Separator(Separator::Space)),
1535            },
1536            CharToken {
1537                byte_offset: 52,
1538                byte_length: 3,
1539                char_offset: 52,
1540                char_length: 3,
1541                token: Token::Word(Word::Word("fox".to_string())),
1542            },
1543            CharToken {
1544                byte_offset: 55,
1545                byte_length: 1,
1546                char_offset: 55,
1547                char_length: 1,
1548                token: Token::Special(Special::Separator(Separator::Space)),
1549            },
1550            CharToken {
1551                byte_offset: 56,
1552                byte_length: 5,
1553                char_offset: 56,
1554                char_length: 5,
1555                token: Token::Word(Word::Word("can\'t".to_string())),
1556            },
1557            CharToken {
1558                byte_offset: 61,
1559                byte_length: 1,
1560                char_offset: 61,
1561                char_length: 1,
1562                token: Token::Special(Special::Separator(Separator::Space)),
1563            },
1564            CharToken {
1565                byte_offset: 62,
1566                byte_length: 4,
1567                char_offset: 62,
1568                char_length: 4,
1569                token: Token::Word(Word::Word("jump".to_string())),
1570            },
1571            CharToken {
1572                byte_offset: 66,
1573                byte_length: 1,
1574                char_offset: 66,
1575                char_length: 1,
1576                token: Token::Special(Special::Separator(Separator::Space)),
1577            },
1578            CharToken {
1579                byte_offset: 67,
1580                byte_length: 4,
1581                char_offset: 67,
1582                char_length: 4,
1583                token: Token::Word(Word::Number(Number::Float(32.3))),
1584            },
1585            CharToken {
1586                byte_offset: 71,
1587                byte_length: 1,
1588                char_offset: 71,
1589                char_length: 1,
1590                token: Token::Special(Special::Separator(Separator::Space)),
1591            },
1592            CharToken {
1593                byte_offset: 72,
1594                byte_length: 4,
1595                char_offset: 72,
1596                char_length: 4,
1597                token: Token::Word(Word::Word("feet".to_string())),
1598            },
1599            CharToken {
1600                byte_offset: 76,
1601                byte_length: 1,
1602                char_offset: 76,
1603                char_length: 1,
1604                token: Token::Special(Special::Punctuation(',')),
1605            },
1606            CharToken {
1607                byte_offset: 77,
1608                byte_length: 1,
1609                char_offset: 77,
1610                char_length: 1,
1611                token: Token::Special(Special::Separator(Separator::Space)),
1612            },
1613            CharToken {
1614                byte_offset: 78,
1615                byte_length: 5,
1616                char_offset: 78,
1617                char_length: 5,
1618                token: Token::Word(Word::Word("right".to_string())),
1619            },
1620            CharToken {
1621                byte_offset: 83,
1622                byte_length: 1,
1623                char_offset: 83,
1624                char_length: 1,
1625                token: Token::Special(Special::Punctuation('?')),
1626            },
1627            CharToken {
1628                byte_offset: 84,
1629                byte_length: 1,
1630                char_offset: 84,
1631                char_length: 1,
1632                token: Token::Special(Special::Separator(Separator::Space)),
1633            },
1634            CharToken {
1635                byte_offset: 85,
1636                byte_length: 4,
1637                char_offset: 85,
1638                char_length: 4,
1639                token: Token::Word(Word::Numerical(Numerical::Measures("4pda".to_string()))),
1640            },
1641            CharToken {
1642                byte_offset: 89,
1643                byte_length: 1,
1644                char_offset: 89,
1645                char_length: 1,
1646                token: Token::Special(Special::Separator(Separator::Space)),
1647            },
1648            CharToken {
1649                byte_offset: 90,
1650                byte_length: 3,
1651                char_offset: 90,
1652                char_length: 3,
1653                token: Token::Word(Word::Word("etc".to_string())),
1654            },
1655            CharToken {
1656                byte_offset: 93,
1657                byte_length: 1,
1658                char_offset: 93,
1659                char_length: 1,
1660                token: Token::Special(Special::Punctuation('.')),
1661            },
1662            CharToken {
1663                byte_offset: 94,
1664                byte_length: 1,
1665                char_offset: 94,
1666                char_length: 1,
1667                token: Token::Special(Special::Separator(Separator::Space)),
1668            },
1669            CharToken {
1670                byte_offset: 95,
1671                byte_length: 3,
1672                char_offset: 95,
1673                char_length: 3,
1674                token: Token::Word(Word::Word("qeq".to_string())),
1675            },
1676            CharToken {
1677                byte_offset: 98,
1678                byte_length: 1,
1679                char_offset: 98,
1680                char_length: 1,
1681                token: Token::Special(Special::Separator(Separator::Space)),
1682            },
1683            CharToken {
1684                byte_offset: 99,
1685                byte_length: 5,
1686                char_offset: 99,
1687                char_length: 5,
1688                token: Token::Word(Word::Word("U.S.A".to_string())),
1689            },
1690            CharToken {
1691                byte_offset: 104,
1692                byte_length: 2,
1693                char_offset: 104,
1694                char_length: 2,
1695                token: Token::Special(Special::Separator(Separator::Space)),
1696            },
1697            CharToken {
1698                byte_offset: 106,
1699                byte_length: 3,
1700                char_offset: 106,
1701                char_length: 3,
1702                token: Token::Word(Word::Word("asd".to_string())),
1703            },
1704            CharToken {
1705                byte_offset: 109,
1706                byte_length: 3,
1707                char_offset: 109,
1708                char_length: 3,
1709                token: Token::Special(Special::Separator(Separator::Newline)),
1710            },
1711            CharToken {
1712                byte_offset: 112,
1713                byte_length: 3,
1714                char_offset: 112,
1715                char_length: 3,
1716                token: Token::Word(Word::Word("Brr".to_string())),
1717            },
1718            CharToken {
1719                byte_offset: 115,
1720                byte_length: 1,
1721                char_offset: 115,
1722                char_length: 1,
1723                token: Token::Special(Special::Punctuation(',')),
1724            },
1725            CharToken {
1726                byte_offset: 116,
1727                byte_length: 1,
1728                char_offset: 116,
1729                char_length: 1,
1730                token: Token::Special(Special::Separator(Separator::Space)),
1731            },
1732            CharToken {
1733                byte_offset: 117,
1734                byte_length: 4,
1735                char_offset: 117,
1736                char_length: 4,
1737                token: Token::Word(Word::Word("it\'s".to_string())),
1738            },
1739            CharToken {
1740                byte_offset: 121,
1741                byte_length: 1,
1742                char_offset: 121,
1743                char_length: 1,
1744                token: Token::Special(Special::Separator(Separator::Space)),
1745            },
1746            CharToken {
1747                byte_offset: 122,
1748                byte_length: 4,
1749                char_offset: 122,
1750                char_length: 4,
1751                token: Token::Word(Word::Number(Number::Float(29.3))),
1752            },
1753            CharToken {
1754                byte_offset: 126,
1755                byte_length: 2,
1756                char_offset: 126,
1757                char_length: 1,
1758                token: Token::Special(Special::Symbol('°')),
1759            },
1760            CharToken {
1761                byte_offset: 128,
1762                byte_length: 1,
1763                char_offset: 127,
1764                char_length: 1,
1765                token: Token::Word(Word::Word("F".to_string())),
1766            },
1767            CharToken {
1768                byte_offset: 129,
1769                byte_length: 1,
1770                char_offset: 128,
1771                char_length: 1,
1772                token: Token::Special(Special::Punctuation('!')),
1773            },
1774            CharToken {
1775                byte_offset: 130,
1776                byte_length: 1,
1777                char_offset: 129,
1778                char_length: 1,
1779                token: Token::Special(Special::Separator(Separator::Newline)),
1780            },
1781            CharToken {
1782                byte_offset: 131,
1783                byte_length: 1,
1784                char_offset: 130,
1785                char_length: 1,
1786                token: Token::Special(Special::Separator(Separator::Space)),
1787            },
1788            CharToken {
1789                byte_offset: 132,
1790                byte_length: 14,
1791                char_offset: 131,
1792                char_length: 7,
1793                token: Token::Word(Word::Word("Русское".to_string())),
1794            },
1795            CharToken {
1796                byte_offset: 146,
1797                byte_length: 1,
1798                char_offset: 138,
1799                char_length: 1,
1800                token: Token::Special(Special::Separator(Separator::Space)),
1801            },
1802            CharToken {
1803                byte_offset: 147,
1804                byte_length: 22,
1805                char_offset: 139,
1806                char_length: 11,
1807                token: Token::Word(Word::Word("предложение".to_string())),
1808            },
1809            CharToken {
1810                byte_offset: 169,
1811                byte_length: 1,
1812                char_offset: 150,
1813                char_length: 1,
1814                token: Token::Special(Special::Separator(Separator::Space)),
1815            },
1816            CharToken {
1817                byte_offset: 170,
1818                byte_length: 5,
1819                char_offset: 151,
1820                char_length: 5,
1821                token: Token::Struct(Struct::Hashtag("36.6".to_string())),
1822            },
1823            CharToken {
1824                byte_offset: 175,
1825                byte_length: 1,
1826                char_offset: 156,
1827                char_length: 1,
1828                token: Token::Special(Special::Separator(Separator::Space)),
1829            },
1830            CharToken {
1831                byte_offset: 176,
1832                byte_length: 6,
1833                char_offset: 157,
1834                char_length: 3,
1835                token: Token::Word(Word::Word("для".to_string())),
1836            },
1837            CharToken {
1838                byte_offset: 182,
1839                byte_length: 1,
1840                char_offset: 160,
1841                char_length: 1,
1842                token: Token::Special(Special::Separator(Separator::Space)),
1843            },
1844            CharToken {
1845                byte_offset: 183,
1846                byte_length: 24,
1847                char_offset: 161,
1848                char_length: 12,
1849                token: Token::Word(Word::Word("тестирования".to_string())),
1850            },
1851            CharToken {
1852                byte_offset: 207,
1853                byte_length: 1,
1854                char_offset: 173,
1855                char_length: 1,
1856                token: Token::Special(Special::Separator(Separator::Space)),
1857            },
1858            CharToken {
1859                byte_offset: 208,
1860                byte_length: 14,
1861                char_offset: 174,
1862                char_length: 7,
1863                token: Token::Word(Word::Word("деления".to_string())),
1864            },
1865            CharToken {
1866                byte_offset: 222,
1867                byte_length: 1,
1868                char_offset: 181,
1869                char_length: 1,
1870                token: Token::Special(Special::Separator(Separator::Space)),
1871            },
1872            CharToken {
1873                byte_offset: 223,
1874                byte_length: 4,
1875                char_offset: 182,
1876                char_length: 2,
1877                token: Token::Word(Word::Word("по".to_string())),
1878            },
1879            CharToken {
1880                byte_offset: 227,
1881                byte_length: 1,
1882                char_offset: 184,
1883                char_length: 1,
1884                token: Token::Special(Special::Separator(Separator::Space)),
1885            },
1886            CharToken {
1887                byte_offset: 228,
1888                byte_length: 12,
1889                char_offset: 185,
1890                char_length: 6,
1891                token: Token::Word(Word::Word("юникод".to_string())),
1892            },
1893            CharToken {
1894                byte_offset: 240,
1895                byte_length: 1,
1896                char_offset: 191,
1897                char_length: 1,
1898                token: Token::Special(Special::Punctuation('-')),
1899            },
1900            CharToken {
1901                byte_offset: 241,
1902                byte_length: 12,
1903                char_offset: 192,
1904                char_length: 6,
1905                token: Token::Word(Word::Word("словам".to_string())),
1906            },
1907            CharToken {
1908                byte_offset: 253,
1909                byte_length: 3,
1910                char_offset: 198,
1911                char_length: 3,
1912                token: Token::Special(Special::Punctuation('.')),
1913            },
1914            CharToken {
1915                byte_offset: 256,
1916                byte_length: 1,
1917                char_offset: 201,
1918                char_length: 1,
1919                token: Token::Special(Special::Separator(Separator::Newline)),
1920            },
1921            CharToken {
1922                byte_offset: 257,
1923                byte_length: 8,
1924                char_offset: 202,
1925                char_length: 2,
1926                token: Token::Word(Word::Emoji("russia")),
1927            },
1928            CharToken {
1929                byte_offset: 265,
1930                byte_length: 1,
1931                char_offset: 204,
1932                char_length: 1,
1933                token: Token::Special(Special::Separator(Separator::Space)),
1934            },
1935            CharToken {
1936                byte_offset: 266,
1937                byte_length: 8,
1938                char_offset: 205,
1939                char_length: 2,
1940                token: Token::Word(Word::Emoji("sao_tome_and_principe")),
1941            },
1942            CharToken {
1943                byte_offset: 274,
1944                byte_length: 1,
1945                char_offset: 207,
1946                char_length: 1,
1947                token: Token::Special(Special::Separator(Separator::Newline)),
1948            },
1949            CharToken {
1950                byte_offset: 275,
1951                byte_length: 8,
1952                char_offset: 208,
1953                char_length: 2,
1954                token: Token::Word(Word::Emoji("blond_haired_person_dark_skin_tone")),
1955            },
1956            CharToken {
1957                byte_offset: 283,
1958                byte_length: 8,
1959                char_offset: 210,
1960                char_length: 2,
1961                token: Token::Word(Word::Emoji("baby_medium_skin_tone")),
1962            },
1963            CharToken {
1964                byte_offset: 291,
1965                byte_length: 8,
1966                char_offset: 212,
1967                char_length: 2,
1968                token: Token::Word(Word::Emoji("man_medium_skin_tone")),
1969            },
1970            CharToken {
1971                byte_offset: 299,
1972                byte_length: 1,
1973                char_offset: 214,
1974                char_length: 1,
1975                token: Token::Special(Special::Separator(Separator::Newline)),
1976            },
1977            CharToken {
1978                byte_offset: 300,
1979                byte_length: 1,
1980                char_offset: 215,
1981                char_length: 1,
1982                token: Token::Special(Special::Punctuation('+')),
1983            },
1984            CharToken {
1985                byte_offset: 301,
1986                byte_length: 4,
1987                char_offset: 216,
1988                char_length: 4,
1989                token: Token::Word(Word::Word("Done".to_string())),
1990            },
1991            CharToken {
1992                byte_offset: 305,
1993                byte_length: 1,
1994                char_offset: 220,
1995                char_length: 1,
1996                token: Token::Special(Special::Punctuation('!')),
1997            },
1998            CharToken {
1999                byte_offset: 306,
2000                byte_length: 1,
2001                char_offset: 221,
2002                char_length: 1,
2003                token: Token::Special(Special::Separator(Separator::Space)),
2004            },
2005            CharToken {
2006                byte_offset: 307,
2007                byte_length: 12,
2008                char_offset: 222,
2009                char_length: 6,
2010                token: Token::Word(Word::Word("Готово".to_string())),
2011            },
2012        ];
2013
2014        let lib_res = uws
2015            .into_tokenizer(TokenizerParams::complex())
2016            .collect::<Vec<_>>();
2017
2018        check_cresults(&result, &lib_res, uws);
2020    }
2021
2022    #[test]
2023    fn general_default() {
2024        let uws = "The quick (\"brown\") fox can't jump 32.3 feet, right? 4pda etc. qeq U.S.A  asd\n\n\nBrr, it's 29.3°F!\n Русское предложение #36.6 для тестирования деления по юникод-словам...\n";
2025        let result = vec![
2026            PositionalToken {
2027                source: uws,
2028                offset: 0,
2029                length: 3,
2030                token: Token::Word(Word::Word("The".to_string())),
2031            },
2032            PositionalToken {
2033                source: uws,
2034                offset: 3,
2035                length: 1,
2036                token: Token::Special(Special::Separator(Separator::Space)),
2037            },
2038            PositionalToken {
2039                source: uws,
2040                offset: 4,
2041                length: 5,
2042                token: Token::Word(Word::Word("quick".to_string())),
2043            },
2044            PositionalToken {
2045                source: uws,
2046                offset: 9,
2047                length: 1,
2048                token: Token::Special(Special::Separator(Separator::Space)),
2049            },
2050            PositionalToken {
2051                source: uws,
2052                offset: 10,
2053                length: 1,
2054                token: Token::Special(Special::Punctuation('(')),
2055            },
2056            PositionalToken {
2057                source: uws,
2058                offset: 11,
2059                length: 1,
2060                token: Token::Special(Special::Punctuation('"')),
2061            },
2062            PositionalToken {
2063                source: uws,
2064                offset: 12,
2065                length: 5,
2066                token: Token::Word(Word::Word("brown".to_string())),
2067            },
2068            PositionalToken {
2069                source: uws,
2070                offset: 17,
2071                length: 1,
2072                token: Token::Special(Special::Punctuation('"')),
2073            },
2074            PositionalToken {
2075                source: uws,
2076                offset: 18,
2077                length: 1,
2078                token: Token::Special(Special::Punctuation(')')),
2079            },
2080            PositionalToken {
2081                source: uws,
2082                offset: 19,
2083                length: 1,
2084                token: Token::Special(Special::Separator(Separator::Space)),
2085            },
2086            PositionalToken {
2087                source: uws,
2088                offset: 20,
2089                length: 3,
2090                token: Token::Word(Word::Word("fox".to_string())),
2091            },
2092            PositionalToken {
2093                source: uws,
2094                offset: 23,
2095                length: 1,
2096                token: Token::Special(Special::Separator(Separator::Space)),
2097            },
2098            PositionalToken {
2099                source: uws,
2100                offset: 24,
2101                length: 5,
2102                token: Token::Word(Word::Word("can\'t".to_string())),
2103            },
2104            PositionalToken {
2105                source: uws,
2106                offset: 29,
2107                length: 1,
2108                token: Token::Special(Special::Separator(Separator::Space)),
2109            },
2110            PositionalToken {
2111                source: uws,
2112                offset: 30,
2113                length: 4,
2114                token: Token::Word(Word::Word("jump".to_string())),
2115            },
2116            PositionalToken {
2117                source: uws,
2118                offset: 34,
2119                length: 1,
2120                token: Token::Special(Special::Separator(Separator::Space)),
2121            },
2122            PositionalToken {
2123                source: uws,
2124                offset: 35,
2125                length: 4,
2126                token: Token::Word(Word::Number(Number::Float(32.3))),
2127            },
2128            PositionalToken {
2129                source: uws,
2130                offset: 39,
2131                length: 1,
2132                token: Token::Special(Special::Separator(Separator::Space)),
2133            },
2134            PositionalToken {
2135                source: uws,
2136                offset: 40,
2137                length: 4,
2138                token: Token::Word(Word::Word("feet".to_string())),
2139            },
2140            PositionalToken {
2141                source: uws,
2142                offset: 44,
2143                length: 1,
2144                token: Token::Special(Special::Punctuation(',')),
2145            },
2146            PositionalToken {
2147                source: uws,
2148                offset: 45,
2149                length: 1,
2150                token: Token::Special(Special::Separator(Separator::Space)),
2151            },
2152            PositionalToken {
2153                source: uws,
2154                offset: 46,
2155                length: 5,
2156                token: Token::Word(Word::Word("right".to_string())),
2157            },
2158            PositionalToken {
2159                source: uws,
2160                offset: 51,
2161                length: 1,
2162                token: Token::Special(Special::Punctuation('?')),
2163            },
2164            PositionalToken {
2165                source: uws,
2166                offset: 52,
2167                length: 1,
2168                token: Token::Special(Special::Separator(Separator::Space)),
2169            },
2170            PositionalToken {
2171                source: uws,
2172                offset: 53,
2173                length: 4,
2174                token: Token::Word(Word::Numerical(Numerical::Measures("4pda".to_string()))),
2175            }, PositionalToken {
2177                source: uws,
2178                offset: 57,
2179                length: 1,
2180                token: Token::Special(Special::Separator(Separator::Space)),
2181            },
2182            PositionalToken {
2183                source: uws,
2184                offset: 58,
2185                length: 3,
2186                token: Token::Word(Word::Word("etc".to_string())),
2187            },
2188            PositionalToken {
2189                source: uws,
2190                offset: 61,
2191                length: 1,
2192                token: Token::Special(Special::Punctuation('.')),
2193            },
2194            PositionalToken {
2195                source: uws,
2196                offset: 62,
2197                length: 1,
2198                token: Token::Special(Special::Separator(Separator::Space)),
2199            },
2200            PositionalToken {
2201                source: uws,
2202                offset: 63,
2203                length: 3,
2204                token: Token::Word(Word::Word("qeq".to_string())),
2205            },
2206            PositionalToken {
2207                source: uws,
2208                offset: 66,
2209                length: 1,
2210                token: Token::Special(Special::Separator(Separator::Space)),
2211            },
2212            PositionalToken {
2213                source: uws,
2214                offset: 67,
2215                length: 1,
2216                token: Token::Word(Word::Word("U".to_string())),
2217            },
2218            PositionalToken {
2219                source: uws,
2220                offset: 68,
2221                length: 1,
2222                token: Token::Special(Special::Punctuation('.')),
2223            },
2224            PositionalToken {
2225                source: uws,
2226                offset: 69,
2227                length: 1,
2228                token: Token::Word(Word::Word("S".to_string())),
2229            },
2230            PositionalToken {
2231                source: uws,
2232                offset: 70,
2233                length: 1,
2234                token: Token::Special(Special::Punctuation('.')),
2235            },
2236            PositionalToken {
2237                source: uws,
2238                offset: 71,
2239                length: 1,
2240                token: Token::Word(Word::Word("A".to_string())),
2241            },
2242            PositionalToken {
2243                source: uws,
2244                offset: 72,
2245                length: 2,
2246                token: Token::Special(Special::Separator(Separator::Space)),
2247            },
2248            PositionalToken {
2249                source: uws,
2250                offset: 74,
2251                length: 3,
2252                token: Token::Word(Word::Word("asd".to_string())),
2253            },
2254            PositionalToken {
2255                source: uws,
2256                offset: 77,
2257                length: 3,
2258                token: Token::Special(Special::Separator(Separator::Newline)),
2259            },
2260            PositionalToken {
2261                source: uws,
2262                offset: 80,
2263                length: 3,
2264                token: Token::Word(Word::Word("Brr".to_string())),
2265            },
2266            PositionalToken {
2267                source: uws,
2268                offset: 83,
2269                length: 1,
2270                token: Token::Special(Special::Punctuation(',')),
2271            },
2272            PositionalToken {
2273                source: uws,
2274                offset: 84,
2275                length: 1,
2276                token: Token::Special(Special::Separator(Separator::Space)),
2277            },
2278            PositionalToken {
2279                source: uws,
2280                offset: 85,
2281                length: 4,
2282                token: Token::Word(Word::Word("it\'s".to_string())),
2283            },
2284            PositionalToken {
2285                source: uws,
2286                offset: 89,
2287                length: 1,
2288                token: Token::Special(Special::Separator(Separator::Space)),
2289            },
2290            PositionalToken {
2291                source: uws,
2292                offset: 90,
2293                length: 4,
2294                token: Token::Word(Word::Number(Number::Float(29.3))),
2295            },
2296            PositionalToken {
2297                source: uws,
2298                offset: 94,
2299                length: 2,
2300                token: Token::Special(Special::Symbol('°')),
2301            },
2302            PositionalToken {
2303                source: uws,
2304                offset: 96,
2305                length: 1,
2306                token: Token::Word(Word::Word("F".to_string())),
2307            },
2308            PositionalToken {
2309                source: uws,
2310                offset: 97,
2311                length: 1,
2312                token: Token::Special(Special::Punctuation('!')),
2313            },
2314            PositionalToken {
2315                source: uws,
2316                offset: 98,
2317                length: 1,
2318                token: Token::Special(Special::Separator(Separator::Newline)),
2319            },
2320            PositionalToken {
2321                source: uws,
2322                offset: 99,
2323                length: 1,
2324                token: Token::Special(Special::Separator(Separator::Space)),
2325            },
2326            PositionalToken {
2327                source: uws,
2328                offset: 100,
2329                length: 14,
2330                token: Token::Word(Word::Word("Русское".to_string())),
2331            },
2332            PositionalToken {
2333                source: uws,
2334                offset: 114,
2335                length: 1,
2336                token: Token::Special(Special::Separator(Separator::Space)),
2337            },
2338            PositionalToken {
2339                source: uws,
2340                offset: 115,
2341                length: 22,
2342                token: Token::Word(Word::Word("предложение".to_string())),
2343            },
2344            PositionalToken {
2345                source: uws,
2346                offset: 137,
2347                length: 1,
2348                token: Token::Special(Special::Separator(Separator::Space)),
2349            },
2350            PositionalToken {
2351                source: uws,
2352                offset: 138,
2353                length: 5,
2354                token: Token::Struct(Struct::Hashtag("36.6".to_string())),
2355            },
2356            PositionalToken {
2369                source: uws,
2370                offset: 143,
2371                length: 1,
2372                token: Token::Special(Special::Separator(Separator::Space)),
2373            },
2374            PositionalToken {
2375                source: uws,
2376                offset: 144,
2377                length: 6,
2378                token: Token::Word(Word::Word("для".to_string())),
2379            },
2380            PositionalToken {
2381                source: uws,
2382                offset: 150,
2383                length: 1,
2384                token: Token::Special(Special::Separator(Separator::Space)),
2385            },
2386            PositionalToken {
2387                source: uws,
2388                offset: 151,
2389                length: 24,
2390                token: Token::Word(Word::Word("тестирования".to_string())),
2391            },
2392            PositionalToken {
2393                source: uws,
2394                offset: 175,
2395                length: 1,
2396                token: Token::Special(Special::Separator(Separator::Space)),
2397            },
2398            PositionalToken {
2399                source: uws,
2400                offset: 176,
2401                length: 14,
2402                token: Token::Word(Word::Word("деления".to_string())),
2403            },
2404            PositionalToken {
2405                source: uws,
2406                offset: 190,
2407                length: 1,
2408                token: Token::Special(Special::Separator(Separator::Space)),
2409            },
2410            PositionalToken {
2411                source: uws,
2412                offset: 191,
2413                length: 4,
2414                token: Token::Word(Word::Word("по".to_string())),
2415            },
2416            PositionalToken {
2417                source: uws,
2418                offset: 195,
2419                length: 1,
2420                token: Token::Special(Special::Separator(Separator::Space)),
2421            },
2422            PositionalToken {
2423                source: uws,
2424                offset: 196,
2425                length: 12,
2426                token: Token::Word(Word::Word("юникод".to_string())),
2427            },
2428            PositionalToken {
2429                source: uws,
2430                offset: 208,
2431                length: 1,
2432                token: Token::Special(Special::Punctuation('-')),
2433            },
2434            PositionalToken {
2435                source: uws,
2436                offset: 209,
2437                length: 12,
2438                token: Token::Word(Word::Word("словам".to_string())),
2439            },
2440            PositionalToken {
2441                source: uws,
2442                offset: 221,
2443                length: 3,
2444                token: Token::Special(Special::Punctuation('.')),
2445            },
2446            PositionalToken {
2447                source: uws,
2448                offset: 224,
2449                length: 1,
2450                token: Token::Special(Special::Separator(Separator::Newline)),
2451            },
2452        ];
2453        let lib_res = uws
2454            .into_tokenizer(TokenizerParams::v1())
2455            .collect::<Vec<_>>();
2456        check_results(&result, &lib_res, uws);
2457    }
2458
2459    #[test]
2460    fn general_no_split() {
2461        let uws = "The quick (\"brown\") fox can't jump 32.3 feet, right? 4pda etc. qeq U.S.A  asd\n\n\nBrr, it's 29.3°F!\n Русское предложение #36.6 для тестирования деления по юникод-словам...\n";
2462        let result = vec![
2463            PositionalToken {
2464                source: uws,
2465                offset: 0,
2466                length: 3,
2467                token: Token::Word(Word::Word("The".to_string())),
2468            },
2469            PositionalToken {
2470                source: uws,
2471                offset: 3,
2472                length: 1,
2473                token: Token::Special(Special::Separator(Separator::Space)),
2474            },
2475            PositionalToken {
2476                source: uws,
2477                offset: 4,
2478                length: 5,
2479                token: Token::Word(Word::Word("quick".to_string())),
2480            },
2481            PositionalToken {
2482                source: uws,
2483                offset: 9,
2484                length: 1,
2485                token: Token::Special(Special::Separator(Separator::Space)),
2486            },
2487            PositionalToken {
2488                source: uws,
2489                offset: 10,
2490                length: 1,
2491                token: Token::Special(Special::Punctuation('(')),
2492            },
2493            PositionalToken {
2494                source: uws,
2495                offset: 11,
2496                length: 1,
2497                token: Token::Special(Special::Punctuation('"')),
2498            },
2499            PositionalToken {
2500                source: uws,
2501                offset: 12,
2502                length: 5,
2503                token: Token::Word(Word::Word("brown".to_string())),
2504            },
2505            PositionalToken {
2506                source: uws,
2507                offset: 17,
2508                length: 1,
2509                token: Token::Special(Special::Punctuation('"')),
2510            },
2511            PositionalToken {
2512                source: uws,
2513                offset: 18,
2514                length: 1,
2515                token: Token::Special(Special::Punctuation(')')),
2516            },
2517            PositionalToken {
2518                source: uws,
2519                offset: 19,
2520                length: 1,
2521                token: Token::Special(Special::Separator(Separator::Space)),
2522            },
2523            PositionalToken {
2524                source: uws,
2525                offset: 20,
2526                length: 3,
2527                token: Token::Word(Word::Word("fox".to_string())),
2528            },
2529            PositionalToken {
2530                source: uws,
2531                offset: 23,
2532                length: 1,
2533                token: Token::Special(Special::Separator(Separator::Space)),
2534            },
2535            PositionalToken {
2536                source: uws,
2537                offset: 24,
2538                length: 5,
2539                token: Token::Word(Word::Word("can\'t".to_string())),
2540            },
2541            PositionalToken {
2542                source: uws,
2543                offset: 29,
2544                length: 1,
2545                token: Token::Special(Special::Separator(Separator::Space)),
2546            },
2547            PositionalToken {
2548                source: uws,
2549                offset: 30,
2550                length: 4,
2551                token: Token::Word(Word::Word("jump".to_string())),
2552            },
2553            PositionalToken {
2554                source: uws,
2555                offset: 34,
2556                length: 1,
2557                token: Token::Special(Special::Separator(Separator::Space)),
2558            },
2559            PositionalToken {
2560                source: uws,
2561                offset: 35,
2562                length: 4,
2563                token: Token::Word(Word::Number(Number::Float(32.3))),
2564            },
2565            PositionalToken {
2566                source: uws,
2567                offset: 39,
2568                length: 1,
2569                token: Token::Special(Special::Separator(Separator::Space)),
2570            },
2571            PositionalToken {
2572                source: uws,
2573                offset: 40,
2574                length: 4,
2575                token: Token::Word(Word::Word("feet".to_string())),
2576            },
2577            PositionalToken {
2578                source: uws,
2579                offset: 44,
2580                length: 1,
2581                token: Token::Special(Special::Punctuation(',')),
2582            },
2583            PositionalToken {
2584                source: uws,
2585                offset: 45,
2586                length: 1,
2587                token: Token::Special(Special::Separator(Separator::Space)),
2588            },
2589            PositionalToken {
2590                source: uws,
2591                offset: 46,
2592                length: 5,
2593                token: Token::Word(Word::Word("right".to_string())),
2594            },
2595            PositionalToken {
2596                source: uws,
2597                offset: 51,
2598                length: 1,
2599                token: Token::Special(Special::Punctuation('?')),
2600            },
2601            PositionalToken {
2602                source: uws,
2603                offset: 52,
2604                length: 1,
2605                token: Token::Special(Special::Separator(Separator::Space)),
2606            },
2607            PositionalToken {
2608                source: uws,
2609                offset: 53,
2610                length: 4,
2611                token: Token::Word(Word::Numerical(Numerical::Measures("4pda".to_string()))),
2612            }, PositionalToken {
2614                source: uws,
2615                offset: 57,
2616                length: 1,
2617                token: Token::Special(Special::Separator(Separator::Space)),
2618            },
2619            PositionalToken {
2620                source: uws,
2621                offset: 58,
2622                length: 3,
2623                token: Token::Word(Word::Word("etc".to_string())),
2624            },
2625            PositionalToken {
2626                source: uws,
2627                offset: 61,
2628                length: 1,
2629                token: Token::Special(Special::Punctuation('.')),
2630            },
2631            PositionalToken {
2632                source: uws,
2633                offset: 62,
2634                length: 1,
2635                token: Token::Special(Special::Separator(Separator::Space)),
2636            },
2637            PositionalToken {
2638                source: uws,
2639                offset: 63,
2640                length: 3,
2641                token: Token::Word(Word::Word("qeq".to_string())),
2642            },
2643            PositionalToken {
2644                source: uws,
2645                offset: 66,
2646                length: 1,
2647                token: Token::Special(Special::Separator(Separator::Space)),
2648            },
2649            PositionalToken {
2650                source: uws,
2651                offset: 67,
2652                length: 5,
2653                token: Token::Word(Word::Word("U.S.A".to_string())),
2654            },
2655            PositionalToken {
2656                source: uws,
2657                offset: 72,
2658                length: 1,
2659                token: Token::Special(Special::Separator(Separator::Space)),
2660            },
2661            PositionalToken {
2662                source: uws,
2663                offset: 73,
2664                length: 1,
2665                token: Token::Special(Special::Separator(Separator::Space)),
2666            },
2667            PositionalToken {
2668                source: uws,
2669                offset: 74,
2670                length: 3,
2671                token: Token::Word(Word::Word("asd".to_string())),
2672            },
2673            PositionalToken {
2674                source: uws,
2675                offset: 77,
2676                length: 1,
2677                token: Token::Special(Special::Separator(Separator::Newline)),
2678            },
2679            PositionalToken {
2680                source: uws,
2681                offset: 78,
2682                length: 1,
2683                token: Token::Special(Special::Separator(Separator::Newline)),
2684            },
2685            PositionalToken {
2686                source: uws,
2687                offset: 79,
2688                length: 1,
2689                token: Token::Special(Special::Separator(Separator::Newline)),
2690            },
2691            PositionalToken {
2692                source: uws,
2693                offset: 80,
2694                length: 3,
2695                token: Token::Word(Word::Word("Brr".to_string())),
2696            },
2697            PositionalToken {
2698                source: uws,
2699                offset: 83,
2700                length: 1,
2701                token: Token::Special(Special::Punctuation(',')),
2702            },
2703            PositionalToken {
2704                source: uws,
2705                offset: 84,
2706                length: 1,
2707                token: Token::Special(Special::Separator(Separator::Space)),
2708            },
2709            PositionalToken {
2710                source: uws,
2711                offset: 85,
2712                length: 4,
2713                token: Token::Word(Word::Word("it\'s".to_string())),
2714            },
2715            PositionalToken {
2716                source: uws,
2717                offset: 89,
2718                length: 1,
2719                token: Token::Special(Special::Separator(Separator::Space)),
2720            },
2721            PositionalToken {
2722                source: uws,
2723                offset: 90,
2724                length: 4,
2725                token: Token::Word(Word::Number(Number::Float(29.3))),
2726            },
2727            PositionalToken {
2728                source: uws,
2729                offset: 94,
2730                length: 2,
2731                token: Token::Special(Special::Symbol('°')),
2732            },
2733            PositionalToken {
2734                source: uws,
2735                offset: 96,
2736                length: 1,
2737                token: Token::Word(Word::Word("F".to_string())),
2738            },
2739            PositionalToken {
2740                source: uws,
2741                offset: 97,
2742                length: 1,
2743                token: Token::Special(Special::Punctuation('!')),
2744            },
2745            PositionalToken {
2746                source: uws,
2747                offset: 98,
2748                length: 1,
2749                token: Token::Special(Special::Separator(Separator::Newline)),
2750            },
2751            PositionalToken {
2752                source: uws,
2753                offset: 99,
2754                length: 1,
2755                token: Token::Special(Special::Separator(Separator::Space)),
2756            },
2757            PositionalToken {
2758                source: uws,
2759                offset: 100,
2760                length: 14,
2761                token: Token::Word(Word::Word("Русское".to_string())),
2762            },
2763            PositionalToken {
2764                source: uws,
2765                offset: 114,
2766                length: 1,
2767                token: Token::Special(Special::Separator(Separator::Space)),
2768            },
2769            PositionalToken {
2770                source: uws,
2771                offset: 115,
2772                length: 22,
2773                token: Token::Word(Word::Word("предложение".to_string())),
2774            },
2775            PositionalToken {
2776                source: uws,
2777                offset: 137,
2778                length: 1,
2779                token: Token::Special(Special::Separator(Separator::Space)),
2780            },
2781            PositionalToken {
2782                source: uws,
2783                offset: 138,
2784                length: 1,
2785                token: Token::Special(Special::Punctuation('#')),
2786            },
2787            PositionalToken {
2788                source: uws,
2789                offset: 139,
2790                length: 4,
2791                token: Token::Word(Word::Number(Number::Float(36.6))),
2792            },
2793            PositionalToken {
2794                source: uws,
2795                offset: 143,
2796                length: 1,
2797                token: Token::Special(Special::Separator(Separator::Space)),
2798            },
2799            PositionalToken {
2800                source: uws,
2801                offset: 144,
2802                length: 6,
2803                token: Token::Word(Word::Word("для".to_string())),
2804            },
2805            PositionalToken {
2806                source: uws,
2807                offset: 150,
2808                length: 1,
2809                token: Token::Special(Special::Separator(Separator::Space)),
2810            },
2811            PositionalToken {
2812                source: uws,
2813                offset: 151,
2814                length: 24,
2815                token: Token::Word(Word::Word("тестирования".to_string())),
2816            },
2817            PositionalToken {
2818                source: uws,
2819                offset: 175,
2820                length: 1,
2821                token: Token::Special(Special::Separator(Separator::Space)),
2822            },
2823            PositionalToken {
2824                source: uws,
2825                offset: 176,
2826                length: 14,
2827                token: Token::Word(Word::Word("деления".to_string())),
2828            },
2829            PositionalToken {
2830                source: uws,
2831                offset: 190,
2832                length: 1,
2833                token: Token::Special(Special::Separator(Separator::Space)),
2834            },
2835            PositionalToken {
2836                source: uws,
2837                offset: 191,
2838                length: 4,
2839                token: Token::Word(Word::Word("по".to_string())),
2840            },
2841            PositionalToken {
2842                source: uws,
2843                offset: 195,
2844                length: 1,
2845                token: Token::Special(Special::Separator(Separator::Space)),
2846            },
2847            PositionalToken {
2848                source: uws,
2849                offset: 196,
2850                length: 12,
2851                token: Token::Word(Word::Word("юникод".to_string())),
2852            },
2853            PositionalToken {
2854                source: uws,
2855                offset: 208,
2856                length: 1,
2857                token: Token::Special(Special::Punctuation('-')),
2858            },
2859            PositionalToken {
2860                source: uws,
2861                offset: 209,
2862                length: 12,
2863                token: Token::Word(Word::Word("словам".to_string())),
2864            },
2865            PositionalToken {
2866                source: uws,
2867                offset: 221,
2868                length: 1,
2869                token: Token::Special(Special::Punctuation('.')),
2870            },
2871            PositionalToken {
2872                source: uws,
2873                offset: 222,
2874                length: 1,
2875                token: Token::Special(Special::Punctuation('.')),
2876            },
2877            PositionalToken {
2878                source: uws,
2879                offset: 223,
2880                length: 1,
2881                token: Token::Special(Special::Punctuation('.')),
2882            },
2883            PositionalToken {
2884                source: uws,
2885                offset: 224,
2886                length: 1,
2887                token: Token::Special(Special::Separator(Separator::Newline)),
2888            },
2889        ];
2890        let lib_res = uws.into_tokenizer(Default::default()).collect::<Vec<_>>();
2891        check_results(&result, &lib_res, uws);
2892    }
2893
2894    #[test]
2895    fn general_complex() {
2896        let uws = "The quick (\"brown\") fox can't jump 32.3 feet, right? 4pda etc. qeq U.S.A  asd\n\n\nBrr, it's 29.3°F!\n Русское предложение #36.6 для тестирования деления по юникод-словам...\n";
2897        let result = vec![
2898            PositionalToken {
2899                source: uws,
2900                offset: 0,
2901                length: 3,
2902                token: Token::Word(Word::Word("The".to_string())),
2903            },
2904            PositionalToken {
2905                source: uws,
2906                offset: 3,
2907                length: 1,
2908                token: Token::Special(Special::Separator(Separator::Space)),
2909            },
2910            PositionalToken {
2911                source: uws,
2912                offset: 4,
2913                length: 5,
2914                token: Token::Word(Word::Word("quick".to_string())),
2915            },
2916            PositionalToken {
2917                source: uws,
2918                offset: 9,
2919                length: 1,
2920                token: Token::Special(Special::Separator(Separator::Space)),
2921            },
2922            PositionalToken {
2923                source: uws,
2924                offset: 10,
2925                length: 1,
2926                token: Token::Special(Special::Punctuation('(')),
2927            },
2928            PositionalToken {
2929                source: uws,
2930                offset: 11,
2931                length: 1,
2932                token: Token::Special(Special::Punctuation('"')),
2933            },
2934            PositionalToken {
2935                source: uws,
2936                offset: 12,
2937                length: 5,
2938                token: Token::Word(Word::Word("brown".to_string())),
2939            },
2940            PositionalToken {
2941                source: uws,
2942                offset: 17,
2943                length: 1,
2944                token: Token::Special(Special::Punctuation('"')),
2945            },
2946            PositionalToken {
2947                source: uws,
2948                offset: 18,
2949                length: 1,
2950                token: Token::Special(Special::Punctuation(')')),
2951            },
2952            PositionalToken {
2953                source: uws,
2954                offset: 19,
2955                length: 1,
2956                token: Token::Special(Special::Separator(Separator::Space)),
2957            },
2958            PositionalToken {
2959                source: uws,
2960                offset: 20,
2961                length: 3,
2962                token: Token::Word(Word::Word("fox".to_string())),
2963            },
2964            PositionalToken {
2965                source: uws,
2966                offset: 23,
2967                length: 1,
2968                token: Token::Special(Special::Separator(Separator::Space)),
2969            },
2970            PositionalToken {
2971                source: uws,
2972                offset: 24,
2973                length: 5,
2974                token: Token::Word(Word::Word("can\'t".to_string())),
2975            },
2976            PositionalToken {
2977                source: uws,
2978                offset: 29,
2979                length: 1,
2980                token: Token::Special(Special::Separator(Separator::Space)),
2981            },
2982            PositionalToken {
2983                source: uws,
2984                offset: 30,
2985                length: 4,
2986                token: Token::Word(Word::Word("jump".to_string())),
2987            },
2988            PositionalToken {
2989                source: uws,
2990                offset: 34,
2991                length: 1,
2992                token: Token::Special(Special::Separator(Separator::Space)),
2993            },
2994            PositionalToken {
2995                source: uws,
2996                offset: 35,
2997                length: 4,
2998                token: Token::Word(Word::Number(Number::Float(32.3))),
2999            },
3000            PositionalToken {
3001                source: uws,
3002                offset: 39,
3003                length: 1,
3004                token: Token::Special(Special::Separator(Separator::Space)),
3005            },
3006            PositionalToken {
3007                source: uws,
3008                offset: 40,
3009                length: 4,
3010                token: Token::Word(Word::Word("feet".to_string())),
3011            },
3012            PositionalToken {
3013                source: uws,
3014                offset: 44,
3015                length: 1,
3016                token: Token::Special(Special::Punctuation(',')),
3017            },
3018            PositionalToken {
3019                source: uws,
3020                offset: 45,
3021                length: 1,
3022                token: Token::Special(Special::Separator(Separator::Space)),
3023            },
3024            PositionalToken {
3025                source: uws,
3026                offset: 46,
3027                length: 5,
3028                token: Token::Word(Word::Word("right".to_string())),
3029            },
3030            PositionalToken {
3031                source: uws,
3032                offset: 51,
3033                length: 1,
3034                token: Token::Special(Special::Punctuation('?')),
3035            },
3036            PositionalToken {
3037                source: uws,
3038                offset: 52,
3039                length: 1,
3040                token: Token::Special(Special::Separator(Separator::Space)),
3041            },
3042            PositionalToken {
3043                source: uws,
3044                offset: 53,
3045                length: 4,
3046                token: Token::Word(Word::Numerical(Numerical::Measures("4pda".to_string()))),
3047            }, PositionalToken {
3049                source: uws,
3050                offset: 57,
3051                length: 1,
3052                token: Token::Special(Special::Separator(Separator::Space)),
3053            },
3054            PositionalToken {
3055                source: uws,
3056                offset: 58,
3057                length: 3,
3058                token: Token::Word(Word::Word("etc".to_string())),
3059            },
3060            PositionalToken {
3061                source: uws,
3062                offset: 61,
3063                length: 1,
3064                token: Token::Special(Special::Punctuation('.')),
3065            },
3066            PositionalToken {
3067                source: uws,
3068                offset: 62,
3069                length: 1,
3070                token: Token::Special(Special::Separator(Separator::Space)),
3071            },
3072            PositionalToken {
3073                source: uws,
3074                offset: 63,
3075                length: 3,
3076                token: Token::Word(Word::Word("qeq".to_string())),
3077            },
3078            PositionalToken {
3079                source: uws,
3080                offset: 66,
3081                length: 1,
3082                token: Token::Special(Special::Separator(Separator::Space)),
3083            },
3084            PositionalToken {
3085                source: uws,
3086                offset: 67,
3087                length: 5,
3088                token: Token::Word(Word::Word("U.S.A".to_string())),
3089            },
3090            PositionalToken {
3091                source: uws,
3092                offset: 72,
3093                length: 2,
3094                token: Token::Special(Special::Separator(Separator::Space)),
3095            },
3096            PositionalToken {
3097                source: uws,
3098                offset: 74,
3099                length: 3,
3100                token: Token::Word(Word::Word("asd".to_string())),
3101            },
3102            PositionalToken {
3103                source: uws,
3104                offset: 77,
3105                length: 3,
3106                token: Token::Special(Special::Separator(Separator::Newline)),
3107            },
3108            PositionalToken {
3109                source: uws,
3110                offset: 80,
3111                length: 3,
3112                token: Token::Word(Word::Word("Brr".to_string())),
3113            },
3114            PositionalToken {
3115                source: uws,
3116                offset: 83,
3117                length: 1,
3118                token: Token::Special(Special::Punctuation(',')),
3119            },
3120            PositionalToken {
3121                source: uws,
3122                offset: 84,
3123                length: 1,
3124                token: Token::Special(Special::Separator(Separator::Space)),
3125            },
3126            PositionalToken {
3127                source: uws,
3128                offset: 85,
3129                length: 4,
3130                token: Token::Word(Word::Word("it\'s".to_string())),
3131            },
3132            PositionalToken {
3133                source: uws,
3134                offset: 89,
3135                length: 1,
3136                token: Token::Special(Special::Separator(Separator::Space)),
3137            },
3138            PositionalToken {
3139                source: uws,
3140                offset: 90,
3141                length: 4,
3142                token: Token::Word(Word::Number(Number::Float(29.3))),
3143            },
3144            PositionalToken {
3145                source: uws,
3146                offset: 94,
3147                length: 2,
3148                token: Token::Special(Special::Symbol('°')),
3149            },
3150            PositionalToken {
3151                source: uws,
3152                offset: 96,
3153                length: 1,
3154                token: Token::Word(Word::Word("F".to_string())),
3155            },
3156            PositionalToken {
3157                source: uws,
3158                offset: 97,
3159                length: 1,
3160                token: Token::Special(Special::Punctuation('!')),
3161            },
3162            PositionalToken {
3163                source: uws,
3164                offset: 98,
3165                length: 1,
3166                token: Token::Special(Special::Separator(Separator::Newline)),
3167            },
3168            PositionalToken {
3169                source: uws,
3170                offset: 99,
3171                length: 1,
3172                token: Token::Special(Special::Separator(Separator::Space)),
3173            },
3174            PositionalToken {
3175                source: uws,
3176                offset: 100,
3177                length: 14,
3178                token: Token::Word(Word::Word("Русское".to_string())),
3179            },
3180            PositionalToken {
3181                source: uws,
3182                offset: 114,
3183                length: 1,
3184                token: Token::Special(Special::Separator(Separator::Space)),
3185            },
3186            PositionalToken {
3187                source: uws,
3188                offset: 115,
3189                length: 22,
3190                token: Token::Word(Word::Word("предложение".to_string())),
3191            },
3192            PositionalToken {
3193                source: uws,
3194                offset: 137,
3195                length: 1,
3196                token: Token::Special(Special::Separator(Separator::Space)),
3197            },
3198            PositionalToken {
3199                source: uws,
3200                offset: 138,
3201                length: 5,
3202                token: Token::Struct(Struct::Hashtag("36.6".to_string())),
3203            },
3204            PositionalToken {
3205                source: uws,
3206                offset: 143,
3207                length: 1,
3208                token: Token::Special(Special::Separator(Separator::Space)),
3209            },
3210            PositionalToken {
3211                source: uws,
3212                offset: 144,
3213                length: 6,
3214                token: Token::Word(Word::Word("для".to_string())),
3215            },
3216            PositionalToken {
3217                source: uws,
3218                offset: 150,
3219                length: 1,
3220                token: Token::Special(Special::Separator(Separator::Space)),
3221            },
3222            PositionalToken {
3223                source: uws,
3224                offset: 151,
3225                length: 24,
3226                token: Token::Word(Word::Word("тестирования".to_string())),
3227            },
3228            PositionalToken {
3229                source: uws,
3230                offset: 175,
3231                length: 1,
3232                token: Token::Special(Special::Separator(Separator::Space)),
3233            },
3234            PositionalToken {
3235                source: uws,
3236                offset: 176,
3237                length: 14,
3238                token: Token::Word(Word::Word("деления".to_string())),
3239            },
3240            PositionalToken {
3241                source: uws,
3242                offset: 190,
3243                length: 1,
3244                token: Token::Special(Special::Separator(Separator::Space)),
3245            },
3246            PositionalToken {
3247                source: uws,
3248                offset: 191,
3249                length: 4,
3250                token: Token::Word(Word::Word("по".to_string())),
3251            },
3252            PositionalToken {
3253                source: uws,
3254                offset: 195,
3255                length: 1,
3256                token: Token::Special(Special::Separator(Separator::Space)),
3257            },
3258            PositionalToken {
3259                source: uws,
3260                offset: 196,
3261                length: 12,
3262                token: Token::Word(Word::Word("юникод".to_string())),
3263            },
3264            PositionalToken {
3265                source: uws,
3266                offset: 208,
3267                length: 1,
3268                token: Token::Special(Special::Punctuation('-')),
3269            },
3270            PositionalToken {
3271                source: uws,
3272                offset: 209,
3273                length: 12,
3274                token: Token::Word(Word::Word("словам".to_string())),
3275            },
3276            PositionalToken {
3277                source: uws,
3278                offset: 221,
3279                length: 3,
3280                token: Token::Special(Special::Punctuation('.')),
3281            },
3282            PositionalToken {
3283                source: uws,
3284                offset: 224,
3285                length: 1,
3286                token: Token::Special(Special::Separator(Separator::Newline)),
3287            },
3288        ];
3289        let lib_res = uws
3290            .into_tokenizer(TokenizerParams::complex())
3291            .collect::<Vec<_>>();
3292        check_results(&result, &lib_res, uws);
3293    }
3294
3295    #[test]
3296    fn plus_minus() {
3297        let uws = "+23 -4.5 -34 +25.7 - 2 + 5.6";
3298        let result = vec![
3299            PositionalToken {
3300                source: uws,
3301                offset: 0,
3302                length: 3,
3303                token: Token::Word(Word::Number(Number::Integer(23))),
3304            },
3305            PositionalToken {
3306                source: uws,
3307                offset: 3,
3308                length: 1,
3309                token: Token::Special(Special::Separator(Separator::Space)),
3310            },
3311            PositionalToken {
3312                source: uws,
3313                offset: 4,
3314                length: 4,
3315                token: Token::Word(Word::Number(Number::Float(-4.5))),
3316            },
3317            PositionalToken {
3318                source: uws,
3319                offset: 8,
3320                length: 1,
3321                token: Token::Special(Special::Separator(Separator::Space)),
3322            },
3323            PositionalToken {
3324                source: uws,
3325                offset: 9,
3326                length: 3,
3327                token: Token::Word(Word::Number(Number::Integer(-34))),
3328            },
3329            PositionalToken {
3330                source: uws,
3331                offset: 12,
3332                length: 1,
3333                token: Token::Special(Special::Separator(Separator::Space)),
3334            },
3335            PositionalToken {
3336                source: uws,
3337                offset: 13,
3338                length: 5,
3339                token: Token::Word(Word::Number(Number::Float(25.7))),
3340            },
3341            PositionalToken {
3342                source: uws,
3343                offset: 18,
3344                length: 1,
3345                token: Token::Special(Special::Separator(Separator::Space)),
3346            },
3347            PositionalToken {
3348                source: uws,
3349                offset: 19,
3350                length: 1,
3351                token: Token::Special(Special::Punctuation('-')),
3352            },
3353            PositionalToken {
3354                source: uws,
3355                offset: 20,
3356                length: 1,
3357                token: Token::Special(Special::Separator(Separator::Space)),
3358            },
3359            PositionalToken {
3360                source: uws,
3361                offset: 21,
3362                length: 1,
3363                token: Token::Word(Word::Number(Number::Integer(2))),
3364            },
3365            PositionalToken {
3366                source: uws,
3367                offset: 22,
3368                length: 1,
3369                token: Token::Special(Special::Separator(Separator::Space)),
3370            },
3371            PositionalToken {
3372                source: uws,
3373                offset: 23,
3374                length: 1,
3375                token: Token::Special(Special::Punctuation('+')),
3376            },
3377            PositionalToken {
3378                source: uws,
3379                offset: 24,
3380                length: 1,
3381                token: Token::Special(Special::Separator(Separator::Space)),
3382            },
3383            PositionalToken {
3384                source: uws,
3385                offset: 25,
3386                length: 3,
3387                token: Token::Word(Word::Number(Number::Float(5.6))),
3388            },
3389        ];
3390        let lib_res = uws
3391            .into_tokenizer(TokenizerParams::v1())
3392            .collect::<Vec<_>>();
3393        check(&result, &lib_res, uws);
3394        }
3396
3397    #[test]
3398    #[ignore]
3399    fn woman_bouncing_ball() {
3400        let uws = "\u{26f9}\u{200d}\u{2640}";
3401        let result = vec![PositionalToken {
3402            source: uws,
3403            offset: 0,
3404            length: 9,
3405            token: Token::Word(Word::Emoji("woman_bouncing_ball")),
3406        }];
3407        let lib_res = uws
3408            .into_tokenizer(TokenizerParams::v1())
3409            .collect::<Vec<_>>();
3410        check_results(&result, &lib_res, uws);
3411        }
3413
3414    #[test]
3415    fn emoji_and_rusabbr_default() {
3416        let uws = "🇷🇺 🇸🇹\n👱🏿👶🏽👨🏽\n👱\nС.С.С.Р.\n👨👩👦👦\n🧠\n";
3417        let result = vec![
3418            PositionalToken {
3419                source: uws,
3420                offset: 0,
3421                length: 8,
3422                token: Token::Word(Word::Emoji("russia")),
3423            },
3424            PositionalToken {
3425                source: uws,
3426                offset: 8,
3427                length: 1,
3428                token: Token::Special(Special::Separator(Separator::Space)),
3429            },
3430            PositionalToken {
3431                source: uws,
3432                offset: 9,
3433                length: 8,
3434                token: Token::Word(Word::Emoji("sao_tome_and_principe")),
3435            },
3436            PositionalToken {
3437                source: uws,
3438                offset: 17,
3439                length: 1,
3440                token: Token::Special(Special::Separator(Separator::Newline)),
3441            },
3442            PositionalToken {
3443                source: uws,
3444                offset: 18,
3445                length: 8,
3446                token: Token::Word(Word::Emoji("blond_haired_person_dark_skin_tone")),
3447            },
3448            PositionalToken {
3449                source: uws,
3450                offset: 26,
3451                length: 8,
3452                token: Token::Word(Word::Emoji("baby_medium_skin_tone")),
3453            },
3454            PositionalToken {
3455                source: uws,
3456                offset: 34,
3457                length: 8,
3458                token: Token::Word(Word::Emoji("man_medium_skin_tone")),
3459            },
3460            PositionalToken {
3461                source: uws,
3462                offset: 42,
3463                length: 1,
3464                token: Token::Special(Special::Separator(Separator::Newline)),
3465            },
3466            PositionalToken {
3467                source: uws,
3468                offset: 43,
3469                length: 4,
3470                token: Token::Word(Word::Emoji("blond_haired_person")),
3471            },
3472            PositionalToken {
3473                source: uws,
3474                offset: 47,
3475                length: 1,
3476                token: Token::Special(Special::Separator(Separator::Newline)),
3477            },
3478            PositionalToken {
3479                source: uws,
3480                offset: 48,
3481                length: 2,
3482                token: Token::Word(Word::Word("С".to_string())),
3483            },
3484            PositionalToken {
3485                source: uws,
3486                offset: 50,
3487                length: 1,
3488                token: Token::Special(Special::Punctuation('.')),
3489            },
3490            PositionalToken {
3491                source: uws,
3492                offset: 51,
3493                length: 2,
3494                token: Token::Word(Word::Word("С".to_string())),
3495            },
3496            PositionalToken {
3497                source: uws,
3498                offset: 53,
3499                length: 1,
3500                token: Token::Special(Special::Punctuation('.')),
3501            },
3502            PositionalToken {
3503                source: uws,
3504                offset: 54,
3505                length: 2,
3506                token: Token::Word(Word::Word("С".to_string())),
3507            },
3508            PositionalToken {
3509                source: uws,
3510                offset: 56,
3511                length: 1,
3512                token: Token::Special(Special::Punctuation('.')),
3513            },
3514            PositionalToken {
3515                source: uws,
3516                offset: 57,
3517                length: 2,
3518                token: Token::Word(Word::Word("Р".to_string())),
3519            },
3520            PositionalToken {
3521                source: uws,
3522                offset: 59,
3523                length: 1,
3524                token: Token::Special(Special::Punctuation('.')),
3525            },
3526            PositionalToken {
3527                source: uws,
3528                offset: 60,
3529                length: 1,
3530                token: Token::Special(Special::Separator(Separator::Newline)),
3531            },
3532            PositionalToken {
3533                source: uws,
3534                offset: 61,
3535                length: 25,
3536                token: Token::Word(Word::Emoji("family_man_woman_boy_boy")),
3537            },
3538            PositionalToken {
3539                source: uws,
3540                offset: 86,
3541                length: 1,
3542                token: Token::Special(Special::Separator(Separator::Newline)),
3543            },
3544            PositionalToken {
3545                source: uws,
3546                offset: 87,
3547                length: 4,
3548                token: Token::Word(Word::Emoji("brain")),
3549            },
3550            PositionalToken {
3551                source: uws,
3552                offset: 91,
3553                length: 1,
3554                token: Token::Special(Special::Separator(Separator::Newline)),
3555            },
3556        ];
3557
3558        let lib_res = uws
3559            .into_tokenizer(TokenizerParams::v1())
3560            .collect::<Vec<_>>();
3561        check_results(&result, &lib_res, uws);
3562        }
3564
3565    #[test]
3566    fn emoji_and_rusabbr_no_split() {
3567        let uws = "🇷🇺 🇸🇹\n👱🏿👶🏽👨🏽\n👱\nС.С.С.Р.\n👨👩👦👦\n🧠\n";
3568        let result = vec![
3569            PositionalToken {
3570                source: uws,
3571                offset: 0,
3572                length: 8,
3573                token: Token::Word(Word::Emoji("russia")),
3574            },
3575            PositionalToken {
3576                source: uws,
3577                offset: 8,
3578                length: 1,
3579                token: Token::Special(Special::Separator(Separator::Space)),
3580            },
3581            PositionalToken {
3582                source: uws,
3583                offset: 9,
3584                length: 8,
3585                token: Token::Word(Word::Emoji("sao_tome_and_principe")),
3586            },
3587            PositionalToken {
3588                source: uws,
3589                offset: 17,
3590                length: 1,
3591                token: Token::Special(Special::Separator(Separator::Newline)),
3592            },
3593            PositionalToken {
3594                source: uws,
3595                offset: 18,
3596                length: 8,
3597                token: Token::Word(Word::Emoji("blond_haired_person_dark_skin_tone")),
3598            },
3599            PositionalToken {
3600                source: uws,
3601                offset: 26,
3602                length: 8,
3603                token: Token::Word(Word::Emoji("baby_medium_skin_tone")),
3604            },
3605            PositionalToken {
3606                source: uws,
3607                offset: 34,
3608                length: 8,
3609                token: Token::Word(Word::Emoji("man_medium_skin_tone")),
3610            },
3611            PositionalToken {
3612                source: uws,
3613                offset: 42,
3614                length: 1,
3615                token: Token::Special(Special::Separator(Separator::Newline)),
3616            },
3617            PositionalToken {
3618                source: uws,
3619                offset: 43,
3620                length: 4,
3621                token: Token::Word(Word::Emoji("blond_haired_person")),
3622            },
3623            PositionalToken {
3624                source: uws,
3625                offset: 47,
3626                length: 1,
3627                token: Token::Special(Special::Separator(Separator::Newline)),
3628            },
3629            PositionalToken {
3630                source: uws,
3631                offset: 48,
3632                length: 11,
3633                token: Token::Word(Word::Word("С.С.С.Р".to_string())),
3634            },
3635            PositionalToken {
3636                source: uws,
3637                offset: 59,
3638                length: 1,
3639                token: Token::Special(Special::Punctuation('.')),
3640            },
3641            PositionalToken {
3642                source: uws,
3643                offset: 60,
3644                length: 1,
3645                token: Token::Special(Special::Separator(Separator::Newline)),
3646            },
3647            PositionalToken {
3648                source: uws,
3649                offset: 61,
3650                length: 25,
3651                token: Token::Word(Word::Emoji("family_man_woman_boy_boy")),
3652            },
3653            PositionalToken {
3654                source: uws,
3655                offset: 86,
3656                length: 1,
3657                token: Token::Special(Special::Separator(Separator::Newline)),
3658            },
3659            PositionalToken {
3660                source: uws,
3661                offset: 87,
3662                length: 4,
3663                token: Token::Word(Word::Emoji("brain")),
3664            },
3665            PositionalToken {
3666                source: uws,
3667                offset: 91,
3668                length: 1,
3669                token: Token::Special(Special::Separator(Separator::Newline)),
3670            },
3671        ];
3672
3673        let lib_res = uws.into_tokenizer(Default::default()).collect::<Vec<_>>();
3674        check_results(&result, &lib_res, uws);
3675        }
3677
3678    #[test]
3902    fn html() {
3903        let uws = "<div class=\"article article_view \" id=\"article_view_-113039156_9551\" data-article-url=\"/@chaibuket-o-chem-ne-zabyt-25-noyabrya\" data-audio-context=\"article:-113039156_9551\"><h1  class=\"article_decoration_first article_decoration_last\" >День Мамы </h1><p  class=\"article_decoration_first article_decoration_last\" >День, когда поздравляют мам, бабушек, сестер и жён — это всемирный праздник, называемый «День Мамы». В настоящее время его отмечают почти в каждой стране, просто везде разные даты и способы празднования. </p><h3  class=\"article_decoration_first article_decoration_last\" ><span class='article_anchor_title'>\n  <span class='article_anchor_button' id='pochemu-my-ego-prazdnuem'></span>\n  <span class='article_anchor_fsymbol'>П</span>\n</span>ПОЧЕМУ МЫ ЕГО ПРАЗДНУЕМ</h3><p  class=\"article_decoration_first article_decoration_last article_decoration_before\" >В 1987 году комитет госдумы по делам женщин, семьи и молодежи выступил с предложением учредить «День мамы», а сам приказ был подписан уже 30 января 1988 года Борисом Ельциным. Было решено, что ежегодно в России празднество дня мамы будет выпадать на последнее воскресенье ноября. </p><figure data-type=\"101\" data-mode=\"\"  class=\"article_decoration_first article_decoration_last\" >\n  <div class=\"article_figure_content\" style=\"width: 1125px\">\n    <div class=\"article_figure_sizer_content\"><div class=\"article_object_sizer_wrap\" data-sizes=\"[{"s":["https://pp.userapi.com/c849128/v849128704/c0ffd/pcNJaBH3NDo.jpg",75,50],"m":["https://pp.userapi.com/c849128/v849128704/c0ffe/ozCLs2kHtRY.jpg",130,87],"x":["https://pp.userapi.com/c849128/v849128704/c0fff/E4KtTNDydzE.jpg",604,403],"y":["https://pp.userapi.com/c849128/v849128704/c1000/1nLxpYKavzU.jpg",807,538],"z":["https://pp.userapi.com/c849128/v849128704/c1001/IgEODe90yEk.jpg",1125,750],"o":["https://pp.userapi.com/c849128/v849128704/c1002/01faNwVZ2_E.jpg",130,87],"p":["https://pp.userapi.com/c849128/v849128704/c1003/baDFzbdRP2s.jpg",200,133],"q":["https://pp.userapi.com/c849128/v849128704/c1004/CY4khI6KJKA.jpg",320,213],"r":["https://pp.userapi.com/c849128/v849128704/c1005/NOvAJ6-VltY.jpg",510,340]}]\">\n  <img class=\"article_object_sizer_inner article_object_photo__image_blur\" src=\"https://pp.userapi.com/c849128/v849128704/c0ffd/pcNJaBH3NDo.jpg\" data-baseurl=\"\"/>\n  \n</div></div>\n    <div class=\"article_figure_sizer\" style=\"padding-bottom: 66.666666666667%\"></div>";
3904        let result = vec![
3905            PositionalToken {
3906                source: uws,
3907                offset: 236,
3908                length: 8,
3909                token: Token::Word(Word::Word("День".to_string())),
3910            },
3911            PositionalToken {
3912                source: uws,
3913                offset: 244,
3914                length: 1,
3915                token: Token::Special(Special::Separator(Separator::Space)),
3916            },
3917            PositionalToken {
3918                source: uws,
3919                offset: 245,
3920                length: 8,
3921                token: Token::Word(Word::Word("Мамы".to_string())),
3922            },
3923            PositionalToken {
3924                source: uws,
3925                offset: 253,
3926                length: 1,
3927                token: Token::Special(Special::Separator(Separator::Space)),
3928            },
3929            PositionalToken {
3930                source: uws,
3931                offset: 321,
3932                length: 8,
3933                token: Token::Word(Word::Word("День".to_string())),
3934            },
3935            PositionalToken {
3936                source: uws,
3937                offset: 329,
3938                length: 1,
3939                token: Token::Special(Special::Punctuation(',')),
3940            },
3941            PositionalToken {
3942                source: uws,
3943                offset: 330,
3944                length: 1,
3945                token: Token::Special(Special::Separator(Separator::Space)),
3946            },
3947            PositionalToken {
3948                source: uws,
3949                offset: 331,
3950                length: 10,
3951                token: Token::Word(Word::Word("когда".to_string())),
3952            },
3953            PositionalToken {
3954                source: uws,
3955                offset: 341,
3956                length: 1,
3957                token: Token::Special(Special::Separator(Separator::Space)),
3958            },
3959            PositionalToken {
3960                source: uws,
3961                offset: 342,
3962                length: 22,
3963                token: Token::Word(Word::Word("поздравляют".to_string())),
3964            },
3965            PositionalToken {
3966                source: uws,
3967                offset: 364,
3968                length: 1,
3969                token: Token::Special(Special::Separator(Separator::Space)),
3970            },
3971            PositionalToken {
3972                source: uws,
3973                offset: 365,
3974                length: 6,
3975                token: Token::Word(Word::Word("мам".to_string())),
3976            },
3977            PositionalToken {
3978                source: uws,
3979                offset: 371,
3980                length: 1,
3981                token: Token::Special(Special::Punctuation(',')),
3982            },
3983            PositionalToken {
3984                source: uws,
3985                offset: 372,
3986                length: 1,
3987                token: Token::Special(Special::Separator(Separator::Space)),
3988            },
3989            PositionalToken {
3990                source: uws,
3991                offset: 373,
3992                length: 14,
3993                token: Token::Word(Word::Word("бабушек".to_string())),
3994            },
3995            PositionalToken {
3996                source: uws,
3997                offset: 387,
3998                length: 1,
3999                token: Token::Special(Special::Punctuation(',')),
4000            },
4001            PositionalToken {
4002                source: uws,
4003                offset: 388,
4004                length: 1,
4005                token: Token::Special(Special::Separator(Separator::Space)),
4006            },
4007            PositionalToken {
4008                source: uws,
4009                offset: 389,
4010                length: 12,
4011                token: Token::Word(Word::Word("сестер".to_string())),
4012            },
4013            PositionalToken {
4014                source: uws,
4015                offset: 401,
4016                length: 1,
4017                token: Token::Special(Special::Separator(Separator::Space)),
4018            },
4019            PositionalToken {
4020                source: uws,
4021                offset: 402,
4022                length: 2,
4023                token: Token::Word(Word::Word("и".to_string())),
4024            },
4025            PositionalToken {
4026                source: uws,
4027                offset: 404,
4028                length: 1,
4029                token: Token::Special(Special::Separator(Separator::Space)),
4030            },
4031            PositionalToken {
4032                source: uws,
4033                offset: 405,
4034                length: 6,
4035                token: Token::Word(Word::Word("жён".to_string())),
4036            },
4037            PositionalToken {
4038                source: uws,
4039                offset: 411,
4040                length: 1,
4041                token: Token::Special(Special::Separator(Separator::Space)),
4042            },
4043            PositionalToken {
4044                source: uws,
4045                offset: 412,
4046                length: 3,
4047                token: Token::Special(Special::Punctuation('—')),
4048            },
4049            PositionalToken {
4050                source: uws,
4051                offset: 415,
4052                length: 1,
4053                token: Token::Special(Special::Separator(Separator::Space)),
4054            },
4055            PositionalToken {
4056                source: uws,
4057                offset: 416,
4058                length: 6,
4059                token: Token::Word(Word::Word("это".to_string())),
4060            },
4061            PositionalToken {
4062                source: uws,
4063                offset: 422,
4064                length: 1,
4065                token: Token::Special(Special::Separator(Separator::Space)),
4066            },
4067            PositionalToken {
4068                source: uws,
4069                offset: 423,
4070                length: 18,
4071                token: Token::Word(Word::Word("всемирный".to_string())),
4072            },
4073            PositionalToken {
4074                source: uws,
4075                offset: 441,
4076                length: 1,
4077                token: Token::Special(Special::Separator(Separator::Space)),
4078            },
4079            PositionalToken {
4080                source: uws,
4081                offset: 442,
4082                length: 16,
4083                token: Token::Word(Word::Word("праздник".to_string())),
4084            },
4085            PositionalToken {
4086                source: uws,
4087                offset: 458,
4088                length: 1,
4089                token: Token::Special(Special::Punctuation(',')),
4090            },
4091            PositionalToken {
4092                source: uws,
4093                offset: 459,
4094                length: 1,
4095                token: Token::Special(Special::Separator(Separator::Space)),
4096            },
4097            PositionalToken {
4098                source: uws,
4099                offset: 460,
4100                length: 20,
4101                token: Token::Word(Word::Word("называемый".to_string())),
4102            },
4103            PositionalToken {
4104                source: uws,
4105                offset: 480,
4106                length: 1,
4107                token: Token::Special(Special::Separator(Separator::Space)),
4108            },
4109            PositionalToken {
4110                source: uws,
4111                offset: 481,
4112                length: 2,
4113                token: Token::Special(Special::Punctuation('«')),
4114            },
4115            PositionalToken {
4116                source: uws,
4117                offset: 483,
4118                length: 8,
4119                token: Token::Word(Word::Word("День".to_string())),
4120            },
4121            PositionalToken {
4122                source: uws,
4123                offset: 491,
4124                length: 1,
4125                token: Token::Special(Special::Separator(Separator::Space)),
4126            },
4127            PositionalToken {
4128                source: uws,
4129                offset: 492,
4130                length: 8,
4131                token: Token::Word(Word::Word("Мамы".to_string())),
4132            },
4133            PositionalToken {
4134                source: uws,
4135                offset: 500,
4136                length: 2,
4137                token: Token::Special(Special::Punctuation('»')),
4138            },
4139            PositionalToken {
4140                source: uws,
4141                offset: 502,
4142                length: 1,
4143                token: Token::Special(Special::Punctuation('.')),
4144            },
4145            PositionalToken {
4146                source: uws,
4147                offset: 503,
4148                length: 1,
4149                token: Token::Special(Special::Separator(Separator::Space)),
4150            },
4151            PositionalToken {
4152                source: uws,
4153                offset: 504,
4154                length: 2,
4155                token: Token::Word(Word::Word("В".to_string())),
4156            },
4157            PositionalToken {
4158                source: uws,
4159                offset: 506,
4160                length: 1,
4161                token: Token::Special(Special::Separator(Separator::Space)),
4162            },
4163            PositionalToken {
4164                source: uws,
4165                offset: 507,
4166                length: 18,
4167                token: Token::Word(Word::Word("настоящее".to_string())),
4168            },
4169            PositionalToken {
4170                source: uws,
4171                offset: 525,
4172                length: 1,
4173                token: Token::Special(Special::Separator(Separator::Space)),
4174            },
4175            PositionalToken {
4176                source: uws,
4177                offset: 526,
4178                length: 10,
4179                token: Token::Word(Word::Word("время".to_string())),
4180            },
4181            PositionalToken {
4182                source: uws,
4183                offset: 536,
4184                length: 1,
4185                token: Token::Special(Special::Separator(Separator::Space)),
4186            },
4187            PositionalToken {
4188                source: uws,
4189                offset: 537,
4190                length: 6,
4191                token: Token::Word(Word::Word("его".to_string())),
4192            },
4193            PositionalToken {
4194                source: uws,
4195                offset: 543,
4196                length: 1,
4197                token: Token::Special(Special::Separator(Separator::Space)),
4198            },
4199            PositionalToken {
4200                source: uws,
4201                offset: 544,
4202                length: 16,
4203                token: Token::Word(Word::Word("отмечают".to_string())),
4204            },
4205            PositionalToken {
4206                source: uws,
4207                offset: 560,
4208                length: 1,
4209                token: Token::Special(Special::Separator(Separator::Space)),
4210            },
4211            PositionalToken {
4212                source: uws,
4213                offset: 561,
4214                length: 10,
4215                token: Token::Word(Word::Word("почти".to_string())),
4216            },
4217            PositionalToken {
4218                source: uws,
4219                offset: 571,
4220                length: 1,
4221                token: Token::Special(Special::Separator(Separator::Space)),
4222            },
4223            PositionalToken {
4224                source: uws,
4225                offset: 572,
4226                length: 2,
4227                token: Token::Word(Word::Word("в".to_string())),
4228            },
4229            PositionalToken {
4230                source: uws,
4231                offset: 574,
4232                length: 1,
4233                token: Token::Special(Special::Separator(Separator::Space)),
4234            },
4235            PositionalToken {
4236                source: uws,
4237                offset: 575,
4238                length: 12,
4239                token: Token::Word(Word::Word("каждой".to_string())),
4240            },
4241            PositionalToken {
4242                source: uws,
4243                offset: 587,
4244                length: 1,
4245                token: Token::Special(Special::Separator(Separator::Space)),
4246            },
4247            PositionalToken {
4248                source: uws,
4249                offset: 588,
4250                length: 12,
4251                token: Token::Word(Word::Word("стране".to_string())),
4252            },
4253            PositionalToken {
4254                source: uws,
4255                offset: 600,
4256                length: 1,
4257                token: Token::Special(Special::Punctuation(',')),
4258            },
4259            PositionalToken {
4260                source: uws,
4261                offset: 601,
4262                length: 1,
4263                token: Token::Special(Special::Separator(Separator::Space)),
4264            },
4265            PositionalToken {
4266                source: uws,
4267                offset: 602,
4268                length: 12,
4269                token: Token::Word(Word::Word("просто".to_string())),
4270            },
4271            PositionalToken {
4272                source: uws,
4273                offset: 614,
4274                length: 1,
4275                token: Token::Special(Special::Separator(Separator::Space)),
4276            },
4277            PositionalToken {
4278                source: uws,
4279                offset: 615,
4280                length: 10,
4281                token: Token::Word(Word::Word("везде".to_string())),
4282            },
4283            PositionalToken {
4284                source: uws,
4285                offset: 625,
4286                length: 1,
4287                token: Token::Special(Special::Separator(Separator::Space)),
4288            },
4289            PositionalToken {
4290                source: uws,
4291                offset: 626,
4292                length: 12,
4293                token: Token::Word(Word::Word("разные".to_string())),
4294            },
4295            PositionalToken {
4296                source: uws,
4297                offset: 638,
4298                length: 1,
4299                token: Token::Special(Special::Separator(Separator::Space)),
4300            },
4301            PositionalToken {
4302                source: uws,
4303                offset: 639,
4304                length: 8,
4305                token: Token::Word(Word::Word("даты".to_string())),
4306            },
4307            PositionalToken {
4308                source: uws,
4309                offset: 647,
4310                length: 1,
4311                token: Token::Special(Special::Separator(Separator::Space)),
4312            },
4313            PositionalToken {
4314                source: uws,
4315                offset: 648,
4316                length: 2,
4317                token: Token::Word(Word::Word("и".to_string())),
4318            },
4319            PositionalToken {
4320                source: uws,
4321                offset: 650,
4322                length: 1,
4323                token: Token::Special(Special::Separator(Separator::Space)),
4324            },
4325            PositionalToken {
4326                source: uws,
4327                offset: 651,
4328                length: 14,
4329                token: Token::Word(Word::Word("способы".to_string())),
4330            },
4331            PositionalToken {
4332                source: uws,
4333                offset: 665,
4334                length: 1,
4335                token: Token::Special(Special::Separator(Separator::Space)),
4336            },
4337            PositionalToken {
4338                source: uws,
4339                offset: 666,
4340                length: 24,
4341                token: Token::Word(Word::Word("празднования".to_string())),
4342            },
4343            PositionalToken {
4344                source: uws,
4345                offset: 690,
4346                length: 1,
4347                token: Token::Special(Special::Punctuation('.')),
4348            },
4349            PositionalToken {
4350                source: uws,
4351                offset: 691,
4352                length: 1,
4353                token: Token::Special(Special::Separator(Separator::Space)),
4354            },
4355            PositionalToken {
4356                source: uws,
4357                offset: 794,
4358                length: 1,
4359                token: Token::Special(Special::Separator(Separator::Newline)),
4360            },
4361            PositionalToken {
4362                source: uws,
4363                offset: 795,
4364                length: 2,
4365                token: Token::Special(Special::Separator(Separator::Space)),
4366            },
4367            PositionalToken {
4368                source: uws,
4369                offset: 870,
4370                length: 1,
4371                token: Token::Special(Special::Separator(Separator::Newline)),
4372            },
4373            PositionalToken {
4374                source: uws,
4375                offset: 871,
4376                length: 2,
4377                token: Token::Special(Special::Separator(Separator::Space)),
4378            },
4379            PositionalToken {
4380                source: uws,
4381                offset: 910,
4382                length: 2,
4383                token: Token::Word(Word::Word("П".to_string())),
4384            },
4385            PositionalToken {
4386                source: uws,
4387                offset: 919,
4388                length: 1,
4389                token: Token::Special(Special::Separator(Separator::Newline)),
4390            },
4391            PositionalToken {
4392                source: uws,
4393                offset: 927,
4394                length: 12,
4395                token: Token::Word(Word::Word("ПОЧЕМУ".to_string())),
4396            },
4397            PositionalToken {
4398                source: uws,
4399                offset: 939,
4400                length: 1,
4401                token: Token::Special(Special::Separator(Separator::Space)),
4402            },
4403            PositionalToken {
4404                source: uws,
4405                offset: 940,
4406                length: 4,
4407                token: Token::Word(Word::Word("МЫ".to_string())),
4408            },
4409            PositionalToken {
4410                source: uws,
4411                offset: 944,
4412                length: 1,
4413                token: Token::Special(Special::Separator(Separator::Space)),
4414            },
4415            PositionalToken {
4416                source: uws,
4417                offset: 945,
4418                length: 6,
4419                token: Token::Word(Word::Word("ЕГО".to_string())),
4420            },
4421            PositionalToken {
4422                source: uws,
4423                offset: 951,
4424                length: 1,
4425                token: Token::Special(Special::Separator(Separator::Space)),
4426            },
4427            PositionalToken {
4428                source: uws,
4429                offset: 952,
4430                length: 18,
4431                token: Token::Word(Word::Word("ПРАЗДНУЕМ".to_string())),
4432            },
4433            PositionalToken {
4434                source: uws,
4435                offset: 1063,
4436                length: 2,
4437                token: Token::Word(Word::Word("В".to_string())),
4438            },
4439            PositionalToken {
4440                source: uws,
4441                offset: 1065,
4442                length: 1,
4443                token: Token::Special(Special::Separator(Separator::Space)),
4444            },
4445            PositionalToken {
4446                source: uws,
4447                offset: 1066,
4448                length: 4,
4449                token: Token::Word(Word::Number(Number::Integer(1987))),
4450            },
4451            PositionalToken {
4452                source: uws,
4453                offset: 1070,
4454                length: 1,
4455                token: Token::Special(Special::Separator(Separator::Space)),
4456            },
4457            PositionalToken {
4458                source: uws,
4459                offset: 1071,
4460                length: 8,
4461                token: Token::Word(Word::Word("году".to_string())),
4462            },
4463            PositionalToken {
4464                source: uws,
4465                offset: 1079,
4466                length: 1,
4467                token: Token::Special(Special::Separator(Separator::Space)),
4468            },
4469            PositionalToken {
4470                source: uws,
4471                offset: 1080,
4472                length: 14,
4473                token: Token::Word(Word::Word("комитет".to_string())),
4474            },
4475            PositionalToken {
4476                source: uws,
4477                offset: 1094,
4478                length: 1,
4479                token: Token::Special(Special::Separator(Separator::Space)),
4480            },
4481            PositionalToken {
4482                source: uws,
4483                offset: 1095,
4484                length: 14,
4485                token: Token::Word(Word::Word("госдумы".to_string())),
4486            },
4487            PositionalToken {
4488                source: uws,
4489                offset: 1109,
4490                length: 1,
4491                token: Token::Special(Special::Separator(Separator::Space)),
4492            },
4493            PositionalToken {
4494                source: uws,
4495                offset: 1110,
4496                length: 4,
4497                token: Token::Word(Word::Word("по".to_string())),
4498            },
4499            PositionalToken {
4500                source: uws,
4501                offset: 1114,
4502                length: 1,
4503                token: Token::Special(Special::Separator(Separator::Space)),
4504            },
4505            PositionalToken {
4506                source: uws,
4507                offset: 1115,
4508                length: 10,
4509                token: Token::Word(Word::Word("делам".to_string())),
4510            },
4511            PositionalToken {
4512                source: uws,
4513                offset: 1125,
4514                length: 1,
4515                token: Token::Special(Special::Separator(Separator::Space)),
4516            },
4517            PositionalToken {
4518                source: uws,
4519                offset: 1126,
4520                length: 12,
4521                token: Token::Word(Word::Word("женщин".to_string())),
4522            },
4523            PositionalToken {
4524                source: uws,
4525                offset: 1138,
4526                length: 1,
4527                token: Token::Special(Special::Punctuation(',')),
4528            },
4529            PositionalToken {
4530                source: uws,
4531                offset: 1139,
4532                length: 1,
4533                token: Token::Special(Special::Separator(Separator::Space)),
4534            },
4535            PositionalToken {
4536                source: uws,
4537                offset: 1140,
4538                length: 10,
4539                token: Token::Word(Word::Word("семьи".to_string())),
4540            },
4541            PositionalToken {
4542                source: uws,
4543                offset: 1150,
4544                length: 1,
4545                token: Token::Special(Special::Separator(Separator::Space)),
4546            },
4547            PositionalToken {
4548                source: uws,
4549                offset: 1151,
4550                length: 2,
4551                token: Token::Word(Word::Word("и".to_string())),
4552            },
4553            PositionalToken {
4554                source: uws,
4555                offset: 1153,
4556                length: 1,
4557                token: Token::Special(Special::Separator(Separator::Space)),
4558            },
4559            PositionalToken {
4560                source: uws,
4561                offset: 1154,
4562                length: 16,
4563                token: Token::Word(Word::Word("молодежи".to_string())),
4564            },
4565            PositionalToken {
4566                source: uws,
4567                offset: 1170,
4568                length: 1,
4569                token: Token::Special(Special::Separator(Separator::Space)),
4570            },
4571            PositionalToken {
4572                source: uws,
4573                offset: 1171,
4574                length: 16,
4575                token: Token::Word(Word::Word("выступил".to_string())),
4576            },
4577            PositionalToken {
4578                source: uws,
4579                offset: 1187,
4580                length: 1,
4581                token: Token::Special(Special::Separator(Separator::Space)),
4582            },
4583            PositionalToken {
4584                source: uws,
4585                offset: 1188,
4586                length: 2,
4587                token: Token::Word(Word::Word("с".to_string())),
4588            },
4589            PositionalToken {
4590                source: uws,
4591                offset: 1190,
4592                length: 1,
4593                token: Token::Special(Special::Separator(Separator::Space)),
4594            },
4595            PositionalToken {
4596                source: uws,
4597                offset: 1191,
4598                length: 24,
4599                token: Token::Word(Word::Word("предложением".to_string())),
4600            },
4601            PositionalToken {
4602                source: uws,
4603                offset: 1215,
4604                length: 1,
4605                token: Token::Special(Special::Separator(Separator::Space)),
4606            },
4607            PositionalToken {
4608                source: uws,
4609                offset: 1216,
4610                length: 16,
4611                token: Token::Word(Word::Word("учредить".to_string())),
4612            },
4613            PositionalToken {
4614                source: uws,
4615                offset: 1232,
4616                length: 1,
4617                token: Token::Special(Special::Separator(Separator::Space)),
4618            },
4619            PositionalToken {
4620                source: uws,
4621                offset: 1233,
4622                length: 2,
4623                token: Token::Special(Special::Punctuation('«')),
4624            },
4625            PositionalToken {
4626                source: uws,
4627                offset: 1235,
4628                length: 8,
4629                token: Token::Word(Word::Word("День".to_string())),
4630            },
4631            PositionalToken {
4632                source: uws,
4633                offset: 1243,
4634                length: 1,
4635                token: Token::Special(Special::Separator(Separator::Space)),
4636            },
4637            PositionalToken {
4638                source: uws,
4639                offset: 1244,
4640                length: 8,
4641                token: Token::Word(Word::Word("мамы".to_string())),
4642            },
4643            PositionalToken {
4644                source: uws,
4645                offset: 1252,
4646                length: 2,
4647                token: Token::Special(Special::Punctuation('»')),
4648            },
4649            PositionalToken {
4650                source: uws,
4651                offset: 1254,
4652                length: 1,
4653                token: Token::Special(Special::Punctuation(',')),
4654            },
4655            PositionalToken {
4656                source: uws,
4657                offset: 1255,
4658                length: 1,
4659                token: Token::Special(Special::Separator(Separator::Space)),
4660            },
4661            PositionalToken {
4662                source: uws,
4663                offset: 1256,
4664                length: 2,
4665                token: Token::Word(Word::Word("а".to_string())),
4666            },
4667            PositionalToken {
4668                source: uws,
4669                offset: 1258,
4670                length: 1,
4671                token: Token::Special(Special::Separator(Separator::Space)),
4672            },
4673            PositionalToken {
4674                source: uws,
4675                offset: 1259,
4676                length: 6,
4677                token: Token::Word(Word::Word("сам".to_string())),
4678            },
4679            PositionalToken {
4680                source: uws,
4681                offset: 1265,
4682                length: 1,
4683                token: Token::Special(Special::Separator(Separator::Space)),
4684            },
4685            PositionalToken {
4686                source: uws,
4687                offset: 1266,
4688                length: 12,
4689                token: Token::Word(Word::Word("приказ".to_string())),
4690            },
4691            PositionalToken {
4692                source: uws,
4693                offset: 1278,
4694                length: 1,
4695                token: Token::Special(Special::Separator(Separator::Space)),
4696            },
4697            PositionalToken {
4698                source: uws,
4699                offset: 1279,
4700                length: 6,
4701                token: Token::Word(Word::Word("был".to_string())),
4702            },
4703            PositionalToken {
4704                source: uws,
4705                offset: 1285,
4706                length: 1,
4707                token: Token::Special(Special::Separator(Separator::Space)),
4708            },
4709            PositionalToken {
4710                source: uws,
4711                offset: 1286,
4712                length: 16,
4713                token: Token::Word(Word::Word("подписан".to_string())),
4714            },
4715            PositionalToken {
4716                source: uws,
4717                offset: 1302,
4718                length: 1,
4719                token: Token::Special(Special::Separator(Separator::Space)),
4720            },
4721            PositionalToken {
4722                source: uws,
4723                offset: 1303,
4724                length: 6,
4725                token: Token::Word(Word::Word("уже".to_string())),
4726            },
4727            PositionalToken {
4728                source: uws,
4729                offset: 1309,
4730                length: 1,
4731                token: Token::Special(Special::Separator(Separator::Space)),
4732            },
4733            PositionalToken {
4734                source: uws,
4735                offset: 1310,
4736                length: 2,
4737                token: Token::Word(Word::Number(Number::Integer(30))),
4738            },
4739            PositionalToken {
4740                source: uws,
4741                offset: 1312,
4742                length: 1,
4743                token: Token::Special(Special::Separator(Separator::Space)),
4744            },
4745            PositionalToken {
4746                source: uws,
4747                offset: 1313,
4748                length: 12,
4749                token: Token::Word(Word::Word("января".to_string())),
4750            },
4751            PositionalToken {
4752                source: uws,
4753                offset: 1325,
4754                length: 1,
4755                token: Token::Special(Special::Separator(Separator::Space)),
4756            },
4757            PositionalToken {
4758                source: uws,
4759                offset: 1326,
4760                length: 4,
4761                token: Token::Word(Word::Number(Number::Integer(1988))),
4762            },
4763            PositionalToken {
4764                source: uws,
4765                offset: 1330,
4766                length: 1,
4767                token: Token::Special(Special::Separator(Separator::Space)),
4768            },
4769            PositionalToken {
4770                source: uws,
4771                offset: 1331,
4772                length: 8,
4773                token: Token::Word(Word::Word("года".to_string())),
4774            },
4775            PositionalToken {
4776                source: uws,
4777                offset: 1339,
4778                length: 1,
4779                token: Token::Special(Special::Separator(Separator::Space)),
4780            },
4781            PositionalToken {
4782                source: uws,
4783                offset: 1340,
4784                length: 14,
4785                token: Token::Word(Word::Word("Борисом".to_string())),
4786            },
4787            PositionalToken {
4788                source: uws,
4789                offset: 1354,
4790                length: 1,
4791                token: Token::Special(Special::Separator(Separator::Space)),
4792            },
4793            PositionalToken {
4794                source: uws,
4795                offset: 1355,
4796                length: 16,
4797                token: Token::Word(Word::Word("Ельциным".to_string())),
4798            },
4799            PositionalToken {
4800                source: uws,
4801                offset: 1371,
4802                length: 1,
4803                token: Token::Special(Special::Punctuation('.')),
4804            },
4805            PositionalToken {
4806                source: uws,
4807                offset: 1372,
4808                length: 1,
4809                token: Token::Special(Special::Separator(Separator::Space)),
4810            },
4811            PositionalToken {
4812                source: uws,
4813                offset: 1373,
4814                length: 8,
4815                token: Token::Word(Word::Word("Было".to_string())),
4816            },
4817            PositionalToken {
4818                source: uws,
4819                offset: 1381,
4820                length: 1,
4821                token: Token::Special(Special::Separator(Separator::Space)),
4822            },
4823            PositionalToken {
4824                source: uws,
4825                offset: 1382,
4826                length: 12,
4827                token: Token::Word(Word::Word("решено".to_string())),
4828            },
4829            PositionalToken {
4830                source: uws,
4831                offset: 1394,
4832                length: 1,
4833                token: Token::Special(Special::Punctuation(',')),
4834            },
4835            PositionalToken {
4836                source: uws,
4837                offset: 1395,
4838                length: 1,
4839                token: Token::Special(Special::Separator(Separator::Space)),
4840            },
4841            PositionalToken {
4842                source: uws,
4843                offset: 1396,
4844                length: 6,
4845                token: Token::Word(Word::Word("что".to_string())),
4846            },
4847            PositionalToken {
4848                source: uws,
4849                offset: 1402,
4850                length: 1,
4851                token: Token::Special(Special::Separator(Separator::Space)),
4852            },
4853            PositionalToken {
4854                source: uws,
4855                offset: 1403,
4856                length: 16,
4857                token: Token::Word(Word::Word("ежегодно".to_string())),
4858            },
4859            PositionalToken {
4860                source: uws,
4861                offset: 1419,
4862                length: 1,
4863                token: Token::Special(Special::Separator(Separator::Space)),
4864            },
4865            PositionalToken {
4866                source: uws,
4867                offset: 1420,
4868                length: 2,
4869                token: Token::Word(Word::Word("в".to_string())),
4870            },
4871            PositionalToken {
4872                source: uws,
4873                offset: 1422,
4874                length: 1,
4875                token: Token::Special(Special::Separator(Separator::Space)),
4876            },
4877            PositionalToken {
4878                source: uws,
4879                offset: 1423,
4880                length: 12,
4881                token: Token::Word(Word::Word("России".to_string())),
4882            },
4883            PositionalToken {
4884                source: uws,
4885                offset: 1435,
4886                length: 1,
4887                token: Token::Special(Special::Separator(Separator::Space)),
4888            },
4889            PositionalToken {
4890                source: uws,
4891                offset: 1436,
4892                length: 22,
4893                token: Token::Word(Word::Word("празднество".to_string())),
4894            },
4895            PositionalToken {
4896                source: uws,
4897                offset: 1458,
4898                length: 1,
4899                token: Token::Special(Special::Separator(Separator::Space)),
4900            },
4901            PositionalToken {
4902                source: uws,
4903                offset: 1459,
4904                length: 6,
4905                token: Token::Word(Word::Word("дня".to_string())),
4906            },
4907            PositionalToken {
4908                source: uws,
4909                offset: 1465,
4910                length: 1,
4911                token: Token::Special(Special::Separator(Separator::Space)),
4912            },
4913            PositionalToken {
4914                source: uws,
4915                offset: 1466,
4916                length: 8,
4917                token: Token::Word(Word::Word("мамы".to_string())),
4918            },
4919            PositionalToken {
4920                source: uws,
4921                offset: 1474,
4922                length: 1,
4923                token: Token::Special(Special::Separator(Separator::Space)),
4924            },
4925            PositionalToken {
4926                source: uws,
4927                offset: 1475,
4928                length: 10,
4929                token: Token::Word(Word::Word("будет".to_string())),
4930            },
4931            PositionalToken {
4932                source: uws,
4933                offset: 1485,
4934                length: 1,
4935                token: Token::Special(Special::Separator(Separator::Space)),
4936            },
4937            PositionalToken {
4938                source: uws,
4939                offset: 1486,
4940                length: 16,
4941                token: Token::Word(Word::Word("выпадать".to_string())),
4942            },
4943            PositionalToken {
4944                source: uws,
4945                offset: 1502,
4946                length: 1,
4947                token: Token::Special(Special::Separator(Separator::Space)),
4948            },
4949            PositionalToken {
4950                source: uws,
4951                offset: 1503,
4952                length: 4,
4953                token: Token::Word(Word::Word("на".to_string())),
4954            },
4955            PositionalToken {
4956                source: uws,
4957                offset: 1507,
4958                length: 1,
4959                token: Token::Special(Special::Separator(Separator::Space)),
4960            },
4961            PositionalToken {
4962                source: uws,
4963                offset: 1508,
4964                length: 18,
4965                token: Token::Word(Word::Word("последнее".to_string())),
4966            },
4967            PositionalToken {
4968                source: uws,
4969                offset: 1526,
4970                length: 1,
4971                token: Token::Special(Special::Separator(Separator::Space)),
4972            },
4973            PositionalToken {
4974                source: uws,
4975                offset: 1527,
4976                length: 22,
4977                token: Token::Word(Word::Word("воскресенье".to_string())),
4978            },
4979            PositionalToken {
4980                source: uws,
4981                offset: 1549,
4982                length: 1,
4983                token: Token::Special(Special::Separator(Separator::Space)),
4984            },
4985            PositionalToken {
4986                source: uws,
4987                offset: 1550,
4988                length: 12,
4989                token: Token::Word(Word::Word("ноября".to_string())),
4990            },
4991            PositionalToken {
4992                source: uws,
4993                offset: 1562,
4994                length: 1,
4995                token: Token::Special(Special::Punctuation('.')),
4996            },
4997            PositionalToken {
4998                source: uws,
4999                offset: 1563,
5000                length: 1,
5001                token: Token::Special(Special::Separator(Separator::Space)),
5002            },
5003            PositionalToken {
5004                source: uws,
5005                offset: 1664,
5006                length: 1,
5007                token: Token::Special(Special::Separator(Separator::Newline)),
5008            },
5009            PositionalToken {
5010                source: uws,
5011                offset: 1665,
5012                length: 2,
5013                token: Token::Special(Special::Separator(Separator::Space)),
5014            },
5015            PositionalToken {
5016                source: uws,
5017                offset: 1725,
5018                length: 1,
5019                token: Token::Special(Special::Separator(Separator::Newline)),
5020            },
5021            PositionalToken {
5022                source: uws,
5023                offset: 1726,
5024                length: 4,
5025                token: Token::Special(Special::Separator(Separator::Space)),
5026            },
5027            PositionalToken {
5028                source: uws,
5029                offset: 2725,
5030                length: 1,
5031                token: Token::Special(Special::Separator(Separator::Newline)),
5032            },
5033            PositionalToken {
5034                source: uws,
5035                offset: 2726,
5036                length: 2,
5037                token: Token::Special(Special::Separator(Separator::Space)),
5038            },
5039            PositionalToken {
5040                source: uws,
5041                offset: 2888,
5042                length: 1,
5043                token: Token::Special(Special::Separator(Separator::Newline)),
5044            },
5045            PositionalToken {
5046                source: uws,
5047                offset: 2889,
5048                length: 2,
5049                token: Token::Special(Special::Separator(Separator::Space)),
5050            },
5051            PositionalToken {
5052                source: uws,
5053                offset: 2891,
5054                length: 1,
5055                token: Token::Special(Special::Separator(Separator::Newline)),
5056            },
5057            PositionalToken {
5058                source: uws,
5059                offset: 2904,
5060                length: 1,
5061                token: Token::Special(Special::Separator(Separator::Newline)),
5062            },
5063            PositionalToken {
5064                source: uws,
5065                offset: 2905,
5066                length: 4,
5067                token: Token::Special(Special::Separator(Separator::Space)),
5068            },
5069        ];
5070
5071        let text = Text::new({
5072            uws.into_source()
5073                .pipe(tagger::Builder::new().create().into_breaker())
5074                .pipe(entities::Builder::new().create().into_piped())
5075                .into_separator()
5076        })
5077        .unwrap();
5078
5079        let lib_res = text
5080            .into_tokenizer(TokenizerParams::v1())
5081            .filter_map(|tt| tt.into_original_token_1())
5082            .collect::<Vec<_>>();
5083
5084        check_results(&result, &lib_res, uws);
5085    }
5086
5087    #[test]
5138    fn numerical_no_split() {
5139        let uws = "12.02.18 31.28.34 23.11.2018 123.568.365.234.578 127.0.0.1 1st 1кг 123123афываыв 12321фвафыов234выалфо 12_123_343.4234_4234";
5140        let lib_res = uws.into_tokenizer(Default::default()).collect::<Vec<_>>();
5141        let result = vec![
5143            PositionalToken {
5144                source: uws,
5145                offset: 0,
5146                length: 8,
5147                token: Token::Word(Word::Numerical(Numerical::DotSeparated(
5148                    "12.02.18".to_string(),
5149                ))),
5150            },
5151            PositionalToken {
5152                source: uws,
5153                offset: 8,
5154                length: 1,
5155                token: Token::Special(Special::Separator(Separator::Space)),
5156            },
5157            PositionalToken {
5158                source: uws,
5159                offset: 9,
5160                length: 8,
5161                token: Token::Word(Word::Numerical(Numerical::DotSeparated(
5162                    "31.28.34".to_string(),
5163                ))),
5164            },
5165            PositionalToken {
5166                source: uws,
5167                offset: 17,
5168                length: 1,
5169                token: Token::Special(Special::Separator(Separator::Space)),
5170            },
5171            PositionalToken {
5172                source: uws,
5173                offset: 18,
5174                length: 10,
5175                token: Token::Word(Word::Numerical(Numerical::DotSeparated(
5176                    "23.11.2018".to_string(),
5177                ))),
5178            },
5179            PositionalToken {
5180                source: uws,
5181                offset: 28,
5182                length: 1,
5183                token: Token::Special(Special::Separator(Separator::Space)),
5184            },
5185            PositionalToken {
5186                source: uws,
5187                offset: 29,
5188                length: 19,
5189                token: Token::Word(Word::Number(Number::Integer(123568365234578))),
5191            },
5192            PositionalToken {
5193                source: uws,
5194                offset: 48,
5195                length: 1,
5196                token: Token::Special(Special::Separator(Separator::Space)),
5197            },
5198            PositionalToken {
5199                source: uws,
5200                offset: 49,
5201                length: 9,
5202                token: Token::Word(Word::Numerical(Numerical::DotSeparated(
5203                    "127.0.0.1".to_string(),
5204                ))),
5205            },
5206            PositionalToken {
5207                source: uws,
5208                offset: 58,
5209                length: 1,
5210                token: Token::Special(Special::Separator(Separator::Space)),
5211            },
5212            PositionalToken {
5213                source: uws,
5214                offset: 59,
5215                length: 3,
5216                token: Token::Word(Word::Numerical(Numerical::Measures("1st".to_string()))),
5217            },
5218            PositionalToken {
5219                source: uws,
5220                offset: 62,
5221                length: 1,
5222                token: Token::Special(Special::Separator(Separator::Space)),
5223            },
5224            PositionalToken {
5225                source: uws,
5226                offset: 63,
5227                length: 5,
5228                token: Token::Word(Word::Numerical(Numerical::Measures("1кг".to_string()))),
5229            },
5230            PositionalToken {
5231                source: uws,
5232                offset: 68,
5233                length: 1,
5234                token: Token::Special(Special::Separator(Separator::Space)),
5235            },
5236            PositionalToken {
5237                source: uws,
5238                offset: 69,
5239                length: 20,
5240                token: Token::Word(Word::Numerical(Numerical::Measures(
5241                    "123123афываыв".to_string(),
5242                ))),
5243            },
5244            PositionalToken {
5245                source: uws,
5246                offset: 89,
5247                length: 1,
5248                token: Token::Special(Special::Separator(Separator::Space)),
5249            },
5250            PositionalToken {
5251                source: uws,
5252                offset: 90,
5253                length: 34,
5254                token: Token::Word(Word::Numerical(Numerical::Alphanumeric(
5255                    "12321фвафыов234выалфо".to_string(),
5256                ))),
5257            },
5258            PositionalToken {
5259                source: uws,
5260                offset: 124,
5261                length: 1,
5262                token: Token::Special(Special::Separator(Separator::Space)),
5263            },
5264            PositionalToken {
5265                source: uws,
5266                offset: 125,
5267                length: 20,
5268                token: Token::Word(Word::Numerical(Numerical::Alphanumeric(
5269                    "12_123_343.4234_4234".to_string(),
5270                ))),
5271            },
5272        ];
5273        check_results(&result, &lib_res, uws);
5274    }
5275
5276    #[test]
5277    fn numerical_default() {
5278        let uws = "12.02.18 31.28.34 23.11.2018 123.568.365.234.578 127.0.0.1 1st 1кг 123123афываыв 12321фвафыов234выалфо 12_123_343.4234_4234";
5279        let lib_res = uws
5280            .into_tokenizer(TokenizerParams::v1())
5281            .collect::<Vec<_>>();
5282        let result = vec![
5285            PositionalToken {
5286                source: uws,
5287                offset: 0,
5288                length: 2,
5289                token: Token::Word(Word::Number(Number::Integer(12))),
5290            },
5291            PositionalToken {
5292                source: uws,
5293                offset: 2,
5294                length: 1,
5295                token: Token::Special(Special::Punctuation('.')),
5296            },
5297            PositionalToken {
5298                source: uws,
5299                offset: 3,
5300                length: 2,
5301                token: Token::Word(Word::Number(Number::ZeroInteger {
5302                    i: 2,
5303                    s: "02".to_string(),
5304                })),
5305            },
5306            PositionalToken {
5307                source: uws,
5308                offset: 5,
5309                length: 1,
5310                token: Token::Special(Special::Punctuation('.')),
5311            },
5312            PositionalToken {
5313                source: uws,
5314                offset: 6,
5315                length: 2,
5316                token: Token::Word(Word::Number(Number::Integer(18))),
5317            },
5318            PositionalToken {
5319                source: uws,
5320                offset: 8,
5321                length: 1,
5322                token: Token::Special(Special::Separator(Separator::Space)),
5323            },
5324            PositionalToken {
5325                source: uws,
5326                offset: 9,
5327                length: 2,
5328                token: Token::Word(Word::Number(Number::Integer(31))),
5329            },
5330            PositionalToken {
5331                source: uws,
5332                offset: 11,
5333                length: 1,
5334                token: Token::Special(Special::Punctuation('.')),
5335            },
5336            PositionalToken {
5337                source: uws,
5338                offset: 12,
5339                length: 2,
5340                token: Token::Word(Word::Number(Number::Integer(28))),
5341            },
5342            PositionalToken {
5343                source: uws,
5344                offset: 14,
5345                length: 1,
5346                token: Token::Special(Special::Punctuation('.')),
5347            },
5348            PositionalToken {
5349                source: uws,
5350                offset: 15,
5351                length: 2,
5352                token: Token::Word(Word::Number(Number::Integer(34))),
5353            },
5354            PositionalToken {
5355                source: uws,
5356                offset: 17,
5357                length: 1,
5358                token: Token::Special(Special::Separator(Separator::Space)),
5359            },
5360            PositionalToken {
5361                source: uws,
5362                offset: 18,
5363                length: 2,
5364                token: Token::Word(Word::Number(Number::Integer(23))),
5365            },
5366            PositionalToken {
5367                source: uws,
5368                offset: 20,
5369                length: 1,
5370                token: Token::Special(Special::Punctuation('.')),
5371            },
5372            PositionalToken {
5373                source: uws,
5374                offset: 21,
5375                length: 2,
5376                token: Token::Word(Word::Number(Number::Integer(11))),
5377            },
5378            PositionalToken {
5379                source: uws,
5380                offset: 23,
5381                length: 1,
5382                token: Token::Special(Special::Punctuation('.')),
5383            },
5384            PositionalToken {
5385                source: uws,
5386                offset: 24,
5387                length: 4,
5388                token: Token::Word(Word::Number(Number::Integer(2018))),
5389            },
5390            PositionalToken {
5391                source: uws,
5392                offset: 28,
5393                length: 1,
5394                token: Token::Special(Special::Separator(Separator::Space)),
5395            },
5396            PositionalToken {
5397                source: uws,
5398                offset: 29,
5399                length: 19,
5400                token: Token::Word(Word::Number(Number::Integer(123568365234578))),
5402            },
5403            PositionalToken {
5458                source: uws,
5459                offset: 48,
5460                length: 1,
5461                token: Token::Special(Special::Separator(Separator::Space)),
5462            },
5463            PositionalToken {
5464                source: uws,
5465                offset: 49,
5466                length: 3,
5467                token: Token::Word(Word::Number(Number::Integer(127))),
5468            },
5469            PositionalToken {
5470                source: uws,
5471                offset: 52,
5472                length: 1,
5473                token: Token::Special(Special::Punctuation('.')),
5474            },
5475            PositionalToken {
5476                source: uws,
5477                offset: 53,
5478                length: 1,
5479                token: Token::Word(Word::Number(Number::ZeroInteger {
5480                    i: 0,
5481                    s: "0".to_string(),
5482                })),
5483            },
5484            PositionalToken {
5485                source: uws,
5486                offset: 54,
5487                length: 1,
5488                token: Token::Special(Special::Punctuation('.')),
5489            },
5490            PositionalToken {
5491                source: uws,
5492                offset: 55,
5493                length: 1,
5494                token: Token::Word(Word::Number(Number::ZeroInteger {
5495                    i: 0,
5496                    s: "0".to_string(),
5497                })),
5498            },
5499            PositionalToken {
5500                source: uws,
5501                offset: 56,
5502                length: 1,
5503                token: Token::Special(Special::Punctuation('.')),
5504            },
5505            PositionalToken {
5506                source: uws,
5507                offset: 57,
5508                length: 1,
5509                token: Token::Word(Word::Number(Number::Integer(1))),
5510            },
5511            PositionalToken {
5512                source: uws,
5513                offset: 58,
5514                length: 1,
5515                token: Token::Special(Special::Separator(Separator::Space)),
5516            },
5517            PositionalToken {
5518                source: uws,
5519                offset: 59,
5520                length: 3,
5521                token: Token::Word(Word::Numerical(Numerical::Measures("1st".to_string()))),
5522            },
5523            PositionalToken {
5524                source: uws,
5525                offset: 62,
5526                length: 1,
5527                token: Token::Special(Special::Separator(Separator::Space)),
5528            },
5529            PositionalToken {
5530                source: uws,
5531                offset: 63,
5532                length: 5,
5533                token: Token::Word(Word::Numerical(Numerical::Measures("1кг".to_string()))),
5534            },
5535            PositionalToken {
5536                source: uws,
5537                offset: 68,
5538                length: 1,
5539                token: Token::Special(Special::Separator(Separator::Space)),
5540            },
5541            PositionalToken {
5542                source: uws,
5543                offset: 69,
5544                length: 20,
5545                token: Token::Word(Word::Numerical(Numerical::Measures(
5546                    "123123афываыв".to_string(),
5547                ))),
5548            },
5549            PositionalToken {
5550                source: uws,
5551                offset: 89,
5552                length: 1,
5553                token: Token::Special(Special::Separator(Separator::Space)),
5554            },
5555            PositionalToken {
5556                source: uws,
5557                offset: 90,
5558                length: 34,
5559                token: Token::Word(Word::Numerical(Numerical::Alphanumeric(
5560                    "12321фвафыов234выалфо".to_string(),
5561                ))),
5562            },
5563            PositionalToken {
5564                source: uws,
5565                offset: 124,
5566                length: 1,
5567                token: Token::Special(Special::Separator(Separator::Space)),
5568            },
5569            PositionalToken {
5570                source: uws,
5571                offset: 125,
5572                length: 2,
5573                token: Token::Word(Word::Number(Number::Integer(12))),
5574            },
5575            PositionalToken {
5576                source: uws,
5577                offset: 127,
5578                length: 1,
5579                token: Token::Special(Special::Punctuation('_')),
5580            },
5581            PositionalToken {
5582                source: uws,
5583                offset: 128,
5584                length: 3,
5585                token: Token::Word(Word::Number(Number::Integer(123))),
5586            },
5587            PositionalToken {
5588                source: uws,
5589                offset: 131,
5590                length: 1,
5591                token: Token::Special(Special::Punctuation('_')),
5592            },
5593            PositionalToken {
5594                source: uws,
5595                offset: 132,
5596                length: 3,
5597                token: Token::Word(Word::Number(Number::Integer(343))),
5598            },
5599            PositionalToken {
5600                source: uws,
5601                offset: 135,
5602                length: 1,
5603                token: Token::Special(Special::Punctuation('.')),
5604            },
5605            PositionalToken {
5606                source: uws,
5607                offset: 136,
5608                length: 4,
5609                token: Token::Word(Word::Number(Number::Integer(4234))),
5610            },
5611            PositionalToken {
5612                source: uws,
5613                offset: 140,
5614                length: 1,
5615                token: Token::Special(Special::Punctuation('_')),
5616            },
5617            PositionalToken {
5618                source: uws,
5619                offset: 141,
5620                length: 4,
5621                token: Token::Word(Word::Number(Number::Integer(4234))),
5622            },
5623        ];
5624        check_results(&result, &lib_res, uws);
5625    }
5626
5627    enum Lang {
5640        Zho,
5641        Jpn,
5642        Kor,
5643        Ara,
5644        Ell,
5645    }
5646
5647    #[test]
5648    fn test_lang_zho() {
5649        let (uws, result) = get_lang_test(Lang::Zho);
5650        let lib_res = uws
5651            .into_tokenizer(TokenizerParams::v1())
5652            .collect::<Vec<_>>();
5653        check_results(&result, &lib_res, &uws);
5654    }
5655
5656    #[test]
5657    fn test_lang_jpn() {
5658        let (uws, result) = get_lang_test(Lang::Jpn);
5659        let lib_res = uws
5660            .into_tokenizer(TokenizerParams::v1())
5661            .collect::<Vec<_>>();
5662        check_results(&result, &lib_res, &uws);
5663    }
5664
5665    #[test]
5666    fn test_lang_kor() {
5667        let (uws, result) = get_lang_test(Lang::Kor);
5668        let lib_res = uws
5669            .into_tokenizer(TokenizerParams::v1())
5670            .collect::<Vec<_>>();
5671        check_results(&result, &lib_res, &uws);
5672    }
5673
5674    #[test]
5675    fn test_lang_ara() {
5676        let (uws, result) = get_lang_test(Lang::Ara);
5677        let lib_res = uws
5678            .into_tokenizer(TokenizerParams::v1())
5679            .collect::<Vec<_>>();
5680        check_results(&result, &lib_res, &uws);
5681    }
5682
5683    #[test]
5684    fn test_lang_ell() {
5685        let (uws, result) = get_lang_test(Lang::Ell);
5686        let lib_res = uws
5687            .into_tokenizer(TokenizerParams::v1())
5688            .collect::<Vec<_>>();
5689        check_results(&result, &lib_res, &uws);
5690    }
5691
5692    fn get_lang_test(lng: Lang) -> (String, Vec<PositionalToken>) {
5693        let uws = match lng {
5694            Lang::Zho => {
5695                "美国电视连续剧《超人前传》的第一集《试播集》于2001年10月16日在電視網首播,剧集主创人阿尔弗雷德·高夫和迈尔斯·米勒編劇,大卫·努特尔执导。这一试播首次向观众引荐了克拉克·肯特一角,他是位拥有超能力的外星孤儿,与家人和朋友一起在堪薩斯州虚构小镇斯莫维尔生活。在这一集里,肯特首度得知自己的来历,同时还需要阻止一位学生试图杀死镇上高中多名学生的报复之举。本集节目里引入了多个之后将贯穿全季甚至整部剧集的主题元素,例如几位主要角色之间的三角恋情。电视剧在加拿大溫哥華取景,旨在选用其“美国中产阶级”景观,主创人花了5个月的时间专门用于为主角物色合适的演员。试播集在所有演员选好4天后正式开拍。由于时间上的限制,剧组无法搭建好实体外景,因此只能使用计算机绘图技术将数字化的外景插入到镜头中。节目一经上映就打破了电视网的多项收视纪录,并且获得了评论员的普遍好评和多个奖项提名,并在其中两项上胜出"
5696            }
5697            Lang::Kor => {
5698                "플레이스테이션 은 소니 컴퓨터 엔터테인먼트가 개발한 세 번째 가정용 게임기이다. 마이크로소프트의 엑스박스 360, 닌텐도의 Wii와 경쟁하고 있다. 이전 제품에서 온라인 플레이 기능을 비디오 게임 개발사에 전적으로 의존하던 것과 달리 통합 온라인 게임 서비스인 플레이스테이션 네트워크 서비스를 발매와 함께 시작해 제공하고 있으며, 탄탄한 멀티미디어 재생 기능, 플레이스테이션 포터블과의 연결, 고화질 광학 디스크 포맷인 블루레이 디스크 재생 기능 등의 기능을 갖추고 있다. 2006년 11월 11일에 일본에서 처음으로 출시했으며, 11월 17일에는 북미 지역, 2007년 3월 23일에는 유럽과 오세아니아 지역에서, 대한민국의 경우 6월 5일부터 일주일간 예약판매를 실시해, 매일 준비한 수량이 동이 나는 등 많은 관심을 받았으며 6월 16일에 정식 출시 행사를 열었다"
5699            }
5700            Lang::Jpn => {
5701                "熊野三山本願所は、15世紀末以降における熊野三山(熊野本宮、熊野新宮、熊野那智)の造営・修造のための勧進を担った組織の総称。 熊野三山を含めて、日本における古代から中世前半にかけての寺社の造営は、寺社領経営のような恒常的財源、幕府や朝廷などからの一時的な造営料所の寄進、あるいは公権力からの臨時の保護によって行われていた。しかしながら、熊野三山では、これらの財源はすべて15世紀半ばまでに実効性を失った"
5702            }
5703            Lang::Ara => {
5704                "لشکرکشیهای روسهای وارنگی به دریای خزر مجموعهای از حملات نظامی در بین سالهای ۸۶۴ تا ۱۰۴۱ میلادی به سواحل دریای خزر بودهاست. روسهای وارنگی ابتدا در قرن نهم میلادی به عنوان بازرگانان پوست، عسل و برده در سرزمینهای اسلامی(سرکلند) ظاهر شدند. این بازرگانان در مسیر تجاری ولگا به خرید و فروش میپرداختند. نخستین حملهٔ آنان در فاصله سالهای ۸۶۴ تا ۸۸۴ میلادی در مقیاسی کوچک علیه علویان طبرستان رخ داد. نخستین یورش بزرگ روسها در سال ۹۱۳ رخ داد و آنان با ۵۰۰ فروند درازکشتی شهر گرگان و اطراف آن را غارت کردند. آنها در این حمله مقداری کالا و برده را به تاراج بردند و در راه بازگشتن به سمت شمال، در دلتای ولگا، مورد حملهٔ خزرهای مسلمان قرار گرفتند و بعضی از آنان موفق به فرار شدند، ولی در میانهٔ ولگا به قتل رسیدند. دومین هجوم بزرگ روسها به دریای خزر در سال ۹۴۳ به وقوع پیوست. در این دوره ایگور یکم، حاکم روس کیف، رهبری روسها را در دست داشت. روسها پس از توافق با دولت خزرها برای عبور امن از منطقه، تا رود کورا و اعماق قفقاز پیش رفتند و در سال ۹۴۳ موفق شدند بندر بردعه، پایتخت اران (جمهوری آذربایجان کنونی)، را تصرف کنند. روسها در آنجا به مدت چند ماه ماندند و بسیاری از ساکنان شهر را کشتند و از راه غارتگری اموالی را به تاراج بردند. تنها دلیل بازگشت آنان "
5705            }
5706            Lang::Ell => {
5707                "Το Πρόγραμμα υλοποιείται εξ ολοκλήρου από απόσταση και μπορεί να συμμετέχει κάθε εμπλεκόμενος στη ή/και ενδιαφερόμενος για τη διδασκαλία της Ελληνικής ως δεύτερης/ξένης γλώσσας στην Ελλάδα και στο εξωτερικό, αρκεί να είναι απόφοιτος ελληνικής φιλολογίας, ξένων φιλολογιών, παιδαγωγικών τμημάτων, θεολογικών σχολών ή άλλων πανεπιστημιακών τμημάτων ελληνικών ή ισότιμων ξένων πανεπιστημίων. Υπό όρους γίνονται δεκτοί υποψήφιοι που δεν έχουν ολοκληρώσει σπουδές τριτοβάθμιας εκπαίδευσης."
5708            }
5709        };
5710        let tokens = match lng {
5711            Lang::Zho => vec![
5712                PositionalToken {
5713                    source: uws,
5714                    offset: 0,
5715                    length: 3,
5716                    token: Token::Word(Word::Word("美".to_string())),
5717                },
5718                PositionalToken {
5719                    source: uws,
5720                    offset: 3,
5721                    length: 3,
5722                    token: Token::Word(Word::Word("国".to_string())),
5723                },
5724                PositionalToken {
5725                    source: uws,
5726                    offset: 6,
5727                    length: 3,
5728                    token: Token::Word(Word::Word("电".to_string())),
5729                },
5730                PositionalToken {
5731                    source: uws,
5732                    offset: 9,
5733                    length: 3,
5734                    token: Token::Word(Word::Word("视".to_string())),
5735                },
5736                PositionalToken {
5737                    source: uws,
5738                    offset: 12,
5739                    length: 3,
5740                    token: Token::Word(Word::Word("连".to_string())),
5741                },
5742                PositionalToken {
5743                    source: uws,
5744                    offset: 15,
5745                    length: 3,
5746                    token: Token::Word(Word::Word("续".to_string())),
5747                },
5748                PositionalToken {
5749                    source: uws,
5750                    offset: 18,
5751                    length: 3,
5752                    token: Token::Word(Word::Word("剧".to_string())),
5753                },
5754                PositionalToken {
5755                    source: uws,
5756                    offset: 21,
5757                    length: 3,
5758                    token: Token::Special(Special::Punctuation('《')),
5759                },
5760                PositionalToken {
5761                    source: uws,
5762                    offset: 24,
5763                    length: 3,
5764                    token: Token::Word(Word::Word("超".to_string())),
5765                },
5766                PositionalToken {
5767                    source: uws,
5768                    offset: 27,
5769                    length: 3,
5770                    token: Token::Word(Word::Word("人".to_string())),
5771                },
5772                PositionalToken {
5773                    source: uws,
5774                    offset: 30,
5775                    length: 3,
5776                    token: Token::Word(Word::Word("前".to_string())),
5777                },
5778                PositionalToken {
5779                    source: uws,
5780                    offset: 33,
5781                    length: 3,
5782                    token: Token::Word(Word::Word("传".to_string())),
5783                },
5784                PositionalToken {
5785                    source: uws,
5786                    offset: 36,
5787                    length: 3,
5788                    token: Token::Special(Special::Punctuation('》')),
5789                },
5790                PositionalToken {
5791                    source: uws,
5792                    offset: 39,
5793                    length: 3,
5794                    token: Token::Word(Word::Word("的".to_string())),
5795                },
5796                PositionalToken {
5797                    source: uws,
5798                    offset: 42,
5799                    length: 3,
5800                    token: Token::Word(Word::Word("第".to_string())),
5801                },
5802                PositionalToken {
5803                    source: uws,
5804                    offset: 45,
5805                    length: 3,
5806                    token: Token::Word(Word::Word("一".to_string())),
5807                },
5808                PositionalToken {
5809                    source: uws,
5810                    offset: 48,
5811                    length: 3,
5812                    token: Token::Word(Word::Word("集".to_string())),
5813                },
5814                PositionalToken {
5815                    source: uws,
5816                    offset: 51,
5817                    length: 3,
5818                    token: Token::Special(Special::Punctuation('《')),
5819                },
5820                PositionalToken {
5821                    source: uws,
5822                    offset: 54,
5823                    length: 3,
5824                    token: Token::Word(Word::Word("试".to_string())),
5825                },
5826                PositionalToken {
5827                    source: uws,
5828                    offset: 57,
5829                    length: 3,
5830                    token: Token::Word(Word::Word("播".to_string())),
5831                },
5832                PositionalToken {
5833                    source: uws,
5834                    offset: 60,
5835                    length: 3,
5836                    token: Token::Word(Word::Word("集".to_string())),
5837                },
5838                PositionalToken {
5839                    source: uws,
5840                    offset: 63,
5841                    length: 3,
5842                    token: Token::Special(Special::Punctuation('》')),
5843                },
5844                PositionalToken {
5845                    source: uws,
5846                    offset: 66,
5847                    length: 3,
5848                    token: Token::Word(Word::Word("于".to_string())),
5849                },
5850                PositionalToken {
5851                    source: uws,
5852                    offset: 69,
5853                    length: 4,
5854                    token: Token::Word(Word::Number(Number::Integer(2001))),
5855                },
5856                PositionalToken {
5857                    source: uws,
5858                    offset: 73,
5859                    length: 3,
5860                    token: Token::Word(Word::Word("年".to_string())),
5861                },
5862                PositionalToken {
5863                    source: uws,
5864                    offset: 76,
5865                    length: 2,
5866                    token: Token::Word(Word::Number(Number::Integer(10))),
5867                },
5868                PositionalToken {
5869                    source: uws,
5870                    offset: 78,
5871                    length: 3,
5872                    token: Token::Word(Word::Word("月".to_string())),
5873                },
5874                PositionalToken {
5875                    source: uws,
5876                    offset: 81,
5877                    length: 2,
5878                    token: Token::Word(Word::Number(Number::Integer(16))),
5879                },
5880                PositionalToken {
5881                    source: uws,
5882                    offset: 83,
5883                    length: 3,
5884                    token: Token::Word(Word::Word("日".to_string())),
5885                },
5886                PositionalToken {
5887                    source: uws,
5888                    offset: 86,
5889                    length: 3,
5890                    token: Token::Word(Word::Word("在".to_string())),
5891                },
5892                PositionalToken {
5893                    source: uws,
5894                    offset: 89,
5895                    length: 3,
5896                    token: Token::Word(Word::Word("電".to_string())),
5897                },
5898                PositionalToken {
5899                    source: uws,
5900                    offset: 92,
5901                    length: 3,
5902                    token: Token::Word(Word::Word("視".to_string())),
5903                },
5904                PositionalToken {
5905                    source: uws,
5906                    offset: 95,
5907                    length: 3,
5908                    token: Token::Word(Word::Word("網".to_string())),
5909                },
5910                PositionalToken {
5911                    source: uws,
5912                    offset: 98,
5913                    length: 3,
5914                    token: Token::Word(Word::Word("首".to_string())),
5915                },
5916                PositionalToken {
5917                    source: uws,
5918                    offset: 101,
5919                    length: 3,
5920                    token: Token::Word(Word::Word("播".to_string())),
5921                },
5922                PositionalToken {
5923                    source: uws,
5924                    offset: 104,
5925                    length: 3,
5926                    token: Token::Special(Special::Punctuation(',')),
5927                },
5928                PositionalToken {
5929                    source: uws,
5930                    offset: 107,
5931                    length: 3,
5932                    token: Token::Word(Word::Word("剧".to_string())),
5933                },
5934                PositionalToken {
5935                    source: uws,
5936                    offset: 110,
5937                    length: 3,
5938                    token: Token::Word(Word::Word("集".to_string())),
5939                },
5940                PositionalToken {
5941                    source: uws,
5942                    offset: 113,
5943                    length: 3,
5944                    token: Token::Word(Word::Word("主".to_string())),
5945                },
5946                PositionalToken {
5947                    source: uws,
5948                    offset: 116,
5949                    length: 3,
5950                    token: Token::Word(Word::Word("创".to_string())),
5951                },
5952                PositionalToken {
5953                    source: uws,
5954                    offset: 119,
5955                    length: 3,
5956                    token: Token::Word(Word::Word("人".to_string())),
5957                },
5958                PositionalToken {
5959                    source: uws,
5960                    offset: 122,
5961                    length: 3,
5962                    token: Token::Word(Word::Word("阿".to_string())),
5963                },
5964                PositionalToken {
5965                    source: uws,
5966                    offset: 125,
5967                    length: 3,
5968                    token: Token::Word(Word::Word("尔".to_string())),
5969                },
5970                PositionalToken {
5971                    source: uws,
5972                    offset: 128,
5973                    length: 3,
5974                    token: Token::Word(Word::Word("弗".to_string())),
5975                },
5976                PositionalToken {
5977                    source: uws,
5978                    offset: 131,
5979                    length: 3,
5980                    token: Token::Word(Word::Word("雷".to_string())),
5981                },
5982                PositionalToken {
5983                    source: uws,
5984                    offset: 134,
5985                    length: 3,
5986                    token: Token::Word(Word::Word("德".to_string())),
5987                },
5988                PositionalToken {
5989                    source: uws,
5990                    offset: 137,
5991                    length: 2,
5992                    token: Token::Special(Special::Punctuation('·')),
5993                },
5994                PositionalToken {
5995                    source: uws,
5996                    offset: 139,
5997                    length: 3,
5998                    token: Token::Word(Word::Word("高".to_string())),
5999                },
6000                PositionalToken {
6001                    source: uws,
6002                    offset: 142,
6003                    length: 3,
6004                    token: Token::Word(Word::Word("夫".to_string())),
6005                },
6006                PositionalToken {
6007                    source: uws,
6008                    offset: 145,
6009                    length: 3,
6010                    token: Token::Word(Word::Word("和".to_string())),
6011                },
6012                PositionalToken {
6013                    source: uws,
6014                    offset: 148,
6015                    length: 3,
6016                    token: Token::Word(Word::Word("迈".to_string())),
6017                },
6018                PositionalToken {
6019                    source: uws,
6020                    offset: 151,
6021                    length: 3,
6022                    token: Token::Word(Word::Word("尔".to_string())),
6023                },
6024                PositionalToken {
6025                    source: uws,
6026                    offset: 154,
6027                    length: 3,
6028                    token: Token::Word(Word::Word("斯".to_string())),
6029                },
6030                PositionalToken {
6031                    source: uws,
6032                    offset: 157,
6033                    length: 2,
6034                    token: Token::Special(Special::Punctuation('·')),
6035                },
6036                PositionalToken {
6037                    source: uws,
6038                    offset: 159,
6039                    length: 3,
6040                    token: Token::Word(Word::Word("米".to_string())),
6041                },
6042                PositionalToken {
6043                    source: uws,
6044                    offset: 162,
6045                    length: 3,
6046                    token: Token::Word(Word::Word("勒".to_string())),
6047                },
6048                PositionalToken {
6049                    source: uws,
6050                    offset: 165,
6051                    length: 3,
6052                    token: Token::Word(Word::Word("編".to_string())),
6053                },
6054                PositionalToken {
6055                    source: uws,
6056                    offset: 168,
6057                    length: 3,
6058                    token: Token::Word(Word::Word("劇".to_string())),
6059                },
6060                PositionalToken {
6061                    source: uws,
6062                    offset: 171,
6063                    length: 3,
6064                    token: Token::Special(Special::Punctuation(',')),
6065                },
6066                PositionalToken {
6067                    source: uws,
6068                    offset: 174,
6069                    length: 3,
6070                    token: Token::Word(Word::Word("大".to_string())),
6071                },
6072                PositionalToken {
6073                    source: uws,
6074                    offset: 177,
6075                    length: 3,
6076                    token: Token::Word(Word::Word("卫".to_string())),
6077                },
6078                PositionalToken {
6079                    source: uws,
6080                    offset: 180,
6081                    length: 2,
6082                    token: Token::Special(Special::Punctuation('·')),
6083                },
6084                PositionalToken {
6085                    source: uws,
6086                    offset: 182,
6087                    length: 3,
6088                    token: Token::Word(Word::Word("努".to_string())),
6089                },
6090                PositionalToken {
6091                    source: uws,
6092                    offset: 185,
6093                    length: 3,
6094                    token: Token::Word(Word::Word("特".to_string())),
6095                },
6096                PositionalToken {
6097                    source: uws,
6098                    offset: 188,
6099                    length: 3,
6100                    token: Token::Word(Word::Word("尔".to_string())),
6101                },
6102                PositionalToken {
6103                    source: uws,
6104                    offset: 191,
6105                    length: 3,
6106                    token: Token::Word(Word::Word("执".to_string())),
6107                },
6108                PositionalToken {
6109                    source: uws,
6110                    offset: 194,
6111                    length: 3,
6112                    token: Token::Word(Word::Word("导".to_string())),
6113                },
6114                PositionalToken {
6115                    source: uws,
6116                    offset: 197,
6117                    length: 3,
6118                    token: Token::Special(Special::Punctuation('。')),
6119                },
6120                PositionalToken {
6121                    source: uws,
6122                    offset: 200,
6123                    length: 3,
6124                    token: Token::Word(Word::Word("这".to_string())),
6125                },
6126                PositionalToken {
6127                    source: uws,
6128                    offset: 203,
6129                    length: 3,
6130                    token: Token::Word(Word::Word("一".to_string())),
6131                },
6132                PositionalToken {
6133                    source: uws,
6134                    offset: 206,
6135                    length: 3,
6136                    token: Token::Word(Word::Word("试".to_string())),
6137                },
6138                PositionalToken {
6139                    source: uws,
6140                    offset: 209,
6141                    length: 3,
6142                    token: Token::Word(Word::Word("播".to_string())),
6143                },
6144                PositionalToken {
6145                    source: uws,
6146                    offset: 212,
6147                    length: 3,
6148                    token: Token::Word(Word::Word("首".to_string())),
6149                },
6150                PositionalToken {
6151                    source: uws,
6152                    offset: 215,
6153                    length: 3,
6154                    token: Token::Word(Word::Word("次".to_string())),
6155                },
6156                PositionalToken {
6157                    source: uws,
6158                    offset: 218,
6159                    length: 3,
6160                    token: Token::Word(Word::Word("向".to_string())),
6161                },
6162                PositionalToken {
6163                    source: uws,
6164                    offset: 221,
6165                    length: 3,
6166                    token: Token::Word(Word::Word("观".to_string())),
6167                },
6168                PositionalToken {
6169                    source: uws,
6170                    offset: 224,
6171                    length: 3,
6172                    token: Token::Word(Word::Word("众".to_string())),
6173                },
6174                PositionalToken {
6175                    source: uws,
6176                    offset: 227,
6177                    length: 3,
6178                    token: Token::Word(Word::Word("引".to_string())),
6179                },
6180                PositionalToken {
6181                    source: uws,
6182                    offset: 230,
6183                    length: 3,
6184                    token: Token::Word(Word::Word("荐".to_string())),
6185                },
6186                PositionalToken {
6187                    source: uws,
6188                    offset: 233,
6189                    length: 3,
6190                    token: Token::Word(Word::Word("了".to_string())),
6191                },
6192                PositionalToken {
6193                    source: uws,
6194                    offset: 236,
6195                    length: 3,
6196                    token: Token::Word(Word::Word("克".to_string())),
6197                },
6198                PositionalToken {
6199                    source: uws,
6200                    offset: 239,
6201                    length: 3,
6202                    token: Token::Word(Word::Word("拉".to_string())),
6203                },
6204                PositionalToken {
6205                    source: uws,
6206                    offset: 242,
6207                    length: 3,
6208                    token: Token::Word(Word::Word("克".to_string())),
6209                },
6210                PositionalToken {
6211                    source: uws,
6212                    offset: 245,
6213                    length: 2,
6214                    token: Token::Special(Special::Punctuation('·')),
6215                },
6216                PositionalToken {
6217                    source: uws,
6218                    offset: 247,
6219                    length: 3,
6220                    token: Token::Word(Word::Word("肯".to_string())),
6221                },
6222                PositionalToken {
6223                    source: uws,
6224                    offset: 250,
6225                    length: 3,
6226                    token: Token::Word(Word::Word("特".to_string())),
6227                },
6228                PositionalToken {
6229                    source: uws,
6230                    offset: 253,
6231                    length: 3,
6232                    token: Token::Word(Word::Word("一".to_string())),
6233                },
6234                PositionalToken {
6235                    source: uws,
6236                    offset: 256,
6237                    length: 3,
6238                    token: Token::Word(Word::Word("角".to_string())),
6239                },
6240                PositionalToken {
6241                    source: uws,
6242                    offset: 259,
6243                    length: 3,
6244                    token: Token::Special(Special::Punctuation(',')),
6245                },
6246                PositionalToken {
6247                    source: uws,
6248                    offset: 262,
6249                    length: 3,
6250                    token: Token::Word(Word::Word("他".to_string())),
6251                },
6252                PositionalToken {
6253                    source: uws,
6254                    offset: 265,
6255                    length: 3,
6256                    token: Token::Word(Word::Word("是".to_string())),
6257                },
6258                PositionalToken {
6259                    source: uws,
6260                    offset: 268,
6261                    length: 3,
6262                    token: Token::Word(Word::Word("位".to_string())),
6263                },
6264                PositionalToken {
6265                    source: uws,
6266                    offset: 271,
6267                    length: 3,
6268                    token: Token::Word(Word::Word("拥".to_string())),
6269                },
6270                PositionalToken {
6271                    source: uws,
6272                    offset: 274,
6273                    length: 3,
6274                    token: Token::Word(Word::Word("有".to_string())),
6275                },
6276                PositionalToken {
6277                    source: uws,
6278                    offset: 277,
6279                    length: 3,
6280                    token: Token::Word(Word::Word("超".to_string())),
6281                },
6282            ],
6283            Lang::Jpn => vec![
6284                PositionalToken {
6285                    source: uws,
6286                    offset: 0,
6287                    length: 3,
6288                    token: Token::Word(Word::Word("熊".to_string())),
6289                },
6290                PositionalToken {
6291                    source: uws,
6292                    offset: 3,
6293                    length: 3,
6294                    token: Token::Word(Word::Word("野".to_string())),
6295                },
6296                PositionalToken {
6297                    source: uws,
6298                    offset: 6,
6299                    length: 3,
6300                    token: Token::Word(Word::Word("三".to_string())),
6301                },
6302                PositionalToken {
6303                    source: uws,
6304                    offset: 9,
6305                    length: 3,
6306                    token: Token::Word(Word::Word("山".to_string())),
6307                },
6308                PositionalToken {
6309                    source: uws,
6310                    offset: 12,
6311                    length: 3,
6312                    token: Token::Word(Word::Word("本".to_string())),
6313                },
6314                PositionalToken {
6315                    source: uws,
6316                    offset: 15,
6317                    length: 3,
6318                    token: Token::Word(Word::Word("願".to_string())),
6319                },
6320                PositionalToken {
6321                    source: uws,
6322                    offset: 18,
6323                    length: 3,
6324                    token: Token::Word(Word::Word("所".to_string())),
6325                },
6326                PositionalToken {
6327                    source: uws,
6328                    offset: 21,
6329                    length: 3,
6330                    token: Token::Word(Word::Word("は".to_string())),
6331                },
6332                PositionalToken {
6333                    source: uws,
6334                    offset: 24,
6335                    length: 3,
6336                    token: Token::Special(Special::Punctuation('、')),
6337                },
6338                PositionalToken {
6339                    source: uws,
6340                    offset: 27,
6341                    length: 2,
6342                    token: Token::Word(Word::Number(Number::Integer(15))),
6343                },
6344                PositionalToken {
6345                    source: uws,
6346                    offset: 29,
6347                    length: 3,
6348                    token: Token::Word(Word::Word("世".to_string())),
6349                },
6350                PositionalToken {
6351                    source: uws,
6352                    offset: 32,
6353                    length: 3,
6354                    token: Token::Word(Word::Word("紀".to_string())),
6355                },
6356                PositionalToken {
6357                    source: uws,
6358                    offset: 35,
6359                    length: 3,
6360                    token: Token::Word(Word::Word("末".to_string())),
6361                },
6362                PositionalToken {
6363                    source: uws,
6364                    offset: 38,
6365                    length: 3,
6366                    token: Token::Word(Word::Word("以".to_string())),
6367                },
6368                PositionalToken {
6369                    source: uws,
6370                    offset: 41,
6371                    length: 3,
6372                    token: Token::Word(Word::Word("降".to_string())),
6373                },
6374                PositionalToken {
6375                    source: uws,
6376                    offset: 44,
6377                    length: 3,
6378                    token: Token::Word(Word::Word("に".to_string())),
6379                },
6380                PositionalToken {
6381                    source: uws,
6382                    offset: 47,
6383                    length: 3,
6384                    token: Token::Word(Word::Word("お".to_string())),
6385                },
6386                PositionalToken {
6387                    source: uws,
6388                    offset: 50,
6389                    length: 3,
6390                    token: Token::Word(Word::Word("け".to_string())),
6391                },
6392                PositionalToken {
6393                    source: uws,
6394                    offset: 53,
6395                    length: 3,
6396                    token: Token::Word(Word::Word("る".to_string())),
6397                },
6398                PositionalToken {
6399                    source: uws,
6400                    offset: 56,
6401                    length: 3,
6402                    token: Token::Word(Word::Word("熊".to_string())),
6403                },
6404                PositionalToken {
6405                    source: uws,
6406                    offset: 59,
6407                    length: 3,
6408                    token: Token::Word(Word::Word("野".to_string())),
6409                },
6410                PositionalToken {
6411                    source: uws,
6412                    offset: 62,
6413                    length: 3,
6414                    token: Token::Word(Word::Word("三".to_string())),
6415                },
6416                PositionalToken {
6417                    source: uws,
6418                    offset: 65,
6419                    length: 3,
6420                    token: Token::Word(Word::Word("山".to_string())),
6421                },
6422                PositionalToken {
6423                    source: uws,
6424                    offset: 68,
6425                    length: 3,
6426                    token: Token::Special(Special::Punctuation('(')),
6427                },
6428                PositionalToken {
6429                    source: uws,
6430                    offset: 71,
6431                    length: 3,
6432                    token: Token::Word(Word::Word("熊".to_string())),
6433                },
6434                PositionalToken {
6435                    source: uws,
6436                    offset: 74,
6437                    length: 3,
6438                    token: Token::Word(Word::Word("野".to_string())),
6439                },
6440                PositionalToken {
6441                    source: uws,
6442                    offset: 77,
6443                    length: 3,
6444                    token: Token::Word(Word::Word("本".to_string())),
6445                },
6446                PositionalToken {
6447                    source: uws,
6448                    offset: 80,
6449                    length: 3,
6450                    token: Token::Word(Word::Word("宮".to_string())),
6451                },
6452                PositionalToken {
6453                    source: uws,
6454                    offset: 83,
6455                    length: 3,
6456                    token: Token::Special(Special::Punctuation('、')),
6457                },
6458                PositionalToken {
6459                    source: uws,
6460                    offset: 86,
6461                    length: 3,
6462                    token: Token::Word(Word::Word("熊".to_string())),
6463                },
6464                PositionalToken {
6465                    source: uws,
6466                    offset: 89,
6467                    length: 3,
6468                    token: Token::Word(Word::Word("野".to_string())),
6469                },
6470                PositionalToken {
6471                    source: uws,
6472                    offset: 92,
6473                    length: 3,
6474                    token: Token::Word(Word::Word("新".to_string())),
6475                },
6476                PositionalToken {
6477                    source: uws,
6478                    offset: 95,
6479                    length: 3,
6480                    token: Token::Word(Word::Word("宮".to_string())),
6481                },
6482                PositionalToken {
6483                    source: uws,
6484                    offset: 98,
6485                    length: 3,
6486                    token: Token::Special(Special::Punctuation('、')),
6487                },
6488                PositionalToken {
6489                    source: uws,
6490                    offset: 101,
6491                    length: 3,
6492                    token: Token::Word(Word::Word("熊".to_string())),
6493                },
6494                PositionalToken {
6495                    source: uws,
6496                    offset: 104,
6497                    length: 3,
6498                    token: Token::Word(Word::Word("野".to_string())),
6499                },
6500                PositionalToken {
6501                    source: uws,
6502                    offset: 107,
6503                    length: 3,
6504                    token: Token::Word(Word::Word("那".to_string())),
6505                },
6506                PositionalToken {
6507                    source: uws,
6508                    offset: 110,
6509                    length: 3,
6510                    token: Token::Word(Word::Word("智".to_string())),
6511                },
6512                PositionalToken {
6513                    source: uws,
6514                    offset: 113,
6515                    length: 3,
6516                    token: Token::Special(Special::Punctuation(')')),
6517                },
6518                PositionalToken {
6519                    source: uws,
6520                    offset: 116,
6521                    length: 3,
6522                    token: Token::Word(Word::Word("の".to_string())),
6523                },
6524                PositionalToken {
6525                    source: uws,
6526                    offset: 119,
6527                    length: 3,
6528                    token: Token::Word(Word::Word("造".to_string())),
6529                },
6530                PositionalToken {
6531                    source: uws,
6532                    offset: 122,
6533                    length: 3,
6534                    token: Token::Word(Word::Word("営".to_string())),
6535                },
6536                PositionalToken {
6537                    source: uws,
6538                    offset: 125,
6539                    length: 3,
6540                    token: Token::Special(Special::Punctuation('・')),
6541                },
6542                PositionalToken {
6543                    source: uws,
6544                    offset: 128,
6545                    length: 3,
6546                    token: Token::Word(Word::Word("修".to_string())),
6547                },
6548                PositionalToken {
6549                    source: uws,
6550                    offset: 131,
6551                    length: 3,
6552                    token: Token::Word(Word::Word("造".to_string())),
6553                },
6554                PositionalToken {
6555                    source: uws,
6556                    offset: 134,
6557                    length: 3,
6558                    token: Token::Word(Word::Word("の".to_string())),
6559                },
6560                PositionalToken {
6561                    source: uws,
6562                    offset: 137,
6563                    length: 3,
6564                    token: Token::Word(Word::Word("た".to_string())),
6565                },
6566                PositionalToken {
6567                    source: uws,
6568                    offset: 140,
6569                    length: 3,
6570                    token: Token::Word(Word::Word("め".to_string())),
6571                },
6572                PositionalToken {
6573                    source: uws,
6574                    offset: 143,
6575                    length: 3,
6576                    token: Token::Word(Word::Word("の".to_string())),
6577                },
6578                PositionalToken {
6579                    source: uws,
6580                    offset: 146,
6581                    length: 3,
6582                    token: Token::Word(Word::Word("勧".to_string())),
6583                },
6584                PositionalToken {
6585                    source: uws,
6586                    offset: 149,
6587                    length: 3,
6588                    token: Token::Word(Word::Word("進".to_string())),
6589                },
6590                PositionalToken {
6591                    source: uws,
6592                    offset: 152,
6593                    length: 3,
6594                    token: Token::Word(Word::Word("を".to_string())),
6595                },
6596                PositionalToken {
6597                    source: uws,
6598                    offset: 155,
6599                    length: 3,
6600                    token: Token::Word(Word::Word("担".to_string())),
6601                },
6602                PositionalToken {
6603                    source: uws,
6604                    offset: 158,
6605                    length: 3,
6606                    token: Token::Word(Word::Word("っ".to_string())),
6607                },
6608                PositionalToken {
6609                    source: uws,
6610                    offset: 161,
6611                    length: 3,
6612                    token: Token::Word(Word::Word("た".to_string())),
6613                },
6614                PositionalToken {
6615                    source: uws,
6616                    offset: 164,
6617                    length: 3,
6618                    token: Token::Word(Word::Word("組".to_string())),
6619                },
6620                PositionalToken {
6621                    source: uws,
6622                    offset: 167,
6623                    length: 3,
6624                    token: Token::Word(Word::Word("織".to_string())),
6625                },
6626                PositionalToken {
6627                    source: uws,
6628                    offset: 170,
6629                    length: 3,
6630                    token: Token::Word(Word::Word("の".to_string())),
6631                },
6632                PositionalToken {
6633                    source: uws,
6634                    offset: 173,
6635                    length: 3,
6636                    token: Token::Word(Word::Word("総".to_string())),
6637                },
6638                PositionalToken {
6639                    source: uws,
6640                    offset: 176,
6641                    length: 3,
6642                    token: Token::Word(Word::Word("称".to_string())),
6643                },
6644                PositionalToken {
6645                    source: uws,
6646                    offset: 179,
6647                    length: 3,
6648                    token: Token::Special(Special::Punctuation('。')),
6649                },
6650                PositionalToken {
6651                    source: uws,
6652                    offset: 182,
6653                    length: 1,
6654                    token: Token::Special(Special::Separator(Separator::Space)),
6655                },
6656                PositionalToken {
6657                    source: uws,
6658                    offset: 183,
6659                    length: 3,
6660                    token: Token::Word(Word::Word("熊".to_string())),
6661                },
6662                PositionalToken {
6663                    source: uws,
6664                    offset: 186,
6665                    length: 3,
6666                    token: Token::Word(Word::Word("野".to_string())),
6667                },
6668                PositionalToken {
6669                    source: uws,
6670                    offset: 189,
6671                    length: 3,
6672                    token: Token::Word(Word::Word("三".to_string())),
6673                },
6674                PositionalToken {
6675                    source: uws,
6676                    offset: 192,
6677                    length: 3,
6678                    token: Token::Word(Word::Word("山".to_string())),
6679                },
6680                PositionalToken {
6681                    source: uws,
6682                    offset: 195,
6683                    length: 3,
6684                    token: Token::Word(Word::Word("を".to_string())),
6685                },
6686                PositionalToken {
6687                    source: uws,
6688                    offset: 198,
6689                    length: 3,
6690                    token: Token::Word(Word::Word("含".to_string())),
6691                },
6692                PositionalToken {
6693                    source: uws,
6694                    offset: 201,
6695                    length: 3,
6696                    token: Token::Word(Word::Word("め".to_string())),
6697                },
6698                PositionalToken {
6699                    source: uws,
6700                    offset: 204,
6701                    length: 3,
6702                    token: Token::Word(Word::Word("て".to_string())),
6703                },
6704                PositionalToken {
6705                    source: uws,
6706                    offset: 207,
6707                    length: 3,
6708                    token: Token::Special(Special::Punctuation('、')),
6709                },
6710                PositionalToken {
6711                    source: uws,
6712                    offset: 210,
6713                    length: 3,
6714                    token: Token::Word(Word::Word("日".to_string())),
6715                },
6716                PositionalToken {
6717                    source: uws,
6718                    offset: 213,
6719                    length: 3,
6720                    token: Token::Word(Word::Word("本".to_string())),
6721                },
6722                PositionalToken {
6723                    source: uws,
6724                    offset: 216,
6725                    length: 3,
6726                    token: Token::Word(Word::Word("に".to_string())),
6727                },
6728                PositionalToken {
6729                    source: uws,
6730                    offset: 219,
6731                    length: 3,
6732                    token: Token::Word(Word::Word("お".to_string())),
6733                },
6734                PositionalToken {
6735                    source: uws,
6736                    offset: 222,
6737                    length: 3,
6738                    token: Token::Word(Word::Word("け".to_string())),
6739                },
6740                PositionalToken {
6741                    source: uws,
6742                    offset: 225,
6743                    length: 3,
6744                    token: Token::Word(Word::Word("る".to_string())),
6745                },
6746                PositionalToken {
6747                    source: uws,
6748                    offset: 228,
6749                    length: 3,
6750                    token: Token::Word(Word::Word("古".to_string())),
6751                },
6752                PositionalToken {
6753                    source: uws,
6754                    offset: 231,
6755                    length: 3,
6756                    token: Token::Word(Word::Word("代".to_string())),
6757                },
6758                PositionalToken {
6759                    source: uws,
6760                    offset: 234,
6761                    length: 3,
6762                    token: Token::Word(Word::Word("か".to_string())),
6763                },
6764                PositionalToken {
6765                    source: uws,
6766                    offset: 237,
6767                    length: 3,
6768                    token: Token::Word(Word::Word("ら".to_string())),
6769                },
6770                PositionalToken {
6771                    source: uws,
6772                    offset: 240,
6773                    length: 3,
6774                    token: Token::Word(Word::Word("中".to_string())),
6775                },
6776                PositionalToken {
6777                    source: uws,
6778                    offset: 243,
6779                    length: 3,
6780                    token: Token::Word(Word::Word("世".to_string())),
6781                },
6782                PositionalToken {
6783                    source: uws,
6784                    offset: 246,
6785                    length: 3,
6786                    token: Token::Word(Word::Word("前".to_string())),
6787                },
6788                PositionalToken {
6789                    source: uws,
6790                    offset: 249,
6791                    length: 3,
6792                    token: Token::Word(Word::Word("半".to_string())),
6793                },
6794                PositionalToken {
6795                    source: uws,
6796                    offset: 252,
6797                    length: 3,
6798                    token: Token::Word(Word::Word("に".to_string())),
6799                },
6800                PositionalToken {
6801                    source: uws,
6802                    offset: 255,
6803                    length: 3,
6804                    token: Token::Word(Word::Word("か".to_string())),
6805                },
6806                PositionalToken {
6807                    source: uws,
6808                    offset: 258,
6809                    length: 3,
6810                    token: Token::Word(Word::Word("け".to_string())),
6811                },
6812                PositionalToken {
6813                    source: uws,
6814                    offset: 261,
6815                    length: 3,
6816                    token: Token::Word(Word::Word("て".to_string())),
6817                },
6818                PositionalToken {
6819                    source: uws,
6820                    offset: 264,
6821                    length: 3,
6822                    token: Token::Word(Word::Word("の".to_string())),
6823                },
6824                PositionalToken {
6825                    source: uws,
6826                    offset: 267,
6827                    length: 3,
6828                    token: Token::Word(Word::Word("寺".to_string())),
6829                },
6830                PositionalToken {
6831                    source: uws,
6832                    offset: 270,
6833                    length: 3,
6834                    token: Token::Word(Word::Word("社".to_string())),
6835                },
6836                PositionalToken {
6837                    source: uws,
6838                    offset: 273,
6839                    length: 3,
6840                    token: Token::Word(Word::Word("の".to_string())),
6841                },
6842                PositionalToken {
6843                    source: uws,
6844                    offset: 276,
6845                    length: 3,
6846                    token: Token::Word(Word::Word("造".to_string())),
6847                },
6848                PositionalToken {
6849                    source: uws,
6850                    offset: 279,
6851                    length: 3,
6852                    token: Token::Word(Word::Word("営".to_string())),
6853                },
6854                PositionalToken {
6855                    source: uws,
6856                    offset: 282,
6857                    length: 3,
6858                    token: Token::Word(Word::Word("は".to_string())),
6859                },
6860                PositionalToken {
6861                    source: uws,
6862                    offset: 285,
6863                    length: 3,
6864                    token: Token::Special(Special::Punctuation('、')),
6865                },
6866                PositionalToken {
6867                    source: uws,
6868                    offset: 288,
6869                    length: 3,
6870                    token: Token::Word(Word::Word("寺".to_string())),
6871                },
6872                PositionalToken {
6873                    source: uws,
6874                    offset: 291,
6875                    length: 3,
6876                    token: Token::Word(Word::Word("社".to_string())),
6877                },
6878            ],
6879            Lang::Kor => vec![
6880                PositionalToken {
6881                    source: uws,
6882                    offset: 0,
6883                    length: 21,
6884                    token: Token::Word(Word::Word("플레이스테이션".to_string())),
6885                },
6886                PositionalToken {
6887                    source: uws,
6888                    offset: 21,
6889                    length: 1,
6890                    token: Token::Special(Special::Separator(Separator::Space)),
6891                },
6892                PositionalToken {
6893                    source: uws,
6894                    offset: 22,
6895                    length: 3,
6896                    token: Token::Word(Word::Word("은".to_string())),
6897                },
6898                PositionalToken {
6899                    source: uws,
6900                    offset: 25,
6901                    length: 1,
6902                    token: Token::Special(Special::Separator(Separator::Space)),
6903                },
6904                PositionalToken {
6905                    source: uws,
6906                    offset: 26,
6907                    length: 6,
6908                    token: Token::Word(Word::Word("소니".to_string())),
6909                },
6910                PositionalToken {
6911                    source: uws,
6912                    offset: 32,
6913                    length: 1,
6914                    token: Token::Special(Special::Separator(Separator::Space)),
6915                },
6916                PositionalToken {
6917                    source: uws,
6918                    offset: 33,
6919                    length: 9,
6920                    token: Token::Word(Word::Word("컴퓨터".to_string())),
6921                },
6922                PositionalToken {
6923                    source: uws,
6924                    offset: 42,
6925                    length: 1,
6926                    token: Token::Special(Special::Separator(Separator::Space)),
6927                },
6928                PositionalToken {
6929                    source: uws,
6930                    offset: 43,
6931                    length: 21,
6932                    token: Token::Word(Word::Word("엔터테인먼트가".to_string())),
6933                },
6934                PositionalToken {
6935                    source: uws,
6936                    offset: 64,
6937                    length: 1,
6938                    token: Token::Special(Special::Separator(Separator::Space)),
6939                },
6940                PositionalToken {
6941                    source: uws,
6942                    offset: 65,
6943                    length: 9,
6944                    token: Token::Word(Word::Word("개발한".to_string())),
6945                },
6946                PositionalToken {
6947                    source: uws,
6948                    offset: 74,
6949                    length: 1,
6950                    token: Token::Special(Special::Separator(Separator::Space)),
6951                },
6952                PositionalToken {
6953                    source: uws,
6954                    offset: 75,
6955                    length: 3,
6956                    token: Token::Word(Word::Word("세".to_string())),
6957                },
6958                PositionalToken {
6959                    source: uws,
6960                    offset: 78,
6961                    length: 1,
6962                    token: Token::Special(Special::Separator(Separator::Space)),
6963                },
6964                PositionalToken {
6965                    source: uws,
6966                    offset: 79,
6967                    length: 6,
6968                    token: Token::Word(Word::Word("번째".to_string())),
6969                },
6970                PositionalToken {
6971                    source: uws,
6972                    offset: 85,
6973                    length: 1,
6974                    token: Token::Special(Special::Separator(Separator::Space)),
6975                },
6976                PositionalToken {
6977                    source: uws,
6978                    offset: 86,
6979                    length: 9,
6980                    token: Token::Word(Word::Word("가정용".to_string())),
6981                },
6982                PositionalToken {
6983                    source: uws,
6984                    offset: 95,
6985                    length: 1,
6986                    token: Token::Special(Special::Separator(Separator::Space)),
6987                },
6988                PositionalToken {
6989                    source: uws,
6990                    offset: 96,
6991                    length: 15,
6992                    token: Token::Word(Word::Word("게임기이다".to_string())),
6993                },
6994                PositionalToken {
6995                    source: uws,
6996                    offset: 111,
6997                    length: 1,
6998                    token: Token::Special(Special::Punctuation('.')),
6999                },
7000                PositionalToken {
7001                    source: uws,
7002                    offset: 112,
7003                    length: 1,
7004                    token: Token::Special(Special::Separator(Separator::Space)),
7005                },
7006                PositionalToken {
7007                    source: uws,
7008                    offset: 113,
7009                    length: 24,
7010                    token: Token::Word(Word::Word("마이크로소프트의".to_string())),
7011                },
7012                PositionalToken {
7013                    source: uws,
7014                    offset: 137,
7015                    length: 1,
7016                    token: Token::Special(Special::Separator(Separator::Space)),
7017                },
7018                PositionalToken {
7019                    source: uws,
7020                    offset: 138,
7021                    length: 12,
7022                    token: Token::Word(Word::Word("엑스박스".to_string())),
7023                },
7024                PositionalToken {
7025                    source: uws,
7026                    offset: 150,
7027                    length: 1,
7028                    token: Token::Special(Special::Separator(Separator::Space)),
7029                },
7030                PositionalToken {
7031                    source: uws,
7032                    offset: 151,
7033                    length: 3,
7034                    token: Token::Word(Word::Number(Number::Integer(360))),
7035                },
7036                PositionalToken {
7037                    source: uws,
7038                    offset: 154,
7039                    length: 1,
7040                    token: Token::Special(Special::Punctuation(',')),
7041                },
7042                PositionalToken {
7043                    source: uws,
7044                    offset: 155,
7045                    length: 1,
7046                    token: Token::Special(Special::Separator(Separator::Space)),
7047                },
7048                PositionalToken {
7049                    source: uws,
7050                    offset: 156,
7051                    length: 12,
7052                    token: Token::Word(Word::Word("닌텐도의".to_string())),
7053                },
7054                PositionalToken {
7055                    source: uws,
7056                    offset: 168,
7057                    length: 1,
7058                    token: Token::Special(Special::Separator(Separator::Space)),
7059                },
7060                PositionalToken {
7061                    source: uws,
7062                    offset: 169,
7063                    length: 6,
7064                    token: Token::Word(Word::Word("Wii와".to_string())),
7065                },
7066                PositionalToken {
7067                    source: uws,
7068                    offset: 175,
7069                    length: 1,
7070                    token: Token::Special(Special::Separator(Separator::Space)),
7071                },
7072                PositionalToken {
7073                    source: uws,
7074                    offset: 176,
7075                    length: 12,
7076                    token: Token::Word(Word::Word("경쟁하고".to_string())),
7077                },
7078                PositionalToken {
7079                    source: uws,
7080                    offset: 188,
7081                    length: 1,
7082                    token: Token::Special(Special::Separator(Separator::Space)),
7083                },
7084                PositionalToken {
7085                    source: uws,
7086                    offset: 189,
7087                    length: 6,
7088                    token: Token::Word(Word::Word("있다".to_string())),
7089                },
7090                PositionalToken {
7091                    source: uws,
7092                    offset: 195,
7093                    length: 1,
7094                    token: Token::Special(Special::Punctuation('.')),
7095                },
7096                PositionalToken {
7097                    source: uws,
7098                    offset: 196,
7099                    length: 1,
7100                    token: Token::Special(Special::Separator(Separator::Space)),
7101                },
7102                PositionalToken {
7103                    source: uws,
7104                    offset: 197,
7105                    length: 6,
7106                    token: Token::Word(Word::Word("이전".to_string())),
7107                },
7108                PositionalToken {
7109                    source: uws,
7110                    offset: 203,
7111                    length: 1,
7112                    token: Token::Special(Special::Separator(Separator::Space)),
7113                },
7114                PositionalToken {
7115                    source: uws,
7116                    offset: 204,
7117                    length: 12,
7118                    token: Token::Word(Word::Word("제품에서".to_string())),
7119                },
7120                PositionalToken {
7121                    source: uws,
7122                    offset: 216,
7123                    length: 1,
7124                    token: Token::Special(Special::Separator(Separator::Space)),
7125                },
7126                PositionalToken {
7127                    source: uws,
7128                    offset: 217,
7129                    length: 9,
7130                    token: Token::Word(Word::Word("온라인".to_string())),
7131                },
7132                PositionalToken {
7133                    source: uws,
7134                    offset: 226,
7135                    length: 1,
7136                    token: Token::Special(Special::Separator(Separator::Space)),
7137                },
7138                PositionalToken {
7139                    source: uws,
7140                    offset: 227,
7141                    length: 9,
7142                    token: Token::Word(Word::Word("플레이".to_string())),
7143                },
7144                PositionalToken {
7145                    source: uws,
7146                    offset: 236,
7147                    length: 1,
7148                    token: Token::Special(Special::Separator(Separator::Space)),
7149                },
7150                PositionalToken {
7151                    source: uws,
7152                    offset: 237,
7153                    length: 3,
7154                    token: Token::Word(Word::Word("기".to_string())),
7155                },
7156            ],
7157            Lang::Ara => vec![
7158                PositionalToken {
7159                    source: uws,
7160                    offset: 0,
7161                    length: 14,
7162                    token: Token::Word(Word::Word("لشکرکشی".to_string())),
7163                },
7164                PositionalToken {
7165                    source: uws,
7166                    offset: 14,
7167                    length: 3,
7168                    token: Token::Unicode(Unicode::Formatter(Formatter::Char('\u{200c}'))),
7169                },
7170                PositionalToken {
7171                    source: uws,
7172                    offset: 17,
7173                    length: 6,
7174                    token: Token::Word(Word::Word("های".to_string())),
7175                },
7176                PositionalToken {
7177                    source: uws,
7178                    offset: 23,
7179                    length: 1,
7180                    token: Token::Special(Special::Separator(Separator::Space)),
7181                },
7182                PositionalToken {
7183                    source: uws,
7184                    offset: 24,
7185                    length: 6,
7186                    token: Token::Word(Word::Word("روس".to_string())),
7187                },
7188                PositionalToken {
7189                    source: uws,
7190                    offset: 30,
7191                    length: 3,
7192                    token: Token::Unicode(Unicode::Formatter(Formatter::Char('\u{200c}'))),
7193                },
7194                PositionalToken {
7195                    source: uws,
7196                    offset: 33,
7197                    length: 6,
7198                    token: Token::Word(Word::Word("های".to_string())),
7199                },
7200                PositionalToken {
7201                    source: uws,
7202                    offset: 39,
7203                    length: 1,
7204                    token: Token::Special(Special::Separator(Separator::Space)),
7205                },
7206                PositionalToken {
7207                    source: uws,
7208                    offset: 40,
7209                    length: 12,
7210                    token: Token::Word(Word::Word("وارنگی".to_string())),
7211                },
7212                PositionalToken {
7213                    source: uws,
7214                    offset: 52,
7215                    length: 1,
7216                    token: Token::Special(Special::Separator(Separator::Space)),
7217                },
7218                PositionalToken {
7219                    source: uws,
7220                    offset: 53,
7221                    length: 4,
7222                    token: Token::Word(Word::Word("به".to_string())),
7223                },
7224                PositionalToken {
7225                    source: uws,
7226                    offset: 57,
7227                    length: 1,
7228                    token: Token::Special(Special::Separator(Separator::Space)),
7229                },
7230                PositionalToken {
7231                    source: uws,
7232                    offset: 58,
7233                    length: 10,
7234                    token: Token::Word(Word::Word("دریای".to_string())),
7235                },
7236                PositionalToken {
7237                    source: uws,
7238                    offset: 68,
7239                    length: 1,
7240                    token: Token::Special(Special::Separator(Separator::Space)),
7241                },
7242                PositionalToken {
7243                    source: uws,
7244                    offset: 69,
7245                    length: 6,
7246                    token: Token::Word(Word::Word("خزر".to_string())),
7247                },
7248                PositionalToken {
7249                    source: uws,
7250                    offset: 75,
7251                    length: 1,
7252                    token: Token::Special(Special::Separator(Separator::Space)),
7253                },
7254                PositionalToken {
7255                    source: uws,
7256                    offset: 76,
7257                    length: 12,
7258                    token: Token::Word(Word::Word("مجموعه".to_string())),
7259                },
7260                PositionalToken {
7261                    source: uws,
7262                    offset: 88,
7263                    length: 3,
7264                    token: Token::Unicode(Unicode::Formatter(Formatter::Char('\u{200c}'))),
7265                },
7266                PositionalToken {
7267                    source: uws,
7268                    offset: 91,
7269                    length: 4,
7270                    token: Token::Word(Word::Word("ای".to_string())),
7271                },
7272                PositionalToken {
7273                    source: uws,
7274                    offset: 95,
7275                    length: 1,
7276                    token: Token::Special(Special::Separator(Separator::Space)),
7277                },
7278                PositionalToken {
7279                    source: uws,
7280                    offset: 96,
7281                    length: 4,
7282                    token: Token::Word(Word::Word("از".to_string())),
7283                },
7284                PositionalToken {
7285                    source: uws,
7286                    offset: 100,
7287                    length: 1,
7288                    token: Token::Special(Special::Separator(Separator::Space)),
7289                },
7290                PositionalToken {
7291                    source: uws,
7292                    offset: 101,
7293                    length: 10,
7294                    token: Token::Word(Word::Word("حملات".to_string())),
7295                },
7296                PositionalToken {
7297                    source: uws,
7298                    offset: 111,
7299                    length: 1,
7300                    token: Token::Special(Special::Separator(Separator::Space)),
7301                },
7302                PositionalToken {
7303                    source: uws,
7304                    offset: 112,
7305                    length: 10,
7306                    token: Token::Word(Word::Word("نظامی".to_string())),
7307                },
7308                PositionalToken {
7309                    source: uws,
7310                    offset: 122,
7311                    length: 1,
7312                    token: Token::Special(Special::Separator(Separator::Space)),
7313                },
7314                PositionalToken {
7315                    source: uws,
7316                    offset: 123,
7317                    length: 4,
7318                    token: Token::Word(Word::Word("در".to_string())),
7319                },
7320                PositionalToken {
7321                    source: uws,
7322                    offset: 127,
7323                    length: 1,
7324                    token: Token::Special(Special::Separator(Separator::Space)),
7325                },
7326                PositionalToken {
7327                    source: uws,
7328                    offset: 128,
7329                    length: 6,
7330                    token: Token::Word(Word::Word("بین".to_string())),
7331                },
7332                PositionalToken {
7333                    source: uws,
7334                    offset: 134,
7335                    length: 1,
7336                    token: Token::Special(Special::Separator(Separator::Space)),
7337                },
7338                PositionalToken {
7339                    source: uws,
7340                    offset: 135,
7341                    length: 6,
7342                    token: Token::Word(Word::Word("سال".to_string())),
7343                },
7344                PositionalToken {
7345                    source: uws,
7346                    offset: 141,
7347                    length: 3,
7348                    token: Token::Unicode(Unicode::Formatter(Formatter::Char('\u{200c}'))),
7349                },
7350                PositionalToken {
7351                    source: uws,
7352                    offset: 144,
7353                    length: 6,
7354                    token: Token::Word(Word::Word("های".to_string())),
7355                },
7356                PositionalToken {
7357                    source: uws,
7358                    offset: 150,
7359                    length: 1,
7360                    token: Token::Special(Special::Separator(Separator::Space)),
7361                },
7362                PositionalToken {
7363                    source: uws,
7364                    offset: 151,
7365                    length: 6,
7366                    token: Token::Word(Word::StrangeWord("۸۶۴".to_string())),
7367                },
7368                PositionalToken {
7369                    source: uws,
7370                    offset: 157,
7371                    length: 1,
7372                    token: Token::Special(Special::Separator(Separator::Space)),
7373                },
7374                PositionalToken {
7375                    source: uws,
7376                    offset: 158,
7377                    length: 4,
7378                    token: Token::Word(Word::Word("تا".to_string())),
7379                },
7380                PositionalToken {
7381                    source: uws,
7382                    offset: 162,
7383                    length: 1,
7384                    token: Token::Special(Special::Separator(Separator::Space)),
7385                },
7386                PositionalToken {
7387                    source: uws,
7388                    offset: 163,
7389                    length: 8,
7390                    token: Token::Word(Word::StrangeWord("۱۰۴۱".to_string())),
7391                },
7392                PositionalToken {
7393                    source: uws,
7394                    offset: 171,
7395                    length: 1,
7396                    token: Token::Special(Special::Separator(Separator::Space)),
7397                },
7398                PositionalToken {
7399                    source: uws,
7400                    offset: 172,
7401                    length: 12,
7402                    token: Token::Word(Word::Word("میلادی".to_string())),
7403                },
7404                PositionalToken {
7405                    source: uws,
7406                    offset: 184,
7407                    length: 1,
7408                    token: Token::Special(Special::Separator(Separator::Space)),
7409                },
7410                PositionalToken {
7411                    source: uws,
7412                    offset: 185,
7413                    length: 2,
7414                    token: Token::Word(Word::Word("ب".to_string())),
7415                },
7416            ],
7417            Lang::Ell => vec![
7418                PositionalToken {
7419                    source: uws,
7420                    offset: 0,
7421                    length: 4,
7422                    token: Token::Word(Word::Word("Το".to_string())),
7423                },
7424                PositionalToken {
7425                    source: uws,
7426                    offset: 4,
7427                    length: 1,
7428                    token: Token::Special(Special::Separator(Separator::Space)),
7429                },
7430                PositionalToken {
7431                    source: uws,
7432                    offset: 5,
7433                    length: 18,
7434                    token: Token::Word(Word::Word("Πρόγραμμα".to_string())),
7435                },
7436                PositionalToken {
7437                    source: uws,
7438                    offset: 23,
7439                    length: 1,
7440                    token: Token::Special(Special::Separator(Separator::Space)),
7441                },
7442                PositionalToken {
7443                    source: uws,
7444                    offset: 24,
7445                    length: 22,
7446                    token: Token::Word(Word::Word("υλοποιείται".to_string())),
7447                },
7448                PositionalToken {
7449                    source: uws,
7450                    offset: 46,
7451                    length: 1,
7452                    token: Token::Special(Special::Separator(Separator::Space)),
7453                },
7454                PositionalToken {
7455                    source: uws,
7456                    offset: 47,
7457                    length: 4,
7458                    token: Token::Word(Word::Word("εξ".to_string())),
7459                },
7460                PositionalToken {
7461                    source: uws,
7462                    offset: 51,
7463                    length: 1,
7464                    token: Token::Special(Special::Separator(Separator::Space)),
7465                },
7466                PositionalToken {
7467                    source: uws,
7468                    offset: 52,
7469                    length: 18,
7470                    token: Token::Word(Word::Word("ολοκλήρου".to_string())),
7471                },
7472                PositionalToken {
7473                    source: uws,
7474                    offset: 70,
7475                    length: 1,
7476                    token: Token::Special(Special::Separator(Separator::Space)),
7477                },
7478                PositionalToken {
7479                    source: uws,
7480                    offset: 71,
7481                    length: 6,
7482                    token: Token::Word(Word::Word("από".to_string())),
7483                },
7484                PositionalToken {
7485                    source: uws,
7486                    offset: 77,
7487                    length: 1,
7488                    token: Token::Special(Special::Separator(Separator::Space)),
7489                },
7490                PositionalToken {
7491                    source: uws,
7492                    offset: 78,
7493                    length: 16,
7494                    token: Token::Word(Word::Word("απόσταση".to_string())),
7495                },
7496                PositionalToken {
7497                    source: uws,
7498                    offset: 94,
7499                    length: 1,
7500                    token: Token::Special(Special::Separator(Separator::Space)),
7501                },
7502                PositionalToken {
7503                    source: uws,
7504                    offset: 95,
7505                    length: 6,
7506                    token: Token::Word(Word::Word("και".to_string())),
7507                },
7508                PositionalToken {
7509                    source: uws,
7510                    offset: 101,
7511                    length: 1,
7512                    token: Token::Special(Special::Separator(Separator::Space)),
7513                },
7514                PositionalToken {
7515                    source: uws,
7516                    offset: 102,
7517                    length: 12,
7518                    token: Token::Word(Word::Word("μπορεί".to_string())),
7519                },
7520                PositionalToken {
7521                    source: uws,
7522                    offset: 114,
7523                    length: 1,
7524                    token: Token::Special(Special::Separator(Separator::Space)),
7525                },
7526                PositionalToken {
7527                    source: uws,
7528                    offset: 115,
7529                    length: 4,
7530                    token: Token::Word(Word::Word("να".to_string())),
7531                },
7532                PositionalToken {
7533                    source: uws,
7534                    offset: 119,
7535                    length: 1,
7536                    token: Token::Special(Special::Separator(Separator::Space)),
7537                },
7538                PositionalToken {
7539                    source: uws,
7540                    offset: 120,
7541                    length: 20,
7542                    token: Token::Word(Word::Word("συμμετέχει".to_string())),
7543                },
7544                PositionalToken {
7545                    source: uws,
7546                    offset: 140,
7547                    length: 1,
7548                    token: Token::Special(Special::Separator(Separator::Space)),
7549                },
7550                PositionalToken {
7551                    source: uws,
7552                    offset: 141,
7553                    length: 8,
7554                    token: Token::Word(Word::Word("κάθε".to_string())),
7555                },
7556                PositionalToken {
7557                    source: uws,
7558                    offset: 149,
7559                    length: 1,
7560                    token: Token::Special(Special::Separator(Separator::Space)),
7561                },
7562                PositionalToken {
7563                    source: uws,
7564                    offset: 150,
7565                    length: 24,
7566                    token: Token::Word(Word::Word("εμπλεκόμενος".to_string())),
7567                },
7568                PositionalToken {
7569                    source: uws,
7570                    offset: 174,
7571                    length: 1,
7572                    token: Token::Special(Special::Separator(Separator::Space)),
7573                },
7574                PositionalToken {
7575                    source: uws,
7576                    offset: 175,
7577                    length: 6,
7578                    token: Token::Word(Word::Word("στη".to_string())),
7579                },
7580                PositionalToken {
7581                    source: uws,
7582                    offset: 181,
7583                    length: 1,
7584                    token: Token::Special(Special::Separator(Separator::Space)),
7585                },
7586                PositionalToken {
7587                    source: uws,
7588                    offset: 182,
7589                    length: 2,
7590                    token: Token::Word(Word::Word("ή".to_string())),
7591                },
7592                PositionalToken {
7593                    source: uws,
7594                    offset: 184,
7595                    length: 1,
7596                    token: Token::Special(Special::Punctuation('/')),
7597                },
7598            ],
7599        };
7600        (
7601            uws.chars()
7602                .take(100)
7603                .fold(String::new(), |acc, c| acc + &format!("{}", c)),
7604            tokens,
7605        )
7606    }
7607}