1use text_parsing::{Breaker, IntoSource, Local, Snip, Source, SourceEvent};
2
3mod emoji;
4pub use emoji::EMOJIMAP;
5
6mod breakers;
7pub use breakers::{SentenceBreaker, UnicodeSentenceBreaker};
8
9mod wordbreaker;
10
11mod options;
12pub use options::{IntoTokenizer, TokenizerOptions, TokenizerParams};
13
14mod tokens;
15pub use tokens::Tokens;
16
17mod text_tokens;
18use text_tokens::InnerBound;
19pub use text_tokens::TextTokens;
20
21#[derive(Debug)]
22pub enum Error {
23 TextParser(text_parsing::Error),
24}
25
26const EPS: f64 = 1e-10;
27
28#[derive(Debug, Clone, Copy, PartialEq, PartialOrd)]
29pub enum Number {
30 Integer(i64),
31 Float(f64),
32}
33impl Number {
34 pub fn as_f64(&self) -> f64 {
35 match self {
36 Number::Integer(i) => *i as f64,
37 Number::Float(f) => *f,
38 }
39 }
40}
41impl Ord for Number {
42 fn cmp(&self, other: &Number) -> std::cmp::Ordering {
43 let s = self.as_f64();
44 let o = other.as_f64();
45 let d = s - o;
46 match d.abs() < EPS {
47 true => std::cmp::Ordering::Equal,
48 false => {
49 if d > 0.0 {
50 return std::cmp::Ordering::Greater;
51 }
52 if d < 0.0 {
53 return std::cmp::Ordering::Less;
54 }
55 std::cmp::Ordering::Equal
56 }
57 }
58 }
59}
60impl Eq for Number {}
61
62#[derive(Debug, Clone, Copy, Eq, PartialEq, Ord, PartialOrd)]
63pub enum Separator {
64 Space,
65 Tab,
66 Newline,
67 Char(char),
68}
69
70#[derive(Debug, Clone, Copy, Eq, PartialEq, Ord, PartialOrd)]
71pub enum Formatter {
72 Char(char),
73 Joiner, }
75
76#[derive(Debug, Clone, Copy, PartialEq, Eq, Ord, PartialOrd)]
77pub enum Special {
78 Punctuation(char),
79 Symbol(char),
80 Separator(Separator),
81}
82
83#[cfg(feature = "strings")]
84#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord)]
85pub enum Word {
86 Word(String),
87 StrangeWord(String),
88 Numerical(Numerical),
89 Number(Number),
90 Emoji(&'static str),
91}
92
93#[cfg(feature = "strings")]
94#[derive(Debug, Clone, Eq, PartialEq, Ord, PartialOrd)]
95pub enum Numerical {
96 DotSeparated(String),
100 Measures(String),
101 Alphanumeric(String),
102}
103
104#[cfg(feature = "strings")]
105#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord)]
106pub enum Struct {
107 Hashtag(String),
108 Mention(String),
109 }
111
112#[cfg(feature = "strings")]
113#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord)]
114pub enum Unicode {
115 String(String),
116 Formatter(Formatter),
117}
118
119#[cfg(not(feature = "strings"))]
120#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord)]
121pub enum Word {
122 Word,
123 StrangeWord,
124 Numerical(Numerical),
125 Number(Number),
126 Emoji(&'static str),
127}
128
129#[cfg(not(feature = "strings"))]
130#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord)]
131pub enum Numerical {
132 DotSeparated,
136 Measures,
137 Alphanumeric,
138}
139
140#[cfg(not(feature = "strings"))]
141#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord)]
142pub enum Struct {
143 Hashtag,
144 Mention,
145 }
147
148#[cfg(not(feature = "strings"))]
149#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord)]
150pub enum Unicode {
151 String,
152 Formatter(Formatter),
153}
154
155#[cfg(feature = "strings")]
156#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord)]
157pub enum Token {
158 Word(Word),
159 Struct(Struct),
160 Special(Special),
161 Unicode(Unicode),
162}
163
164#[cfg(not(feature = "strings"))]
165#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord)]
166pub enum Token {
167 Word(Word),
168 Struct(Struct),
169 Special(Special),
170 Unicode(Unicode),
171}
172
173#[derive(Debug)]
187pub struct TextStr<'s> {
188 buffer: &'s str,
189 originals: Vec<Local<()>>,
190 breakers: Vec<InnerBound>,
191}
192impl<'s> TextStr<'s> {
193 pub fn new<'a>(s: &'a str) -> Result<TextStr<'a>, Error> {
194 let text = inner_new(s.into_source(), false)?;
195 Ok(TextStr {
196 buffer: s,
197 originals: text.originals,
198 breakers: text.breakers,
199 })
200 }
201}
202
203fn inner_new<S: Source>(mut source: S, with_buffer: bool) -> Result<Text, Error> {
204 let mut text = Text {
205 buffer: String::new(),
206 originals: Vec::new(),
207 breakers: Vec::new(),
208 };
209 while let Some(local_se) = source.next_char().map_err(Error::TextParser)? {
210 let (local, se) = local_se.into_inner();
211 let c = match se {
212 SourceEvent::Char(c) => match c {
213 '\u{0060}' => '\u{0027}',
214 _ => c,
215 },
216 SourceEvent::Breaker(b) => {
217 let (c, opt_b) = match b {
218 Breaker::None => continue,
219 Breaker::Space => (' ', None),
220 Breaker::Line => ('\n', None),
221 Breaker::Word => ('\u{200B}', Some(b)), Breaker::Sentence | Breaker::Paragraph | Breaker::Section => ('\n', Some(b)),
223 };
224 if let Some(b) = opt_b {
225 let br = InnerBound {
226 bytes: Snip {
227 offset: text.buffer.len(),
228 length: c.len_utf8(),
229 },
230 chars: Snip {
231 offset: text.originals.len(),
232 length: 1,
233 },
234 breaker: b,
235 original: Some(local),
236 };
237 text.breakers.push(br);
239 }
240 c
241 }
242 };
243 if with_buffer {
244 text.buffer.push(c);
245 }
246 text.originals.push(local);
247 }
248 Ok(text)
249}
250
251#[derive(Debug)]
252pub struct Text {
253 buffer: String,
254 originals: Vec<Local<()>>,
255 breakers: Vec<InnerBound>,
256}
257impl Text {
258 pub fn new<S: Source>(source: S) -> Result<Text, Error> {
259 inner_new(source, true)
260 }
261 pub fn token_text<'s>(&'s self, token: &TextToken) -> &'s str {
262 let Snip {
263 offset: begin,
264 length: len,
265 } = token.locality.bytes();
266 let end = begin + len;
267 &self.buffer[begin..end]
268 }
269 pub fn text(&self) -> &str {
270 &self.buffer
271 }
272 pub fn original_locality(&self, idx: usize) -> Option<Local<()>> {
273 self.originals.get(idx).copied()
274 }
275 pub fn originals(&self) -> &Vec<Local<()>> {
276 &self.originals
277 }
278}
279
280impl TryFrom<String> for Text {
281 type Error = Error;
282
283 fn try_from(s: String) -> Result<Text, Error> {
284 let mut text = inner_new((&s).into_source(), false)?;
285 text.buffer = s;
286 Ok(text)
287 }
288}
289
290impl TryFrom<&str> for Text {
291 type Error = Error;
292
293 fn try_from(s: &str) -> Result<Text, Error> {
294 Text::new(s.into_source())
295 }
296}
297
298#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord)]
299pub enum Bound {
300 Sentence,
301 Paragraph,
302 Section,
303}
304
305#[cfg(feature = "strings")]
306#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord)]
307pub struct TextToken {
308 locality: Local<()>,
309 original: Option<Local<()>>,
310 pub token: Token2,
311}
312
313#[cfg(not(feature = "strings"))]
314#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord)]
315pub struct TextToken {
316 locality: Local<()>,
317 original: Option<Local<()>>,
318 pub token: Token2,
319}
320
321#[cfg(test)]
322impl TextToken {
323 fn into_original_token_1(self) -> Option<Local<Token>> {
324 match self.original {
325 Some(original) => self.token.into_token().map(|t| original.local(t)),
326 None => None,
327 }
328 }
329}
330
331impl TextToken {
332 pub fn local(&self) -> Local<()> {
333 self.locality
334 }
335 pub fn original(&self) -> Option<Local<()>> {
336 self.original
337 }
338 pub fn into_position(mut self) -> TextToken {
339 self.locality = self.locality.into_position();
340 self.original = self.original.map(|or| or.into_position());
341 self
342 }
343 pub fn try_as_token(&self) -> Result<Token, Bound> {
344 self.token.try_as_token()
345 }
346 pub fn as_original_token(&self) -> Option<Local<&Token2>> {
347 self.original.map(|original| original.local(&self.token))
348 }
349 pub fn into_original_token(self) -> Option<Local<Token2>> {
350 self.original.map(|original| original.local(self.token))
351 }
352 pub fn original_str<'s>(&self, original: &'s str) -> Result<&'s str, OriginalError> {
353 match self.original {
354 Some(local) => {
355 let Snip {
356 offset: begin,
357 length: len,
358 } = local.bytes();
359 let end = begin + len;
360 match original.get(begin..end) {
361 Some(s) => Ok(s),
362 None => Err(OriginalError::InvalidSnip),
363 }
364 }
365 None => Err(OriginalError::NoOriginal),
366 }
367 }
368
369 pub fn test_token(lt: Local<Token2>) -> TextToken {
370 let (local, token) = lt.into_inner();
371 TextToken {
372 locality: local,
373 original: Some(local.local(())),
374 token,
375 }
376 }
377 pub fn test_new(token: Token2, local: Local<()>, original: Option<Local<()>>) -> TextToken {
378 TextToken {
379 locality: local,
380 original,
381 token,
382 }
383 }
384}
385
386#[derive(Debug)]
413pub enum OriginalError {
414 NoOriginal,
415 InvalidSnip,
416}
417
418#[cfg(feature = "strings")]
426#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord)]
427pub enum Token2 {
428 Word(Word),
429 Struct(Struct),
430 Special(Special),
431 Unicode(Unicode),
432
433 Bound(Bound),
434}
435#[cfg(not(feature = "strings"))]
436#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord)]
437pub enum Token2 {
438 Word(Word),
439 Struct(Struct),
440 Special(Special),
441 Unicode(Unicode),
442
443 Bound(Bound),
444}
445impl From<Token> for Token2 {
446 fn from(t: Token) -> Token2 {
447 match t {
448 Token::Word(w) => Token2::Word(w),
449 Token::Struct(s) => Token2::Struct(s),
450 Token::Special(s) => Token2::Special(s),
451 Token::Unicode(u) => Token2::Unicode(u),
452 }
453 }
454}
455impl Token2 {
456 #[cfg(not(feature = "strings"))]
457 fn try_as_token(&self) -> Result<Token, Bound> {
458 (*self).try_into_token()
459 }
460
461 #[cfg(feature = "strings")]
462 fn try_as_token(&self) -> Result<Token, Bound> {
463 self.clone().try_into_token()
464 }
465
466 fn try_into_token(self) -> Result<Token, Bound> {
467 match self {
468 Token2::Word(w) => Ok(Token::Word(w)),
469 Token2::Struct(s) => Ok(Token::Struct(s)),
470 Token2::Special(s) => Ok(Token::Special(s)),
471 Token2::Unicode(u) => Ok(Token::Unicode(u)),
472 Token2::Bound(b) => Err(b),
473 }
474 }
475}
476#[cfg(test)]
477impl Token2 {
478 fn into_token(self) -> Option<Token> {
479 match self {
480 Token2::Word(w) => Some(Token::Word(w)),
481 Token2::Struct(s) => Some(Token::Struct(s)),
482 Token2::Special(s) => Some(Token::Special(s)),
483 Token2::Unicode(u) => Some(Token::Unicode(u)),
484 Token2::Bound(_) => None,
485 }
486 }
487}
488
489#[cfg(test)]
490mod test_v0_5 {
491 use super::*;
492 use text_parsing::{entities, tagger, IntoPipeParser, IntoSource, ParserExt, SourceExt};
493
494 fn basic() {
496 let uws = "<p>Oxana Putan shared the quick (\"brown\") fox can't jump 32.3 feet, right? 4pda etc.</p><p> qeq U.S.A asd\n\n\nBrr, it's 29.3°F!\n Русское предложение #36.6 для тестирования деления по юникод-словам...\n🇷🇺 🇸🇹\n👱🏿👶🏽👨🏽\n+Done! Готово</p>";
511 let text = Text::new({
512 uws.into_source()
513 .pipe(tagger::Builder::new().create().into_breaker())
514 .pipe(entities::Builder::new().create().into_piped())
515 .into_separator()
516 })
517 .unwrap();
518 let lib_res = text
519 .into_tokenizer({
520 TokenizerParams::default()
521 .add_option(TokenizerOptions::SplitDot)
522 .add_option(TokenizerOptions::SplitUnderscore)
523 .add_option(TokenizerOptions::SplitColon)
524 .with_default_sentences()
525 })
526 .collect::<Vec<_>>();
527
528 for tok in lib_res {
529 println!(
530 "C{:?}, B{:?}, {:?} -> {:?}",
531 tok.original.map(|loc| loc.chars()),
532 tok.original.map(|loc| loc.bytes()),
533 tok.token,
534 tok.original_str(uws)
535 );
536 }
537
538 panic!()
539 }
540}
541
542#[cfg(test)]
543#[cfg(feature = "strings")]
544mod test {
545 use super::*;
546 use text_parsing::{
547 entities, tagger, IntoPipeParser, IntoSource, Localize, ParserExt, SourceExt,
548 };
549
550 #[derive(Debug, Clone)]
611 struct CharToken {
612 byte_offset: usize,
613 byte_length: usize,
614 char_offset: usize,
615 char_length: usize,
616 token: Token,
617 }
618 impl Into<Local<Token>> for CharToken {
619 fn into(self) -> Local<Token> {
620 self.token.localize(
621 Snip {
622 offset: self.char_offset,
623 length: self.char_length,
624 },
625 Snip {
626 offset: self.byte_offset,
627 length: self.byte_length,
628 },
629 )
630 }
631 }
632
633 #[derive(Debug, Clone)]
634 struct PositionalToken {
635 source: &'static str,
636 offset: usize,
637 length: usize,
638 token: Token,
639 }
640 impl Into<Local<Token>> for PositionalToken {
641 fn into(self) -> Local<Token> {
642 self.token.localize(
643 Snip {
644 offset: self.source[..self.offset].chars().count(),
645 length: self.source[self.offset..self.offset + self.length]
646 .chars()
647 .count(),
648 },
649 Snip {
650 offset: self.offset,
651 length: self.length,
652 },
653 )
654 }
655 }
656
657 fn check_results(result: &Vec<PositionalToken>, lib_res: &Vec<Local<Token>>, _uws: &str) {
658 assert_eq!(result.len(), lib_res.len());
659 for i in 0..result.len() {
660 let res: Local<Token> = result[i].clone().into();
661 assert_eq!(res, lib_res[i]);
662 }
663 }
664
665 fn check_cresults(result: &Vec<CharToken>, lib_res: &Vec<Local<Token>>, _uws: &str) {
666 assert_eq!(result.len(), lib_res.len());
667 for i in 0..result.len() {
668 let res: Local<Token> = result[i].clone().into();
669 assert_eq!(res, lib_res[i]);
670 }
671 }
672
673 fn check<T: Clone + std::fmt::Debug + Into<Local<Token>>>(
674 res: &Vec<T>,
675 lib: &Vec<Local<Token>>,
676 _uws: &str,
677 ) {
678 let mut lib = lib.iter();
679 let mut res = res.iter().map(|r| {
680 let res: Local<Token> = r.clone().into();
681 res
682 });
683 let mut diff = Vec::new();
684 loop {
685 match (lib.next(), res.next()) {
686 (Some(lw), Some(rw)) => {
687 if *lw != rw {
688 diff.push(format!("LIB: {:?}", lw));
689 diff.push(format!("TEST: {:?}", rw));
690 diff.push("".to_string())
691 }
692 }
693 (Some(lw), None) => {
694 diff.push(format!("LIB: {:?}", lw));
695 diff.push("TEST: ----".to_string());
696 diff.push("".to_string())
697 }
698 (None, Some(rw)) => {
699 diff.push("LIB: ----".to_string());
700 diff.push(format!("TEST: {:?}", rw));
701 diff.push("".to_string())
702 }
703 (None, None) => break,
704 }
705 }
706 if diff.len() > 0 {
707 for ln in &diff {
708 println!("{}", ln);
709 }
710 panic!("Diff count: {}", diff.len() / 3);
711 }
712 }
713
714 #[test]
715 fn spaces() {
716 let uws = " spaces too many apces ";
717 let result = vec![
718 PositionalToken {
719 source: uws,
720 offset: 0,
721 length: 4,
722 token: Token::Special(Special::Separator(Separator::Space)),
723 },
724 PositionalToken {
725 source: uws,
726 offset: 4,
727 length: 6,
728 token: Token::Word(Word::Word("spaces".to_string())),
729 },
730 PositionalToken {
731 source: uws,
732 offset: 10,
733 length: 4,
734 token: Token::Special(Special::Separator(Separator::Space)),
735 },
736 PositionalToken {
737 source: uws,
738 offset: 14,
739 length: 3,
740 token: Token::Word(Word::Word("too".to_string())),
741 },
742 PositionalToken {
743 source: uws,
744 offset: 17,
745 length: 3,
746 token: Token::Special(Special::Separator(Separator::Space)),
747 },
748 PositionalToken {
749 source: uws,
750 offset: 20,
751 length: 4,
752 token: Token::Word(Word::Word("many".to_string())),
753 },
754 PositionalToken {
755 source: uws,
756 offset: 24,
757 length: 3,
758 token: Token::Special(Special::Separator(Separator::Space)),
759 },
760 PositionalToken {
761 source: uws,
762 offset: 27,
763 length: 5,
764 token: Token::Word(Word::Word("apces".to_string())),
765 },
766 PositionalToken {
767 source: uws,
768 offset: 32,
769 length: 3,
770 token: Token::Special(Special::Separator(Separator::Space)),
771 },
772 ];
773 let lib_res = uws
774 .into_tokenizer(TokenizerParams::v1())
775 .collect::<Vec<_>>();
776 check_results(&result, &lib_res, uws);
777 }
779
780 #[test]
781 fn numbers() {
782 let uws = "(() -2\n() -2";
783 let result = vec![
784 PositionalToken {
785 source: uws,
786 offset: 0,
787 length: 1,
788 token: Token::Special(Special::Punctuation('(')),
789 },
790 PositionalToken {
791 source: uws,
792 offset: 1,
793 length: 1,
794 token: Token::Special(Special::Punctuation('(')),
795 },
796 PositionalToken {
797 source: uws,
798 offset: 2,
799 length: 1,
800 token: Token::Special(Special::Punctuation(')')),
801 },
802 PositionalToken {
803 source: uws,
804 offset: 3,
805 length: 1,
806 token: Token::Special(Special::Separator(Separator::Space)),
807 },
808 PositionalToken {
809 source: uws,
810 offset: 4,
811 length: 2,
812 token: Token::Word(Word::Number(Number::Integer(-2))),
813 },
814 PositionalToken {
815 source: uws,
816 offset: 6,
817 length: 1,
818 token: Token::Special(Special::Separator(Separator::Newline)),
819 },
820 PositionalToken {
821 source: uws,
822 offset: 7,
823 length: 1,
824 token: Token::Special(Special::Punctuation('(')),
825 },
826 PositionalToken {
827 source: uws,
828 offset: 8,
829 length: 1,
830 token: Token::Special(Special::Punctuation(')')),
831 },
832 PositionalToken {
833 source: uws,
834 offset: 9,
835 length: 2,
836 token: Token::Special(Special::Separator(Separator::Space)),
837 },
838 PositionalToken {
839 source: uws,
840 offset: 11,
841 length: 2,
842 token: Token::Word(Word::Number(Number::Integer(-2))),
843 },
844 ];
845 let lib_res = uws
846 .into_tokenizer({
847 TokenizerParams::default()
848 .add_option(TokenizerOptions::SplitDot)
849 .add_option(TokenizerOptions::SplitUnderscore)
850 .add_option(TokenizerOptions::SplitColon)
851 .add_option(TokenizerOptions::MergeWhites)
852 })
853 .collect::<Vec<_>>();
854 check_results(&result, &lib_res, uws);
855 }
856
857 #[test]
858 fn word_with_inner_hyphens() {
859 let uws = "Опросы показывают";
860 let result = vec![
861 PositionalToken {
862 source: uws,
863 offset: 0,
864 length: 14,
865 token: Token::Word(Word::StrangeWord("Опросы".to_string())),
866 },
867 PositionalToken {
868 source: uws,
869 offset: 14,
870 length: 1,
871 token: Token::Special(Special::Separator(Separator::Space)),
872 },
873 PositionalToken {
874 source: uws,
875 offset: 15,
876 length: 28,
877 token: Token::Word(Word::StrangeWord("показывают".to_string())),
878 },
879 ];
880 let lib_res = uws
881 .into_tokenizer(TokenizerParams::v1())
882 .collect::<Vec<_>>();
883 check_results(&result, &lib_res, uws);
884 }
885
886 #[test]
887 fn mixed_but_word() {
888 let uws = "L’Oreal";
889 let result = vec![PositionalToken {
890 source: uws,
891 offset: 0,
892 length: 9,
893 token: Token::Word(Word::StrangeWord("L’Oreal".to_string())),
894 }];
895 let lib_res = uws
896 .into_tokenizer(TokenizerParams::v1())
897 .collect::<Vec<_>>();
898 check_results(&result, &lib_res, uws);
899 }
900
901 #[test]
902 fn hashtags() {
903 let uws = "#hashtag#hashtag2";
904 let result = vec![
905 PositionalToken {
906 source: uws,
907 offset: 0,
908 length: 1,
909 token: Token::Special(Special::Punctuation('#')),
910 },
911 PositionalToken {
912 source: uws,
913 offset: 1,
914 length: 7,
915 token: Token::Word(Word::Word("hashtag".to_string())),
916 },
917 PositionalToken {
918 source: uws,
919 offset: 8,
920 length: 1,
921 token: Token::Special(Special::Punctuation('#')),
922 },
923 PositionalToken {
924 source: uws,
925 offset: 9,
926 length: 8,
927 token: Token::Word(Word::Numerical(Numerical::Alphanumeric(
928 "hashtag2".to_string(),
929 ))),
930 },
931 ];
932 let lib_res = uws
933 .into_tokenizer(TokenizerParams::v1())
934 .collect::<Vec<_>>();
935 check_results(&result, &lib_res, uws);
936 }
937
938 #[test]
939 fn apostrophe() {
940 let uws = "l'oreal; l\u{0060}oreal";
941 let result = vec![
942 PositionalToken {
943 source: uws,
944 offset: 0,
945 length: 7,
946 token: Token::Word(Word::Word("l'oreal".to_string())),
947 },
948 PositionalToken {
949 source: uws,
950 offset: 7,
951 length: 1,
952 token: Token::Special(Special::Punctuation(';')),
953 },
954 PositionalToken {
955 source: uws,
956 offset: 8,
957 length: 1,
958 token: Token::Special(Special::Separator(Separator::Space)),
959 },
960 PositionalToken {
961 source: uws,
962 offset: 9,
963 length: 7,
964 token: Token::Word(Word::Word("l'oreal".to_string())),
965 },
966 ];
967 let text = Text::new(uws.into_source()).unwrap();
968 let lib_res = text
969 .into_tokenizer(TokenizerParams::v1())
970 .filter_map(|tt| tt.into_original_token_1())
971 .collect::<Vec<_>>();
972 check_results(&result, &lib_res, uws);
973 }
974
975 #[test]
976 fn char_tokens() {
977 let uws = "[Oxana Putan|1712640565] shared the quick (\"brown\") fox can't jump 32.3 feet, right? 4pda etc. qeq U.S.A asd\n\n\nBrr, it's 29.3°F!\n Русское предложение #36.6 для тестирования деления по юникод-словам...\n🇷🇺 🇸🇹\n👱🏿👶🏽👨🏽\n+Done! Готово";
978 let result = vec![
979 CharToken {
980 byte_offset: 0,
981 byte_length: 1,
982 char_offset: 0,
983 char_length: 1,
984 token: Token::Special(Special::Punctuation('[')),
985 },
986 CharToken {
987 byte_offset: 1,
988 byte_length: 5,
989 char_offset: 1,
990 char_length: 5,
991 token: Token::Word(Word::Word("Oxana".to_string())),
992 },
993 CharToken {
994 byte_offset: 6,
995 byte_length: 1,
996 char_offset: 6,
997 char_length: 1,
998 token: Token::Special(Special::Separator(Separator::Space)),
999 },
1000 CharToken {
1001 byte_offset: 7,
1002 byte_length: 5,
1003 char_offset: 7,
1004 char_length: 5,
1005 token: Token::Word(Word::Word("Putan".to_string())),
1006 },
1007 CharToken {
1008 byte_offset: 12,
1009 byte_length: 1,
1010 char_offset: 12,
1011 char_length: 1,
1012 token: Token::Special(Special::Punctuation('|')),
1013 },
1014 CharToken {
1015 byte_offset: 13,
1016 byte_length: 10,
1017 char_offset: 13,
1018 char_length: 10,
1019 token: Token::Word(Word::Number(Number::Integer(1712640565))),
1020 },
1021 CharToken {
1022 byte_offset: 23,
1023 byte_length: 1,
1024 char_offset: 23,
1025 char_length: 1,
1026 token: Token::Special(Special::Punctuation(']')),
1027 },
1028 CharToken {
1036 byte_offset: 24,
1037 byte_length: 1,
1038 char_offset: 24,
1039 char_length: 1,
1040 token: Token::Special(Special::Separator(Separator::Space)),
1041 },
1042 CharToken {
1043 byte_offset: 25,
1044 byte_length: 6,
1045 char_offset: 25,
1046 char_length: 6,
1047 token: Token::Word(Word::Word("shared".to_string())),
1048 },
1049 CharToken {
1050 byte_offset: 31,
1051 byte_length: 1,
1052 char_offset: 31,
1053 char_length: 1,
1054 token: Token::Special(Special::Separator(Separator::Space)),
1055 },
1056 CharToken {
1057 byte_offset: 32,
1058 byte_length: 3,
1059 char_offset: 32,
1060 char_length: 3,
1061 token: Token::Word(Word::Word("the".to_string())),
1062 },
1063 CharToken {
1064 byte_offset: 35,
1065 byte_length: 1,
1066 char_offset: 35,
1067 char_length: 1,
1068 token: Token::Special(Special::Separator(Separator::Space)),
1069 },
1070 CharToken {
1071 byte_offset: 36,
1072 byte_length: 5,
1073 char_offset: 36,
1074 char_length: 5,
1075 token: Token::Word(Word::Word("quick".to_string())),
1076 },
1077 CharToken {
1078 byte_offset: 41,
1079 byte_length: 1,
1080 char_offset: 41,
1081 char_length: 1,
1082 token: Token::Special(Special::Separator(Separator::Space)),
1083 },
1084 CharToken {
1085 byte_offset: 42,
1086 byte_length: 1,
1087 char_offset: 42,
1088 char_length: 1,
1089 token: Token::Special(Special::Punctuation('(')),
1090 },
1091 CharToken {
1092 byte_offset: 43,
1093 byte_length: 1,
1094 char_offset: 43,
1095 char_length: 1,
1096 token: Token::Special(Special::Punctuation('"')),
1097 },
1098 CharToken {
1099 byte_offset: 44,
1100 byte_length: 5,
1101 char_offset: 44,
1102 char_length: 5,
1103 token: Token::Word(Word::Word("brown".to_string())),
1104 },
1105 CharToken {
1106 byte_offset: 49,
1107 byte_length: 1,
1108 char_offset: 49,
1109 char_length: 1,
1110 token: Token::Special(Special::Punctuation('"')),
1111 },
1112 CharToken {
1113 byte_offset: 50,
1114 byte_length: 1,
1115 char_offset: 50,
1116 char_length: 1,
1117 token: Token::Special(Special::Punctuation(')')),
1118 },
1119 CharToken {
1120 byte_offset: 51,
1121 byte_length: 1,
1122 char_offset: 51,
1123 char_length: 1,
1124 token: Token::Special(Special::Separator(Separator::Space)),
1125 },
1126 CharToken {
1127 byte_offset: 52,
1128 byte_length: 3,
1129 char_offset: 52,
1130 char_length: 3,
1131 token: Token::Word(Word::Word("fox".to_string())),
1132 },
1133 CharToken {
1134 byte_offset: 55,
1135 byte_length: 1,
1136 char_offset: 55,
1137 char_length: 1,
1138 token: Token::Special(Special::Separator(Separator::Space)),
1139 },
1140 CharToken {
1141 byte_offset: 56,
1142 byte_length: 5,
1143 char_offset: 56,
1144 char_length: 5,
1145 token: Token::Word(Word::Word("can\'t".to_string())),
1146 },
1147 CharToken {
1148 byte_offset: 61,
1149 byte_length: 1,
1150 char_offset: 61,
1151 char_length: 1,
1152 token: Token::Special(Special::Separator(Separator::Space)),
1153 },
1154 CharToken {
1155 byte_offset: 62,
1156 byte_length: 4,
1157 char_offset: 62,
1158 char_length: 4,
1159 token: Token::Word(Word::Word("jump".to_string())),
1160 },
1161 CharToken {
1162 byte_offset: 66,
1163 byte_length: 1,
1164 char_offset: 66,
1165 char_length: 1,
1166 token: Token::Special(Special::Separator(Separator::Space)),
1167 },
1168 CharToken {
1169 byte_offset: 67,
1170 byte_length: 4,
1171 char_offset: 67,
1172 char_length: 4,
1173 token: Token::Word(Word::Number(Number::Float(32.3))),
1174 },
1175 CharToken {
1176 byte_offset: 71,
1177 byte_length: 1,
1178 char_offset: 71,
1179 char_length: 1,
1180 token: Token::Special(Special::Separator(Separator::Space)),
1181 },
1182 CharToken {
1183 byte_offset: 72,
1184 byte_length: 4,
1185 char_offset: 72,
1186 char_length: 4,
1187 token: Token::Word(Word::Word("feet".to_string())),
1188 },
1189 CharToken {
1190 byte_offset: 76,
1191 byte_length: 1,
1192 char_offset: 76,
1193 char_length: 1,
1194 token: Token::Special(Special::Punctuation(',')),
1195 },
1196 CharToken {
1197 byte_offset: 77,
1198 byte_length: 1,
1199 char_offset: 77,
1200 char_length: 1,
1201 token: Token::Special(Special::Separator(Separator::Space)),
1202 },
1203 CharToken {
1204 byte_offset: 78,
1205 byte_length: 5,
1206 char_offset: 78,
1207 char_length: 5,
1208 token: Token::Word(Word::Word("right".to_string())),
1209 },
1210 CharToken {
1211 byte_offset: 83,
1212 byte_length: 1,
1213 char_offset: 83,
1214 char_length: 1,
1215 token: Token::Special(Special::Punctuation('?')),
1216 },
1217 CharToken {
1218 byte_offset: 84,
1219 byte_length: 1,
1220 char_offset: 84,
1221 char_length: 1,
1222 token: Token::Special(Special::Separator(Separator::Space)),
1223 },
1224 CharToken {
1225 byte_offset: 85,
1226 byte_length: 4,
1227 char_offset: 85,
1228 char_length: 4,
1229 token: Token::Word(Word::Numerical(Numerical::Measures("4pda".to_string()))),
1230 },
1231 CharToken {
1232 byte_offset: 89,
1233 byte_length: 1,
1234 char_offset: 89,
1235 char_length: 1,
1236 token: Token::Special(Special::Separator(Separator::Space)),
1237 },
1238 CharToken {
1239 byte_offset: 90,
1240 byte_length: 3,
1241 char_offset: 90,
1242 char_length: 3,
1243 token: Token::Word(Word::Word("etc".to_string())),
1244 },
1245 CharToken {
1246 byte_offset: 93,
1247 byte_length: 1,
1248 char_offset: 93,
1249 char_length: 1,
1250 token: Token::Special(Special::Punctuation('.')),
1251 },
1252 CharToken {
1253 byte_offset: 94,
1254 byte_length: 1,
1255 char_offset: 94,
1256 char_length: 1,
1257 token: Token::Special(Special::Separator(Separator::Space)),
1258 },
1259 CharToken {
1260 byte_offset: 95,
1261 byte_length: 3,
1262 char_offset: 95,
1263 char_length: 3,
1264 token: Token::Word(Word::Word("qeq".to_string())),
1265 },
1266 CharToken {
1267 byte_offset: 98,
1268 byte_length: 1,
1269 char_offset: 98,
1270 char_length: 1,
1271 token: Token::Special(Special::Separator(Separator::Space)),
1272 },
1273 CharToken {
1274 byte_offset: 99,
1275 byte_length: 5,
1276 char_offset: 99,
1277 char_length: 5,
1278 token: Token::Word(Word::Word("U.S.A".to_string())),
1279 },
1280 CharToken {
1281 byte_offset: 104,
1282 byte_length: 2,
1283 char_offset: 104,
1284 char_length: 2,
1285 token: Token::Special(Special::Separator(Separator::Space)),
1286 },
1287 CharToken {
1288 byte_offset: 106,
1289 byte_length: 3,
1290 char_offset: 106,
1291 char_length: 3,
1292 token: Token::Word(Word::Word("asd".to_string())),
1293 },
1294 CharToken {
1295 byte_offset: 109,
1296 byte_length: 3,
1297 char_offset: 109,
1298 char_length: 3,
1299 token: Token::Special(Special::Separator(Separator::Newline)),
1300 },
1301 CharToken {
1302 byte_offset: 112,
1303 byte_length: 3,
1304 char_offset: 112,
1305 char_length: 3,
1306 token: Token::Word(Word::Word("Brr".to_string())),
1307 },
1308 CharToken {
1309 byte_offset: 115,
1310 byte_length: 1,
1311 char_offset: 115,
1312 char_length: 1,
1313 token: Token::Special(Special::Punctuation(',')),
1314 },
1315 CharToken {
1316 byte_offset: 116,
1317 byte_length: 1,
1318 char_offset: 116,
1319 char_length: 1,
1320 token: Token::Special(Special::Separator(Separator::Space)),
1321 },
1322 CharToken {
1323 byte_offset: 117,
1324 byte_length: 4,
1325 char_offset: 117,
1326 char_length: 4,
1327 token: Token::Word(Word::Word("it\'s".to_string())),
1328 },
1329 CharToken {
1330 byte_offset: 121,
1331 byte_length: 1,
1332 char_offset: 121,
1333 char_length: 1,
1334 token: Token::Special(Special::Separator(Separator::Space)),
1335 },
1336 CharToken {
1337 byte_offset: 122,
1338 byte_length: 4,
1339 char_offset: 122,
1340 char_length: 4,
1341 token: Token::Word(Word::Number(Number::Float(29.3))),
1342 },
1343 CharToken {
1344 byte_offset: 126,
1345 byte_length: 2,
1346 char_offset: 126,
1347 char_length: 1,
1348 token: Token::Special(Special::Symbol('°')),
1349 },
1350 CharToken {
1351 byte_offset: 128,
1352 byte_length: 1,
1353 char_offset: 127,
1354 char_length: 1,
1355 token: Token::Word(Word::Word("F".to_string())),
1356 },
1357 CharToken {
1358 byte_offset: 129,
1359 byte_length: 1,
1360 char_offset: 128,
1361 char_length: 1,
1362 token: Token::Special(Special::Punctuation('!')),
1363 },
1364 CharToken {
1365 byte_offset: 130,
1366 byte_length: 1,
1367 char_offset: 129,
1368 char_length: 1,
1369 token: Token::Special(Special::Separator(Separator::Newline)),
1370 },
1371 CharToken {
1372 byte_offset: 131,
1373 byte_length: 1,
1374 char_offset: 130,
1375 char_length: 1,
1376 token: Token::Special(Special::Separator(Separator::Space)),
1377 },
1378 CharToken {
1379 byte_offset: 132,
1380 byte_length: 14,
1381 char_offset: 131,
1382 char_length: 7,
1383 token: Token::Word(Word::Word("Русское".to_string())),
1384 },
1385 CharToken {
1386 byte_offset: 146,
1387 byte_length: 1,
1388 char_offset: 138,
1389 char_length: 1,
1390 token: Token::Special(Special::Separator(Separator::Space)),
1391 },
1392 CharToken {
1393 byte_offset: 147,
1394 byte_length: 22,
1395 char_offset: 139,
1396 char_length: 11,
1397 token: Token::Word(Word::Word("предложение".to_string())),
1398 },
1399 CharToken {
1400 byte_offset: 169,
1401 byte_length: 1,
1402 char_offset: 150,
1403 char_length: 1,
1404 token: Token::Special(Special::Separator(Separator::Space)),
1405 },
1406 CharToken {
1407 byte_offset: 170,
1408 byte_length: 5,
1409 char_offset: 151,
1410 char_length: 5,
1411 token: Token::Struct(Struct::Hashtag("36.6".to_string())),
1412 },
1413 CharToken {
1414 byte_offset: 175,
1415 byte_length: 1,
1416 char_offset: 156,
1417 char_length: 1,
1418 token: Token::Special(Special::Separator(Separator::Space)),
1419 },
1420 CharToken {
1421 byte_offset: 176,
1422 byte_length: 6,
1423 char_offset: 157,
1424 char_length: 3,
1425 token: Token::Word(Word::Word("для".to_string())),
1426 },
1427 CharToken {
1428 byte_offset: 182,
1429 byte_length: 1,
1430 char_offset: 160,
1431 char_length: 1,
1432 token: Token::Special(Special::Separator(Separator::Space)),
1433 },
1434 CharToken {
1435 byte_offset: 183,
1436 byte_length: 24,
1437 char_offset: 161,
1438 char_length: 12,
1439 token: Token::Word(Word::Word("тестирования".to_string())),
1440 },
1441 CharToken {
1442 byte_offset: 207,
1443 byte_length: 1,
1444 char_offset: 173,
1445 char_length: 1,
1446 token: Token::Special(Special::Separator(Separator::Space)),
1447 },
1448 CharToken {
1449 byte_offset: 208,
1450 byte_length: 14,
1451 char_offset: 174,
1452 char_length: 7,
1453 token: Token::Word(Word::Word("деления".to_string())),
1454 },
1455 CharToken {
1456 byte_offset: 222,
1457 byte_length: 1,
1458 char_offset: 181,
1459 char_length: 1,
1460 token: Token::Special(Special::Separator(Separator::Space)),
1461 },
1462 CharToken {
1463 byte_offset: 223,
1464 byte_length: 4,
1465 char_offset: 182,
1466 char_length: 2,
1467 token: Token::Word(Word::Word("по".to_string())),
1468 },
1469 CharToken {
1470 byte_offset: 227,
1471 byte_length: 1,
1472 char_offset: 184,
1473 char_length: 1,
1474 token: Token::Special(Special::Separator(Separator::Space)),
1475 },
1476 CharToken {
1477 byte_offset: 228,
1478 byte_length: 12,
1479 char_offset: 185,
1480 char_length: 6,
1481 token: Token::Word(Word::Word("юникод".to_string())),
1482 },
1483 CharToken {
1484 byte_offset: 240,
1485 byte_length: 1,
1486 char_offset: 191,
1487 char_length: 1,
1488 token: Token::Special(Special::Punctuation('-')),
1489 },
1490 CharToken {
1491 byte_offset: 241,
1492 byte_length: 12,
1493 char_offset: 192,
1494 char_length: 6,
1495 token: Token::Word(Word::Word("словам".to_string())),
1496 },
1497 CharToken {
1498 byte_offset: 253,
1499 byte_length: 3,
1500 char_offset: 198,
1501 char_length: 3,
1502 token: Token::Special(Special::Punctuation('.')),
1503 },
1504 CharToken {
1505 byte_offset: 256,
1506 byte_length: 1,
1507 char_offset: 201,
1508 char_length: 1,
1509 token: Token::Special(Special::Separator(Separator::Newline)),
1510 },
1511 CharToken {
1512 byte_offset: 257,
1513 byte_length: 8,
1514 char_offset: 202,
1515 char_length: 2,
1516 token: Token::Word(Word::Emoji("russia")),
1517 },
1518 CharToken {
1519 byte_offset: 265,
1520 byte_length: 1,
1521 char_offset: 204,
1522 char_length: 1,
1523 token: Token::Special(Special::Separator(Separator::Space)),
1524 },
1525 CharToken {
1526 byte_offset: 266,
1527 byte_length: 8,
1528 char_offset: 205,
1529 char_length: 2,
1530 token: Token::Word(Word::Emoji("sao_tome_and_principe")),
1531 },
1532 CharToken {
1533 byte_offset: 274,
1534 byte_length: 1,
1535 char_offset: 207,
1536 char_length: 1,
1537 token: Token::Special(Special::Separator(Separator::Newline)),
1538 },
1539 CharToken {
1540 byte_offset: 275,
1541 byte_length: 8,
1542 char_offset: 208,
1543 char_length: 2,
1544 token: Token::Word(Word::Emoji("blond_haired_person_dark_skin_tone")),
1545 },
1546 CharToken {
1547 byte_offset: 283,
1548 byte_length: 8,
1549 char_offset: 210,
1550 char_length: 2,
1551 token: Token::Word(Word::Emoji("baby_medium_skin_tone")),
1552 },
1553 CharToken {
1554 byte_offset: 291,
1555 byte_length: 8,
1556 char_offset: 212,
1557 char_length: 2,
1558 token: Token::Word(Word::Emoji("man_medium_skin_tone")),
1559 },
1560 CharToken {
1561 byte_offset: 299,
1562 byte_length: 1,
1563 char_offset: 214,
1564 char_length: 1,
1565 token: Token::Special(Special::Separator(Separator::Newline)),
1566 },
1567 CharToken {
1568 byte_offset: 300,
1569 byte_length: 1,
1570 char_offset: 215,
1571 char_length: 1,
1572 token: Token::Special(Special::Punctuation('+')),
1573 },
1574 CharToken {
1575 byte_offset: 301,
1576 byte_length: 4,
1577 char_offset: 216,
1578 char_length: 4,
1579 token: Token::Word(Word::Word("Done".to_string())),
1580 },
1581 CharToken {
1582 byte_offset: 305,
1583 byte_length: 1,
1584 char_offset: 220,
1585 char_length: 1,
1586 token: Token::Special(Special::Punctuation('!')),
1587 },
1588 CharToken {
1589 byte_offset: 306,
1590 byte_length: 1,
1591 char_offset: 221,
1592 char_length: 1,
1593 token: Token::Special(Special::Separator(Separator::Space)),
1594 },
1595 CharToken {
1596 byte_offset: 307,
1597 byte_length: 12,
1598 char_offset: 222,
1599 char_length: 6,
1600 token: Token::Word(Word::Word("Готово".to_string())),
1601 },
1602 ];
1603
1604 let lib_res = uws
1605 .into_tokenizer(TokenizerParams::complex())
1606 .collect::<Vec<_>>();
1607
1608 check_cresults(&result, &lib_res, uws);
1610 }
1611
1612 #[test]
1613 fn general_default() {
1614 let uws = "The quick (\"brown\") fox can't jump 32.3 feet, right? 4pda etc. qeq U.S.A asd\n\n\nBrr, it's 29.3°F!\n Русское предложение #36.6 для тестирования деления по юникод-словам...\n";
1615 let result = vec![
1616 PositionalToken {
1617 source: uws,
1618 offset: 0,
1619 length: 3,
1620 token: Token::Word(Word::Word("The".to_string())),
1621 },
1622 PositionalToken {
1623 source: uws,
1624 offset: 3,
1625 length: 1,
1626 token: Token::Special(Special::Separator(Separator::Space)),
1627 },
1628 PositionalToken {
1629 source: uws,
1630 offset: 4,
1631 length: 5,
1632 token: Token::Word(Word::Word("quick".to_string())),
1633 },
1634 PositionalToken {
1635 source: uws,
1636 offset: 9,
1637 length: 1,
1638 token: Token::Special(Special::Separator(Separator::Space)),
1639 },
1640 PositionalToken {
1641 source: uws,
1642 offset: 10,
1643 length: 1,
1644 token: Token::Special(Special::Punctuation('(')),
1645 },
1646 PositionalToken {
1647 source: uws,
1648 offset: 11,
1649 length: 1,
1650 token: Token::Special(Special::Punctuation('"')),
1651 },
1652 PositionalToken {
1653 source: uws,
1654 offset: 12,
1655 length: 5,
1656 token: Token::Word(Word::Word("brown".to_string())),
1657 },
1658 PositionalToken {
1659 source: uws,
1660 offset: 17,
1661 length: 1,
1662 token: Token::Special(Special::Punctuation('"')),
1663 },
1664 PositionalToken {
1665 source: uws,
1666 offset: 18,
1667 length: 1,
1668 token: Token::Special(Special::Punctuation(')')),
1669 },
1670 PositionalToken {
1671 source: uws,
1672 offset: 19,
1673 length: 1,
1674 token: Token::Special(Special::Separator(Separator::Space)),
1675 },
1676 PositionalToken {
1677 source: uws,
1678 offset: 20,
1679 length: 3,
1680 token: Token::Word(Word::Word("fox".to_string())),
1681 },
1682 PositionalToken {
1683 source: uws,
1684 offset: 23,
1685 length: 1,
1686 token: Token::Special(Special::Separator(Separator::Space)),
1687 },
1688 PositionalToken {
1689 source: uws,
1690 offset: 24,
1691 length: 5,
1692 token: Token::Word(Word::Word("can\'t".to_string())),
1693 },
1694 PositionalToken {
1695 source: uws,
1696 offset: 29,
1697 length: 1,
1698 token: Token::Special(Special::Separator(Separator::Space)),
1699 },
1700 PositionalToken {
1701 source: uws,
1702 offset: 30,
1703 length: 4,
1704 token: Token::Word(Word::Word("jump".to_string())),
1705 },
1706 PositionalToken {
1707 source: uws,
1708 offset: 34,
1709 length: 1,
1710 token: Token::Special(Special::Separator(Separator::Space)),
1711 },
1712 PositionalToken {
1713 source: uws,
1714 offset: 35,
1715 length: 4,
1716 token: Token::Word(Word::Number(Number::Float(32.3))),
1717 },
1718 PositionalToken {
1719 source: uws,
1720 offset: 39,
1721 length: 1,
1722 token: Token::Special(Special::Separator(Separator::Space)),
1723 },
1724 PositionalToken {
1725 source: uws,
1726 offset: 40,
1727 length: 4,
1728 token: Token::Word(Word::Word("feet".to_string())),
1729 },
1730 PositionalToken {
1731 source: uws,
1732 offset: 44,
1733 length: 1,
1734 token: Token::Special(Special::Punctuation(',')),
1735 },
1736 PositionalToken {
1737 source: uws,
1738 offset: 45,
1739 length: 1,
1740 token: Token::Special(Special::Separator(Separator::Space)),
1741 },
1742 PositionalToken {
1743 source: uws,
1744 offset: 46,
1745 length: 5,
1746 token: Token::Word(Word::Word("right".to_string())),
1747 },
1748 PositionalToken {
1749 source: uws,
1750 offset: 51,
1751 length: 1,
1752 token: Token::Special(Special::Punctuation('?')),
1753 },
1754 PositionalToken {
1755 source: uws,
1756 offset: 52,
1757 length: 1,
1758 token: Token::Special(Special::Separator(Separator::Space)),
1759 },
1760 PositionalToken {
1761 source: uws,
1762 offset: 53,
1763 length: 4,
1764 token: Token::Word(Word::Numerical(Numerical::Measures("4pda".to_string()))),
1765 }, PositionalToken {
1767 source: uws,
1768 offset: 57,
1769 length: 1,
1770 token: Token::Special(Special::Separator(Separator::Space)),
1771 },
1772 PositionalToken {
1773 source: uws,
1774 offset: 58,
1775 length: 3,
1776 token: Token::Word(Word::Word("etc".to_string())),
1777 },
1778 PositionalToken {
1779 source: uws,
1780 offset: 61,
1781 length: 1,
1782 token: Token::Special(Special::Punctuation('.')),
1783 },
1784 PositionalToken {
1785 source: uws,
1786 offset: 62,
1787 length: 1,
1788 token: Token::Special(Special::Separator(Separator::Space)),
1789 },
1790 PositionalToken {
1791 source: uws,
1792 offset: 63,
1793 length: 3,
1794 token: Token::Word(Word::Word("qeq".to_string())),
1795 },
1796 PositionalToken {
1797 source: uws,
1798 offset: 66,
1799 length: 1,
1800 token: Token::Special(Special::Separator(Separator::Space)),
1801 },
1802 PositionalToken {
1803 source: uws,
1804 offset: 67,
1805 length: 1,
1806 token: Token::Word(Word::Word("U".to_string())),
1807 },
1808 PositionalToken {
1809 source: uws,
1810 offset: 68,
1811 length: 1,
1812 token: Token::Special(Special::Punctuation('.')),
1813 },
1814 PositionalToken {
1815 source: uws,
1816 offset: 69,
1817 length: 1,
1818 token: Token::Word(Word::Word("S".to_string())),
1819 },
1820 PositionalToken {
1821 source: uws,
1822 offset: 70,
1823 length: 1,
1824 token: Token::Special(Special::Punctuation('.')),
1825 },
1826 PositionalToken {
1827 source: uws,
1828 offset: 71,
1829 length: 1,
1830 token: Token::Word(Word::Word("A".to_string())),
1831 },
1832 PositionalToken {
1833 source: uws,
1834 offset: 72,
1835 length: 2,
1836 token: Token::Special(Special::Separator(Separator::Space)),
1837 },
1838 PositionalToken {
1839 source: uws,
1840 offset: 74,
1841 length: 3,
1842 token: Token::Word(Word::Word("asd".to_string())),
1843 },
1844 PositionalToken {
1845 source: uws,
1846 offset: 77,
1847 length: 3,
1848 token: Token::Special(Special::Separator(Separator::Newline)),
1849 },
1850 PositionalToken {
1851 source: uws,
1852 offset: 80,
1853 length: 3,
1854 token: Token::Word(Word::Word("Brr".to_string())),
1855 },
1856 PositionalToken {
1857 source: uws,
1858 offset: 83,
1859 length: 1,
1860 token: Token::Special(Special::Punctuation(',')),
1861 },
1862 PositionalToken {
1863 source: uws,
1864 offset: 84,
1865 length: 1,
1866 token: Token::Special(Special::Separator(Separator::Space)),
1867 },
1868 PositionalToken {
1869 source: uws,
1870 offset: 85,
1871 length: 4,
1872 token: Token::Word(Word::Word("it\'s".to_string())),
1873 },
1874 PositionalToken {
1875 source: uws,
1876 offset: 89,
1877 length: 1,
1878 token: Token::Special(Special::Separator(Separator::Space)),
1879 },
1880 PositionalToken {
1881 source: uws,
1882 offset: 90,
1883 length: 4,
1884 token: Token::Word(Word::Number(Number::Float(29.3))),
1885 },
1886 PositionalToken {
1887 source: uws,
1888 offset: 94,
1889 length: 2,
1890 token: Token::Special(Special::Symbol('°')),
1891 },
1892 PositionalToken {
1893 source: uws,
1894 offset: 96,
1895 length: 1,
1896 token: Token::Word(Word::Word("F".to_string())),
1897 },
1898 PositionalToken {
1899 source: uws,
1900 offset: 97,
1901 length: 1,
1902 token: Token::Special(Special::Punctuation('!')),
1903 },
1904 PositionalToken {
1905 source: uws,
1906 offset: 98,
1907 length: 1,
1908 token: Token::Special(Special::Separator(Separator::Newline)),
1909 },
1910 PositionalToken {
1911 source: uws,
1912 offset: 99,
1913 length: 1,
1914 token: Token::Special(Special::Separator(Separator::Space)),
1915 },
1916 PositionalToken {
1917 source: uws,
1918 offset: 100,
1919 length: 14,
1920 token: Token::Word(Word::Word("Русское".to_string())),
1921 },
1922 PositionalToken {
1923 source: uws,
1924 offset: 114,
1925 length: 1,
1926 token: Token::Special(Special::Separator(Separator::Space)),
1927 },
1928 PositionalToken {
1929 source: uws,
1930 offset: 115,
1931 length: 22,
1932 token: Token::Word(Word::Word("предложение".to_string())),
1933 },
1934 PositionalToken {
1935 source: uws,
1936 offset: 137,
1937 length: 1,
1938 token: Token::Special(Special::Separator(Separator::Space)),
1939 },
1940 PositionalToken {
1941 source: uws,
1942 offset: 138,
1943 length: 1,
1944 token: Token::Special(Special::Punctuation('#')),
1945 },
1946 PositionalToken {
1947 source: uws,
1948 offset: 139,
1949 length: 4,
1950 token: Token::Word(Word::Number(Number::Float(36.6))),
1951 },
1952 PositionalToken {
1953 source: uws,
1954 offset: 143,
1955 length: 1,
1956 token: Token::Special(Special::Separator(Separator::Space)),
1957 },
1958 PositionalToken {
1959 source: uws,
1960 offset: 144,
1961 length: 6,
1962 token: Token::Word(Word::Word("для".to_string())),
1963 },
1964 PositionalToken {
1965 source: uws,
1966 offset: 150,
1967 length: 1,
1968 token: Token::Special(Special::Separator(Separator::Space)),
1969 },
1970 PositionalToken {
1971 source: uws,
1972 offset: 151,
1973 length: 24,
1974 token: Token::Word(Word::Word("тестирования".to_string())),
1975 },
1976 PositionalToken {
1977 source: uws,
1978 offset: 175,
1979 length: 1,
1980 token: Token::Special(Special::Separator(Separator::Space)),
1981 },
1982 PositionalToken {
1983 source: uws,
1984 offset: 176,
1985 length: 14,
1986 token: Token::Word(Word::Word("деления".to_string())),
1987 },
1988 PositionalToken {
1989 source: uws,
1990 offset: 190,
1991 length: 1,
1992 token: Token::Special(Special::Separator(Separator::Space)),
1993 },
1994 PositionalToken {
1995 source: uws,
1996 offset: 191,
1997 length: 4,
1998 token: Token::Word(Word::Word("по".to_string())),
1999 },
2000 PositionalToken {
2001 source: uws,
2002 offset: 195,
2003 length: 1,
2004 token: Token::Special(Special::Separator(Separator::Space)),
2005 },
2006 PositionalToken {
2007 source: uws,
2008 offset: 196,
2009 length: 12,
2010 token: Token::Word(Word::Word("юникод".to_string())),
2011 },
2012 PositionalToken {
2013 source: uws,
2014 offset: 208,
2015 length: 1,
2016 token: Token::Special(Special::Punctuation('-')),
2017 },
2018 PositionalToken {
2019 source: uws,
2020 offset: 209,
2021 length: 12,
2022 token: Token::Word(Word::Word("словам".to_string())),
2023 },
2024 PositionalToken {
2025 source: uws,
2026 offset: 221,
2027 length: 3,
2028 token: Token::Special(Special::Punctuation('.')),
2029 },
2030 PositionalToken {
2031 source: uws,
2032 offset: 224,
2033 length: 1,
2034 token: Token::Special(Special::Separator(Separator::Newline)),
2035 },
2036 ];
2037 let lib_res = uws
2038 .into_tokenizer(TokenizerParams::v1())
2039 .collect::<Vec<_>>();
2040 check_results(&result, &lib_res, uws);
2041 }
2042
2043 #[test]
2044 fn general_no_split() {
2045 let uws = "The quick (\"brown\") fox can't jump 32.3 feet, right? 4pda etc. qeq U.S.A asd\n\n\nBrr, it's 29.3°F!\n Русское предложение #36.6 для тестирования деления по юникод-словам...\n";
2046 let result = vec![
2047 PositionalToken {
2048 source: uws,
2049 offset: 0,
2050 length: 3,
2051 token: Token::Word(Word::Word("The".to_string())),
2052 },
2053 PositionalToken {
2054 source: uws,
2055 offset: 3,
2056 length: 1,
2057 token: Token::Special(Special::Separator(Separator::Space)),
2058 },
2059 PositionalToken {
2060 source: uws,
2061 offset: 4,
2062 length: 5,
2063 token: Token::Word(Word::Word("quick".to_string())),
2064 },
2065 PositionalToken {
2066 source: uws,
2067 offset: 9,
2068 length: 1,
2069 token: Token::Special(Special::Separator(Separator::Space)),
2070 },
2071 PositionalToken {
2072 source: uws,
2073 offset: 10,
2074 length: 1,
2075 token: Token::Special(Special::Punctuation('(')),
2076 },
2077 PositionalToken {
2078 source: uws,
2079 offset: 11,
2080 length: 1,
2081 token: Token::Special(Special::Punctuation('"')),
2082 },
2083 PositionalToken {
2084 source: uws,
2085 offset: 12,
2086 length: 5,
2087 token: Token::Word(Word::Word("brown".to_string())),
2088 },
2089 PositionalToken {
2090 source: uws,
2091 offset: 17,
2092 length: 1,
2093 token: Token::Special(Special::Punctuation('"')),
2094 },
2095 PositionalToken {
2096 source: uws,
2097 offset: 18,
2098 length: 1,
2099 token: Token::Special(Special::Punctuation(')')),
2100 },
2101 PositionalToken {
2102 source: uws,
2103 offset: 19,
2104 length: 1,
2105 token: Token::Special(Special::Separator(Separator::Space)),
2106 },
2107 PositionalToken {
2108 source: uws,
2109 offset: 20,
2110 length: 3,
2111 token: Token::Word(Word::Word("fox".to_string())),
2112 },
2113 PositionalToken {
2114 source: uws,
2115 offset: 23,
2116 length: 1,
2117 token: Token::Special(Special::Separator(Separator::Space)),
2118 },
2119 PositionalToken {
2120 source: uws,
2121 offset: 24,
2122 length: 5,
2123 token: Token::Word(Word::Word("can\'t".to_string())),
2124 },
2125 PositionalToken {
2126 source: uws,
2127 offset: 29,
2128 length: 1,
2129 token: Token::Special(Special::Separator(Separator::Space)),
2130 },
2131 PositionalToken {
2132 source: uws,
2133 offset: 30,
2134 length: 4,
2135 token: Token::Word(Word::Word("jump".to_string())),
2136 },
2137 PositionalToken {
2138 source: uws,
2139 offset: 34,
2140 length: 1,
2141 token: Token::Special(Special::Separator(Separator::Space)),
2142 },
2143 PositionalToken {
2144 source: uws,
2145 offset: 35,
2146 length: 4,
2147 token: Token::Word(Word::Number(Number::Float(32.3))),
2148 },
2149 PositionalToken {
2150 source: uws,
2151 offset: 39,
2152 length: 1,
2153 token: Token::Special(Special::Separator(Separator::Space)),
2154 },
2155 PositionalToken {
2156 source: uws,
2157 offset: 40,
2158 length: 4,
2159 token: Token::Word(Word::Word("feet".to_string())),
2160 },
2161 PositionalToken {
2162 source: uws,
2163 offset: 44,
2164 length: 1,
2165 token: Token::Special(Special::Punctuation(',')),
2166 },
2167 PositionalToken {
2168 source: uws,
2169 offset: 45,
2170 length: 1,
2171 token: Token::Special(Special::Separator(Separator::Space)),
2172 },
2173 PositionalToken {
2174 source: uws,
2175 offset: 46,
2176 length: 5,
2177 token: Token::Word(Word::Word("right".to_string())),
2178 },
2179 PositionalToken {
2180 source: uws,
2181 offset: 51,
2182 length: 1,
2183 token: Token::Special(Special::Punctuation('?')),
2184 },
2185 PositionalToken {
2186 source: uws,
2187 offset: 52,
2188 length: 1,
2189 token: Token::Special(Special::Separator(Separator::Space)),
2190 },
2191 PositionalToken {
2192 source: uws,
2193 offset: 53,
2194 length: 4,
2195 token: Token::Word(Word::Numerical(Numerical::Measures("4pda".to_string()))),
2196 }, PositionalToken {
2198 source: uws,
2199 offset: 57,
2200 length: 1,
2201 token: Token::Special(Special::Separator(Separator::Space)),
2202 },
2203 PositionalToken {
2204 source: uws,
2205 offset: 58,
2206 length: 3,
2207 token: Token::Word(Word::Word("etc".to_string())),
2208 },
2209 PositionalToken {
2210 source: uws,
2211 offset: 61,
2212 length: 1,
2213 token: Token::Special(Special::Punctuation('.')),
2214 },
2215 PositionalToken {
2216 source: uws,
2217 offset: 62,
2218 length: 1,
2219 token: Token::Special(Special::Separator(Separator::Space)),
2220 },
2221 PositionalToken {
2222 source: uws,
2223 offset: 63,
2224 length: 3,
2225 token: Token::Word(Word::Word("qeq".to_string())),
2226 },
2227 PositionalToken {
2228 source: uws,
2229 offset: 66,
2230 length: 1,
2231 token: Token::Special(Special::Separator(Separator::Space)),
2232 },
2233 PositionalToken {
2234 source: uws,
2235 offset: 67,
2236 length: 5,
2237 token: Token::Word(Word::Word("U.S.A".to_string())),
2238 },
2239 PositionalToken {
2240 source: uws,
2241 offset: 72,
2242 length: 1,
2243 token: Token::Special(Special::Separator(Separator::Space)),
2244 },
2245 PositionalToken {
2246 source: uws,
2247 offset: 73,
2248 length: 1,
2249 token: Token::Special(Special::Separator(Separator::Space)),
2250 },
2251 PositionalToken {
2252 source: uws,
2253 offset: 74,
2254 length: 3,
2255 token: Token::Word(Word::Word("asd".to_string())),
2256 },
2257 PositionalToken {
2258 source: uws,
2259 offset: 77,
2260 length: 1,
2261 token: Token::Special(Special::Separator(Separator::Newline)),
2262 },
2263 PositionalToken {
2264 source: uws,
2265 offset: 78,
2266 length: 1,
2267 token: Token::Special(Special::Separator(Separator::Newline)),
2268 },
2269 PositionalToken {
2270 source: uws,
2271 offset: 79,
2272 length: 1,
2273 token: Token::Special(Special::Separator(Separator::Newline)),
2274 },
2275 PositionalToken {
2276 source: uws,
2277 offset: 80,
2278 length: 3,
2279 token: Token::Word(Word::Word("Brr".to_string())),
2280 },
2281 PositionalToken {
2282 source: uws,
2283 offset: 83,
2284 length: 1,
2285 token: Token::Special(Special::Punctuation(',')),
2286 },
2287 PositionalToken {
2288 source: uws,
2289 offset: 84,
2290 length: 1,
2291 token: Token::Special(Special::Separator(Separator::Space)),
2292 },
2293 PositionalToken {
2294 source: uws,
2295 offset: 85,
2296 length: 4,
2297 token: Token::Word(Word::Word("it\'s".to_string())),
2298 },
2299 PositionalToken {
2300 source: uws,
2301 offset: 89,
2302 length: 1,
2303 token: Token::Special(Special::Separator(Separator::Space)),
2304 },
2305 PositionalToken {
2306 source: uws,
2307 offset: 90,
2308 length: 4,
2309 token: Token::Word(Word::Number(Number::Float(29.3))),
2310 },
2311 PositionalToken {
2312 source: uws,
2313 offset: 94,
2314 length: 2,
2315 token: Token::Special(Special::Symbol('°')),
2316 },
2317 PositionalToken {
2318 source: uws,
2319 offset: 96,
2320 length: 1,
2321 token: Token::Word(Word::Word("F".to_string())),
2322 },
2323 PositionalToken {
2324 source: uws,
2325 offset: 97,
2326 length: 1,
2327 token: Token::Special(Special::Punctuation('!')),
2328 },
2329 PositionalToken {
2330 source: uws,
2331 offset: 98,
2332 length: 1,
2333 token: Token::Special(Special::Separator(Separator::Newline)),
2334 },
2335 PositionalToken {
2336 source: uws,
2337 offset: 99,
2338 length: 1,
2339 token: Token::Special(Special::Separator(Separator::Space)),
2340 },
2341 PositionalToken {
2342 source: uws,
2343 offset: 100,
2344 length: 14,
2345 token: Token::Word(Word::Word("Русское".to_string())),
2346 },
2347 PositionalToken {
2348 source: uws,
2349 offset: 114,
2350 length: 1,
2351 token: Token::Special(Special::Separator(Separator::Space)),
2352 },
2353 PositionalToken {
2354 source: uws,
2355 offset: 115,
2356 length: 22,
2357 token: Token::Word(Word::Word("предложение".to_string())),
2358 },
2359 PositionalToken {
2360 source: uws,
2361 offset: 137,
2362 length: 1,
2363 token: Token::Special(Special::Separator(Separator::Space)),
2364 },
2365 PositionalToken {
2366 source: uws,
2367 offset: 138,
2368 length: 1,
2369 token: Token::Special(Special::Punctuation('#')),
2370 },
2371 PositionalToken {
2372 source: uws,
2373 offset: 139,
2374 length: 4,
2375 token: Token::Word(Word::Number(Number::Float(36.6))),
2376 },
2377 PositionalToken {
2378 source: uws,
2379 offset: 143,
2380 length: 1,
2381 token: Token::Special(Special::Separator(Separator::Space)),
2382 },
2383 PositionalToken {
2384 source: uws,
2385 offset: 144,
2386 length: 6,
2387 token: Token::Word(Word::Word("для".to_string())),
2388 },
2389 PositionalToken {
2390 source: uws,
2391 offset: 150,
2392 length: 1,
2393 token: Token::Special(Special::Separator(Separator::Space)),
2394 },
2395 PositionalToken {
2396 source: uws,
2397 offset: 151,
2398 length: 24,
2399 token: Token::Word(Word::Word("тестирования".to_string())),
2400 },
2401 PositionalToken {
2402 source: uws,
2403 offset: 175,
2404 length: 1,
2405 token: Token::Special(Special::Separator(Separator::Space)),
2406 },
2407 PositionalToken {
2408 source: uws,
2409 offset: 176,
2410 length: 14,
2411 token: Token::Word(Word::Word("деления".to_string())),
2412 },
2413 PositionalToken {
2414 source: uws,
2415 offset: 190,
2416 length: 1,
2417 token: Token::Special(Special::Separator(Separator::Space)),
2418 },
2419 PositionalToken {
2420 source: uws,
2421 offset: 191,
2422 length: 4,
2423 token: Token::Word(Word::Word("по".to_string())),
2424 },
2425 PositionalToken {
2426 source: uws,
2427 offset: 195,
2428 length: 1,
2429 token: Token::Special(Special::Separator(Separator::Space)),
2430 },
2431 PositionalToken {
2432 source: uws,
2433 offset: 196,
2434 length: 12,
2435 token: Token::Word(Word::Word("юникод".to_string())),
2436 },
2437 PositionalToken {
2438 source: uws,
2439 offset: 208,
2440 length: 1,
2441 token: Token::Special(Special::Punctuation('-')),
2442 },
2443 PositionalToken {
2444 source: uws,
2445 offset: 209,
2446 length: 12,
2447 token: Token::Word(Word::Word("словам".to_string())),
2448 },
2449 PositionalToken {
2450 source: uws,
2451 offset: 221,
2452 length: 1,
2453 token: Token::Special(Special::Punctuation('.')),
2454 },
2455 PositionalToken {
2456 source: uws,
2457 offset: 222,
2458 length: 1,
2459 token: Token::Special(Special::Punctuation('.')),
2460 },
2461 PositionalToken {
2462 source: uws,
2463 offset: 223,
2464 length: 1,
2465 token: Token::Special(Special::Punctuation('.')),
2466 },
2467 PositionalToken {
2468 source: uws,
2469 offset: 224,
2470 length: 1,
2471 token: Token::Special(Special::Separator(Separator::Newline)),
2472 },
2473 ];
2474 let lib_res = uws.into_tokenizer(Default::default()).collect::<Vec<_>>();
2475 check_results(&result, &lib_res, uws);
2476 }
2477
2478 #[test]
2479 fn general_complex() {
2480 let uws = "The quick (\"brown\") fox can't jump 32.3 feet, right? 4pda etc. qeq U.S.A asd\n\n\nBrr, it's 29.3°F!\n Русское предложение #36.6 для тестирования деления по юникод-словам...\n";
2481 let result = vec![
2482 PositionalToken {
2483 source: uws,
2484 offset: 0,
2485 length: 3,
2486 token: Token::Word(Word::Word("The".to_string())),
2487 },
2488 PositionalToken {
2489 source: uws,
2490 offset: 3,
2491 length: 1,
2492 token: Token::Special(Special::Separator(Separator::Space)),
2493 },
2494 PositionalToken {
2495 source: uws,
2496 offset: 4,
2497 length: 5,
2498 token: Token::Word(Word::Word("quick".to_string())),
2499 },
2500 PositionalToken {
2501 source: uws,
2502 offset: 9,
2503 length: 1,
2504 token: Token::Special(Special::Separator(Separator::Space)),
2505 },
2506 PositionalToken {
2507 source: uws,
2508 offset: 10,
2509 length: 1,
2510 token: Token::Special(Special::Punctuation('(')),
2511 },
2512 PositionalToken {
2513 source: uws,
2514 offset: 11,
2515 length: 1,
2516 token: Token::Special(Special::Punctuation('"')),
2517 },
2518 PositionalToken {
2519 source: uws,
2520 offset: 12,
2521 length: 5,
2522 token: Token::Word(Word::Word("brown".to_string())),
2523 },
2524 PositionalToken {
2525 source: uws,
2526 offset: 17,
2527 length: 1,
2528 token: Token::Special(Special::Punctuation('"')),
2529 },
2530 PositionalToken {
2531 source: uws,
2532 offset: 18,
2533 length: 1,
2534 token: Token::Special(Special::Punctuation(')')),
2535 },
2536 PositionalToken {
2537 source: uws,
2538 offset: 19,
2539 length: 1,
2540 token: Token::Special(Special::Separator(Separator::Space)),
2541 },
2542 PositionalToken {
2543 source: uws,
2544 offset: 20,
2545 length: 3,
2546 token: Token::Word(Word::Word("fox".to_string())),
2547 },
2548 PositionalToken {
2549 source: uws,
2550 offset: 23,
2551 length: 1,
2552 token: Token::Special(Special::Separator(Separator::Space)),
2553 },
2554 PositionalToken {
2555 source: uws,
2556 offset: 24,
2557 length: 5,
2558 token: Token::Word(Word::Word("can\'t".to_string())),
2559 },
2560 PositionalToken {
2561 source: uws,
2562 offset: 29,
2563 length: 1,
2564 token: Token::Special(Special::Separator(Separator::Space)),
2565 },
2566 PositionalToken {
2567 source: uws,
2568 offset: 30,
2569 length: 4,
2570 token: Token::Word(Word::Word("jump".to_string())),
2571 },
2572 PositionalToken {
2573 source: uws,
2574 offset: 34,
2575 length: 1,
2576 token: Token::Special(Special::Separator(Separator::Space)),
2577 },
2578 PositionalToken {
2579 source: uws,
2580 offset: 35,
2581 length: 4,
2582 token: Token::Word(Word::Number(Number::Float(32.3))),
2583 },
2584 PositionalToken {
2585 source: uws,
2586 offset: 39,
2587 length: 1,
2588 token: Token::Special(Special::Separator(Separator::Space)),
2589 },
2590 PositionalToken {
2591 source: uws,
2592 offset: 40,
2593 length: 4,
2594 token: Token::Word(Word::Word("feet".to_string())),
2595 },
2596 PositionalToken {
2597 source: uws,
2598 offset: 44,
2599 length: 1,
2600 token: Token::Special(Special::Punctuation(',')),
2601 },
2602 PositionalToken {
2603 source: uws,
2604 offset: 45,
2605 length: 1,
2606 token: Token::Special(Special::Separator(Separator::Space)),
2607 },
2608 PositionalToken {
2609 source: uws,
2610 offset: 46,
2611 length: 5,
2612 token: Token::Word(Word::Word("right".to_string())),
2613 },
2614 PositionalToken {
2615 source: uws,
2616 offset: 51,
2617 length: 1,
2618 token: Token::Special(Special::Punctuation('?')),
2619 },
2620 PositionalToken {
2621 source: uws,
2622 offset: 52,
2623 length: 1,
2624 token: Token::Special(Special::Separator(Separator::Space)),
2625 },
2626 PositionalToken {
2627 source: uws,
2628 offset: 53,
2629 length: 4,
2630 token: Token::Word(Word::Numerical(Numerical::Measures("4pda".to_string()))),
2631 }, PositionalToken {
2633 source: uws,
2634 offset: 57,
2635 length: 1,
2636 token: Token::Special(Special::Separator(Separator::Space)),
2637 },
2638 PositionalToken {
2639 source: uws,
2640 offset: 58,
2641 length: 3,
2642 token: Token::Word(Word::Word("etc".to_string())),
2643 },
2644 PositionalToken {
2645 source: uws,
2646 offset: 61,
2647 length: 1,
2648 token: Token::Special(Special::Punctuation('.')),
2649 },
2650 PositionalToken {
2651 source: uws,
2652 offset: 62,
2653 length: 1,
2654 token: Token::Special(Special::Separator(Separator::Space)),
2655 },
2656 PositionalToken {
2657 source: uws,
2658 offset: 63,
2659 length: 3,
2660 token: Token::Word(Word::Word("qeq".to_string())),
2661 },
2662 PositionalToken {
2663 source: uws,
2664 offset: 66,
2665 length: 1,
2666 token: Token::Special(Special::Separator(Separator::Space)),
2667 },
2668 PositionalToken {
2669 source: uws,
2670 offset: 67,
2671 length: 5,
2672 token: Token::Word(Word::Word("U.S.A".to_string())),
2673 },
2674 PositionalToken {
2675 source: uws,
2676 offset: 72,
2677 length: 2,
2678 token: Token::Special(Special::Separator(Separator::Space)),
2679 },
2680 PositionalToken {
2681 source: uws,
2682 offset: 74,
2683 length: 3,
2684 token: Token::Word(Word::Word("asd".to_string())),
2685 },
2686 PositionalToken {
2687 source: uws,
2688 offset: 77,
2689 length: 3,
2690 token: Token::Special(Special::Separator(Separator::Newline)),
2691 },
2692 PositionalToken {
2693 source: uws,
2694 offset: 80,
2695 length: 3,
2696 token: Token::Word(Word::Word("Brr".to_string())),
2697 },
2698 PositionalToken {
2699 source: uws,
2700 offset: 83,
2701 length: 1,
2702 token: Token::Special(Special::Punctuation(',')),
2703 },
2704 PositionalToken {
2705 source: uws,
2706 offset: 84,
2707 length: 1,
2708 token: Token::Special(Special::Separator(Separator::Space)),
2709 },
2710 PositionalToken {
2711 source: uws,
2712 offset: 85,
2713 length: 4,
2714 token: Token::Word(Word::Word("it\'s".to_string())),
2715 },
2716 PositionalToken {
2717 source: uws,
2718 offset: 89,
2719 length: 1,
2720 token: Token::Special(Special::Separator(Separator::Space)),
2721 },
2722 PositionalToken {
2723 source: uws,
2724 offset: 90,
2725 length: 4,
2726 token: Token::Word(Word::Number(Number::Float(29.3))),
2727 },
2728 PositionalToken {
2729 source: uws,
2730 offset: 94,
2731 length: 2,
2732 token: Token::Special(Special::Symbol('°')),
2733 },
2734 PositionalToken {
2735 source: uws,
2736 offset: 96,
2737 length: 1,
2738 token: Token::Word(Word::Word("F".to_string())),
2739 },
2740 PositionalToken {
2741 source: uws,
2742 offset: 97,
2743 length: 1,
2744 token: Token::Special(Special::Punctuation('!')),
2745 },
2746 PositionalToken {
2747 source: uws,
2748 offset: 98,
2749 length: 1,
2750 token: Token::Special(Special::Separator(Separator::Newline)),
2751 },
2752 PositionalToken {
2753 source: uws,
2754 offset: 99,
2755 length: 1,
2756 token: Token::Special(Special::Separator(Separator::Space)),
2757 },
2758 PositionalToken {
2759 source: uws,
2760 offset: 100,
2761 length: 14,
2762 token: Token::Word(Word::Word("Русское".to_string())),
2763 },
2764 PositionalToken {
2765 source: uws,
2766 offset: 114,
2767 length: 1,
2768 token: Token::Special(Special::Separator(Separator::Space)),
2769 },
2770 PositionalToken {
2771 source: uws,
2772 offset: 115,
2773 length: 22,
2774 token: Token::Word(Word::Word("предложение".to_string())),
2775 },
2776 PositionalToken {
2777 source: uws,
2778 offset: 137,
2779 length: 1,
2780 token: Token::Special(Special::Separator(Separator::Space)),
2781 },
2782 PositionalToken {
2783 source: uws,
2784 offset: 138,
2785 length: 5,
2786 token: Token::Struct(Struct::Hashtag("36.6".to_string())),
2787 },
2788 PositionalToken {
2789 source: uws,
2790 offset: 143,
2791 length: 1,
2792 token: Token::Special(Special::Separator(Separator::Space)),
2793 },
2794 PositionalToken {
2795 source: uws,
2796 offset: 144,
2797 length: 6,
2798 token: Token::Word(Word::Word("для".to_string())),
2799 },
2800 PositionalToken {
2801 source: uws,
2802 offset: 150,
2803 length: 1,
2804 token: Token::Special(Special::Separator(Separator::Space)),
2805 },
2806 PositionalToken {
2807 source: uws,
2808 offset: 151,
2809 length: 24,
2810 token: Token::Word(Word::Word("тестирования".to_string())),
2811 },
2812 PositionalToken {
2813 source: uws,
2814 offset: 175,
2815 length: 1,
2816 token: Token::Special(Special::Separator(Separator::Space)),
2817 },
2818 PositionalToken {
2819 source: uws,
2820 offset: 176,
2821 length: 14,
2822 token: Token::Word(Word::Word("деления".to_string())),
2823 },
2824 PositionalToken {
2825 source: uws,
2826 offset: 190,
2827 length: 1,
2828 token: Token::Special(Special::Separator(Separator::Space)),
2829 },
2830 PositionalToken {
2831 source: uws,
2832 offset: 191,
2833 length: 4,
2834 token: Token::Word(Word::Word("по".to_string())),
2835 },
2836 PositionalToken {
2837 source: uws,
2838 offset: 195,
2839 length: 1,
2840 token: Token::Special(Special::Separator(Separator::Space)),
2841 },
2842 PositionalToken {
2843 source: uws,
2844 offset: 196,
2845 length: 12,
2846 token: Token::Word(Word::Word("юникод".to_string())),
2847 },
2848 PositionalToken {
2849 source: uws,
2850 offset: 208,
2851 length: 1,
2852 token: Token::Special(Special::Punctuation('-')),
2853 },
2854 PositionalToken {
2855 source: uws,
2856 offset: 209,
2857 length: 12,
2858 token: Token::Word(Word::Word("словам".to_string())),
2859 },
2860 PositionalToken {
2861 source: uws,
2862 offset: 221,
2863 length: 3,
2864 token: Token::Special(Special::Punctuation('.')),
2865 },
2866 PositionalToken {
2867 source: uws,
2868 offset: 224,
2869 length: 1,
2870 token: Token::Special(Special::Separator(Separator::Newline)),
2871 },
2872 ];
2873 let lib_res = uws
2874 .into_tokenizer(TokenizerParams::complex())
2875 .collect::<Vec<_>>();
2876 check_results(&result, &lib_res, uws);
2877 }
2878
2879 #[test]
2880 fn plus_minus() {
2881 let uws = "+23 -4.5 -34 +25.7 - 2 + 5.6";
2882 let result = vec![
2883 PositionalToken {
2884 source: uws,
2885 offset: 0,
2886 length: 3,
2887 token: Token::Word(Word::Number(Number::Integer(23))),
2888 },
2889 PositionalToken {
2890 source: uws,
2891 offset: 3,
2892 length: 1,
2893 token: Token::Special(Special::Separator(Separator::Space)),
2894 },
2895 PositionalToken {
2896 source: uws,
2897 offset: 4,
2898 length: 4,
2899 token: Token::Word(Word::Number(Number::Float(-4.5))),
2900 },
2901 PositionalToken {
2902 source: uws,
2903 offset: 8,
2904 length: 1,
2905 token: Token::Special(Special::Separator(Separator::Space)),
2906 },
2907 PositionalToken {
2908 source: uws,
2909 offset: 9,
2910 length: 3,
2911 token: Token::Word(Word::Number(Number::Integer(-34))),
2912 },
2913 PositionalToken {
2914 source: uws,
2915 offset: 12,
2916 length: 1,
2917 token: Token::Special(Special::Separator(Separator::Space)),
2918 },
2919 PositionalToken {
2920 source: uws,
2921 offset: 13,
2922 length: 5,
2923 token: Token::Word(Word::Number(Number::Float(25.7))),
2924 },
2925 PositionalToken {
2926 source: uws,
2927 offset: 18,
2928 length: 1,
2929 token: Token::Special(Special::Separator(Separator::Space)),
2930 },
2931 PositionalToken {
2932 source: uws,
2933 offset: 19,
2934 length: 1,
2935 token: Token::Special(Special::Punctuation('-')),
2936 },
2937 PositionalToken {
2938 source: uws,
2939 offset: 20,
2940 length: 1,
2941 token: Token::Special(Special::Separator(Separator::Space)),
2942 },
2943 PositionalToken {
2944 source: uws,
2945 offset: 21,
2946 length: 1,
2947 token: Token::Word(Word::Number(Number::Integer(2))),
2948 },
2949 PositionalToken {
2950 source: uws,
2951 offset: 22,
2952 length: 1,
2953 token: Token::Special(Special::Separator(Separator::Space)),
2954 },
2955 PositionalToken {
2956 source: uws,
2957 offset: 23,
2958 length: 1,
2959 token: Token::Special(Special::Punctuation('+')),
2960 },
2961 PositionalToken {
2962 source: uws,
2963 offset: 24,
2964 length: 1,
2965 token: Token::Special(Special::Separator(Separator::Space)),
2966 },
2967 PositionalToken {
2968 source: uws,
2969 offset: 25,
2970 length: 3,
2971 token: Token::Word(Word::Number(Number::Float(5.6))),
2972 },
2973 ];
2974 let lib_res = uws
2975 .into_tokenizer(TokenizerParams::v1())
2976 .collect::<Vec<_>>();
2977 check(&result, &lib_res, uws);
2978 }
2980
2981 #[test]
2982 #[ignore]
2983 fn woman_bouncing_ball() {
2984 let uws = "\u{26f9}\u{200d}\u{2640}";
2985 let result = vec![PositionalToken {
2986 source: uws,
2987 offset: 0,
2988 length: 9,
2989 token: Token::Word(Word::Emoji("woman_bouncing_ball")),
2990 }];
2991 let lib_res = uws
2992 .into_tokenizer(TokenizerParams::v1())
2993 .collect::<Vec<_>>();
2994 check_results(&result, &lib_res, uws);
2995 }
2997
2998 #[test]
2999 fn emoji_and_rusabbr_default() {
3000 let uws = "🇷🇺 🇸🇹\n👱🏿👶🏽👨🏽\n👱\nС.С.С.Р.\n👨👩👦👦\n🧠\n";
3001 let result = vec![
3002 PositionalToken {
3003 source: uws,
3004 offset: 0,
3005 length: 8,
3006 token: Token::Word(Word::Emoji("russia")),
3007 },
3008 PositionalToken {
3009 source: uws,
3010 offset: 8,
3011 length: 1,
3012 token: Token::Special(Special::Separator(Separator::Space)),
3013 },
3014 PositionalToken {
3015 source: uws,
3016 offset: 9,
3017 length: 8,
3018 token: Token::Word(Word::Emoji("sao_tome_and_principe")),
3019 },
3020 PositionalToken {
3021 source: uws,
3022 offset: 17,
3023 length: 1,
3024 token: Token::Special(Special::Separator(Separator::Newline)),
3025 },
3026 PositionalToken {
3027 source: uws,
3028 offset: 18,
3029 length: 8,
3030 token: Token::Word(Word::Emoji("blond_haired_person_dark_skin_tone")),
3031 },
3032 PositionalToken {
3033 source: uws,
3034 offset: 26,
3035 length: 8,
3036 token: Token::Word(Word::Emoji("baby_medium_skin_tone")),
3037 },
3038 PositionalToken {
3039 source: uws,
3040 offset: 34,
3041 length: 8,
3042 token: Token::Word(Word::Emoji("man_medium_skin_tone")),
3043 },
3044 PositionalToken {
3045 source: uws,
3046 offset: 42,
3047 length: 1,
3048 token: Token::Special(Special::Separator(Separator::Newline)),
3049 },
3050 PositionalToken {
3051 source: uws,
3052 offset: 43,
3053 length: 4,
3054 token: Token::Word(Word::Emoji("blond_haired_person")),
3055 },
3056 PositionalToken {
3057 source: uws,
3058 offset: 47,
3059 length: 1,
3060 token: Token::Special(Special::Separator(Separator::Newline)),
3061 },
3062 PositionalToken {
3063 source: uws,
3064 offset: 48,
3065 length: 2,
3066 token: Token::Word(Word::Word("С".to_string())),
3067 },
3068 PositionalToken {
3069 source: uws,
3070 offset: 50,
3071 length: 1,
3072 token: Token::Special(Special::Punctuation('.')),
3073 },
3074 PositionalToken {
3075 source: uws,
3076 offset: 51,
3077 length: 2,
3078 token: Token::Word(Word::Word("С".to_string())),
3079 },
3080 PositionalToken {
3081 source: uws,
3082 offset: 53,
3083 length: 1,
3084 token: Token::Special(Special::Punctuation('.')),
3085 },
3086 PositionalToken {
3087 source: uws,
3088 offset: 54,
3089 length: 2,
3090 token: Token::Word(Word::Word("С".to_string())),
3091 },
3092 PositionalToken {
3093 source: uws,
3094 offset: 56,
3095 length: 1,
3096 token: Token::Special(Special::Punctuation('.')),
3097 },
3098 PositionalToken {
3099 source: uws,
3100 offset: 57,
3101 length: 2,
3102 token: Token::Word(Word::Word("Р".to_string())),
3103 },
3104 PositionalToken {
3105 source: uws,
3106 offset: 59,
3107 length: 1,
3108 token: Token::Special(Special::Punctuation('.')),
3109 },
3110 PositionalToken {
3111 source: uws,
3112 offset: 60,
3113 length: 1,
3114 token: Token::Special(Special::Separator(Separator::Newline)),
3115 },
3116 PositionalToken {
3117 source: uws,
3118 offset: 61,
3119 length: 25,
3120 token: Token::Word(Word::Emoji("family_man_woman_boy_boy")),
3121 },
3122 PositionalToken {
3123 source: uws,
3124 offset: 86,
3125 length: 1,
3126 token: Token::Special(Special::Separator(Separator::Newline)),
3127 },
3128 PositionalToken {
3129 source: uws,
3130 offset: 87,
3131 length: 4,
3132 token: Token::Word(Word::Emoji("brain")),
3133 },
3134 PositionalToken {
3135 source: uws,
3136 offset: 91,
3137 length: 1,
3138 token: Token::Special(Special::Separator(Separator::Newline)),
3139 },
3140 ];
3141
3142 let lib_res = uws
3143 .into_tokenizer(TokenizerParams::v1())
3144 .collect::<Vec<_>>();
3145 check_results(&result, &lib_res, uws);
3146 }
3148
3149 #[test]
3150 fn emoji_and_rusabbr_no_split() {
3151 let uws = "🇷🇺 🇸🇹\n👱🏿👶🏽👨🏽\n👱\nС.С.С.Р.\n👨👩👦👦\n🧠\n";
3152 let result = vec![
3153 PositionalToken {
3154 source: uws,
3155 offset: 0,
3156 length: 8,
3157 token: Token::Word(Word::Emoji("russia")),
3158 },
3159 PositionalToken {
3160 source: uws,
3161 offset: 8,
3162 length: 1,
3163 token: Token::Special(Special::Separator(Separator::Space)),
3164 },
3165 PositionalToken {
3166 source: uws,
3167 offset: 9,
3168 length: 8,
3169 token: Token::Word(Word::Emoji("sao_tome_and_principe")),
3170 },
3171 PositionalToken {
3172 source: uws,
3173 offset: 17,
3174 length: 1,
3175 token: Token::Special(Special::Separator(Separator::Newline)),
3176 },
3177 PositionalToken {
3178 source: uws,
3179 offset: 18,
3180 length: 8,
3181 token: Token::Word(Word::Emoji("blond_haired_person_dark_skin_tone")),
3182 },
3183 PositionalToken {
3184 source: uws,
3185 offset: 26,
3186 length: 8,
3187 token: Token::Word(Word::Emoji("baby_medium_skin_tone")),
3188 },
3189 PositionalToken {
3190 source: uws,
3191 offset: 34,
3192 length: 8,
3193 token: Token::Word(Word::Emoji("man_medium_skin_tone")),
3194 },
3195 PositionalToken {
3196 source: uws,
3197 offset: 42,
3198 length: 1,
3199 token: Token::Special(Special::Separator(Separator::Newline)),
3200 },
3201 PositionalToken {
3202 source: uws,
3203 offset: 43,
3204 length: 4,
3205 token: Token::Word(Word::Emoji("blond_haired_person")),
3206 },
3207 PositionalToken {
3208 source: uws,
3209 offset: 47,
3210 length: 1,
3211 token: Token::Special(Special::Separator(Separator::Newline)),
3212 },
3213 PositionalToken {
3214 source: uws,
3215 offset: 48,
3216 length: 11,
3217 token: Token::Word(Word::Word("С.С.С.Р".to_string())),
3218 },
3219 PositionalToken {
3220 source: uws,
3221 offset: 59,
3222 length: 1,
3223 token: Token::Special(Special::Punctuation('.')),
3224 },
3225 PositionalToken {
3226 source: uws,
3227 offset: 60,
3228 length: 1,
3229 token: Token::Special(Special::Separator(Separator::Newline)),
3230 },
3231 PositionalToken {
3232 source: uws,
3233 offset: 61,
3234 length: 25,
3235 token: Token::Word(Word::Emoji("family_man_woman_boy_boy")),
3236 },
3237 PositionalToken {
3238 source: uws,
3239 offset: 86,
3240 length: 1,
3241 token: Token::Special(Special::Separator(Separator::Newline)),
3242 },
3243 PositionalToken {
3244 source: uws,
3245 offset: 87,
3246 length: 4,
3247 token: Token::Word(Word::Emoji("brain")),
3248 },
3249 PositionalToken {
3250 source: uws,
3251 offset: 91,
3252 length: 1,
3253 token: Token::Special(Special::Separator(Separator::Newline)),
3254 },
3255 ];
3256
3257 let lib_res = uws.into_tokenizer(Default::default()).collect::<Vec<_>>();
3258 check_results(&result, &lib_res, uws);
3259 }
3261
3262 #[test]
3486 fn html() {
3487 let uws = "<div class=\"article article_view \" id=\"article_view_-113039156_9551\" data-article-url=\"/@chaibuket-o-chem-ne-zabyt-25-noyabrya\" data-audio-context=\"article:-113039156_9551\"><h1 class=\"article_decoration_first article_decoration_last\" >День Мамы </h1><p class=\"article_decoration_first article_decoration_last\" >День, когда поздравляют мам, бабушек, сестер и жён — это всемирный праздник, называемый «День Мамы». В настоящее время его отмечают почти в каждой стране, просто везде разные даты и способы празднования. </p><h3 class=\"article_decoration_first article_decoration_last\" ><span class='article_anchor_title'>\n <span class='article_anchor_button' id='pochemu-my-ego-prazdnuem'></span>\n <span class='article_anchor_fsymbol'>П</span>\n</span>ПОЧЕМУ МЫ ЕГО ПРАЗДНУЕМ</h3><p class=\"article_decoration_first article_decoration_last article_decoration_before\" >В 1987 году комитет госдумы по делам женщин, семьи и молодежи выступил с предложением учредить «День мамы», а сам приказ был подписан уже 30 января 1988 года Борисом Ельциным. Было решено, что ежегодно в России празднество дня мамы будет выпадать на последнее воскресенье ноября. </p><figure data-type=\"101\" data-mode=\"\" class=\"article_decoration_first article_decoration_last\" >\n <div class=\"article_figure_content\" style=\"width: 1125px\">\n <div class=\"article_figure_sizer_content\"><div class=\"article_object_sizer_wrap\" data-sizes=\"[{"s":["https://pp.userapi.com/c849128/v849128704/c0ffd/pcNJaBH3NDo.jpg",75,50],"m":["https://pp.userapi.com/c849128/v849128704/c0ffe/ozCLs2kHtRY.jpg",130,87],"x":["https://pp.userapi.com/c849128/v849128704/c0fff/E4KtTNDydzE.jpg",604,403],"y":["https://pp.userapi.com/c849128/v849128704/c1000/1nLxpYKavzU.jpg",807,538],"z":["https://pp.userapi.com/c849128/v849128704/c1001/IgEODe90yEk.jpg",1125,750],"o":["https://pp.userapi.com/c849128/v849128704/c1002/01faNwVZ2_E.jpg",130,87],"p":["https://pp.userapi.com/c849128/v849128704/c1003/baDFzbdRP2s.jpg",200,133],"q":["https://pp.userapi.com/c849128/v849128704/c1004/CY4khI6KJKA.jpg",320,213],"r":["https://pp.userapi.com/c849128/v849128704/c1005/NOvAJ6-VltY.jpg",510,340]}]\">\n <img class=\"article_object_sizer_inner article_object_photo__image_blur\" src=\"https://pp.userapi.com/c849128/v849128704/c0ffd/pcNJaBH3NDo.jpg\" data-baseurl=\"\"/>\n \n</div></div>\n <div class=\"article_figure_sizer\" style=\"padding-bottom: 66.666666666667%\"></div>";
3488 let result = vec![
3489 PositionalToken {
3490 source: uws,
3491 offset: 236,
3492 length: 8,
3493 token: Token::Word(Word::Word("День".to_string())),
3494 },
3495 PositionalToken {
3496 source: uws,
3497 offset: 244,
3498 length: 1,
3499 token: Token::Special(Special::Separator(Separator::Space)),
3500 },
3501 PositionalToken {
3502 source: uws,
3503 offset: 245,
3504 length: 8,
3505 token: Token::Word(Word::Word("Мамы".to_string())),
3506 },
3507 PositionalToken {
3508 source: uws,
3509 offset: 253,
3510 length: 1,
3511 token: Token::Special(Special::Separator(Separator::Space)),
3512 },
3513 PositionalToken {
3514 source: uws,
3515 offset: 321,
3516 length: 8,
3517 token: Token::Word(Word::Word("День".to_string())),
3518 },
3519 PositionalToken {
3520 source: uws,
3521 offset: 329,
3522 length: 1,
3523 token: Token::Special(Special::Punctuation(',')),
3524 },
3525 PositionalToken {
3526 source: uws,
3527 offset: 330,
3528 length: 1,
3529 token: Token::Special(Special::Separator(Separator::Space)),
3530 },
3531 PositionalToken {
3532 source: uws,
3533 offset: 331,
3534 length: 10,
3535 token: Token::Word(Word::Word("когда".to_string())),
3536 },
3537 PositionalToken {
3538 source: uws,
3539 offset: 341,
3540 length: 1,
3541 token: Token::Special(Special::Separator(Separator::Space)),
3542 },
3543 PositionalToken {
3544 source: uws,
3545 offset: 342,
3546 length: 22,
3547 token: Token::Word(Word::Word("поздравляют".to_string())),
3548 },
3549 PositionalToken {
3550 source: uws,
3551 offset: 364,
3552 length: 1,
3553 token: Token::Special(Special::Separator(Separator::Space)),
3554 },
3555 PositionalToken {
3556 source: uws,
3557 offset: 365,
3558 length: 6,
3559 token: Token::Word(Word::Word("мам".to_string())),
3560 },
3561 PositionalToken {
3562 source: uws,
3563 offset: 371,
3564 length: 1,
3565 token: Token::Special(Special::Punctuation(',')),
3566 },
3567 PositionalToken {
3568 source: uws,
3569 offset: 372,
3570 length: 1,
3571 token: Token::Special(Special::Separator(Separator::Space)),
3572 },
3573 PositionalToken {
3574 source: uws,
3575 offset: 373,
3576 length: 14,
3577 token: Token::Word(Word::Word("бабушек".to_string())),
3578 },
3579 PositionalToken {
3580 source: uws,
3581 offset: 387,
3582 length: 1,
3583 token: Token::Special(Special::Punctuation(',')),
3584 },
3585 PositionalToken {
3586 source: uws,
3587 offset: 388,
3588 length: 1,
3589 token: Token::Special(Special::Separator(Separator::Space)),
3590 },
3591 PositionalToken {
3592 source: uws,
3593 offset: 389,
3594 length: 12,
3595 token: Token::Word(Word::Word("сестер".to_string())),
3596 },
3597 PositionalToken {
3598 source: uws,
3599 offset: 401,
3600 length: 1,
3601 token: Token::Special(Special::Separator(Separator::Space)),
3602 },
3603 PositionalToken {
3604 source: uws,
3605 offset: 402,
3606 length: 2,
3607 token: Token::Word(Word::Word("и".to_string())),
3608 },
3609 PositionalToken {
3610 source: uws,
3611 offset: 404,
3612 length: 1,
3613 token: Token::Special(Special::Separator(Separator::Space)),
3614 },
3615 PositionalToken {
3616 source: uws,
3617 offset: 405,
3618 length: 6,
3619 token: Token::Word(Word::Word("жён".to_string())),
3620 },
3621 PositionalToken {
3622 source: uws,
3623 offset: 411,
3624 length: 1,
3625 token: Token::Special(Special::Separator(Separator::Space)),
3626 },
3627 PositionalToken {
3628 source: uws,
3629 offset: 412,
3630 length: 3,
3631 token: Token::Special(Special::Punctuation('—')),
3632 },
3633 PositionalToken {
3634 source: uws,
3635 offset: 415,
3636 length: 1,
3637 token: Token::Special(Special::Separator(Separator::Space)),
3638 },
3639 PositionalToken {
3640 source: uws,
3641 offset: 416,
3642 length: 6,
3643 token: Token::Word(Word::Word("это".to_string())),
3644 },
3645 PositionalToken {
3646 source: uws,
3647 offset: 422,
3648 length: 1,
3649 token: Token::Special(Special::Separator(Separator::Space)),
3650 },
3651 PositionalToken {
3652 source: uws,
3653 offset: 423,
3654 length: 18,
3655 token: Token::Word(Word::Word("всемирный".to_string())),
3656 },
3657 PositionalToken {
3658 source: uws,
3659 offset: 441,
3660 length: 1,
3661 token: Token::Special(Special::Separator(Separator::Space)),
3662 },
3663 PositionalToken {
3664 source: uws,
3665 offset: 442,
3666 length: 16,
3667 token: Token::Word(Word::Word("праздник".to_string())),
3668 },
3669 PositionalToken {
3670 source: uws,
3671 offset: 458,
3672 length: 1,
3673 token: Token::Special(Special::Punctuation(',')),
3674 },
3675 PositionalToken {
3676 source: uws,
3677 offset: 459,
3678 length: 1,
3679 token: Token::Special(Special::Separator(Separator::Space)),
3680 },
3681 PositionalToken {
3682 source: uws,
3683 offset: 460,
3684 length: 20,
3685 token: Token::Word(Word::Word("называемый".to_string())),
3686 },
3687 PositionalToken {
3688 source: uws,
3689 offset: 480,
3690 length: 1,
3691 token: Token::Special(Special::Separator(Separator::Space)),
3692 },
3693 PositionalToken {
3694 source: uws,
3695 offset: 481,
3696 length: 2,
3697 token: Token::Special(Special::Punctuation('«')),
3698 },
3699 PositionalToken {
3700 source: uws,
3701 offset: 483,
3702 length: 8,
3703 token: Token::Word(Word::Word("День".to_string())),
3704 },
3705 PositionalToken {
3706 source: uws,
3707 offset: 491,
3708 length: 1,
3709 token: Token::Special(Special::Separator(Separator::Space)),
3710 },
3711 PositionalToken {
3712 source: uws,
3713 offset: 492,
3714 length: 8,
3715 token: Token::Word(Word::Word("Мамы".to_string())),
3716 },
3717 PositionalToken {
3718 source: uws,
3719 offset: 500,
3720 length: 2,
3721 token: Token::Special(Special::Punctuation('»')),
3722 },
3723 PositionalToken {
3724 source: uws,
3725 offset: 502,
3726 length: 1,
3727 token: Token::Special(Special::Punctuation('.')),
3728 },
3729 PositionalToken {
3730 source: uws,
3731 offset: 503,
3732 length: 1,
3733 token: Token::Special(Special::Separator(Separator::Space)),
3734 },
3735 PositionalToken {
3736 source: uws,
3737 offset: 504,
3738 length: 2,
3739 token: Token::Word(Word::Word("В".to_string())),
3740 },
3741 PositionalToken {
3742 source: uws,
3743 offset: 506,
3744 length: 1,
3745 token: Token::Special(Special::Separator(Separator::Space)),
3746 },
3747 PositionalToken {
3748 source: uws,
3749 offset: 507,
3750 length: 18,
3751 token: Token::Word(Word::Word("настоящее".to_string())),
3752 },
3753 PositionalToken {
3754 source: uws,
3755 offset: 525,
3756 length: 1,
3757 token: Token::Special(Special::Separator(Separator::Space)),
3758 },
3759 PositionalToken {
3760 source: uws,
3761 offset: 526,
3762 length: 10,
3763 token: Token::Word(Word::Word("время".to_string())),
3764 },
3765 PositionalToken {
3766 source: uws,
3767 offset: 536,
3768 length: 1,
3769 token: Token::Special(Special::Separator(Separator::Space)),
3770 },
3771 PositionalToken {
3772 source: uws,
3773 offset: 537,
3774 length: 6,
3775 token: Token::Word(Word::Word("его".to_string())),
3776 },
3777 PositionalToken {
3778 source: uws,
3779 offset: 543,
3780 length: 1,
3781 token: Token::Special(Special::Separator(Separator::Space)),
3782 },
3783 PositionalToken {
3784 source: uws,
3785 offset: 544,
3786 length: 16,
3787 token: Token::Word(Word::Word("отмечают".to_string())),
3788 },
3789 PositionalToken {
3790 source: uws,
3791 offset: 560,
3792 length: 1,
3793 token: Token::Special(Special::Separator(Separator::Space)),
3794 },
3795 PositionalToken {
3796 source: uws,
3797 offset: 561,
3798 length: 10,
3799 token: Token::Word(Word::Word("почти".to_string())),
3800 },
3801 PositionalToken {
3802 source: uws,
3803 offset: 571,
3804 length: 1,
3805 token: Token::Special(Special::Separator(Separator::Space)),
3806 },
3807 PositionalToken {
3808 source: uws,
3809 offset: 572,
3810 length: 2,
3811 token: Token::Word(Word::Word("в".to_string())),
3812 },
3813 PositionalToken {
3814 source: uws,
3815 offset: 574,
3816 length: 1,
3817 token: Token::Special(Special::Separator(Separator::Space)),
3818 },
3819 PositionalToken {
3820 source: uws,
3821 offset: 575,
3822 length: 12,
3823 token: Token::Word(Word::Word("каждой".to_string())),
3824 },
3825 PositionalToken {
3826 source: uws,
3827 offset: 587,
3828 length: 1,
3829 token: Token::Special(Special::Separator(Separator::Space)),
3830 },
3831 PositionalToken {
3832 source: uws,
3833 offset: 588,
3834 length: 12,
3835 token: Token::Word(Word::Word("стране".to_string())),
3836 },
3837 PositionalToken {
3838 source: uws,
3839 offset: 600,
3840 length: 1,
3841 token: Token::Special(Special::Punctuation(',')),
3842 },
3843 PositionalToken {
3844 source: uws,
3845 offset: 601,
3846 length: 1,
3847 token: Token::Special(Special::Separator(Separator::Space)),
3848 },
3849 PositionalToken {
3850 source: uws,
3851 offset: 602,
3852 length: 12,
3853 token: Token::Word(Word::Word("просто".to_string())),
3854 },
3855 PositionalToken {
3856 source: uws,
3857 offset: 614,
3858 length: 1,
3859 token: Token::Special(Special::Separator(Separator::Space)),
3860 },
3861 PositionalToken {
3862 source: uws,
3863 offset: 615,
3864 length: 10,
3865 token: Token::Word(Word::Word("везде".to_string())),
3866 },
3867 PositionalToken {
3868 source: uws,
3869 offset: 625,
3870 length: 1,
3871 token: Token::Special(Special::Separator(Separator::Space)),
3872 },
3873 PositionalToken {
3874 source: uws,
3875 offset: 626,
3876 length: 12,
3877 token: Token::Word(Word::Word("разные".to_string())),
3878 },
3879 PositionalToken {
3880 source: uws,
3881 offset: 638,
3882 length: 1,
3883 token: Token::Special(Special::Separator(Separator::Space)),
3884 },
3885 PositionalToken {
3886 source: uws,
3887 offset: 639,
3888 length: 8,
3889 token: Token::Word(Word::Word("даты".to_string())),
3890 },
3891 PositionalToken {
3892 source: uws,
3893 offset: 647,
3894 length: 1,
3895 token: Token::Special(Special::Separator(Separator::Space)),
3896 },
3897 PositionalToken {
3898 source: uws,
3899 offset: 648,
3900 length: 2,
3901 token: Token::Word(Word::Word("и".to_string())),
3902 },
3903 PositionalToken {
3904 source: uws,
3905 offset: 650,
3906 length: 1,
3907 token: Token::Special(Special::Separator(Separator::Space)),
3908 },
3909 PositionalToken {
3910 source: uws,
3911 offset: 651,
3912 length: 14,
3913 token: Token::Word(Word::Word("способы".to_string())),
3914 },
3915 PositionalToken {
3916 source: uws,
3917 offset: 665,
3918 length: 1,
3919 token: Token::Special(Special::Separator(Separator::Space)),
3920 },
3921 PositionalToken {
3922 source: uws,
3923 offset: 666,
3924 length: 24,
3925 token: Token::Word(Word::Word("празднования".to_string())),
3926 },
3927 PositionalToken {
3928 source: uws,
3929 offset: 690,
3930 length: 1,
3931 token: Token::Special(Special::Punctuation('.')),
3932 },
3933 PositionalToken {
3934 source: uws,
3935 offset: 691,
3936 length: 1,
3937 token: Token::Special(Special::Separator(Separator::Space)),
3938 },
3939 PositionalToken {
3940 source: uws,
3941 offset: 794,
3942 length: 1,
3943 token: Token::Special(Special::Separator(Separator::Newline)),
3944 },
3945 PositionalToken {
3946 source: uws,
3947 offset: 795,
3948 length: 2,
3949 token: Token::Special(Special::Separator(Separator::Space)),
3950 },
3951 PositionalToken {
3952 source: uws,
3953 offset: 870,
3954 length: 1,
3955 token: Token::Special(Special::Separator(Separator::Newline)),
3956 },
3957 PositionalToken {
3958 source: uws,
3959 offset: 871,
3960 length: 2,
3961 token: Token::Special(Special::Separator(Separator::Space)),
3962 },
3963 PositionalToken {
3964 source: uws,
3965 offset: 910,
3966 length: 2,
3967 token: Token::Word(Word::Word("П".to_string())),
3968 },
3969 PositionalToken {
3970 source: uws,
3971 offset: 919,
3972 length: 1,
3973 token: Token::Special(Special::Separator(Separator::Newline)),
3974 },
3975 PositionalToken {
3976 source: uws,
3977 offset: 927,
3978 length: 12,
3979 token: Token::Word(Word::Word("ПОЧЕМУ".to_string())),
3980 },
3981 PositionalToken {
3982 source: uws,
3983 offset: 939,
3984 length: 1,
3985 token: Token::Special(Special::Separator(Separator::Space)),
3986 },
3987 PositionalToken {
3988 source: uws,
3989 offset: 940,
3990 length: 4,
3991 token: Token::Word(Word::Word("МЫ".to_string())),
3992 },
3993 PositionalToken {
3994 source: uws,
3995 offset: 944,
3996 length: 1,
3997 token: Token::Special(Special::Separator(Separator::Space)),
3998 },
3999 PositionalToken {
4000 source: uws,
4001 offset: 945,
4002 length: 6,
4003 token: Token::Word(Word::Word("ЕГО".to_string())),
4004 },
4005 PositionalToken {
4006 source: uws,
4007 offset: 951,
4008 length: 1,
4009 token: Token::Special(Special::Separator(Separator::Space)),
4010 },
4011 PositionalToken {
4012 source: uws,
4013 offset: 952,
4014 length: 18,
4015 token: Token::Word(Word::Word("ПРАЗДНУЕМ".to_string())),
4016 },
4017 PositionalToken {
4018 source: uws,
4019 offset: 1063,
4020 length: 2,
4021 token: Token::Word(Word::Word("В".to_string())),
4022 },
4023 PositionalToken {
4024 source: uws,
4025 offset: 1065,
4026 length: 1,
4027 token: Token::Special(Special::Separator(Separator::Space)),
4028 },
4029 PositionalToken {
4030 source: uws,
4031 offset: 1066,
4032 length: 4,
4033 token: Token::Word(Word::Number(Number::Integer(1987))),
4034 },
4035 PositionalToken {
4036 source: uws,
4037 offset: 1070,
4038 length: 1,
4039 token: Token::Special(Special::Separator(Separator::Space)),
4040 },
4041 PositionalToken {
4042 source: uws,
4043 offset: 1071,
4044 length: 8,
4045 token: Token::Word(Word::Word("году".to_string())),
4046 },
4047 PositionalToken {
4048 source: uws,
4049 offset: 1079,
4050 length: 1,
4051 token: Token::Special(Special::Separator(Separator::Space)),
4052 },
4053 PositionalToken {
4054 source: uws,
4055 offset: 1080,
4056 length: 14,
4057 token: Token::Word(Word::Word("комитет".to_string())),
4058 },
4059 PositionalToken {
4060 source: uws,
4061 offset: 1094,
4062 length: 1,
4063 token: Token::Special(Special::Separator(Separator::Space)),
4064 },
4065 PositionalToken {
4066 source: uws,
4067 offset: 1095,
4068 length: 14,
4069 token: Token::Word(Word::Word("госдумы".to_string())),
4070 },
4071 PositionalToken {
4072 source: uws,
4073 offset: 1109,
4074 length: 1,
4075 token: Token::Special(Special::Separator(Separator::Space)),
4076 },
4077 PositionalToken {
4078 source: uws,
4079 offset: 1110,
4080 length: 4,
4081 token: Token::Word(Word::Word("по".to_string())),
4082 },
4083 PositionalToken {
4084 source: uws,
4085 offset: 1114,
4086 length: 1,
4087 token: Token::Special(Special::Separator(Separator::Space)),
4088 },
4089 PositionalToken {
4090 source: uws,
4091 offset: 1115,
4092 length: 10,
4093 token: Token::Word(Word::Word("делам".to_string())),
4094 },
4095 PositionalToken {
4096 source: uws,
4097 offset: 1125,
4098 length: 1,
4099 token: Token::Special(Special::Separator(Separator::Space)),
4100 },
4101 PositionalToken {
4102 source: uws,
4103 offset: 1126,
4104 length: 12,
4105 token: Token::Word(Word::Word("женщин".to_string())),
4106 },
4107 PositionalToken {
4108 source: uws,
4109 offset: 1138,
4110 length: 1,
4111 token: Token::Special(Special::Punctuation(',')),
4112 },
4113 PositionalToken {
4114 source: uws,
4115 offset: 1139,
4116 length: 1,
4117 token: Token::Special(Special::Separator(Separator::Space)),
4118 },
4119 PositionalToken {
4120 source: uws,
4121 offset: 1140,
4122 length: 10,
4123 token: Token::Word(Word::Word("семьи".to_string())),
4124 },
4125 PositionalToken {
4126 source: uws,
4127 offset: 1150,
4128 length: 1,
4129 token: Token::Special(Special::Separator(Separator::Space)),
4130 },
4131 PositionalToken {
4132 source: uws,
4133 offset: 1151,
4134 length: 2,
4135 token: Token::Word(Word::Word("и".to_string())),
4136 },
4137 PositionalToken {
4138 source: uws,
4139 offset: 1153,
4140 length: 1,
4141 token: Token::Special(Special::Separator(Separator::Space)),
4142 },
4143 PositionalToken {
4144 source: uws,
4145 offset: 1154,
4146 length: 16,
4147 token: Token::Word(Word::Word("молодежи".to_string())),
4148 },
4149 PositionalToken {
4150 source: uws,
4151 offset: 1170,
4152 length: 1,
4153 token: Token::Special(Special::Separator(Separator::Space)),
4154 },
4155 PositionalToken {
4156 source: uws,
4157 offset: 1171,
4158 length: 16,
4159 token: Token::Word(Word::Word("выступил".to_string())),
4160 },
4161 PositionalToken {
4162 source: uws,
4163 offset: 1187,
4164 length: 1,
4165 token: Token::Special(Special::Separator(Separator::Space)),
4166 },
4167 PositionalToken {
4168 source: uws,
4169 offset: 1188,
4170 length: 2,
4171 token: Token::Word(Word::Word("с".to_string())),
4172 },
4173 PositionalToken {
4174 source: uws,
4175 offset: 1190,
4176 length: 1,
4177 token: Token::Special(Special::Separator(Separator::Space)),
4178 },
4179 PositionalToken {
4180 source: uws,
4181 offset: 1191,
4182 length: 24,
4183 token: Token::Word(Word::Word("предложением".to_string())),
4184 },
4185 PositionalToken {
4186 source: uws,
4187 offset: 1215,
4188 length: 1,
4189 token: Token::Special(Special::Separator(Separator::Space)),
4190 },
4191 PositionalToken {
4192 source: uws,
4193 offset: 1216,
4194 length: 16,
4195 token: Token::Word(Word::Word("учредить".to_string())),
4196 },
4197 PositionalToken {
4198 source: uws,
4199 offset: 1232,
4200 length: 1,
4201 token: Token::Special(Special::Separator(Separator::Space)),
4202 },
4203 PositionalToken {
4204 source: uws,
4205 offset: 1233,
4206 length: 2,
4207 token: Token::Special(Special::Punctuation('«')),
4208 },
4209 PositionalToken {
4210 source: uws,
4211 offset: 1235,
4212 length: 8,
4213 token: Token::Word(Word::Word("День".to_string())),
4214 },
4215 PositionalToken {
4216 source: uws,
4217 offset: 1243,
4218 length: 1,
4219 token: Token::Special(Special::Separator(Separator::Space)),
4220 },
4221 PositionalToken {
4222 source: uws,
4223 offset: 1244,
4224 length: 8,
4225 token: Token::Word(Word::Word("мамы".to_string())),
4226 },
4227 PositionalToken {
4228 source: uws,
4229 offset: 1252,
4230 length: 2,
4231 token: Token::Special(Special::Punctuation('»')),
4232 },
4233 PositionalToken {
4234 source: uws,
4235 offset: 1254,
4236 length: 1,
4237 token: Token::Special(Special::Punctuation(',')),
4238 },
4239 PositionalToken {
4240 source: uws,
4241 offset: 1255,
4242 length: 1,
4243 token: Token::Special(Special::Separator(Separator::Space)),
4244 },
4245 PositionalToken {
4246 source: uws,
4247 offset: 1256,
4248 length: 2,
4249 token: Token::Word(Word::Word("а".to_string())),
4250 },
4251 PositionalToken {
4252 source: uws,
4253 offset: 1258,
4254 length: 1,
4255 token: Token::Special(Special::Separator(Separator::Space)),
4256 },
4257 PositionalToken {
4258 source: uws,
4259 offset: 1259,
4260 length: 6,
4261 token: Token::Word(Word::Word("сам".to_string())),
4262 },
4263 PositionalToken {
4264 source: uws,
4265 offset: 1265,
4266 length: 1,
4267 token: Token::Special(Special::Separator(Separator::Space)),
4268 },
4269 PositionalToken {
4270 source: uws,
4271 offset: 1266,
4272 length: 12,
4273 token: Token::Word(Word::Word("приказ".to_string())),
4274 },
4275 PositionalToken {
4276 source: uws,
4277 offset: 1278,
4278 length: 1,
4279 token: Token::Special(Special::Separator(Separator::Space)),
4280 },
4281 PositionalToken {
4282 source: uws,
4283 offset: 1279,
4284 length: 6,
4285 token: Token::Word(Word::Word("был".to_string())),
4286 },
4287 PositionalToken {
4288 source: uws,
4289 offset: 1285,
4290 length: 1,
4291 token: Token::Special(Special::Separator(Separator::Space)),
4292 },
4293 PositionalToken {
4294 source: uws,
4295 offset: 1286,
4296 length: 16,
4297 token: Token::Word(Word::Word("подписан".to_string())),
4298 },
4299 PositionalToken {
4300 source: uws,
4301 offset: 1302,
4302 length: 1,
4303 token: Token::Special(Special::Separator(Separator::Space)),
4304 },
4305 PositionalToken {
4306 source: uws,
4307 offset: 1303,
4308 length: 6,
4309 token: Token::Word(Word::Word("уже".to_string())),
4310 },
4311 PositionalToken {
4312 source: uws,
4313 offset: 1309,
4314 length: 1,
4315 token: Token::Special(Special::Separator(Separator::Space)),
4316 },
4317 PositionalToken {
4318 source: uws,
4319 offset: 1310,
4320 length: 2,
4321 token: Token::Word(Word::Number(Number::Integer(30))),
4322 },
4323 PositionalToken {
4324 source: uws,
4325 offset: 1312,
4326 length: 1,
4327 token: Token::Special(Special::Separator(Separator::Space)),
4328 },
4329 PositionalToken {
4330 source: uws,
4331 offset: 1313,
4332 length: 12,
4333 token: Token::Word(Word::Word("января".to_string())),
4334 },
4335 PositionalToken {
4336 source: uws,
4337 offset: 1325,
4338 length: 1,
4339 token: Token::Special(Special::Separator(Separator::Space)),
4340 },
4341 PositionalToken {
4342 source: uws,
4343 offset: 1326,
4344 length: 4,
4345 token: Token::Word(Word::Number(Number::Integer(1988))),
4346 },
4347 PositionalToken {
4348 source: uws,
4349 offset: 1330,
4350 length: 1,
4351 token: Token::Special(Special::Separator(Separator::Space)),
4352 },
4353 PositionalToken {
4354 source: uws,
4355 offset: 1331,
4356 length: 8,
4357 token: Token::Word(Word::Word("года".to_string())),
4358 },
4359 PositionalToken {
4360 source: uws,
4361 offset: 1339,
4362 length: 1,
4363 token: Token::Special(Special::Separator(Separator::Space)),
4364 },
4365 PositionalToken {
4366 source: uws,
4367 offset: 1340,
4368 length: 14,
4369 token: Token::Word(Word::Word("Борисом".to_string())),
4370 },
4371 PositionalToken {
4372 source: uws,
4373 offset: 1354,
4374 length: 1,
4375 token: Token::Special(Special::Separator(Separator::Space)),
4376 },
4377 PositionalToken {
4378 source: uws,
4379 offset: 1355,
4380 length: 16,
4381 token: Token::Word(Word::Word("Ельциным".to_string())),
4382 },
4383 PositionalToken {
4384 source: uws,
4385 offset: 1371,
4386 length: 1,
4387 token: Token::Special(Special::Punctuation('.')),
4388 },
4389 PositionalToken {
4390 source: uws,
4391 offset: 1372,
4392 length: 1,
4393 token: Token::Special(Special::Separator(Separator::Space)),
4394 },
4395 PositionalToken {
4396 source: uws,
4397 offset: 1373,
4398 length: 8,
4399 token: Token::Word(Word::Word("Было".to_string())),
4400 },
4401 PositionalToken {
4402 source: uws,
4403 offset: 1381,
4404 length: 1,
4405 token: Token::Special(Special::Separator(Separator::Space)),
4406 },
4407 PositionalToken {
4408 source: uws,
4409 offset: 1382,
4410 length: 12,
4411 token: Token::Word(Word::Word("решено".to_string())),
4412 },
4413 PositionalToken {
4414 source: uws,
4415 offset: 1394,
4416 length: 1,
4417 token: Token::Special(Special::Punctuation(',')),
4418 },
4419 PositionalToken {
4420 source: uws,
4421 offset: 1395,
4422 length: 1,
4423 token: Token::Special(Special::Separator(Separator::Space)),
4424 },
4425 PositionalToken {
4426 source: uws,
4427 offset: 1396,
4428 length: 6,
4429 token: Token::Word(Word::Word("что".to_string())),
4430 },
4431 PositionalToken {
4432 source: uws,
4433 offset: 1402,
4434 length: 1,
4435 token: Token::Special(Special::Separator(Separator::Space)),
4436 },
4437 PositionalToken {
4438 source: uws,
4439 offset: 1403,
4440 length: 16,
4441 token: Token::Word(Word::Word("ежегодно".to_string())),
4442 },
4443 PositionalToken {
4444 source: uws,
4445 offset: 1419,
4446 length: 1,
4447 token: Token::Special(Special::Separator(Separator::Space)),
4448 },
4449 PositionalToken {
4450 source: uws,
4451 offset: 1420,
4452 length: 2,
4453 token: Token::Word(Word::Word("в".to_string())),
4454 },
4455 PositionalToken {
4456 source: uws,
4457 offset: 1422,
4458 length: 1,
4459 token: Token::Special(Special::Separator(Separator::Space)),
4460 },
4461 PositionalToken {
4462 source: uws,
4463 offset: 1423,
4464 length: 12,
4465 token: Token::Word(Word::Word("России".to_string())),
4466 },
4467 PositionalToken {
4468 source: uws,
4469 offset: 1435,
4470 length: 1,
4471 token: Token::Special(Special::Separator(Separator::Space)),
4472 },
4473 PositionalToken {
4474 source: uws,
4475 offset: 1436,
4476 length: 22,
4477 token: Token::Word(Word::Word("празднество".to_string())),
4478 },
4479 PositionalToken {
4480 source: uws,
4481 offset: 1458,
4482 length: 1,
4483 token: Token::Special(Special::Separator(Separator::Space)),
4484 },
4485 PositionalToken {
4486 source: uws,
4487 offset: 1459,
4488 length: 6,
4489 token: Token::Word(Word::Word("дня".to_string())),
4490 },
4491 PositionalToken {
4492 source: uws,
4493 offset: 1465,
4494 length: 1,
4495 token: Token::Special(Special::Separator(Separator::Space)),
4496 },
4497 PositionalToken {
4498 source: uws,
4499 offset: 1466,
4500 length: 8,
4501 token: Token::Word(Word::Word("мамы".to_string())),
4502 },
4503 PositionalToken {
4504 source: uws,
4505 offset: 1474,
4506 length: 1,
4507 token: Token::Special(Special::Separator(Separator::Space)),
4508 },
4509 PositionalToken {
4510 source: uws,
4511 offset: 1475,
4512 length: 10,
4513 token: Token::Word(Word::Word("будет".to_string())),
4514 },
4515 PositionalToken {
4516 source: uws,
4517 offset: 1485,
4518 length: 1,
4519 token: Token::Special(Special::Separator(Separator::Space)),
4520 },
4521 PositionalToken {
4522 source: uws,
4523 offset: 1486,
4524 length: 16,
4525 token: Token::Word(Word::Word("выпадать".to_string())),
4526 },
4527 PositionalToken {
4528 source: uws,
4529 offset: 1502,
4530 length: 1,
4531 token: Token::Special(Special::Separator(Separator::Space)),
4532 },
4533 PositionalToken {
4534 source: uws,
4535 offset: 1503,
4536 length: 4,
4537 token: Token::Word(Word::Word("на".to_string())),
4538 },
4539 PositionalToken {
4540 source: uws,
4541 offset: 1507,
4542 length: 1,
4543 token: Token::Special(Special::Separator(Separator::Space)),
4544 },
4545 PositionalToken {
4546 source: uws,
4547 offset: 1508,
4548 length: 18,
4549 token: Token::Word(Word::Word("последнее".to_string())),
4550 },
4551 PositionalToken {
4552 source: uws,
4553 offset: 1526,
4554 length: 1,
4555 token: Token::Special(Special::Separator(Separator::Space)),
4556 },
4557 PositionalToken {
4558 source: uws,
4559 offset: 1527,
4560 length: 22,
4561 token: Token::Word(Word::Word("воскресенье".to_string())),
4562 },
4563 PositionalToken {
4564 source: uws,
4565 offset: 1549,
4566 length: 1,
4567 token: Token::Special(Special::Separator(Separator::Space)),
4568 },
4569 PositionalToken {
4570 source: uws,
4571 offset: 1550,
4572 length: 12,
4573 token: Token::Word(Word::Word("ноября".to_string())),
4574 },
4575 PositionalToken {
4576 source: uws,
4577 offset: 1562,
4578 length: 1,
4579 token: Token::Special(Special::Punctuation('.')),
4580 },
4581 PositionalToken {
4582 source: uws,
4583 offset: 1563,
4584 length: 1,
4585 token: Token::Special(Special::Separator(Separator::Space)),
4586 },
4587 PositionalToken {
4588 source: uws,
4589 offset: 1664,
4590 length: 1,
4591 token: Token::Special(Special::Separator(Separator::Newline)),
4592 },
4593 PositionalToken {
4594 source: uws,
4595 offset: 1665,
4596 length: 2,
4597 token: Token::Special(Special::Separator(Separator::Space)),
4598 },
4599 PositionalToken {
4600 source: uws,
4601 offset: 1725,
4602 length: 1,
4603 token: Token::Special(Special::Separator(Separator::Newline)),
4604 },
4605 PositionalToken {
4606 source: uws,
4607 offset: 1726,
4608 length: 4,
4609 token: Token::Special(Special::Separator(Separator::Space)),
4610 },
4611 PositionalToken {
4612 source: uws,
4613 offset: 2725,
4614 length: 1,
4615 token: Token::Special(Special::Separator(Separator::Newline)),
4616 },
4617 PositionalToken {
4618 source: uws,
4619 offset: 2726,
4620 length: 2,
4621 token: Token::Special(Special::Separator(Separator::Space)),
4622 },
4623 PositionalToken {
4624 source: uws,
4625 offset: 2888,
4626 length: 1,
4627 token: Token::Special(Special::Separator(Separator::Newline)),
4628 },
4629 PositionalToken {
4630 source: uws,
4631 offset: 2889,
4632 length: 2,
4633 token: Token::Special(Special::Separator(Separator::Space)),
4634 },
4635 PositionalToken {
4636 source: uws,
4637 offset: 2891,
4638 length: 1,
4639 token: Token::Special(Special::Separator(Separator::Newline)),
4640 },
4641 PositionalToken {
4642 source: uws,
4643 offset: 2904,
4644 length: 1,
4645 token: Token::Special(Special::Separator(Separator::Newline)),
4646 },
4647 PositionalToken {
4648 source: uws,
4649 offset: 2905,
4650 length: 4,
4651 token: Token::Special(Special::Separator(Separator::Space)),
4652 },
4653 ];
4654
4655 let text = Text::new({
4656 uws.into_source()
4657 .pipe(tagger::Builder::new().create().into_breaker())
4658 .pipe(entities::Builder::new().create().into_piped())
4659 .into_separator()
4660 })
4661 .unwrap();
4662
4663 let lib_res = text
4664 .into_tokenizer(TokenizerParams::v1())
4665 .filter_map(|tt| tt.into_original_token_1())
4666 .collect::<Vec<_>>();
4667
4668 check_results(&result, &lib_res, uws);
4669 }
4670
4671 #[test]
4722 fn numerical_no_split() {
4723 let uws = "12.02.18 31.28.34 23.11.2018 123.568.365.234.578 127.0.0.1 1st 1кг 123123афываыв 12321фвафыов234выалфо 12_123_343.4234_4234";
4724 let lib_res = uws.into_tokenizer(Default::default()).collect::<Vec<_>>();
4725 let result = vec![
4727 PositionalToken {
4728 source: uws,
4729 offset: 0,
4730 length: 8,
4731 token: Token::Word(Word::Numerical(Numerical::DotSeparated(
4732 "12.02.18".to_string(),
4733 ))),
4734 },
4735 PositionalToken {
4736 source: uws,
4737 offset: 8,
4738 length: 1,
4739 token: Token::Special(Special::Separator(Separator::Space)),
4740 },
4741 PositionalToken {
4742 source: uws,
4743 offset: 9,
4744 length: 8,
4745 token: Token::Word(Word::Numerical(Numerical::DotSeparated(
4746 "31.28.34".to_string(),
4747 ))),
4748 },
4749 PositionalToken {
4750 source: uws,
4751 offset: 17,
4752 length: 1,
4753 token: Token::Special(Special::Separator(Separator::Space)),
4754 },
4755 PositionalToken {
4756 source: uws,
4757 offset: 18,
4758 length: 10,
4759 token: Token::Word(Word::Numerical(Numerical::DotSeparated(
4760 "23.11.2018".to_string(),
4761 ))),
4762 },
4763 PositionalToken {
4764 source: uws,
4765 offset: 28,
4766 length: 1,
4767 token: Token::Special(Special::Separator(Separator::Space)),
4768 },
4769 PositionalToken {
4770 source: uws,
4771 offset: 29,
4772 length: 19,
4773 token: Token::Word(Word::Numerical(Numerical::DotSeparated(
4774 "123.568.365.234.578".to_string(),
4775 ))),
4776 },
4777 PositionalToken {
4778 source: uws,
4779 offset: 48,
4780 length: 1,
4781 token: Token::Special(Special::Separator(Separator::Space)),
4782 },
4783 PositionalToken {
4784 source: uws,
4785 offset: 49,
4786 length: 9,
4787 token: Token::Word(Word::Numerical(Numerical::DotSeparated(
4788 "127.0.0.1".to_string(),
4789 ))),
4790 },
4791 PositionalToken {
4792 source: uws,
4793 offset: 58,
4794 length: 1,
4795 token: Token::Special(Special::Separator(Separator::Space)),
4796 },
4797 PositionalToken {
4798 source: uws,
4799 offset: 59,
4800 length: 3,
4801 token: Token::Word(Word::Numerical(Numerical::Measures("1st".to_string()))),
4802 },
4803 PositionalToken {
4804 source: uws,
4805 offset: 62,
4806 length: 1,
4807 token: Token::Special(Special::Separator(Separator::Space)),
4808 },
4809 PositionalToken {
4810 source: uws,
4811 offset: 63,
4812 length: 5,
4813 token: Token::Word(Word::Numerical(Numerical::Measures("1кг".to_string()))),
4814 },
4815 PositionalToken {
4816 source: uws,
4817 offset: 68,
4818 length: 1,
4819 token: Token::Special(Special::Separator(Separator::Space)),
4820 },
4821 PositionalToken {
4822 source: uws,
4823 offset: 69,
4824 length: 20,
4825 token: Token::Word(Word::Numerical(Numerical::Measures(
4826 "123123афываыв".to_string(),
4827 ))),
4828 },
4829 PositionalToken {
4830 source: uws,
4831 offset: 89,
4832 length: 1,
4833 token: Token::Special(Special::Separator(Separator::Space)),
4834 },
4835 PositionalToken {
4836 source: uws,
4837 offset: 90,
4838 length: 34,
4839 token: Token::Word(Word::Numerical(Numerical::Alphanumeric(
4840 "12321фвафыов234выалфо".to_string(),
4841 ))),
4842 },
4843 PositionalToken {
4844 source: uws,
4845 offset: 124,
4846 length: 1,
4847 token: Token::Special(Special::Separator(Separator::Space)),
4848 },
4849 PositionalToken {
4850 source: uws,
4851 offset: 125,
4852 length: 20,
4853 token: Token::Word(Word::Numerical(Numerical::Alphanumeric(
4854 "12_123_343.4234_4234".to_string(),
4855 ))),
4856 },
4857 ];
4858 check_results(&result, &lib_res, uws);
4859 }
4860
4861 #[test]
4862 fn numerical_default() {
4863 let uws = "12.02.18 31.28.34 23.11.2018 123.568.365.234.578 127.0.0.1 1st 1кг 123123афываыв 12321фвафыов234выалфо 12_123_343.4234_4234";
4864 let lib_res = uws
4865 .into_tokenizer(TokenizerParams::v1())
4866 .collect::<Vec<_>>();
4867 let result = vec![
4869 PositionalToken {
4870 source: uws,
4871 offset: 0,
4872 length: 2,
4873 token: Token::Word(Word::Number(Number::Integer(12))),
4874 },
4875 PositionalToken {
4876 source: uws,
4877 offset: 2,
4878 length: 1,
4879 token: Token::Special(Special::Punctuation('.')),
4880 },
4881 PositionalToken {
4882 source: uws,
4883 offset: 3,
4884 length: 2,
4885 token: Token::Word(Word::Number(Number::Integer(2))),
4886 },
4887 PositionalToken {
4888 source: uws,
4889 offset: 5,
4890 length: 1,
4891 token: Token::Special(Special::Punctuation('.')),
4892 },
4893 PositionalToken {
4894 source: uws,
4895 offset: 6,
4896 length: 2,
4897 token: Token::Word(Word::Number(Number::Integer(18))),
4898 },
4899 PositionalToken {
4900 source: uws,
4901 offset: 8,
4902 length: 1,
4903 token: Token::Special(Special::Separator(Separator::Space)),
4904 },
4905 PositionalToken {
4906 source: uws,
4907 offset: 9,
4908 length: 2,
4909 token: Token::Word(Word::Number(Number::Integer(31))),
4910 },
4911 PositionalToken {
4912 source: uws,
4913 offset: 11,
4914 length: 1,
4915 token: Token::Special(Special::Punctuation('.')),
4916 },
4917 PositionalToken {
4918 source: uws,
4919 offset: 12,
4920 length: 2,
4921 token: Token::Word(Word::Number(Number::Integer(28))),
4922 },
4923 PositionalToken {
4924 source: uws,
4925 offset: 14,
4926 length: 1,
4927 token: Token::Special(Special::Punctuation('.')),
4928 },
4929 PositionalToken {
4930 source: uws,
4931 offset: 15,
4932 length: 2,
4933 token: Token::Word(Word::Number(Number::Integer(34))),
4934 },
4935 PositionalToken {
4936 source: uws,
4937 offset: 17,
4938 length: 1,
4939 token: Token::Special(Special::Separator(Separator::Space)),
4940 },
4941 PositionalToken {
4942 source: uws,
4943 offset: 18,
4944 length: 2,
4945 token: Token::Word(Word::Number(Number::Integer(23))),
4946 },
4947 PositionalToken {
4948 source: uws,
4949 offset: 20,
4950 length: 1,
4951 token: Token::Special(Special::Punctuation('.')),
4952 },
4953 PositionalToken {
4954 source: uws,
4955 offset: 21,
4956 length: 2,
4957 token: Token::Word(Word::Number(Number::Integer(11))),
4958 },
4959 PositionalToken {
4960 source: uws,
4961 offset: 23,
4962 length: 1,
4963 token: Token::Special(Special::Punctuation('.')),
4964 },
4965 PositionalToken {
4966 source: uws,
4967 offset: 24,
4968 length: 4,
4969 token: Token::Word(Word::Number(Number::Integer(2018))),
4970 },
4971 PositionalToken {
4972 source: uws,
4973 offset: 28,
4974 length: 1,
4975 token: Token::Special(Special::Separator(Separator::Space)),
4976 },
4977 PositionalToken {
4978 source: uws,
4979 offset: 29,
4980 length: 3,
4981 token: Token::Word(Word::Number(Number::Integer(123))),
4982 },
4983 PositionalToken {
4984 source: uws,
4985 offset: 32,
4986 length: 1,
4987 token: Token::Special(Special::Punctuation('.')),
4988 },
4989 PositionalToken {
4990 source: uws,
4991 offset: 33,
4992 length: 3,
4993 token: Token::Word(Word::Number(Number::Integer(568))),
4994 },
4995 PositionalToken {
4996 source: uws,
4997 offset: 36,
4998 length: 1,
4999 token: Token::Special(Special::Punctuation('.')),
5000 },
5001 PositionalToken {
5002 source: uws,
5003 offset: 37,
5004 length: 3,
5005 token: Token::Word(Word::Number(Number::Integer(365))),
5006 },
5007 PositionalToken {
5008 source: uws,
5009 offset: 40,
5010 length: 1,
5011 token: Token::Special(Special::Punctuation('.')),
5012 },
5013 PositionalToken {
5014 source: uws,
5015 offset: 41,
5016 length: 3,
5017 token: Token::Word(Word::Number(Number::Integer(234))),
5018 },
5019 PositionalToken {
5020 source: uws,
5021 offset: 44,
5022 length: 1,
5023 token: Token::Special(Special::Punctuation('.')),
5024 },
5025 PositionalToken {
5026 source: uws,
5027 offset: 45,
5028 length: 3,
5029 token: Token::Word(Word::Number(Number::Integer(578))),
5030 },
5031 PositionalToken {
5032 source: uws,
5033 offset: 48,
5034 length: 1,
5035 token: Token::Special(Special::Separator(Separator::Space)),
5036 },
5037 PositionalToken {
5038 source: uws,
5039 offset: 49,
5040 length: 3,
5041 token: Token::Word(Word::Number(Number::Integer(127))),
5042 },
5043 PositionalToken {
5044 source: uws,
5045 offset: 52,
5046 length: 1,
5047 token: Token::Special(Special::Punctuation('.')),
5048 },
5049 PositionalToken {
5050 source: uws,
5051 offset: 53,
5052 length: 1,
5053 token: Token::Word(Word::Number(Number::Integer(0))),
5054 },
5055 PositionalToken {
5056 source: uws,
5057 offset: 54,
5058 length: 1,
5059 token: Token::Special(Special::Punctuation('.')),
5060 },
5061 PositionalToken {
5062 source: uws,
5063 offset: 55,
5064 length: 1,
5065 token: Token::Word(Word::Number(Number::Integer(0))),
5066 },
5067 PositionalToken {
5068 source: uws,
5069 offset: 56,
5070 length: 1,
5071 token: Token::Special(Special::Punctuation('.')),
5072 },
5073 PositionalToken {
5074 source: uws,
5075 offset: 57,
5076 length: 1,
5077 token: Token::Word(Word::Number(Number::Integer(1))),
5078 },
5079 PositionalToken {
5080 source: uws,
5081 offset: 58,
5082 length: 1,
5083 token: Token::Special(Special::Separator(Separator::Space)),
5084 },
5085 PositionalToken {
5086 source: uws,
5087 offset: 59,
5088 length: 3,
5089 token: Token::Word(Word::Numerical(Numerical::Measures("1st".to_string()))),
5090 },
5091 PositionalToken {
5092 source: uws,
5093 offset: 62,
5094 length: 1,
5095 token: Token::Special(Special::Separator(Separator::Space)),
5096 },
5097 PositionalToken {
5098 source: uws,
5099 offset: 63,
5100 length: 5,
5101 token: Token::Word(Word::Numerical(Numerical::Measures("1кг".to_string()))),
5102 },
5103 PositionalToken {
5104 source: uws,
5105 offset: 68,
5106 length: 1,
5107 token: Token::Special(Special::Separator(Separator::Space)),
5108 },
5109 PositionalToken {
5110 source: uws,
5111 offset: 69,
5112 length: 20,
5113 token: Token::Word(Word::Numerical(Numerical::Measures(
5114 "123123афываыв".to_string(),
5115 ))),
5116 },
5117 PositionalToken {
5118 source: uws,
5119 offset: 89,
5120 length: 1,
5121 token: Token::Special(Special::Separator(Separator::Space)),
5122 },
5123 PositionalToken {
5124 source: uws,
5125 offset: 90,
5126 length: 34,
5127 token: Token::Word(Word::Numerical(Numerical::Alphanumeric(
5128 "12321фвафыов234выалфо".to_string(),
5129 ))),
5130 },
5131 PositionalToken {
5132 source: uws,
5133 offset: 124,
5134 length: 1,
5135 token: Token::Special(Special::Separator(Separator::Space)),
5136 },
5137 PositionalToken {
5138 source: uws,
5139 offset: 125,
5140 length: 2,
5141 token: Token::Word(Word::Number(Number::Integer(12))),
5142 },
5143 PositionalToken {
5144 source: uws,
5145 offset: 127,
5146 length: 1,
5147 token: Token::Special(Special::Punctuation('_')),
5148 },
5149 PositionalToken {
5150 source: uws,
5151 offset: 128,
5152 length: 3,
5153 token: Token::Word(Word::Number(Number::Integer(123))),
5154 },
5155 PositionalToken {
5156 source: uws,
5157 offset: 131,
5158 length: 1,
5159 token: Token::Special(Special::Punctuation('_')),
5160 },
5161 PositionalToken {
5162 source: uws,
5163 offset: 132,
5164 length: 3,
5165 token: Token::Word(Word::Number(Number::Integer(343))),
5166 },
5167 PositionalToken {
5168 source: uws,
5169 offset: 135,
5170 length: 1,
5171 token: Token::Special(Special::Punctuation('.')),
5172 },
5173 PositionalToken {
5174 source: uws,
5175 offset: 136,
5176 length: 4,
5177 token: Token::Word(Word::Number(Number::Integer(4234))),
5178 },
5179 PositionalToken {
5180 source: uws,
5181 offset: 140,
5182 length: 1,
5183 token: Token::Special(Special::Punctuation('_')),
5184 },
5185 PositionalToken {
5186 source: uws,
5187 offset: 141,
5188 length: 4,
5189 token: Token::Word(Word::Number(Number::Integer(4234))),
5190 },
5191 ];
5192 check_results(&result, &lib_res, uws);
5193 }
5194
5195 enum Lang {
5208 Zho,
5209 Jpn,
5210 Kor,
5211 Ara,
5212 Ell,
5213 }
5214
5215 #[test]
5216 fn test_lang_zho() {
5217 let (uws, result) = get_lang_test(Lang::Zho);
5218 let lib_res = uws
5219 .into_tokenizer(TokenizerParams::v1())
5220 .collect::<Vec<_>>();
5221 check_results(&result, &lib_res, &uws);
5222 }
5223
5224 #[test]
5225 fn test_lang_jpn() {
5226 let (uws, result) = get_lang_test(Lang::Jpn);
5227 let lib_res = uws
5228 .into_tokenizer(TokenizerParams::v1())
5229 .collect::<Vec<_>>();
5230 check_results(&result, &lib_res, &uws);
5231 }
5232
5233 #[test]
5234 fn test_lang_kor() {
5235 let (uws, result) = get_lang_test(Lang::Kor);
5236 let lib_res = uws
5237 .into_tokenizer(TokenizerParams::v1())
5238 .collect::<Vec<_>>();
5239 check_results(&result, &lib_res, &uws);
5240 }
5241
5242 #[test]
5243 fn test_lang_ara() {
5244 let (uws, result) = get_lang_test(Lang::Ara);
5245 let lib_res = uws
5246 .into_tokenizer(TokenizerParams::v1())
5247 .collect::<Vec<_>>();
5248 check_results(&result, &lib_res, &uws);
5249 }
5250
5251 #[test]
5252 fn test_lang_ell() {
5253 let (uws, result) = get_lang_test(Lang::Ell);
5254 let lib_res = uws
5255 .into_tokenizer(TokenizerParams::v1())
5256 .collect::<Vec<_>>();
5257 check_results(&result, &lib_res, &uws);
5258 }
5259
5260 fn get_lang_test(lng: Lang) -> (String, Vec<PositionalToken>) {
5261 let uws = match lng {
5262 Lang::Zho => "美国电视连续剧《超人前传》的第一集《试播集》于2001年10月16日在電視網首播,剧集主创人阿尔弗雷德·高夫和迈尔斯·米勒編劇,大卫·努特尔执导。这一试播首次向观众引荐了克拉克·肯特一角,他是位拥有超能力的外星孤儿,与家人和朋友一起在堪薩斯州虚构小镇斯莫维尔生活。在这一集里,肯特首度得知自己的来历,同时还需要阻止一位学生试图杀死镇上高中多名学生的报复之举。本集节目里引入了多个之后将贯穿全季甚至整部剧集的主题元素,例如几位主要角色之间的三角恋情。电视剧在加拿大溫哥華取景,旨在选用其“美国中产阶级”景观,主创人花了5个月的时间专门用于为主角物色合适的演员。试播集在所有演员选好4天后正式开拍。由于时间上的限制,剧组无法搭建好实体外景,因此只能使用计算机绘图技术将数字化的外景插入到镜头中。节目一经上映就打破了电视网的多项收视纪录,并且获得了评论员的普遍好评和多个奖项提名,并在其中两项上胜出",
5263 Lang::Kor => "플레이스테이션 은 소니 컴퓨터 엔터테인먼트가 개발한 세 번째 가정용 게임기이다. 마이크로소프트의 엑스박스 360, 닌텐도의 Wii와 경쟁하고 있다. 이전 제품에서 온라인 플레이 기능을 비디오 게임 개발사에 전적으로 의존하던 것과 달리 통합 온라인 게임 서비스인 플레이스테이션 네트워크 서비스를 발매와 함께 시작해 제공하고 있으며, 탄탄한 멀티미디어 재생 기능, 플레이스테이션 포터블과의 연결, 고화질 광학 디스크 포맷인 블루레이 디스크 재생 기능 등의 기능을 갖추고 있다. 2006년 11월 11일에 일본에서 처음으로 출시했으며, 11월 17일에는 북미 지역, 2007년 3월 23일에는 유럽과 오세아니아 지역에서, 대한민국의 경우 6월 5일부터 일주일간 예약판매를 실시해, 매일 준비한 수량이 동이 나는 등 많은 관심을 받았으며 6월 16일에 정식 출시 행사를 열었다",
5264 Lang::Jpn => "熊野三山本願所は、15世紀末以降における熊野三山(熊野本宮、熊野新宮、熊野那智)の造営・修造のための勧進を担った組織の総称。 熊野三山を含めて、日本における古代から中世前半にかけての寺社の造営は、寺社領経営のような恒常的財源、幕府や朝廷などからの一時的な造営料所の寄進、あるいは公権力からの臨時の保護によって行われていた。しかしながら、熊野三山では、これらの財源はすべて15世紀半ばまでに実効性を失った",
5265 Lang::Ara => "لشکرکشیهای روسهای وارنگی به دریای خزر مجموعهای از حملات نظامی در بین سالهای ۸۶۴ تا ۱۰۴۱ میلادی به سواحل دریای خزر بودهاست. روسهای وارنگی ابتدا در قرن نهم میلادی به عنوان بازرگانان پوست، عسل و برده در سرزمینهای اسلامی(سرکلند) ظاهر شدند. این بازرگانان در مسیر تجاری ولگا به خرید و فروش میپرداختند. نخستین حملهٔ آنان در فاصله سالهای ۸۶۴ تا ۸۸۴ میلادی در مقیاسی کوچک علیه علویان طبرستان رخ داد. نخستین یورش بزرگ روسها در سال ۹۱۳ رخ داد و آنان با ۵۰۰ فروند درازکشتی شهر گرگان و اطراف آن را غارت کردند. آنها در این حمله مقداری کالا و برده را به تاراج بردند و در راه بازگشتن به سمت شمال، در دلتای ولگا، مورد حملهٔ خزرهای مسلمان قرار گرفتند و بعضی از آنان موفق به فرار شدند، ولی در میانهٔ ولگا به قتل رسیدند. دومین هجوم بزرگ روسها به دریای خزر در سال ۹۴۳ به وقوع پیوست. در این دوره ایگور یکم، حاکم روس کیف، رهبری روسها را در دست داشت. روسها پس از توافق با دولت خزرها برای عبور امن از منطقه، تا رود کورا و اعماق قفقاز پیش رفتند و در سال ۹۴۳ موفق شدند بندر بردعه، پایتخت اران (جمهوری آذربایجان کنونی)، را تصرف کنند. روسها در آنجا به مدت چند ماه ماندند و بسیاری از ساکنان شهر را کشتند و از راه غارتگری اموالی را به تاراج بردند. تنها دلیل بازگشت آنان ",
5266 Lang::Ell => "Το Πρόγραμμα υλοποιείται εξ ολοκλήρου από απόσταση και μπορεί να συμμετέχει κάθε εμπλεκόμενος στη ή/και ενδιαφερόμενος για τη διδασκαλία της Ελληνικής ως δεύτερης/ξένης γλώσσας στην Ελλάδα και στο εξωτερικό, αρκεί να είναι απόφοιτος ελληνικής φιλολογίας, ξένων φιλολογιών, παιδαγωγικών τμημάτων, θεολογικών σχολών ή άλλων πανεπιστημιακών τμημάτων ελληνικών ή ισότιμων ξένων πανεπιστημίων. Υπό όρους γίνονται δεκτοί υποψήφιοι που δεν έχουν ολοκληρώσει σπουδές τριτοβάθμιας εκπαίδευσης.",
5267 };
5268 let tokens = match lng {
5269 Lang::Zho => vec![
5270 PositionalToken {
5271 source: uws,
5272 offset: 0,
5273 length: 3,
5274 token: Token::Word(Word::Word("美".to_string())),
5275 },
5276 PositionalToken {
5277 source: uws,
5278 offset: 3,
5279 length: 3,
5280 token: Token::Word(Word::Word("国".to_string())),
5281 },
5282 PositionalToken {
5283 source: uws,
5284 offset: 6,
5285 length: 3,
5286 token: Token::Word(Word::Word("电".to_string())),
5287 },
5288 PositionalToken {
5289 source: uws,
5290 offset: 9,
5291 length: 3,
5292 token: Token::Word(Word::Word("视".to_string())),
5293 },
5294 PositionalToken {
5295 source: uws,
5296 offset: 12,
5297 length: 3,
5298 token: Token::Word(Word::Word("连".to_string())),
5299 },
5300 PositionalToken {
5301 source: uws,
5302 offset: 15,
5303 length: 3,
5304 token: Token::Word(Word::Word("续".to_string())),
5305 },
5306 PositionalToken {
5307 source: uws,
5308 offset: 18,
5309 length: 3,
5310 token: Token::Word(Word::Word("剧".to_string())),
5311 },
5312 PositionalToken {
5313 source: uws,
5314 offset: 21,
5315 length: 3,
5316 token: Token::Special(Special::Punctuation('《')),
5317 },
5318 PositionalToken {
5319 source: uws,
5320 offset: 24,
5321 length: 3,
5322 token: Token::Word(Word::Word("超".to_string())),
5323 },
5324 PositionalToken {
5325 source: uws,
5326 offset: 27,
5327 length: 3,
5328 token: Token::Word(Word::Word("人".to_string())),
5329 },
5330 PositionalToken {
5331 source: uws,
5332 offset: 30,
5333 length: 3,
5334 token: Token::Word(Word::Word("前".to_string())),
5335 },
5336 PositionalToken {
5337 source: uws,
5338 offset: 33,
5339 length: 3,
5340 token: Token::Word(Word::Word("传".to_string())),
5341 },
5342 PositionalToken {
5343 source: uws,
5344 offset: 36,
5345 length: 3,
5346 token: Token::Special(Special::Punctuation('》')),
5347 },
5348 PositionalToken {
5349 source: uws,
5350 offset: 39,
5351 length: 3,
5352 token: Token::Word(Word::Word("的".to_string())),
5353 },
5354 PositionalToken {
5355 source: uws,
5356 offset: 42,
5357 length: 3,
5358 token: Token::Word(Word::Word("第".to_string())),
5359 },
5360 PositionalToken {
5361 source: uws,
5362 offset: 45,
5363 length: 3,
5364 token: Token::Word(Word::Word("一".to_string())),
5365 },
5366 PositionalToken {
5367 source: uws,
5368 offset: 48,
5369 length: 3,
5370 token: Token::Word(Word::Word("集".to_string())),
5371 },
5372 PositionalToken {
5373 source: uws,
5374 offset: 51,
5375 length: 3,
5376 token: Token::Special(Special::Punctuation('《')),
5377 },
5378 PositionalToken {
5379 source: uws,
5380 offset: 54,
5381 length: 3,
5382 token: Token::Word(Word::Word("试".to_string())),
5383 },
5384 PositionalToken {
5385 source: uws,
5386 offset: 57,
5387 length: 3,
5388 token: Token::Word(Word::Word("播".to_string())),
5389 },
5390 PositionalToken {
5391 source: uws,
5392 offset: 60,
5393 length: 3,
5394 token: Token::Word(Word::Word("集".to_string())),
5395 },
5396 PositionalToken {
5397 source: uws,
5398 offset: 63,
5399 length: 3,
5400 token: Token::Special(Special::Punctuation('》')),
5401 },
5402 PositionalToken {
5403 source: uws,
5404 offset: 66,
5405 length: 3,
5406 token: Token::Word(Word::Word("于".to_string())),
5407 },
5408 PositionalToken {
5409 source: uws,
5410 offset: 69,
5411 length: 4,
5412 token: Token::Word(Word::Number(Number::Integer(2001))),
5413 },
5414 PositionalToken {
5415 source: uws,
5416 offset: 73,
5417 length: 3,
5418 token: Token::Word(Word::Word("年".to_string())),
5419 },
5420 PositionalToken {
5421 source: uws,
5422 offset: 76,
5423 length: 2,
5424 token: Token::Word(Word::Number(Number::Integer(10))),
5425 },
5426 PositionalToken {
5427 source: uws,
5428 offset: 78,
5429 length: 3,
5430 token: Token::Word(Word::Word("月".to_string())),
5431 },
5432 PositionalToken {
5433 source: uws,
5434 offset: 81,
5435 length: 2,
5436 token: Token::Word(Word::Number(Number::Integer(16))),
5437 },
5438 PositionalToken {
5439 source: uws,
5440 offset: 83,
5441 length: 3,
5442 token: Token::Word(Word::Word("日".to_string())),
5443 },
5444 PositionalToken {
5445 source: uws,
5446 offset: 86,
5447 length: 3,
5448 token: Token::Word(Word::Word("在".to_string())),
5449 },
5450 PositionalToken {
5451 source: uws,
5452 offset: 89,
5453 length: 3,
5454 token: Token::Word(Word::Word("電".to_string())),
5455 },
5456 PositionalToken {
5457 source: uws,
5458 offset: 92,
5459 length: 3,
5460 token: Token::Word(Word::Word("視".to_string())),
5461 },
5462 PositionalToken {
5463 source: uws,
5464 offset: 95,
5465 length: 3,
5466 token: Token::Word(Word::Word("網".to_string())),
5467 },
5468 PositionalToken {
5469 source: uws,
5470 offset: 98,
5471 length: 3,
5472 token: Token::Word(Word::Word("首".to_string())),
5473 },
5474 PositionalToken {
5475 source: uws,
5476 offset: 101,
5477 length: 3,
5478 token: Token::Word(Word::Word("播".to_string())),
5479 },
5480 PositionalToken {
5481 source: uws,
5482 offset: 104,
5483 length: 3,
5484 token: Token::Special(Special::Punctuation(',')),
5485 },
5486 PositionalToken {
5487 source: uws,
5488 offset: 107,
5489 length: 3,
5490 token: Token::Word(Word::Word("剧".to_string())),
5491 },
5492 PositionalToken {
5493 source: uws,
5494 offset: 110,
5495 length: 3,
5496 token: Token::Word(Word::Word("集".to_string())),
5497 },
5498 PositionalToken {
5499 source: uws,
5500 offset: 113,
5501 length: 3,
5502 token: Token::Word(Word::Word("主".to_string())),
5503 },
5504 PositionalToken {
5505 source: uws,
5506 offset: 116,
5507 length: 3,
5508 token: Token::Word(Word::Word("创".to_string())),
5509 },
5510 PositionalToken {
5511 source: uws,
5512 offset: 119,
5513 length: 3,
5514 token: Token::Word(Word::Word("人".to_string())),
5515 },
5516 PositionalToken {
5517 source: uws,
5518 offset: 122,
5519 length: 3,
5520 token: Token::Word(Word::Word("阿".to_string())),
5521 },
5522 PositionalToken {
5523 source: uws,
5524 offset: 125,
5525 length: 3,
5526 token: Token::Word(Word::Word("尔".to_string())),
5527 },
5528 PositionalToken {
5529 source: uws,
5530 offset: 128,
5531 length: 3,
5532 token: Token::Word(Word::Word("弗".to_string())),
5533 },
5534 PositionalToken {
5535 source: uws,
5536 offset: 131,
5537 length: 3,
5538 token: Token::Word(Word::Word("雷".to_string())),
5539 },
5540 PositionalToken {
5541 source: uws,
5542 offset: 134,
5543 length: 3,
5544 token: Token::Word(Word::Word("德".to_string())),
5545 },
5546 PositionalToken {
5547 source: uws,
5548 offset: 137,
5549 length: 2,
5550 token: Token::Special(Special::Punctuation('·')),
5551 },
5552 PositionalToken {
5553 source: uws,
5554 offset: 139,
5555 length: 3,
5556 token: Token::Word(Word::Word("高".to_string())),
5557 },
5558 PositionalToken {
5559 source: uws,
5560 offset: 142,
5561 length: 3,
5562 token: Token::Word(Word::Word("夫".to_string())),
5563 },
5564 PositionalToken {
5565 source: uws,
5566 offset: 145,
5567 length: 3,
5568 token: Token::Word(Word::Word("和".to_string())),
5569 },
5570 PositionalToken {
5571 source: uws,
5572 offset: 148,
5573 length: 3,
5574 token: Token::Word(Word::Word("迈".to_string())),
5575 },
5576 PositionalToken {
5577 source: uws,
5578 offset: 151,
5579 length: 3,
5580 token: Token::Word(Word::Word("尔".to_string())),
5581 },
5582 PositionalToken {
5583 source: uws,
5584 offset: 154,
5585 length: 3,
5586 token: Token::Word(Word::Word("斯".to_string())),
5587 },
5588 PositionalToken {
5589 source: uws,
5590 offset: 157,
5591 length: 2,
5592 token: Token::Special(Special::Punctuation('·')),
5593 },
5594 PositionalToken {
5595 source: uws,
5596 offset: 159,
5597 length: 3,
5598 token: Token::Word(Word::Word("米".to_string())),
5599 },
5600 PositionalToken {
5601 source: uws,
5602 offset: 162,
5603 length: 3,
5604 token: Token::Word(Word::Word("勒".to_string())),
5605 },
5606 PositionalToken {
5607 source: uws,
5608 offset: 165,
5609 length: 3,
5610 token: Token::Word(Word::Word("編".to_string())),
5611 },
5612 PositionalToken {
5613 source: uws,
5614 offset: 168,
5615 length: 3,
5616 token: Token::Word(Word::Word("劇".to_string())),
5617 },
5618 PositionalToken {
5619 source: uws,
5620 offset: 171,
5621 length: 3,
5622 token: Token::Special(Special::Punctuation(',')),
5623 },
5624 PositionalToken {
5625 source: uws,
5626 offset: 174,
5627 length: 3,
5628 token: Token::Word(Word::Word("大".to_string())),
5629 },
5630 PositionalToken {
5631 source: uws,
5632 offset: 177,
5633 length: 3,
5634 token: Token::Word(Word::Word("卫".to_string())),
5635 },
5636 PositionalToken {
5637 source: uws,
5638 offset: 180,
5639 length: 2,
5640 token: Token::Special(Special::Punctuation('·')),
5641 },
5642 PositionalToken {
5643 source: uws,
5644 offset: 182,
5645 length: 3,
5646 token: Token::Word(Word::Word("努".to_string())),
5647 },
5648 PositionalToken {
5649 source: uws,
5650 offset: 185,
5651 length: 3,
5652 token: Token::Word(Word::Word("特".to_string())),
5653 },
5654 PositionalToken {
5655 source: uws,
5656 offset: 188,
5657 length: 3,
5658 token: Token::Word(Word::Word("尔".to_string())),
5659 },
5660 PositionalToken {
5661 source: uws,
5662 offset: 191,
5663 length: 3,
5664 token: Token::Word(Word::Word("执".to_string())),
5665 },
5666 PositionalToken {
5667 source: uws,
5668 offset: 194,
5669 length: 3,
5670 token: Token::Word(Word::Word("导".to_string())),
5671 },
5672 PositionalToken {
5673 source: uws,
5674 offset: 197,
5675 length: 3,
5676 token: Token::Special(Special::Punctuation('。')),
5677 },
5678 PositionalToken {
5679 source: uws,
5680 offset: 200,
5681 length: 3,
5682 token: Token::Word(Word::Word("这".to_string())),
5683 },
5684 PositionalToken {
5685 source: uws,
5686 offset: 203,
5687 length: 3,
5688 token: Token::Word(Word::Word("一".to_string())),
5689 },
5690 PositionalToken {
5691 source: uws,
5692 offset: 206,
5693 length: 3,
5694 token: Token::Word(Word::Word("试".to_string())),
5695 },
5696 PositionalToken {
5697 source: uws,
5698 offset: 209,
5699 length: 3,
5700 token: Token::Word(Word::Word("播".to_string())),
5701 },
5702 PositionalToken {
5703 source: uws,
5704 offset: 212,
5705 length: 3,
5706 token: Token::Word(Word::Word("首".to_string())),
5707 },
5708 PositionalToken {
5709 source: uws,
5710 offset: 215,
5711 length: 3,
5712 token: Token::Word(Word::Word("次".to_string())),
5713 },
5714 PositionalToken {
5715 source: uws,
5716 offset: 218,
5717 length: 3,
5718 token: Token::Word(Word::Word("向".to_string())),
5719 },
5720 PositionalToken {
5721 source: uws,
5722 offset: 221,
5723 length: 3,
5724 token: Token::Word(Word::Word("观".to_string())),
5725 },
5726 PositionalToken {
5727 source: uws,
5728 offset: 224,
5729 length: 3,
5730 token: Token::Word(Word::Word("众".to_string())),
5731 },
5732 PositionalToken {
5733 source: uws,
5734 offset: 227,
5735 length: 3,
5736 token: Token::Word(Word::Word("引".to_string())),
5737 },
5738 PositionalToken {
5739 source: uws,
5740 offset: 230,
5741 length: 3,
5742 token: Token::Word(Word::Word("荐".to_string())),
5743 },
5744 PositionalToken {
5745 source: uws,
5746 offset: 233,
5747 length: 3,
5748 token: Token::Word(Word::Word("了".to_string())),
5749 },
5750 PositionalToken {
5751 source: uws,
5752 offset: 236,
5753 length: 3,
5754 token: Token::Word(Word::Word("克".to_string())),
5755 },
5756 PositionalToken {
5757 source: uws,
5758 offset: 239,
5759 length: 3,
5760 token: Token::Word(Word::Word("拉".to_string())),
5761 },
5762 PositionalToken {
5763 source: uws,
5764 offset: 242,
5765 length: 3,
5766 token: Token::Word(Word::Word("克".to_string())),
5767 },
5768 PositionalToken {
5769 source: uws,
5770 offset: 245,
5771 length: 2,
5772 token: Token::Special(Special::Punctuation('·')),
5773 },
5774 PositionalToken {
5775 source: uws,
5776 offset: 247,
5777 length: 3,
5778 token: Token::Word(Word::Word("肯".to_string())),
5779 },
5780 PositionalToken {
5781 source: uws,
5782 offset: 250,
5783 length: 3,
5784 token: Token::Word(Word::Word("特".to_string())),
5785 },
5786 PositionalToken {
5787 source: uws,
5788 offset: 253,
5789 length: 3,
5790 token: Token::Word(Word::Word("一".to_string())),
5791 },
5792 PositionalToken {
5793 source: uws,
5794 offset: 256,
5795 length: 3,
5796 token: Token::Word(Word::Word("角".to_string())),
5797 },
5798 PositionalToken {
5799 source: uws,
5800 offset: 259,
5801 length: 3,
5802 token: Token::Special(Special::Punctuation(',')),
5803 },
5804 PositionalToken {
5805 source: uws,
5806 offset: 262,
5807 length: 3,
5808 token: Token::Word(Word::Word("他".to_string())),
5809 },
5810 PositionalToken {
5811 source: uws,
5812 offset: 265,
5813 length: 3,
5814 token: Token::Word(Word::Word("是".to_string())),
5815 },
5816 PositionalToken {
5817 source: uws,
5818 offset: 268,
5819 length: 3,
5820 token: Token::Word(Word::Word("位".to_string())),
5821 },
5822 PositionalToken {
5823 source: uws,
5824 offset: 271,
5825 length: 3,
5826 token: Token::Word(Word::Word("拥".to_string())),
5827 },
5828 PositionalToken {
5829 source: uws,
5830 offset: 274,
5831 length: 3,
5832 token: Token::Word(Word::Word("有".to_string())),
5833 },
5834 PositionalToken {
5835 source: uws,
5836 offset: 277,
5837 length: 3,
5838 token: Token::Word(Word::Word("超".to_string())),
5839 },
5840 ],
5841 Lang::Jpn => vec![
5842 PositionalToken {
5843 source: uws,
5844 offset: 0,
5845 length: 3,
5846 token: Token::Word(Word::Word("熊".to_string())),
5847 },
5848 PositionalToken {
5849 source: uws,
5850 offset: 3,
5851 length: 3,
5852 token: Token::Word(Word::Word("野".to_string())),
5853 },
5854 PositionalToken {
5855 source: uws,
5856 offset: 6,
5857 length: 3,
5858 token: Token::Word(Word::Word("三".to_string())),
5859 },
5860 PositionalToken {
5861 source: uws,
5862 offset: 9,
5863 length: 3,
5864 token: Token::Word(Word::Word("山".to_string())),
5865 },
5866 PositionalToken {
5867 source: uws,
5868 offset: 12,
5869 length: 3,
5870 token: Token::Word(Word::Word("本".to_string())),
5871 },
5872 PositionalToken {
5873 source: uws,
5874 offset: 15,
5875 length: 3,
5876 token: Token::Word(Word::Word("願".to_string())),
5877 },
5878 PositionalToken {
5879 source: uws,
5880 offset: 18,
5881 length: 3,
5882 token: Token::Word(Word::Word("所".to_string())),
5883 },
5884 PositionalToken {
5885 source: uws,
5886 offset: 21,
5887 length: 3,
5888 token: Token::Word(Word::Word("は".to_string())),
5889 },
5890 PositionalToken {
5891 source: uws,
5892 offset: 24,
5893 length: 3,
5894 token: Token::Special(Special::Punctuation('、')),
5895 },
5896 PositionalToken {
5897 source: uws,
5898 offset: 27,
5899 length: 2,
5900 token: Token::Word(Word::Number(Number::Integer(15))),
5901 },
5902 PositionalToken {
5903 source: uws,
5904 offset: 29,
5905 length: 3,
5906 token: Token::Word(Word::Word("世".to_string())),
5907 },
5908 PositionalToken {
5909 source: uws,
5910 offset: 32,
5911 length: 3,
5912 token: Token::Word(Word::Word("紀".to_string())),
5913 },
5914 PositionalToken {
5915 source: uws,
5916 offset: 35,
5917 length: 3,
5918 token: Token::Word(Word::Word("末".to_string())),
5919 },
5920 PositionalToken {
5921 source: uws,
5922 offset: 38,
5923 length: 3,
5924 token: Token::Word(Word::Word("以".to_string())),
5925 },
5926 PositionalToken {
5927 source: uws,
5928 offset: 41,
5929 length: 3,
5930 token: Token::Word(Word::Word("降".to_string())),
5931 },
5932 PositionalToken {
5933 source: uws,
5934 offset: 44,
5935 length: 3,
5936 token: Token::Word(Word::Word("に".to_string())),
5937 },
5938 PositionalToken {
5939 source: uws,
5940 offset: 47,
5941 length: 3,
5942 token: Token::Word(Word::Word("お".to_string())),
5943 },
5944 PositionalToken {
5945 source: uws,
5946 offset: 50,
5947 length: 3,
5948 token: Token::Word(Word::Word("け".to_string())),
5949 },
5950 PositionalToken {
5951 source: uws,
5952 offset: 53,
5953 length: 3,
5954 token: Token::Word(Word::Word("る".to_string())),
5955 },
5956 PositionalToken {
5957 source: uws,
5958 offset: 56,
5959 length: 3,
5960 token: Token::Word(Word::Word("熊".to_string())),
5961 },
5962 PositionalToken {
5963 source: uws,
5964 offset: 59,
5965 length: 3,
5966 token: Token::Word(Word::Word("野".to_string())),
5967 },
5968 PositionalToken {
5969 source: uws,
5970 offset: 62,
5971 length: 3,
5972 token: Token::Word(Word::Word("三".to_string())),
5973 },
5974 PositionalToken {
5975 source: uws,
5976 offset: 65,
5977 length: 3,
5978 token: Token::Word(Word::Word("山".to_string())),
5979 },
5980 PositionalToken {
5981 source: uws,
5982 offset: 68,
5983 length: 3,
5984 token: Token::Special(Special::Punctuation('(')),
5985 },
5986 PositionalToken {
5987 source: uws,
5988 offset: 71,
5989 length: 3,
5990 token: Token::Word(Word::Word("熊".to_string())),
5991 },
5992 PositionalToken {
5993 source: uws,
5994 offset: 74,
5995 length: 3,
5996 token: Token::Word(Word::Word("野".to_string())),
5997 },
5998 PositionalToken {
5999 source: uws,
6000 offset: 77,
6001 length: 3,
6002 token: Token::Word(Word::Word("本".to_string())),
6003 },
6004 PositionalToken {
6005 source: uws,
6006 offset: 80,
6007 length: 3,
6008 token: Token::Word(Word::Word("宮".to_string())),
6009 },
6010 PositionalToken {
6011 source: uws,
6012 offset: 83,
6013 length: 3,
6014 token: Token::Special(Special::Punctuation('、')),
6015 },
6016 PositionalToken {
6017 source: uws,
6018 offset: 86,
6019 length: 3,
6020 token: Token::Word(Word::Word("熊".to_string())),
6021 },
6022 PositionalToken {
6023 source: uws,
6024 offset: 89,
6025 length: 3,
6026 token: Token::Word(Word::Word("野".to_string())),
6027 },
6028 PositionalToken {
6029 source: uws,
6030 offset: 92,
6031 length: 3,
6032 token: Token::Word(Word::Word("新".to_string())),
6033 },
6034 PositionalToken {
6035 source: uws,
6036 offset: 95,
6037 length: 3,
6038 token: Token::Word(Word::Word("宮".to_string())),
6039 },
6040 PositionalToken {
6041 source: uws,
6042 offset: 98,
6043 length: 3,
6044 token: Token::Special(Special::Punctuation('、')),
6045 },
6046 PositionalToken {
6047 source: uws,
6048 offset: 101,
6049 length: 3,
6050 token: Token::Word(Word::Word("熊".to_string())),
6051 },
6052 PositionalToken {
6053 source: uws,
6054 offset: 104,
6055 length: 3,
6056 token: Token::Word(Word::Word("野".to_string())),
6057 },
6058 PositionalToken {
6059 source: uws,
6060 offset: 107,
6061 length: 3,
6062 token: Token::Word(Word::Word("那".to_string())),
6063 },
6064 PositionalToken {
6065 source: uws,
6066 offset: 110,
6067 length: 3,
6068 token: Token::Word(Word::Word("智".to_string())),
6069 },
6070 PositionalToken {
6071 source: uws,
6072 offset: 113,
6073 length: 3,
6074 token: Token::Special(Special::Punctuation(')')),
6075 },
6076 PositionalToken {
6077 source: uws,
6078 offset: 116,
6079 length: 3,
6080 token: Token::Word(Word::Word("の".to_string())),
6081 },
6082 PositionalToken {
6083 source: uws,
6084 offset: 119,
6085 length: 3,
6086 token: Token::Word(Word::Word("造".to_string())),
6087 },
6088 PositionalToken {
6089 source: uws,
6090 offset: 122,
6091 length: 3,
6092 token: Token::Word(Word::Word("営".to_string())),
6093 },
6094 PositionalToken {
6095 source: uws,
6096 offset: 125,
6097 length: 3,
6098 token: Token::Special(Special::Punctuation('・')),
6099 },
6100 PositionalToken {
6101 source: uws,
6102 offset: 128,
6103 length: 3,
6104 token: Token::Word(Word::Word("修".to_string())),
6105 },
6106 PositionalToken {
6107 source: uws,
6108 offset: 131,
6109 length: 3,
6110 token: Token::Word(Word::Word("造".to_string())),
6111 },
6112 PositionalToken {
6113 source: uws,
6114 offset: 134,
6115 length: 3,
6116 token: Token::Word(Word::Word("の".to_string())),
6117 },
6118 PositionalToken {
6119 source: uws,
6120 offset: 137,
6121 length: 3,
6122 token: Token::Word(Word::Word("た".to_string())),
6123 },
6124 PositionalToken {
6125 source: uws,
6126 offset: 140,
6127 length: 3,
6128 token: Token::Word(Word::Word("め".to_string())),
6129 },
6130 PositionalToken {
6131 source: uws,
6132 offset: 143,
6133 length: 3,
6134 token: Token::Word(Word::Word("の".to_string())),
6135 },
6136 PositionalToken {
6137 source: uws,
6138 offset: 146,
6139 length: 3,
6140 token: Token::Word(Word::Word("勧".to_string())),
6141 },
6142 PositionalToken {
6143 source: uws,
6144 offset: 149,
6145 length: 3,
6146 token: Token::Word(Word::Word("進".to_string())),
6147 },
6148 PositionalToken {
6149 source: uws,
6150 offset: 152,
6151 length: 3,
6152 token: Token::Word(Word::Word("を".to_string())),
6153 },
6154 PositionalToken {
6155 source: uws,
6156 offset: 155,
6157 length: 3,
6158 token: Token::Word(Word::Word("担".to_string())),
6159 },
6160 PositionalToken {
6161 source: uws,
6162 offset: 158,
6163 length: 3,
6164 token: Token::Word(Word::Word("っ".to_string())),
6165 },
6166 PositionalToken {
6167 source: uws,
6168 offset: 161,
6169 length: 3,
6170 token: Token::Word(Word::Word("た".to_string())),
6171 },
6172 PositionalToken {
6173 source: uws,
6174 offset: 164,
6175 length: 3,
6176 token: Token::Word(Word::Word("組".to_string())),
6177 },
6178 PositionalToken {
6179 source: uws,
6180 offset: 167,
6181 length: 3,
6182 token: Token::Word(Word::Word("織".to_string())),
6183 },
6184 PositionalToken {
6185 source: uws,
6186 offset: 170,
6187 length: 3,
6188 token: Token::Word(Word::Word("の".to_string())),
6189 },
6190 PositionalToken {
6191 source: uws,
6192 offset: 173,
6193 length: 3,
6194 token: Token::Word(Word::Word("総".to_string())),
6195 },
6196 PositionalToken {
6197 source: uws,
6198 offset: 176,
6199 length: 3,
6200 token: Token::Word(Word::Word("称".to_string())),
6201 },
6202 PositionalToken {
6203 source: uws,
6204 offset: 179,
6205 length: 3,
6206 token: Token::Special(Special::Punctuation('。')),
6207 },
6208 PositionalToken {
6209 source: uws,
6210 offset: 182,
6211 length: 1,
6212 token: Token::Special(Special::Separator(Separator::Space)),
6213 },
6214 PositionalToken {
6215 source: uws,
6216 offset: 183,
6217 length: 3,
6218 token: Token::Word(Word::Word("熊".to_string())),
6219 },
6220 PositionalToken {
6221 source: uws,
6222 offset: 186,
6223 length: 3,
6224 token: Token::Word(Word::Word("野".to_string())),
6225 },
6226 PositionalToken {
6227 source: uws,
6228 offset: 189,
6229 length: 3,
6230 token: Token::Word(Word::Word("三".to_string())),
6231 },
6232 PositionalToken {
6233 source: uws,
6234 offset: 192,
6235 length: 3,
6236 token: Token::Word(Word::Word("山".to_string())),
6237 },
6238 PositionalToken {
6239 source: uws,
6240 offset: 195,
6241 length: 3,
6242 token: Token::Word(Word::Word("を".to_string())),
6243 },
6244 PositionalToken {
6245 source: uws,
6246 offset: 198,
6247 length: 3,
6248 token: Token::Word(Word::Word("含".to_string())),
6249 },
6250 PositionalToken {
6251 source: uws,
6252 offset: 201,
6253 length: 3,
6254 token: Token::Word(Word::Word("め".to_string())),
6255 },
6256 PositionalToken {
6257 source: uws,
6258 offset: 204,
6259 length: 3,
6260 token: Token::Word(Word::Word("て".to_string())),
6261 },
6262 PositionalToken {
6263 source: uws,
6264 offset: 207,
6265 length: 3,
6266 token: Token::Special(Special::Punctuation('、')),
6267 },
6268 PositionalToken {
6269 source: uws,
6270 offset: 210,
6271 length: 3,
6272 token: Token::Word(Word::Word("日".to_string())),
6273 },
6274 PositionalToken {
6275 source: uws,
6276 offset: 213,
6277 length: 3,
6278 token: Token::Word(Word::Word("本".to_string())),
6279 },
6280 PositionalToken {
6281 source: uws,
6282 offset: 216,
6283 length: 3,
6284 token: Token::Word(Word::Word("に".to_string())),
6285 },
6286 PositionalToken {
6287 source: uws,
6288 offset: 219,
6289 length: 3,
6290 token: Token::Word(Word::Word("お".to_string())),
6291 },
6292 PositionalToken {
6293 source: uws,
6294 offset: 222,
6295 length: 3,
6296 token: Token::Word(Word::Word("け".to_string())),
6297 },
6298 PositionalToken {
6299 source: uws,
6300 offset: 225,
6301 length: 3,
6302 token: Token::Word(Word::Word("る".to_string())),
6303 },
6304 PositionalToken {
6305 source: uws,
6306 offset: 228,
6307 length: 3,
6308 token: Token::Word(Word::Word("古".to_string())),
6309 },
6310 PositionalToken {
6311 source: uws,
6312 offset: 231,
6313 length: 3,
6314 token: Token::Word(Word::Word("代".to_string())),
6315 },
6316 PositionalToken {
6317 source: uws,
6318 offset: 234,
6319 length: 3,
6320 token: Token::Word(Word::Word("か".to_string())),
6321 },
6322 PositionalToken {
6323 source: uws,
6324 offset: 237,
6325 length: 3,
6326 token: Token::Word(Word::Word("ら".to_string())),
6327 },
6328 PositionalToken {
6329 source: uws,
6330 offset: 240,
6331 length: 3,
6332 token: Token::Word(Word::Word("中".to_string())),
6333 },
6334 PositionalToken {
6335 source: uws,
6336 offset: 243,
6337 length: 3,
6338 token: Token::Word(Word::Word("世".to_string())),
6339 },
6340 PositionalToken {
6341 source: uws,
6342 offset: 246,
6343 length: 3,
6344 token: Token::Word(Word::Word("前".to_string())),
6345 },
6346 PositionalToken {
6347 source: uws,
6348 offset: 249,
6349 length: 3,
6350 token: Token::Word(Word::Word("半".to_string())),
6351 },
6352 PositionalToken {
6353 source: uws,
6354 offset: 252,
6355 length: 3,
6356 token: Token::Word(Word::Word("に".to_string())),
6357 },
6358 PositionalToken {
6359 source: uws,
6360 offset: 255,
6361 length: 3,
6362 token: Token::Word(Word::Word("か".to_string())),
6363 },
6364 PositionalToken {
6365 source: uws,
6366 offset: 258,
6367 length: 3,
6368 token: Token::Word(Word::Word("け".to_string())),
6369 },
6370 PositionalToken {
6371 source: uws,
6372 offset: 261,
6373 length: 3,
6374 token: Token::Word(Word::Word("て".to_string())),
6375 },
6376 PositionalToken {
6377 source: uws,
6378 offset: 264,
6379 length: 3,
6380 token: Token::Word(Word::Word("の".to_string())),
6381 },
6382 PositionalToken {
6383 source: uws,
6384 offset: 267,
6385 length: 3,
6386 token: Token::Word(Word::Word("寺".to_string())),
6387 },
6388 PositionalToken {
6389 source: uws,
6390 offset: 270,
6391 length: 3,
6392 token: Token::Word(Word::Word("社".to_string())),
6393 },
6394 PositionalToken {
6395 source: uws,
6396 offset: 273,
6397 length: 3,
6398 token: Token::Word(Word::Word("の".to_string())),
6399 },
6400 PositionalToken {
6401 source: uws,
6402 offset: 276,
6403 length: 3,
6404 token: Token::Word(Word::Word("造".to_string())),
6405 },
6406 PositionalToken {
6407 source: uws,
6408 offset: 279,
6409 length: 3,
6410 token: Token::Word(Word::Word("営".to_string())),
6411 },
6412 PositionalToken {
6413 source: uws,
6414 offset: 282,
6415 length: 3,
6416 token: Token::Word(Word::Word("は".to_string())),
6417 },
6418 PositionalToken {
6419 source: uws,
6420 offset: 285,
6421 length: 3,
6422 token: Token::Special(Special::Punctuation('、')),
6423 },
6424 PositionalToken {
6425 source: uws,
6426 offset: 288,
6427 length: 3,
6428 token: Token::Word(Word::Word("寺".to_string())),
6429 },
6430 PositionalToken {
6431 source: uws,
6432 offset: 291,
6433 length: 3,
6434 token: Token::Word(Word::Word("社".to_string())),
6435 },
6436 ],
6437 Lang::Kor => vec![
6438 PositionalToken {
6439 source: uws,
6440 offset: 0,
6441 length: 21,
6442 token: Token::Word(Word::Word("플레이스테이션".to_string())),
6443 },
6444 PositionalToken {
6445 source: uws,
6446 offset: 21,
6447 length: 1,
6448 token: Token::Special(Special::Separator(Separator::Space)),
6449 },
6450 PositionalToken {
6451 source: uws,
6452 offset: 22,
6453 length: 3,
6454 token: Token::Word(Word::Word("은".to_string())),
6455 },
6456 PositionalToken {
6457 source: uws,
6458 offset: 25,
6459 length: 1,
6460 token: Token::Special(Special::Separator(Separator::Space)),
6461 },
6462 PositionalToken {
6463 source: uws,
6464 offset: 26,
6465 length: 6,
6466 token: Token::Word(Word::Word("소니".to_string())),
6467 },
6468 PositionalToken {
6469 source: uws,
6470 offset: 32,
6471 length: 1,
6472 token: Token::Special(Special::Separator(Separator::Space)),
6473 },
6474 PositionalToken {
6475 source: uws,
6476 offset: 33,
6477 length: 9,
6478 token: Token::Word(Word::Word("컴퓨터".to_string())),
6479 },
6480 PositionalToken {
6481 source: uws,
6482 offset: 42,
6483 length: 1,
6484 token: Token::Special(Special::Separator(Separator::Space)),
6485 },
6486 PositionalToken {
6487 source: uws,
6488 offset: 43,
6489 length: 21,
6490 token: Token::Word(Word::Word("엔터테인먼트가".to_string())),
6491 },
6492 PositionalToken {
6493 source: uws,
6494 offset: 64,
6495 length: 1,
6496 token: Token::Special(Special::Separator(Separator::Space)),
6497 },
6498 PositionalToken {
6499 source: uws,
6500 offset: 65,
6501 length: 9,
6502 token: Token::Word(Word::Word("개발한".to_string())),
6503 },
6504 PositionalToken {
6505 source: uws,
6506 offset: 74,
6507 length: 1,
6508 token: Token::Special(Special::Separator(Separator::Space)),
6509 },
6510 PositionalToken {
6511 source: uws,
6512 offset: 75,
6513 length: 3,
6514 token: Token::Word(Word::Word("세".to_string())),
6515 },
6516 PositionalToken {
6517 source: uws,
6518 offset: 78,
6519 length: 1,
6520 token: Token::Special(Special::Separator(Separator::Space)),
6521 },
6522 PositionalToken {
6523 source: uws,
6524 offset: 79,
6525 length: 6,
6526 token: Token::Word(Word::Word("번째".to_string())),
6527 },
6528 PositionalToken {
6529 source: uws,
6530 offset: 85,
6531 length: 1,
6532 token: Token::Special(Special::Separator(Separator::Space)),
6533 },
6534 PositionalToken {
6535 source: uws,
6536 offset: 86,
6537 length: 9,
6538 token: Token::Word(Word::Word("가정용".to_string())),
6539 },
6540 PositionalToken {
6541 source: uws,
6542 offset: 95,
6543 length: 1,
6544 token: Token::Special(Special::Separator(Separator::Space)),
6545 },
6546 PositionalToken {
6547 source: uws,
6548 offset: 96,
6549 length: 15,
6550 token: Token::Word(Word::Word("게임기이다".to_string())),
6551 },
6552 PositionalToken {
6553 source: uws,
6554 offset: 111,
6555 length: 1,
6556 token: Token::Special(Special::Punctuation('.')),
6557 },
6558 PositionalToken {
6559 source: uws,
6560 offset: 112,
6561 length: 1,
6562 token: Token::Special(Special::Separator(Separator::Space)),
6563 },
6564 PositionalToken {
6565 source: uws,
6566 offset: 113,
6567 length: 24,
6568 token: Token::Word(Word::Word("마이크로소프트의".to_string())),
6569 },
6570 PositionalToken {
6571 source: uws,
6572 offset: 137,
6573 length: 1,
6574 token: Token::Special(Special::Separator(Separator::Space)),
6575 },
6576 PositionalToken {
6577 source: uws,
6578 offset: 138,
6579 length: 12,
6580 token: Token::Word(Word::Word("엑스박스".to_string())),
6581 },
6582 PositionalToken {
6583 source: uws,
6584 offset: 150,
6585 length: 1,
6586 token: Token::Special(Special::Separator(Separator::Space)),
6587 },
6588 PositionalToken {
6589 source: uws,
6590 offset: 151,
6591 length: 3,
6592 token: Token::Word(Word::Number(Number::Integer(360))),
6593 },
6594 PositionalToken {
6595 source: uws,
6596 offset: 154,
6597 length: 1,
6598 token: Token::Special(Special::Punctuation(',')),
6599 },
6600 PositionalToken {
6601 source: uws,
6602 offset: 155,
6603 length: 1,
6604 token: Token::Special(Special::Separator(Separator::Space)),
6605 },
6606 PositionalToken {
6607 source: uws,
6608 offset: 156,
6609 length: 12,
6610 token: Token::Word(Word::Word("닌텐도의".to_string())),
6611 },
6612 PositionalToken {
6613 source: uws,
6614 offset: 168,
6615 length: 1,
6616 token: Token::Special(Special::Separator(Separator::Space)),
6617 },
6618 PositionalToken {
6619 source: uws,
6620 offset: 169,
6621 length: 6,
6622 token: Token::Word(Word::Word("Wii와".to_string())),
6623 },
6624 PositionalToken {
6625 source: uws,
6626 offset: 175,
6627 length: 1,
6628 token: Token::Special(Special::Separator(Separator::Space)),
6629 },
6630 PositionalToken {
6631 source: uws,
6632 offset: 176,
6633 length: 12,
6634 token: Token::Word(Word::Word("경쟁하고".to_string())),
6635 },
6636 PositionalToken {
6637 source: uws,
6638 offset: 188,
6639 length: 1,
6640 token: Token::Special(Special::Separator(Separator::Space)),
6641 },
6642 PositionalToken {
6643 source: uws,
6644 offset: 189,
6645 length: 6,
6646 token: Token::Word(Word::Word("있다".to_string())),
6647 },
6648 PositionalToken {
6649 source: uws,
6650 offset: 195,
6651 length: 1,
6652 token: Token::Special(Special::Punctuation('.')),
6653 },
6654 PositionalToken {
6655 source: uws,
6656 offset: 196,
6657 length: 1,
6658 token: Token::Special(Special::Separator(Separator::Space)),
6659 },
6660 PositionalToken {
6661 source: uws,
6662 offset: 197,
6663 length: 6,
6664 token: Token::Word(Word::Word("이전".to_string())),
6665 },
6666 PositionalToken {
6667 source: uws,
6668 offset: 203,
6669 length: 1,
6670 token: Token::Special(Special::Separator(Separator::Space)),
6671 },
6672 PositionalToken {
6673 source: uws,
6674 offset: 204,
6675 length: 12,
6676 token: Token::Word(Word::Word("제품에서".to_string())),
6677 },
6678 PositionalToken {
6679 source: uws,
6680 offset: 216,
6681 length: 1,
6682 token: Token::Special(Special::Separator(Separator::Space)),
6683 },
6684 PositionalToken {
6685 source: uws,
6686 offset: 217,
6687 length: 9,
6688 token: Token::Word(Word::Word("온라인".to_string())),
6689 },
6690 PositionalToken {
6691 source: uws,
6692 offset: 226,
6693 length: 1,
6694 token: Token::Special(Special::Separator(Separator::Space)),
6695 },
6696 PositionalToken {
6697 source: uws,
6698 offset: 227,
6699 length: 9,
6700 token: Token::Word(Word::Word("플레이".to_string())),
6701 },
6702 PositionalToken {
6703 source: uws,
6704 offset: 236,
6705 length: 1,
6706 token: Token::Special(Special::Separator(Separator::Space)),
6707 },
6708 PositionalToken {
6709 source: uws,
6710 offset: 237,
6711 length: 3,
6712 token: Token::Word(Word::Word("기".to_string())),
6713 },
6714 ],
6715 Lang::Ara => vec![
6716 PositionalToken {
6717 source: uws,
6718 offset: 0,
6719 length: 14,
6720 token: Token::Word(Word::Word("لشکرکشی".to_string())),
6721 },
6722 PositionalToken {
6723 source: uws,
6724 offset: 14,
6725 length: 3,
6726 token: Token::Unicode(Unicode::Formatter(Formatter::Char('\u{200c}'))),
6727 },
6728 PositionalToken {
6729 source: uws,
6730 offset: 17,
6731 length: 6,
6732 token: Token::Word(Word::Word("های".to_string())),
6733 },
6734 PositionalToken {
6735 source: uws,
6736 offset: 23,
6737 length: 1,
6738 token: Token::Special(Special::Separator(Separator::Space)),
6739 },
6740 PositionalToken {
6741 source: uws,
6742 offset: 24,
6743 length: 6,
6744 token: Token::Word(Word::Word("روس".to_string())),
6745 },
6746 PositionalToken {
6747 source: uws,
6748 offset: 30,
6749 length: 3,
6750 token: Token::Unicode(Unicode::Formatter(Formatter::Char('\u{200c}'))),
6751 },
6752 PositionalToken {
6753 source: uws,
6754 offset: 33,
6755 length: 6,
6756 token: Token::Word(Word::Word("های".to_string())),
6757 },
6758 PositionalToken {
6759 source: uws,
6760 offset: 39,
6761 length: 1,
6762 token: Token::Special(Special::Separator(Separator::Space)),
6763 },
6764 PositionalToken {
6765 source: uws,
6766 offset: 40,
6767 length: 12,
6768 token: Token::Word(Word::Word("وارنگی".to_string())),
6769 },
6770 PositionalToken {
6771 source: uws,
6772 offset: 52,
6773 length: 1,
6774 token: Token::Special(Special::Separator(Separator::Space)),
6775 },
6776 PositionalToken {
6777 source: uws,
6778 offset: 53,
6779 length: 4,
6780 token: Token::Word(Word::Word("به".to_string())),
6781 },
6782 PositionalToken {
6783 source: uws,
6784 offset: 57,
6785 length: 1,
6786 token: Token::Special(Special::Separator(Separator::Space)),
6787 },
6788 PositionalToken {
6789 source: uws,
6790 offset: 58,
6791 length: 10,
6792 token: Token::Word(Word::Word("دریای".to_string())),
6793 },
6794 PositionalToken {
6795 source: uws,
6796 offset: 68,
6797 length: 1,
6798 token: Token::Special(Special::Separator(Separator::Space)),
6799 },
6800 PositionalToken {
6801 source: uws,
6802 offset: 69,
6803 length: 6,
6804 token: Token::Word(Word::Word("خزر".to_string())),
6805 },
6806 PositionalToken {
6807 source: uws,
6808 offset: 75,
6809 length: 1,
6810 token: Token::Special(Special::Separator(Separator::Space)),
6811 },
6812 PositionalToken {
6813 source: uws,
6814 offset: 76,
6815 length: 12,
6816 token: Token::Word(Word::Word("مجموعه".to_string())),
6817 },
6818 PositionalToken {
6819 source: uws,
6820 offset: 88,
6821 length: 3,
6822 token: Token::Unicode(Unicode::Formatter(Formatter::Char('\u{200c}'))),
6823 },
6824 PositionalToken {
6825 source: uws,
6826 offset: 91,
6827 length: 4,
6828 token: Token::Word(Word::Word("ای".to_string())),
6829 },
6830 PositionalToken {
6831 source: uws,
6832 offset: 95,
6833 length: 1,
6834 token: Token::Special(Special::Separator(Separator::Space)),
6835 },
6836 PositionalToken {
6837 source: uws,
6838 offset: 96,
6839 length: 4,
6840 token: Token::Word(Word::Word("از".to_string())),
6841 },
6842 PositionalToken {
6843 source: uws,
6844 offset: 100,
6845 length: 1,
6846 token: Token::Special(Special::Separator(Separator::Space)),
6847 },
6848 PositionalToken {
6849 source: uws,
6850 offset: 101,
6851 length: 10,
6852 token: Token::Word(Word::Word("حملات".to_string())),
6853 },
6854 PositionalToken {
6855 source: uws,
6856 offset: 111,
6857 length: 1,
6858 token: Token::Special(Special::Separator(Separator::Space)),
6859 },
6860 PositionalToken {
6861 source: uws,
6862 offset: 112,
6863 length: 10,
6864 token: Token::Word(Word::Word("نظامی".to_string())),
6865 },
6866 PositionalToken {
6867 source: uws,
6868 offset: 122,
6869 length: 1,
6870 token: Token::Special(Special::Separator(Separator::Space)),
6871 },
6872 PositionalToken {
6873 source: uws,
6874 offset: 123,
6875 length: 4,
6876 token: Token::Word(Word::Word("در".to_string())),
6877 },
6878 PositionalToken {
6879 source: uws,
6880 offset: 127,
6881 length: 1,
6882 token: Token::Special(Special::Separator(Separator::Space)),
6883 },
6884 PositionalToken {
6885 source: uws,
6886 offset: 128,
6887 length: 6,
6888 token: Token::Word(Word::Word("بین".to_string())),
6889 },
6890 PositionalToken {
6891 source: uws,
6892 offset: 134,
6893 length: 1,
6894 token: Token::Special(Special::Separator(Separator::Space)),
6895 },
6896 PositionalToken {
6897 source: uws,
6898 offset: 135,
6899 length: 6,
6900 token: Token::Word(Word::Word("سال".to_string())),
6901 },
6902 PositionalToken {
6903 source: uws,
6904 offset: 141,
6905 length: 3,
6906 token: Token::Unicode(Unicode::Formatter(Formatter::Char('\u{200c}'))),
6907 },
6908 PositionalToken {
6909 source: uws,
6910 offset: 144,
6911 length: 6,
6912 token: Token::Word(Word::Word("های".to_string())),
6913 },
6914 PositionalToken {
6915 source: uws,
6916 offset: 150,
6917 length: 1,
6918 token: Token::Special(Special::Separator(Separator::Space)),
6919 },
6920 PositionalToken {
6921 source: uws,
6922 offset: 151,
6923 length: 6,
6924 token: Token::Word(Word::StrangeWord("۸۶۴".to_string())),
6925 },
6926 PositionalToken {
6927 source: uws,
6928 offset: 157,
6929 length: 1,
6930 token: Token::Special(Special::Separator(Separator::Space)),
6931 },
6932 PositionalToken {
6933 source: uws,
6934 offset: 158,
6935 length: 4,
6936 token: Token::Word(Word::Word("تا".to_string())),
6937 },
6938 PositionalToken {
6939 source: uws,
6940 offset: 162,
6941 length: 1,
6942 token: Token::Special(Special::Separator(Separator::Space)),
6943 },
6944 PositionalToken {
6945 source: uws,
6946 offset: 163,
6947 length: 8,
6948 token: Token::Word(Word::StrangeWord("۱۰۴۱".to_string())),
6949 },
6950 PositionalToken {
6951 source: uws,
6952 offset: 171,
6953 length: 1,
6954 token: Token::Special(Special::Separator(Separator::Space)),
6955 },
6956 PositionalToken {
6957 source: uws,
6958 offset: 172,
6959 length: 12,
6960 token: Token::Word(Word::Word("میلادی".to_string())),
6961 },
6962 PositionalToken {
6963 source: uws,
6964 offset: 184,
6965 length: 1,
6966 token: Token::Special(Special::Separator(Separator::Space)),
6967 },
6968 PositionalToken {
6969 source: uws,
6970 offset: 185,
6971 length: 2,
6972 token: Token::Word(Word::Word("ب".to_string())),
6973 },
6974 ],
6975 Lang::Ell => vec![
6976 PositionalToken {
6977 source: uws,
6978 offset: 0,
6979 length: 4,
6980 token: Token::Word(Word::Word("Το".to_string())),
6981 },
6982 PositionalToken {
6983 source: uws,
6984 offset: 4,
6985 length: 1,
6986 token: Token::Special(Special::Separator(Separator::Space)),
6987 },
6988 PositionalToken {
6989 source: uws,
6990 offset: 5,
6991 length: 18,
6992 token: Token::Word(Word::Word("Πρόγραμμα".to_string())),
6993 },
6994 PositionalToken {
6995 source: uws,
6996 offset: 23,
6997 length: 1,
6998 token: Token::Special(Special::Separator(Separator::Space)),
6999 },
7000 PositionalToken {
7001 source: uws,
7002 offset: 24,
7003 length: 22,
7004 token: Token::Word(Word::Word("υλοποιείται".to_string())),
7005 },
7006 PositionalToken {
7007 source: uws,
7008 offset: 46,
7009 length: 1,
7010 token: Token::Special(Special::Separator(Separator::Space)),
7011 },
7012 PositionalToken {
7013 source: uws,
7014 offset: 47,
7015 length: 4,
7016 token: Token::Word(Word::Word("εξ".to_string())),
7017 },
7018 PositionalToken {
7019 source: uws,
7020 offset: 51,
7021 length: 1,
7022 token: Token::Special(Special::Separator(Separator::Space)),
7023 },
7024 PositionalToken {
7025 source: uws,
7026 offset: 52,
7027 length: 18,
7028 token: Token::Word(Word::Word("ολοκλήρου".to_string())),
7029 },
7030 PositionalToken {
7031 source: uws,
7032 offset: 70,
7033 length: 1,
7034 token: Token::Special(Special::Separator(Separator::Space)),
7035 },
7036 PositionalToken {
7037 source: uws,
7038 offset: 71,
7039 length: 6,
7040 token: Token::Word(Word::Word("από".to_string())),
7041 },
7042 PositionalToken {
7043 source: uws,
7044 offset: 77,
7045 length: 1,
7046 token: Token::Special(Special::Separator(Separator::Space)),
7047 },
7048 PositionalToken {
7049 source: uws,
7050 offset: 78,
7051 length: 16,
7052 token: Token::Word(Word::Word("απόσταση".to_string())),
7053 },
7054 PositionalToken {
7055 source: uws,
7056 offset: 94,
7057 length: 1,
7058 token: Token::Special(Special::Separator(Separator::Space)),
7059 },
7060 PositionalToken {
7061 source: uws,
7062 offset: 95,
7063 length: 6,
7064 token: Token::Word(Word::Word("και".to_string())),
7065 },
7066 PositionalToken {
7067 source: uws,
7068 offset: 101,
7069 length: 1,
7070 token: Token::Special(Special::Separator(Separator::Space)),
7071 },
7072 PositionalToken {
7073 source: uws,
7074 offset: 102,
7075 length: 12,
7076 token: Token::Word(Word::Word("μπορεί".to_string())),
7077 },
7078 PositionalToken {
7079 source: uws,
7080 offset: 114,
7081 length: 1,
7082 token: Token::Special(Special::Separator(Separator::Space)),
7083 },
7084 PositionalToken {
7085 source: uws,
7086 offset: 115,
7087 length: 4,
7088 token: Token::Word(Word::Word("να".to_string())),
7089 },
7090 PositionalToken {
7091 source: uws,
7092 offset: 119,
7093 length: 1,
7094 token: Token::Special(Special::Separator(Separator::Space)),
7095 },
7096 PositionalToken {
7097 source: uws,
7098 offset: 120,
7099 length: 20,
7100 token: Token::Word(Word::Word("συμμετέχει".to_string())),
7101 },
7102 PositionalToken {
7103 source: uws,
7104 offset: 140,
7105 length: 1,
7106 token: Token::Special(Special::Separator(Separator::Space)),
7107 },
7108 PositionalToken {
7109 source: uws,
7110 offset: 141,
7111 length: 8,
7112 token: Token::Word(Word::Word("κάθε".to_string())),
7113 },
7114 PositionalToken {
7115 source: uws,
7116 offset: 149,
7117 length: 1,
7118 token: Token::Special(Special::Separator(Separator::Space)),
7119 },
7120 PositionalToken {
7121 source: uws,
7122 offset: 150,
7123 length: 24,
7124 token: Token::Word(Word::Word("εμπλεκόμενος".to_string())),
7125 },
7126 PositionalToken {
7127 source: uws,
7128 offset: 174,
7129 length: 1,
7130 token: Token::Special(Special::Separator(Separator::Space)),
7131 },
7132 PositionalToken {
7133 source: uws,
7134 offset: 175,
7135 length: 6,
7136 token: Token::Word(Word::Word("στη".to_string())),
7137 },
7138 PositionalToken {
7139 source: uws,
7140 offset: 181,
7141 length: 1,
7142 token: Token::Special(Special::Separator(Separator::Space)),
7143 },
7144 PositionalToken {
7145 source: uws,
7146 offset: 182,
7147 length: 2,
7148 token: Token::Word(Word::Word("ή".to_string())),
7149 },
7150 PositionalToken {
7151 source: uws,
7152 offset: 184,
7153 length: 1,
7154 token: Token::Special(Special::Punctuation('/')),
7155 },
7156 ],
7157 };
7158 (
7159 uws.chars()
7160 .take(100)
7161 .fold(String::new(), |acc, c| acc + &format!("{}", c)),
7162 tokens,
7163 )
7164 }
7165}