1use std::sync::Arc;
2use text_parsing::{Breaker, IntoSource, Local, Localize, Snip, Source, SourceEvent};
3
4mod emoji;
5pub use emoji::EMOJIMAP;
6
7mod breakers;
8pub use breakers::{SentenceBreaker, UnicodeSentenceBreaker};
9
10mod wordbreaker;
11
12mod options;
13pub use options::{IntoTokenizer, TokenizerOptions, TokenizerParams};
14
15mod tokens;
16pub use tokens::Tokens;
17
18mod text_tokens;
19use text_tokens::InnerBound;
20pub use text_tokens::TextTokens;
21
22#[derive(Debug)]
23pub enum Error {
24 TextParser(text_parsing::Error),
25}
26
27const EPS: f64 = 1e-8;
28
29#[cfg(feature = "strings")]
30#[derive(Debug, Clone, Copy, PartialEq, PartialOrd)]
31pub enum Number {
32 Integer(i64),
33 Float(f64),
34 ZeroInteger { i: i64, s: String },
36}
37
38#[cfg(not(feature = "strings"))]
39#[derive(Debug, Clone, Copy, PartialEq, PartialOrd)]
40pub enum Number {
41 Integer(i64),
42 Float(f64),
43 ZeroInteger { i: i64 },
44}
45
46impl Number {
47 pub fn as_f64(&self) -> f64 {
48 match self {
49 Number::Integer(i) => *i as f64,
50 Number::Float(f) => *f,
51 Number::ZeroInteger { i, .. } => *i as f64,
52 }
53 }
54}
55impl Ord for Number {
56 fn cmp(&self, other: &Number) -> std::cmp::Ordering {
57 let s = self.as_f64();
58 let o = other.as_f64();
59 let d = s - o;
60 match d.abs() < EPS {
61 true => std::cmp::Ordering::Equal,
62 false => {
63 if d > 0.0 {
64 return std::cmp::Ordering::Greater;
65 }
66 if d < 0.0 {
67 return std::cmp::Ordering::Less;
68 }
69 std::cmp::Ordering::Equal
70 }
71 }
72 }
73}
74impl Eq for Number {}
75
76#[derive(Debug, Clone, Copy, Eq, PartialEq, Ord, PartialOrd)]
77pub enum Separator {
78 Space,
79 Tab,
80 Newline,
81 Char(char),
82}
83
84#[derive(Debug, Clone, Copy, Eq, PartialEq, Ord, PartialOrd)]
85pub enum Formatter {
86 Char(char),
87 Joiner, }
89
90#[derive(Debug, Clone, Copy, PartialEq, Eq, Ord, PartialOrd)]
91pub enum Special {
92 Punctuation(char),
93 Symbol(char),
94 Separator(Separator),
95}
96
97#[cfg(feature = "strings")]
98#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord)]
99pub enum Word {
100 Word(String),
101 StrangeWord(String),
102 Numerical(Numerical),
103 Number(Number),
104 Emoji(&'static str),
105}
106
107#[cfg(feature = "strings")]
108#[derive(Debug, Clone, Eq, PartialEq, Ord, PartialOrd)]
109pub enum Numerical {
110 DotSeparated(String),
114 Measures(String),
115 Alphanumeric(String),
116}
117
118#[cfg(feature = "strings")]
119#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord)]
120pub enum Struct {
121 Hashtag(String),
122 Mention(String),
123 }
125
126#[cfg(feature = "strings")]
127#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord)]
128pub enum Unicode {
129 String(String),
130 Formatter(Formatter),
131}
132
133#[cfg(not(feature = "strings"))]
134#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord)]
135pub enum Word {
136 Word,
137 StrangeWord,
138 Numerical(Numerical),
139 Number(Number),
140 Emoji(&'static str),
141}
142
143#[cfg(not(feature = "strings"))]
144#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord)]
145pub enum Numerical {
146 DotSeparated,
150 Measures,
151 Alphanumeric,
152}
153
154#[cfg(not(feature = "strings"))]
155#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord)]
156pub enum Struct {
157 Hashtag,
158 Mention,
159 }
161
162#[cfg(not(feature = "strings"))]
163#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord)]
164pub enum Unicode {
165 String,
166 Formatter(Formatter),
167}
168
169#[cfg(feature = "strings")]
170#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord)]
171pub enum Token {
172 Word(Word),
173 Struct(Struct),
174 Special(Special),
175 Unicode(Unicode),
176}
177
178#[cfg(not(feature = "strings"))]
179#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord)]
180pub enum Token {
181 Word(Word),
182 Struct(Struct),
183 Special(Special),
184 Unicode(Unicode),
185}
186
187#[derive(Debug)]
201pub struct TextStr<'s> {
202 buffer: &'s str,
203 localities: Arc<Vec<TextLocality>>,
204 breakers: Arc<Vec<InnerBound>>,
205}
206impl<'s> TextStr<'s> {
207 pub fn new<'a>(s: &'a str) -> Result<TextStr<'a>, Error> {
208 let text = inner_new(s.into_source(), false)?;
209 Ok(TextStr {
210 buffer: s,
211 localities: text.localities,
212 breakers: text.breakers,
213 })
214 }
215}
216
217fn inner_new<S: Source>(mut source: S, with_buffer: bool) -> Result<Text, Error> {
218 let mut buffer = String::new();
219 let mut localities = Vec::new();
220 let mut breakers = Vec::new();
221 let mut buffer_len = 0;
222
223 while let Some(local_se) = source.next_char().map_err(Error::TextParser)? {
224 let (local, se) = local_se.into_inner();
225 let c = match se {
226 SourceEvent::Char(c) => match c {
227 '\u{0060}' => '\u{0027}',
228 _ => c,
229 },
230 SourceEvent::Breaker(b) => {
231 let (c, opt_b) = match b {
232 Breaker::None => continue,
233 Breaker::Space => (' ', None),
234 Breaker::Line => ('\n', None),
235 Breaker::Word => ('\u{200B}', Some(b)), Breaker::Sentence | Breaker::Paragraph | Breaker::Section => ('\n', Some(b)),
237 };
238 if let Some(b) = opt_b {
239 let br = InnerBound {
240 bytes: Snip {
241 offset: buffer_len,
242 length: c.len_utf8(),
243 },
244 chars: Snip {
245 offset: localities.len(),
246 length: 1,
247 },
248 breaker: b,
249 original: Some(local),
250 };
251 breakers.push(br);
253 }
254 c
255 }
256 };
257
258 let buf_local = ().localize(
259 Snip {
260 offset: localities.len(),
262 length: 1,
263 },
264 Snip {
265 offset: buffer_len,
267 length: c.len_utf8(),
268 },
269 );
270 if with_buffer {
271 buffer.push(c);
272 }
273 buffer_len += c.len_utf8();
274 localities.push(TextLocality {
275 buffer: buf_local,
276 original: local,
277 });
278 }
279 Ok(Text {
280 buffer: Arc::new(buffer),
281 localities: Arc::new(localities),
282 breakers: Arc::new(breakers),
283 })
284}
285
286#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord)]
287pub struct TextLocality {
288 pub buffer: Local<()>,
289 pub original: Local<()>,
290}
291
292#[derive(Debug)]
293pub struct Text {
294 buffer: Arc<String>,
295 localities: Arc<Vec<TextLocality>>,
296 breakers: Arc<Vec<InnerBound>>,
297}
298impl Text {
299 pub fn new<S: Source>(source: S) -> Result<Text, Error> {
300 inner_new(source, true)
301 }
302 pub fn token_text<'s>(&'s self, token: &TextToken) -> &'s str {
303 let Snip {
304 offset: begin,
305 length: len,
306 } = token.locality.bytes();
307 let end = begin + len;
308 &self.buffer[begin..end]
309 }
310 pub fn text(&self) -> &str {
311 self.buffer.as_ref()
312 }
313 pub fn original_locality(&self, idx: usize) -> Option<Local<()>> {
314 self.localities.get(idx).map(|tl| tl.original)
315 }
316 pub fn localities(&self) -> &Vec<TextLocality> {
317 self.localities.as_ref()
318 }
319 pub fn shared_text(&self) -> Text {
320 Text {
321 buffer: self.buffer.clone(),
322 localities: self.localities.clone(),
323 breakers: self.breakers.clone(),
324 }
325 }
326}
327
328impl TryFrom<String> for Text {
329 type Error = Error;
330
331 fn try_from(s: String) -> Result<Text, Error> {
332 let mut text = inner_new((&s).into_source(), false)?;
333 text.buffer = Arc::new(s);
334 Ok(text)
335 }
336}
337
338impl TryFrom<&str> for Text {
339 type Error = Error;
340
341 fn try_from(s: &str) -> Result<Text, Error> {
342 Text::new(s.into_source())
343 }
344}
345
346#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord)]
347pub enum Bound {
348 Sentence,
349 Paragraph,
350 Section,
351}
352
353#[cfg(feature = "strings")]
354#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord)]
355pub struct TextToken {
356 locality: Local<()>,
357 original: Option<Local<()>>,
358 pub token: Token2,
359}
360
361#[cfg(not(feature = "strings"))]
362#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord)]
363pub struct TextToken {
364 locality: Local<()>,
365 original: Option<Local<()>>,
366 pub token: Token2,
367}
368
369#[cfg(test)]
370impl TextToken {
371 fn into_original_token_1(self) -> Option<Local<Token>> {
372 match self.original {
373 Some(original) => self.token.into_token().map(|t| original.local(t)),
374 None => None,
375 }
376 }
377}
378
379impl TextToken {
380 pub fn local(&self) -> Local<()> {
381 self.locality
382 }
383 pub fn original(&self) -> Option<Local<()>> {
384 self.original
385 }
386 pub fn into_position(mut self) -> TextToken {
387 self.locality = self.locality.into_position();
388 self.original = self.original.map(|or| or.into_position());
389 self
390 }
391 pub fn try_as_token(&self) -> Result<Token, Bound> {
392 self.token.try_as_token()
393 }
394 pub fn as_original_token(&self) -> Option<Local<&Token2>> {
395 self.original.map(|original| original.local(&self.token))
396 }
397 pub fn into_original_token(self) -> Option<Local<Token2>> {
398 self.original.map(|original| original.local(self.token))
399 }
400 pub fn original_str<'s>(&self, original: &'s str) -> Result<&'s str, OriginalError> {
401 match self.original {
402 Some(local) => {
403 let Snip {
404 offset: begin,
405 length: len,
406 } = local.bytes();
407 let end = begin + len;
408 match original.get(begin..end) {
409 Some(s) => Ok(s),
410 None => Err(OriginalError::InvalidSnip),
411 }
412 }
413 None => Err(OriginalError::NoOriginal),
414 }
415 }
416
417 pub fn test_token(lt: Local<Token2>) -> TextToken {
418 let (local, token) = lt.into_inner();
419 TextToken {
420 locality: local,
421 original: Some(local.local(())),
422 token,
423 }
424 }
425 pub fn test_new(token: Token2, local: Local<()>, original: Option<Local<()>>) -> TextToken {
426 TextToken {
427 locality: local,
428 original,
429 token,
430 }
431 }
432}
433
434#[derive(Debug)]
461pub enum OriginalError {
462 NoOriginal,
463 InvalidSnip,
464}
465
466#[cfg(feature = "strings")]
474#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord)]
475pub enum Token2 {
476 Word(Word),
477 Struct(Struct),
478 Special(Special),
479 Unicode(Unicode),
480
481 Bound(Bound),
482}
483#[cfg(not(feature = "strings"))]
484#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord)]
485pub enum Token2 {
486 Word(Word),
487 Struct(Struct),
488 Special(Special),
489 Unicode(Unicode),
490
491 Bound(Bound),
492}
493impl From<Token> for Token2 {
494 fn from(t: Token) -> Token2 {
495 match t {
496 Token::Word(w) => Token2::Word(w),
497 Token::Struct(s) => Token2::Struct(s),
498 Token::Special(s) => Token2::Special(s),
499 Token::Unicode(u) => Token2::Unicode(u),
500 }
501 }
502}
503impl Token2 {
504 #[cfg(not(feature = "strings"))]
505 fn try_as_token(&self) -> Result<Token, Bound> {
506 (*self).try_into_token()
507 }
508
509 #[cfg(feature = "strings")]
510 fn try_as_token(&self) -> Result<Token, Bound> {
511 self.clone().try_into_token()
512 }
513
514 fn try_into_token(self) -> Result<Token, Bound> {
515 match self {
516 Token2::Word(w) => Ok(Token::Word(w)),
517 Token2::Struct(s) => Ok(Token::Struct(s)),
518 Token2::Special(s) => Ok(Token::Special(s)),
519 Token2::Unicode(u) => Ok(Token::Unicode(u)),
520 Token2::Bound(b) => Err(b),
521 }
522 }
523}
524#[cfg(test)]
525impl Token2 {
526 fn into_token(self) -> Option<Token> {
527 match self {
528 Token2::Word(w) => Some(Token::Word(w)),
529 Token2::Struct(s) => Some(Token::Struct(s)),
530 Token2::Special(s) => Some(Token::Special(s)),
531 Token2::Unicode(u) => Some(Token::Unicode(u)),
532 Token2::Bound(_) => None,
533 }
534 }
535}
536
537#[cfg(test)]
538mod test_v0_5 {
539 use super::*;
540 use text_parsing::{entities, tagger, IntoPipeParser, IntoSource, ParserExt, SourceExt};
541
542 fn basic() {
544 let uws = "<p>Oxana Putan shared the quick (\"brown\") fox can't jump 32.3 feet, right? 4pda etc.</p><p> qeq U.S.A asd\n\n\nBrr, it's 29.3°F!\n Русское предложение #36.6 для тестирования деления по юникод-словам...\n🇷🇺 🇸🇹\n👱🏿👶🏽👨🏽\n+Done! Готово</p>";
559 let text = Text::new({
560 uws.into_source()
561 .pipe(tagger::Builder::new().create().into_breaker())
562 .pipe(entities::Builder::new().create().into_piped())
563 .into_separator()
564 })
565 .unwrap();
566 let lib_res = text
567 .into_tokenizer({
568 TokenizerParams::default()
569 .add_option(TokenizerOptions::SplitDot)
570 .add_option(TokenizerOptions::SplitUnderscore)
571 .add_option(TokenizerOptions::SplitColon)
572 .with_default_sentences()
573 })
574 .collect::<Vec<_>>();
575
576 for tok in lib_res {
577 println!(
578 "C{:?}, B{:?}, {:?} -> {:?}",
579 tok.original.map(|loc| loc.chars()),
580 tok.original.map(|loc| loc.bytes()),
581 tok.token,
582 tok.original_str(uws)
583 );
584 }
585
586 panic!()
587 }
588}
589
590#[cfg(test)]
591#[cfg(feature = "strings")]
592mod test {
593 use super::*;
594 use text_parsing::{
595 entities, tagger, IntoPipeParser, IntoSource, Localize, ParserExt, SourceExt,
596 };
597
598 #[derive(Debug, Clone)]
659 struct CharToken {
660 byte_offset: usize,
661 byte_length: usize,
662 char_offset: usize,
663 char_length: usize,
664 token: Token,
665 }
666 impl Into<Local<Token>> for CharToken {
667 fn into(self) -> Local<Token> {
668 self.token.localize(
669 Snip {
670 offset: self.char_offset,
671 length: self.char_length,
672 },
673 Snip {
674 offset: self.byte_offset,
675 length: self.byte_length,
676 },
677 )
678 }
679 }
680
681 #[derive(Debug, Clone)]
682 struct PositionalToken {
683 source: &'static str,
684 offset: usize,
685 length: usize,
686 token: Token,
687 }
688 impl Into<Local<Token>> for PositionalToken {
689 fn into(self) -> Local<Token> {
690 self.token.localize(
691 Snip {
692 offset: self.source[..self.offset].chars().count(),
693 length: self.source[self.offset..self.offset + self.length]
694 .chars()
695 .count(),
696 },
697 Snip {
698 offset: self.offset,
699 length: self.length,
700 },
701 )
702 }
703 }
704
705 fn check_results(result: &Vec<PositionalToken>, lib_res: &Vec<Local<Token>>, _uws: &str) {
706 assert_eq!(result.len(), lib_res.len());
707 for i in 0..result.len() {
708 let res: Local<Token> = result[i].clone().into();
709 assert_eq!(res, lib_res[i]);
710 }
711 }
712
713 fn check_cresults(result: &Vec<CharToken>, lib_res: &Vec<Local<Token>>, _uws: &str) {
714 assert_eq!(result.len(), lib_res.len());
715 for i in 0..result.len() {
716 let res: Local<Token> = result[i].clone().into();
717 assert_eq!(res, lib_res[i]);
718 }
719 }
720
721 fn check<T: Clone + std::fmt::Debug + Into<Local<Token>>>(
722 res: &Vec<T>,
723 lib: &Vec<Local<Token>>,
724 _uws: &str,
725 ) {
726 let mut lib = lib.iter();
727 let mut res = res.iter().map(|r| {
728 let res: Local<Token> = r.clone().into();
729 res
730 });
731 let mut diff = Vec::new();
732 loop {
733 match (lib.next(), res.next()) {
734 (Some(lw), Some(rw)) => {
735 if *lw != rw {
736 diff.push(format!("LIB: {:?}", lw));
737 diff.push(format!("TEST: {:?}", rw));
738 diff.push("".to_string())
739 }
740 }
741 (Some(lw), None) => {
742 diff.push(format!("LIB: {:?}", lw));
743 diff.push("TEST: ----".to_string());
744 diff.push("".to_string())
745 }
746 (None, Some(rw)) => {
747 diff.push("LIB: ----".to_string());
748 diff.push(format!("TEST: {:?}", rw));
749 diff.push("".to_string())
750 }
751 (None, None) => break,
752 }
753 }
754 if diff.len() > 0 {
755 for ln in &diff {
756 println!("{}", ln);
757 }
758 panic!("Diff count: {}", diff.len() / 3);
759 }
760 }
761
762 #[test]
763 fn spaces() {
764 let uws = " spaces too many apces ";
765 let result = vec![
766 PositionalToken {
767 source: uws,
768 offset: 0,
769 length: 4,
770 token: Token::Special(Special::Separator(Separator::Space)),
771 },
772 PositionalToken {
773 source: uws,
774 offset: 4,
775 length: 6,
776 token: Token::Word(Word::Word("spaces".to_string())),
777 },
778 PositionalToken {
779 source: uws,
780 offset: 10,
781 length: 4,
782 token: Token::Special(Special::Separator(Separator::Space)),
783 },
784 PositionalToken {
785 source: uws,
786 offset: 14,
787 length: 3,
788 token: Token::Word(Word::Word("too".to_string())),
789 },
790 PositionalToken {
791 source: uws,
792 offset: 17,
793 length: 3,
794 token: Token::Special(Special::Separator(Separator::Space)),
795 },
796 PositionalToken {
797 source: uws,
798 offset: 20,
799 length: 4,
800 token: Token::Word(Word::Word("many".to_string())),
801 },
802 PositionalToken {
803 source: uws,
804 offset: 24,
805 length: 3,
806 token: Token::Special(Special::Separator(Separator::Space)),
807 },
808 PositionalToken {
809 source: uws,
810 offset: 27,
811 length: 5,
812 token: Token::Word(Word::Word("apces".to_string())),
813 },
814 PositionalToken {
815 source: uws,
816 offset: 32,
817 length: 3,
818 token: Token::Special(Special::Separator(Separator::Space)),
819 },
820 ];
821 let lib_res = uws
822 .into_tokenizer(TokenizerParams::v1())
823 .collect::<Vec<_>>();
824 check_results(&result, &lib_res, uws);
825 }
827
828 #[test]
829 fn numbers() {
830 let uws = "(() -2\n() -2";
831 let result = vec![
832 PositionalToken {
833 source: uws,
834 offset: 0,
835 length: 1,
836 token: Token::Special(Special::Punctuation('(')),
837 },
838 PositionalToken {
839 source: uws,
840 offset: 1,
841 length: 1,
842 token: Token::Special(Special::Punctuation('(')),
843 },
844 PositionalToken {
845 source: uws,
846 offset: 2,
847 length: 1,
848 token: Token::Special(Special::Punctuation(')')),
849 },
850 PositionalToken {
851 source: uws,
852 offset: 3,
853 length: 1,
854 token: Token::Special(Special::Separator(Separator::Space)),
855 },
856 PositionalToken {
857 source: uws,
858 offset: 4,
859 length: 2,
860 token: Token::Word(Word::Number(Number::Integer(-2))),
861 },
862 PositionalToken {
863 source: uws,
864 offset: 6,
865 length: 1,
866 token: Token::Special(Special::Separator(Separator::Newline)),
867 },
868 PositionalToken {
869 source: uws,
870 offset: 7,
871 length: 1,
872 token: Token::Special(Special::Punctuation('(')),
873 },
874 PositionalToken {
875 source: uws,
876 offset: 8,
877 length: 1,
878 token: Token::Special(Special::Punctuation(')')),
879 },
880 PositionalToken {
881 source: uws,
882 offset: 9,
883 length: 2,
884 token: Token::Special(Special::Separator(Separator::Space)),
885 },
886 PositionalToken {
887 source: uws,
888 offset: 11,
889 length: 2,
890 token: Token::Word(Word::Number(Number::Integer(-2))),
891 },
892 ];
893 let lib_res = uws
894 .into_tokenizer({
895 TokenizerParams::default()
896 .add_option(TokenizerOptions::SplitDot)
897 .add_option(TokenizerOptions::SplitUnderscore)
898 .add_option(TokenizerOptions::SplitColon)
899 .add_option(TokenizerOptions::MergeWhites)
900 })
901 .collect::<Vec<_>>();
902 check_results(&result, &lib_res, uws);
903 }
904
905 #[test]
906 fn word_with_inner_hyphens() {
907 let uws = "Опросы показывают";
908 let result = vec![
909 PositionalToken {
910 source: uws,
911 offset: 0,
912 length: 14,
913 token: Token::Word(Word::StrangeWord("Опросы".to_string())),
914 },
915 PositionalToken {
916 source: uws,
917 offset: 14,
918 length: 1,
919 token: Token::Special(Special::Separator(Separator::Space)),
920 },
921 PositionalToken {
922 source: uws,
923 offset: 15,
924 length: 28,
925 token: Token::Word(Word::StrangeWord("показывают".to_string())),
926 },
927 ];
928 let lib_res = uws
929 .into_tokenizer(TokenizerParams::v1())
930 .collect::<Vec<_>>();
931 check_results(&result, &lib_res, uws);
932 }
933
934 #[test]
935 fn mixed_but_word() {
936 let uws = "L’Oreal";
937 let result = vec![PositionalToken {
938 source: uws,
939 offset: 0,
940 length: 9,
941 token: Token::Word(Word::StrangeWord("L’Oreal".to_string())),
942 }];
943 let lib_res = uws
944 .into_tokenizer(TokenizerParams::v1())
945 .collect::<Vec<_>>();
946 check_results(&result, &lib_res, uws);
947 }
948
949 #[test]
950 fn hashtags() {
951 let uws = "#hashtag#hashtag2";
952 let result = vec![
953 PositionalToken {
954 source: uws,
955 offset: 0,
956 length: 1,
957 token: Token::Special(Special::Punctuation('#')),
958 },
959 PositionalToken {
960 source: uws,
961 offset: 1,
962 length: 7,
963 token: Token::Word(Word::Word("hashtag".to_string())),
964 },
965 PositionalToken {
966 source: uws,
967 offset: 8,
968 length: 1,
969 token: Token::Special(Special::Punctuation('#')),
970 },
971 PositionalToken {
972 source: uws,
973 offset: 9,
974 length: 8,
975 token: Token::Word(Word::Numerical(Numerical::Alphanumeric(
976 "hashtag2".to_string(),
977 ))),
978 },
979 ];
980 let lib_res = uws
981 .into_tokenizer(TokenizerParams::v1())
982 .collect::<Vec<_>>();
983 check_results(&result, &lib_res, uws);
984 }
985
986 #[test]
987 fn apostrophe() {
988 let uws = "l'oreal; l\u{0060}oreal";
989 let result = vec![
990 PositionalToken {
991 source: uws,
992 offset: 0,
993 length: 7,
994 token: Token::Word(Word::Word("l'oreal".to_string())),
995 },
996 PositionalToken {
997 source: uws,
998 offset: 7,
999 length: 1,
1000 token: Token::Special(Special::Punctuation(';')),
1001 },
1002 PositionalToken {
1003 source: uws,
1004 offset: 8,
1005 length: 1,
1006 token: Token::Special(Special::Separator(Separator::Space)),
1007 },
1008 PositionalToken {
1009 source: uws,
1010 offset: 9,
1011 length: 7,
1012 token: Token::Word(Word::Word("l'oreal".to_string())),
1013 },
1014 ];
1015 let text = Text::new(uws.into_source()).unwrap();
1016 let lib_res = text
1017 .into_tokenizer(TokenizerParams::v1())
1018 .filter_map(|tt| tt.into_original_token_1())
1019 .collect::<Vec<_>>();
1020 check_results(&result, &lib_res, uws);
1021 }
1022
1023 #[test]
1024 fn char_tokens() {
1025 let uws = "[Oxana Putan|1712640565] shared the quick (\"brown\") fox can't jump 32.3 feet, right? 4pda etc. qeq U.S.A asd\n\n\nBrr, it's 29.3°F!\n Русское предложение #36.6 для тестирования деления по юникод-словам...\n🇷🇺 🇸🇹\n👱🏿👶🏽👨🏽\n+Done! Готово";
1026 let result = vec![
1027 CharToken {
1028 byte_offset: 0,
1029 byte_length: 1,
1030 char_offset: 0,
1031 char_length: 1,
1032 token: Token::Special(Special::Punctuation('[')),
1033 },
1034 CharToken {
1035 byte_offset: 1,
1036 byte_length: 5,
1037 char_offset: 1,
1038 char_length: 5,
1039 token: Token::Word(Word::Word("Oxana".to_string())),
1040 },
1041 CharToken {
1042 byte_offset: 6,
1043 byte_length: 1,
1044 char_offset: 6,
1045 char_length: 1,
1046 token: Token::Special(Special::Separator(Separator::Space)),
1047 },
1048 CharToken {
1049 byte_offset: 7,
1050 byte_length: 5,
1051 char_offset: 7,
1052 char_length: 5,
1053 token: Token::Word(Word::Word("Putan".to_string())),
1054 },
1055 CharToken {
1056 byte_offset: 12,
1057 byte_length: 1,
1058 char_offset: 12,
1059 char_length: 1,
1060 token: Token::Special(Special::Punctuation('|')),
1061 },
1062 CharToken {
1063 byte_offset: 13,
1064 byte_length: 10,
1065 char_offset: 13,
1066 char_length: 10,
1067 token: Token::Word(Word::Number(Number::Integer(1712640565))),
1068 },
1069 CharToken {
1070 byte_offset: 23,
1071 byte_length: 1,
1072 char_offset: 23,
1073 char_length: 1,
1074 token: Token::Special(Special::Punctuation(']')),
1075 },
1076 CharToken {
1084 byte_offset: 24,
1085 byte_length: 1,
1086 char_offset: 24,
1087 char_length: 1,
1088 token: Token::Special(Special::Separator(Separator::Space)),
1089 },
1090 CharToken {
1091 byte_offset: 25,
1092 byte_length: 6,
1093 char_offset: 25,
1094 char_length: 6,
1095 token: Token::Word(Word::Word("shared".to_string())),
1096 },
1097 CharToken {
1098 byte_offset: 31,
1099 byte_length: 1,
1100 char_offset: 31,
1101 char_length: 1,
1102 token: Token::Special(Special::Separator(Separator::Space)),
1103 },
1104 CharToken {
1105 byte_offset: 32,
1106 byte_length: 3,
1107 char_offset: 32,
1108 char_length: 3,
1109 token: Token::Word(Word::Word("the".to_string())),
1110 },
1111 CharToken {
1112 byte_offset: 35,
1113 byte_length: 1,
1114 char_offset: 35,
1115 char_length: 1,
1116 token: Token::Special(Special::Separator(Separator::Space)),
1117 },
1118 CharToken {
1119 byte_offset: 36,
1120 byte_length: 5,
1121 char_offset: 36,
1122 char_length: 5,
1123 token: Token::Word(Word::Word("quick".to_string())),
1124 },
1125 CharToken {
1126 byte_offset: 41,
1127 byte_length: 1,
1128 char_offset: 41,
1129 char_length: 1,
1130 token: Token::Special(Special::Separator(Separator::Space)),
1131 },
1132 CharToken {
1133 byte_offset: 42,
1134 byte_length: 1,
1135 char_offset: 42,
1136 char_length: 1,
1137 token: Token::Special(Special::Punctuation('(')),
1138 },
1139 CharToken {
1140 byte_offset: 43,
1141 byte_length: 1,
1142 char_offset: 43,
1143 char_length: 1,
1144 token: Token::Special(Special::Punctuation('"')),
1145 },
1146 CharToken {
1147 byte_offset: 44,
1148 byte_length: 5,
1149 char_offset: 44,
1150 char_length: 5,
1151 token: Token::Word(Word::Word("brown".to_string())),
1152 },
1153 CharToken {
1154 byte_offset: 49,
1155 byte_length: 1,
1156 char_offset: 49,
1157 char_length: 1,
1158 token: Token::Special(Special::Punctuation('"')),
1159 },
1160 CharToken {
1161 byte_offset: 50,
1162 byte_length: 1,
1163 char_offset: 50,
1164 char_length: 1,
1165 token: Token::Special(Special::Punctuation(')')),
1166 },
1167 CharToken {
1168 byte_offset: 51,
1169 byte_length: 1,
1170 char_offset: 51,
1171 char_length: 1,
1172 token: Token::Special(Special::Separator(Separator::Space)),
1173 },
1174 CharToken {
1175 byte_offset: 52,
1176 byte_length: 3,
1177 char_offset: 52,
1178 char_length: 3,
1179 token: Token::Word(Word::Word("fox".to_string())),
1180 },
1181 CharToken {
1182 byte_offset: 55,
1183 byte_length: 1,
1184 char_offset: 55,
1185 char_length: 1,
1186 token: Token::Special(Special::Separator(Separator::Space)),
1187 },
1188 CharToken {
1189 byte_offset: 56,
1190 byte_length: 5,
1191 char_offset: 56,
1192 char_length: 5,
1193 token: Token::Word(Word::Word("can\'t".to_string())),
1194 },
1195 CharToken {
1196 byte_offset: 61,
1197 byte_length: 1,
1198 char_offset: 61,
1199 char_length: 1,
1200 token: Token::Special(Special::Separator(Separator::Space)),
1201 },
1202 CharToken {
1203 byte_offset: 62,
1204 byte_length: 4,
1205 char_offset: 62,
1206 char_length: 4,
1207 token: Token::Word(Word::Word("jump".to_string())),
1208 },
1209 CharToken {
1210 byte_offset: 66,
1211 byte_length: 1,
1212 char_offset: 66,
1213 char_length: 1,
1214 token: Token::Special(Special::Separator(Separator::Space)),
1215 },
1216 CharToken {
1217 byte_offset: 67,
1218 byte_length: 4,
1219 char_offset: 67,
1220 char_length: 4,
1221 token: Token::Word(Word::Number(Number::Float(32.3))),
1222 },
1223 CharToken {
1224 byte_offset: 71,
1225 byte_length: 1,
1226 char_offset: 71,
1227 char_length: 1,
1228 token: Token::Special(Special::Separator(Separator::Space)),
1229 },
1230 CharToken {
1231 byte_offset: 72,
1232 byte_length: 4,
1233 char_offset: 72,
1234 char_length: 4,
1235 token: Token::Word(Word::Word("feet".to_string())),
1236 },
1237 CharToken {
1238 byte_offset: 76,
1239 byte_length: 1,
1240 char_offset: 76,
1241 char_length: 1,
1242 token: Token::Special(Special::Punctuation(',')),
1243 },
1244 CharToken {
1245 byte_offset: 77,
1246 byte_length: 1,
1247 char_offset: 77,
1248 char_length: 1,
1249 token: Token::Special(Special::Separator(Separator::Space)),
1250 },
1251 CharToken {
1252 byte_offset: 78,
1253 byte_length: 5,
1254 char_offset: 78,
1255 char_length: 5,
1256 token: Token::Word(Word::Word("right".to_string())),
1257 },
1258 CharToken {
1259 byte_offset: 83,
1260 byte_length: 1,
1261 char_offset: 83,
1262 char_length: 1,
1263 token: Token::Special(Special::Punctuation('?')),
1264 },
1265 CharToken {
1266 byte_offset: 84,
1267 byte_length: 1,
1268 char_offset: 84,
1269 char_length: 1,
1270 token: Token::Special(Special::Separator(Separator::Space)),
1271 },
1272 CharToken {
1273 byte_offset: 85,
1274 byte_length: 4,
1275 char_offset: 85,
1276 char_length: 4,
1277 token: Token::Word(Word::Numerical(Numerical::Measures("4pda".to_string()))),
1278 },
1279 CharToken {
1280 byte_offset: 89,
1281 byte_length: 1,
1282 char_offset: 89,
1283 char_length: 1,
1284 token: Token::Special(Special::Separator(Separator::Space)),
1285 },
1286 CharToken {
1287 byte_offset: 90,
1288 byte_length: 3,
1289 char_offset: 90,
1290 char_length: 3,
1291 token: Token::Word(Word::Word("etc".to_string())),
1292 },
1293 CharToken {
1294 byte_offset: 93,
1295 byte_length: 1,
1296 char_offset: 93,
1297 char_length: 1,
1298 token: Token::Special(Special::Punctuation('.')),
1299 },
1300 CharToken {
1301 byte_offset: 94,
1302 byte_length: 1,
1303 char_offset: 94,
1304 char_length: 1,
1305 token: Token::Special(Special::Separator(Separator::Space)),
1306 },
1307 CharToken {
1308 byte_offset: 95,
1309 byte_length: 3,
1310 char_offset: 95,
1311 char_length: 3,
1312 token: Token::Word(Word::Word("qeq".to_string())),
1313 },
1314 CharToken {
1315 byte_offset: 98,
1316 byte_length: 1,
1317 char_offset: 98,
1318 char_length: 1,
1319 token: Token::Special(Special::Separator(Separator::Space)),
1320 },
1321 CharToken {
1322 byte_offset: 99,
1323 byte_length: 5,
1324 char_offset: 99,
1325 char_length: 5,
1326 token: Token::Word(Word::Word("U.S.A".to_string())),
1327 },
1328 CharToken {
1329 byte_offset: 104,
1330 byte_length: 2,
1331 char_offset: 104,
1332 char_length: 2,
1333 token: Token::Special(Special::Separator(Separator::Space)),
1334 },
1335 CharToken {
1336 byte_offset: 106,
1337 byte_length: 3,
1338 char_offset: 106,
1339 char_length: 3,
1340 token: Token::Word(Word::Word("asd".to_string())),
1341 },
1342 CharToken {
1343 byte_offset: 109,
1344 byte_length: 3,
1345 char_offset: 109,
1346 char_length: 3,
1347 token: Token::Special(Special::Separator(Separator::Newline)),
1348 },
1349 CharToken {
1350 byte_offset: 112,
1351 byte_length: 3,
1352 char_offset: 112,
1353 char_length: 3,
1354 token: Token::Word(Word::Word("Brr".to_string())),
1355 },
1356 CharToken {
1357 byte_offset: 115,
1358 byte_length: 1,
1359 char_offset: 115,
1360 char_length: 1,
1361 token: Token::Special(Special::Punctuation(',')),
1362 },
1363 CharToken {
1364 byte_offset: 116,
1365 byte_length: 1,
1366 char_offset: 116,
1367 char_length: 1,
1368 token: Token::Special(Special::Separator(Separator::Space)),
1369 },
1370 CharToken {
1371 byte_offset: 117,
1372 byte_length: 4,
1373 char_offset: 117,
1374 char_length: 4,
1375 token: Token::Word(Word::Word("it\'s".to_string())),
1376 },
1377 CharToken {
1378 byte_offset: 121,
1379 byte_length: 1,
1380 char_offset: 121,
1381 char_length: 1,
1382 token: Token::Special(Special::Separator(Separator::Space)),
1383 },
1384 CharToken {
1385 byte_offset: 122,
1386 byte_length: 4,
1387 char_offset: 122,
1388 char_length: 4,
1389 token: Token::Word(Word::Number(Number::Float(29.3))),
1390 },
1391 CharToken {
1392 byte_offset: 126,
1393 byte_length: 2,
1394 char_offset: 126,
1395 char_length: 1,
1396 token: Token::Special(Special::Symbol('°')),
1397 },
1398 CharToken {
1399 byte_offset: 128,
1400 byte_length: 1,
1401 char_offset: 127,
1402 char_length: 1,
1403 token: Token::Word(Word::Word("F".to_string())),
1404 },
1405 CharToken {
1406 byte_offset: 129,
1407 byte_length: 1,
1408 char_offset: 128,
1409 char_length: 1,
1410 token: Token::Special(Special::Punctuation('!')),
1411 },
1412 CharToken {
1413 byte_offset: 130,
1414 byte_length: 1,
1415 char_offset: 129,
1416 char_length: 1,
1417 token: Token::Special(Special::Separator(Separator::Newline)),
1418 },
1419 CharToken {
1420 byte_offset: 131,
1421 byte_length: 1,
1422 char_offset: 130,
1423 char_length: 1,
1424 token: Token::Special(Special::Separator(Separator::Space)),
1425 },
1426 CharToken {
1427 byte_offset: 132,
1428 byte_length: 14,
1429 char_offset: 131,
1430 char_length: 7,
1431 token: Token::Word(Word::Word("Русское".to_string())),
1432 },
1433 CharToken {
1434 byte_offset: 146,
1435 byte_length: 1,
1436 char_offset: 138,
1437 char_length: 1,
1438 token: Token::Special(Special::Separator(Separator::Space)),
1439 },
1440 CharToken {
1441 byte_offset: 147,
1442 byte_length: 22,
1443 char_offset: 139,
1444 char_length: 11,
1445 token: Token::Word(Word::Word("предложение".to_string())),
1446 },
1447 CharToken {
1448 byte_offset: 169,
1449 byte_length: 1,
1450 char_offset: 150,
1451 char_length: 1,
1452 token: Token::Special(Special::Separator(Separator::Space)),
1453 },
1454 CharToken {
1455 byte_offset: 170,
1456 byte_length: 5,
1457 char_offset: 151,
1458 char_length: 5,
1459 token: Token::Struct(Struct::Hashtag("36.6".to_string())),
1460 },
1461 CharToken {
1462 byte_offset: 175,
1463 byte_length: 1,
1464 char_offset: 156,
1465 char_length: 1,
1466 token: Token::Special(Special::Separator(Separator::Space)),
1467 },
1468 CharToken {
1469 byte_offset: 176,
1470 byte_length: 6,
1471 char_offset: 157,
1472 char_length: 3,
1473 token: Token::Word(Word::Word("для".to_string())),
1474 },
1475 CharToken {
1476 byte_offset: 182,
1477 byte_length: 1,
1478 char_offset: 160,
1479 char_length: 1,
1480 token: Token::Special(Special::Separator(Separator::Space)),
1481 },
1482 CharToken {
1483 byte_offset: 183,
1484 byte_length: 24,
1485 char_offset: 161,
1486 char_length: 12,
1487 token: Token::Word(Word::Word("тестирования".to_string())),
1488 },
1489 CharToken {
1490 byte_offset: 207,
1491 byte_length: 1,
1492 char_offset: 173,
1493 char_length: 1,
1494 token: Token::Special(Special::Separator(Separator::Space)),
1495 },
1496 CharToken {
1497 byte_offset: 208,
1498 byte_length: 14,
1499 char_offset: 174,
1500 char_length: 7,
1501 token: Token::Word(Word::Word("деления".to_string())),
1502 },
1503 CharToken {
1504 byte_offset: 222,
1505 byte_length: 1,
1506 char_offset: 181,
1507 char_length: 1,
1508 token: Token::Special(Special::Separator(Separator::Space)),
1509 },
1510 CharToken {
1511 byte_offset: 223,
1512 byte_length: 4,
1513 char_offset: 182,
1514 char_length: 2,
1515 token: Token::Word(Word::Word("по".to_string())),
1516 },
1517 CharToken {
1518 byte_offset: 227,
1519 byte_length: 1,
1520 char_offset: 184,
1521 char_length: 1,
1522 token: Token::Special(Special::Separator(Separator::Space)),
1523 },
1524 CharToken {
1525 byte_offset: 228,
1526 byte_length: 12,
1527 char_offset: 185,
1528 char_length: 6,
1529 token: Token::Word(Word::Word("юникод".to_string())),
1530 },
1531 CharToken {
1532 byte_offset: 240,
1533 byte_length: 1,
1534 char_offset: 191,
1535 char_length: 1,
1536 token: Token::Special(Special::Punctuation('-')),
1537 },
1538 CharToken {
1539 byte_offset: 241,
1540 byte_length: 12,
1541 char_offset: 192,
1542 char_length: 6,
1543 token: Token::Word(Word::Word("словам".to_string())),
1544 },
1545 CharToken {
1546 byte_offset: 253,
1547 byte_length: 3,
1548 char_offset: 198,
1549 char_length: 3,
1550 token: Token::Special(Special::Punctuation('.')),
1551 },
1552 CharToken {
1553 byte_offset: 256,
1554 byte_length: 1,
1555 char_offset: 201,
1556 char_length: 1,
1557 token: Token::Special(Special::Separator(Separator::Newline)),
1558 },
1559 CharToken {
1560 byte_offset: 257,
1561 byte_length: 8,
1562 char_offset: 202,
1563 char_length: 2,
1564 token: Token::Word(Word::Emoji("russia")),
1565 },
1566 CharToken {
1567 byte_offset: 265,
1568 byte_length: 1,
1569 char_offset: 204,
1570 char_length: 1,
1571 token: Token::Special(Special::Separator(Separator::Space)),
1572 },
1573 CharToken {
1574 byte_offset: 266,
1575 byte_length: 8,
1576 char_offset: 205,
1577 char_length: 2,
1578 token: Token::Word(Word::Emoji("sao_tome_and_principe")),
1579 },
1580 CharToken {
1581 byte_offset: 274,
1582 byte_length: 1,
1583 char_offset: 207,
1584 char_length: 1,
1585 token: Token::Special(Special::Separator(Separator::Newline)),
1586 },
1587 CharToken {
1588 byte_offset: 275,
1589 byte_length: 8,
1590 char_offset: 208,
1591 char_length: 2,
1592 token: Token::Word(Word::Emoji("blond_haired_person_dark_skin_tone")),
1593 },
1594 CharToken {
1595 byte_offset: 283,
1596 byte_length: 8,
1597 char_offset: 210,
1598 char_length: 2,
1599 token: Token::Word(Word::Emoji("baby_medium_skin_tone")),
1600 },
1601 CharToken {
1602 byte_offset: 291,
1603 byte_length: 8,
1604 char_offset: 212,
1605 char_length: 2,
1606 token: Token::Word(Word::Emoji("man_medium_skin_tone")),
1607 },
1608 CharToken {
1609 byte_offset: 299,
1610 byte_length: 1,
1611 char_offset: 214,
1612 char_length: 1,
1613 token: Token::Special(Special::Separator(Separator::Newline)),
1614 },
1615 CharToken {
1616 byte_offset: 300,
1617 byte_length: 1,
1618 char_offset: 215,
1619 char_length: 1,
1620 token: Token::Special(Special::Punctuation('+')),
1621 },
1622 CharToken {
1623 byte_offset: 301,
1624 byte_length: 4,
1625 char_offset: 216,
1626 char_length: 4,
1627 token: Token::Word(Word::Word("Done".to_string())),
1628 },
1629 CharToken {
1630 byte_offset: 305,
1631 byte_length: 1,
1632 char_offset: 220,
1633 char_length: 1,
1634 token: Token::Special(Special::Punctuation('!')),
1635 },
1636 CharToken {
1637 byte_offset: 306,
1638 byte_length: 1,
1639 char_offset: 221,
1640 char_length: 1,
1641 token: Token::Special(Special::Separator(Separator::Space)),
1642 },
1643 CharToken {
1644 byte_offset: 307,
1645 byte_length: 12,
1646 char_offset: 222,
1647 char_length: 6,
1648 token: Token::Word(Word::Word("Готово".to_string())),
1649 },
1650 ];
1651
1652 let lib_res = uws
1653 .into_tokenizer(TokenizerParams::complex())
1654 .collect::<Vec<_>>();
1655
1656 check_cresults(&result, &lib_res, uws);
1658 }
1659
1660 #[test]
1661 fn general_default() {
1662 let uws = "The quick (\"brown\") fox can't jump 32.3 feet, right? 4pda etc. qeq U.S.A asd\n\n\nBrr, it's 29.3°F!\n Русское предложение #36.6 для тестирования деления по юникод-словам...\n";
1663 let result = vec![
1664 PositionalToken {
1665 source: uws,
1666 offset: 0,
1667 length: 3,
1668 token: Token::Word(Word::Word("The".to_string())),
1669 },
1670 PositionalToken {
1671 source: uws,
1672 offset: 3,
1673 length: 1,
1674 token: Token::Special(Special::Separator(Separator::Space)),
1675 },
1676 PositionalToken {
1677 source: uws,
1678 offset: 4,
1679 length: 5,
1680 token: Token::Word(Word::Word("quick".to_string())),
1681 },
1682 PositionalToken {
1683 source: uws,
1684 offset: 9,
1685 length: 1,
1686 token: Token::Special(Special::Separator(Separator::Space)),
1687 },
1688 PositionalToken {
1689 source: uws,
1690 offset: 10,
1691 length: 1,
1692 token: Token::Special(Special::Punctuation('(')),
1693 },
1694 PositionalToken {
1695 source: uws,
1696 offset: 11,
1697 length: 1,
1698 token: Token::Special(Special::Punctuation('"')),
1699 },
1700 PositionalToken {
1701 source: uws,
1702 offset: 12,
1703 length: 5,
1704 token: Token::Word(Word::Word("brown".to_string())),
1705 },
1706 PositionalToken {
1707 source: uws,
1708 offset: 17,
1709 length: 1,
1710 token: Token::Special(Special::Punctuation('"')),
1711 },
1712 PositionalToken {
1713 source: uws,
1714 offset: 18,
1715 length: 1,
1716 token: Token::Special(Special::Punctuation(')')),
1717 },
1718 PositionalToken {
1719 source: uws,
1720 offset: 19,
1721 length: 1,
1722 token: Token::Special(Special::Separator(Separator::Space)),
1723 },
1724 PositionalToken {
1725 source: uws,
1726 offset: 20,
1727 length: 3,
1728 token: Token::Word(Word::Word("fox".to_string())),
1729 },
1730 PositionalToken {
1731 source: uws,
1732 offset: 23,
1733 length: 1,
1734 token: Token::Special(Special::Separator(Separator::Space)),
1735 },
1736 PositionalToken {
1737 source: uws,
1738 offset: 24,
1739 length: 5,
1740 token: Token::Word(Word::Word("can\'t".to_string())),
1741 },
1742 PositionalToken {
1743 source: uws,
1744 offset: 29,
1745 length: 1,
1746 token: Token::Special(Special::Separator(Separator::Space)),
1747 },
1748 PositionalToken {
1749 source: uws,
1750 offset: 30,
1751 length: 4,
1752 token: Token::Word(Word::Word("jump".to_string())),
1753 },
1754 PositionalToken {
1755 source: uws,
1756 offset: 34,
1757 length: 1,
1758 token: Token::Special(Special::Separator(Separator::Space)),
1759 },
1760 PositionalToken {
1761 source: uws,
1762 offset: 35,
1763 length: 4,
1764 token: Token::Word(Word::Number(Number::Float(32.3))),
1765 },
1766 PositionalToken {
1767 source: uws,
1768 offset: 39,
1769 length: 1,
1770 token: Token::Special(Special::Separator(Separator::Space)),
1771 },
1772 PositionalToken {
1773 source: uws,
1774 offset: 40,
1775 length: 4,
1776 token: Token::Word(Word::Word("feet".to_string())),
1777 },
1778 PositionalToken {
1779 source: uws,
1780 offset: 44,
1781 length: 1,
1782 token: Token::Special(Special::Punctuation(',')),
1783 },
1784 PositionalToken {
1785 source: uws,
1786 offset: 45,
1787 length: 1,
1788 token: Token::Special(Special::Separator(Separator::Space)),
1789 },
1790 PositionalToken {
1791 source: uws,
1792 offset: 46,
1793 length: 5,
1794 token: Token::Word(Word::Word("right".to_string())),
1795 },
1796 PositionalToken {
1797 source: uws,
1798 offset: 51,
1799 length: 1,
1800 token: Token::Special(Special::Punctuation('?')),
1801 },
1802 PositionalToken {
1803 source: uws,
1804 offset: 52,
1805 length: 1,
1806 token: Token::Special(Special::Separator(Separator::Space)),
1807 },
1808 PositionalToken {
1809 source: uws,
1810 offset: 53,
1811 length: 4,
1812 token: Token::Word(Word::Numerical(Numerical::Measures("4pda".to_string()))),
1813 }, PositionalToken {
1815 source: uws,
1816 offset: 57,
1817 length: 1,
1818 token: Token::Special(Special::Separator(Separator::Space)),
1819 },
1820 PositionalToken {
1821 source: uws,
1822 offset: 58,
1823 length: 3,
1824 token: Token::Word(Word::Word("etc".to_string())),
1825 },
1826 PositionalToken {
1827 source: uws,
1828 offset: 61,
1829 length: 1,
1830 token: Token::Special(Special::Punctuation('.')),
1831 },
1832 PositionalToken {
1833 source: uws,
1834 offset: 62,
1835 length: 1,
1836 token: Token::Special(Special::Separator(Separator::Space)),
1837 },
1838 PositionalToken {
1839 source: uws,
1840 offset: 63,
1841 length: 3,
1842 token: Token::Word(Word::Word("qeq".to_string())),
1843 },
1844 PositionalToken {
1845 source: uws,
1846 offset: 66,
1847 length: 1,
1848 token: Token::Special(Special::Separator(Separator::Space)),
1849 },
1850 PositionalToken {
1851 source: uws,
1852 offset: 67,
1853 length: 1,
1854 token: Token::Word(Word::Word("U".to_string())),
1855 },
1856 PositionalToken {
1857 source: uws,
1858 offset: 68,
1859 length: 1,
1860 token: Token::Special(Special::Punctuation('.')),
1861 },
1862 PositionalToken {
1863 source: uws,
1864 offset: 69,
1865 length: 1,
1866 token: Token::Word(Word::Word("S".to_string())),
1867 },
1868 PositionalToken {
1869 source: uws,
1870 offset: 70,
1871 length: 1,
1872 token: Token::Special(Special::Punctuation('.')),
1873 },
1874 PositionalToken {
1875 source: uws,
1876 offset: 71,
1877 length: 1,
1878 token: Token::Word(Word::Word("A".to_string())),
1879 },
1880 PositionalToken {
1881 source: uws,
1882 offset: 72,
1883 length: 2,
1884 token: Token::Special(Special::Separator(Separator::Space)),
1885 },
1886 PositionalToken {
1887 source: uws,
1888 offset: 74,
1889 length: 3,
1890 token: Token::Word(Word::Word("asd".to_string())),
1891 },
1892 PositionalToken {
1893 source: uws,
1894 offset: 77,
1895 length: 3,
1896 token: Token::Special(Special::Separator(Separator::Newline)),
1897 },
1898 PositionalToken {
1899 source: uws,
1900 offset: 80,
1901 length: 3,
1902 token: Token::Word(Word::Word("Brr".to_string())),
1903 },
1904 PositionalToken {
1905 source: uws,
1906 offset: 83,
1907 length: 1,
1908 token: Token::Special(Special::Punctuation(',')),
1909 },
1910 PositionalToken {
1911 source: uws,
1912 offset: 84,
1913 length: 1,
1914 token: Token::Special(Special::Separator(Separator::Space)),
1915 },
1916 PositionalToken {
1917 source: uws,
1918 offset: 85,
1919 length: 4,
1920 token: Token::Word(Word::Word("it\'s".to_string())),
1921 },
1922 PositionalToken {
1923 source: uws,
1924 offset: 89,
1925 length: 1,
1926 token: Token::Special(Special::Separator(Separator::Space)),
1927 },
1928 PositionalToken {
1929 source: uws,
1930 offset: 90,
1931 length: 4,
1932 token: Token::Word(Word::Number(Number::Float(29.3))),
1933 },
1934 PositionalToken {
1935 source: uws,
1936 offset: 94,
1937 length: 2,
1938 token: Token::Special(Special::Symbol('°')),
1939 },
1940 PositionalToken {
1941 source: uws,
1942 offset: 96,
1943 length: 1,
1944 token: Token::Word(Word::Word("F".to_string())),
1945 },
1946 PositionalToken {
1947 source: uws,
1948 offset: 97,
1949 length: 1,
1950 token: Token::Special(Special::Punctuation('!')),
1951 },
1952 PositionalToken {
1953 source: uws,
1954 offset: 98,
1955 length: 1,
1956 token: Token::Special(Special::Separator(Separator::Newline)),
1957 },
1958 PositionalToken {
1959 source: uws,
1960 offset: 99,
1961 length: 1,
1962 token: Token::Special(Special::Separator(Separator::Space)),
1963 },
1964 PositionalToken {
1965 source: uws,
1966 offset: 100,
1967 length: 14,
1968 token: Token::Word(Word::Word("Русское".to_string())),
1969 },
1970 PositionalToken {
1971 source: uws,
1972 offset: 114,
1973 length: 1,
1974 token: Token::Special(Special::Separator(Separator::Space)),
1975 },
1976 PositionalToken {
1977 source: uws,
1978 offset: 115,
1979 length: 22,
1980 token: Token::Word(Word::Word("предложение".to_string())),
1981 },
1982 PositionalToken {
1983 source: uws,
1984 offset: 137,
1985 length: 1,
1986 token: Token::Special(Special::Separator(Separator::Space)),
1987 },
1988 PositionalToken {
1989 source: uws,
1990 offset: 138,
1991 length: 1,
1992 token: Token::Special(Special::Punctuation('#')),
1993 },
1994 PositionalToken {
1995 source: uws,
1996 offset: 139,
1997 length: 4,
1998 token: Token::Word(Word::Number(Number::Float(36.6))),
1999 },
2000 PositionalToken {
2001 source: uws,
2002 offset: 143,
2003 length: 1,
2004 token: Token::Special(Special::Separator(Separator::Space)),
2005 },
2006 PositionalToken {
2007 source: uws,
2008 offset: 144,
2009 length: 6,
2010 token: Token::Word(Word::Word("для".to_string())),
2011 },
2012 PositionalToken {
2013 source: uws,
2014 offset: 150,
2015 length: 1,
2016 token: Token::Special(Special::Separator(Separator::Space)),
2017 },
2018 PositionalToken {
2019 source: uws,
2020 offset: 151,
2021 length: 24,
2022 token: Token::Word(Word::Word("тестирования".to_string())),
2023 },
2024 PositionalToken {
2025 source: uws,
2026 offset: 175,
2027 length: 1,
2028 token: Token::Special(Special::Separator(Separator::Space)),
2029 },
2030 PositionalToken {
2031 source: uws,
2032 offset: 176,
2033 length: 14,
2034 token: Token::Word(Word::Word("деления".to_string())),
2035 },
2036 PositionalToken {
2037 source: uws,
2038 offset: 190,
2039 length: 1,
2040 token: Token::Special(Special::Separator(Separator::Space)),
2041 },
2042 PositionalToken {
2043 source: uws,
2044 offset: 191,
2045 length: 4,
2046 token: Token::Word(Word::Word("по".to_string())),
2047 },
2048 PositionalToken {
2049 source: uws,
2050 offset: 195,
2051 length: 1,
2052 token: Token::Special(Special::Separator(Separator::Space)),
2053 },
2054 PositionalToken {
2055 source: uws,
2056 offset: 196,
2057 length: 12,
2058 token: Token::Word(Word::Word("юникод".to_string())),
2059 },
2060 PositionalToken {
2061 source: uws,
2062 offset: 208,
2063 length: 1,
2064 token: Token::Special(Special::Punctuation('-')),
2065 },
2066 PositionalToken {
2067 source: uws,
2068 offset: 209,
2069 length: 12,
2070 token: Token::Word(Word::Word("словам".to_string())),
2071 },
2072 PositionalToken {
2073 source: uws,
2074 offset: 221,
2075 length: 3,
2076 token: Token::Special(Special::Punctuation('.')),
2077 },
2078 PositionalToken {
2079 source: uws,
2080 offset: 224,
2081 length: 1,
2082 token: Token::Special(Special::Separator(Separator::Newline)),
2083 },
2084 ];
2085 let lib_res = uws
2086 .into_tokenizer(TokenizerParams::v1())
2087 .collect::<Vec<_>>();
2088 check_results(&result, &lib_res, uws);
2089 }
2090
2091 #[test]
2092 fn general_no_split() {
2093 let uws = "The quick (\"brown\") fox can't jump 32.3 feet, right? 4pda etc. qeq U.S.A asd\n\n\nBrr, it's 29.3°F!\n Русское предложение #36.6 для тестирования деления по юникод-словам...\n";
2094 let result = vec![
2095 PositionalToken {
2096 source: uws,
2097 offset: 0,
2098 length: 3,
2099 token: Token::Word(Word::Word("The".to_string())),
2100 },
2101 PositionalToken {
2102 source: uws,
2103 offset: 3,
2104 length: 1,
2105 token: Token::Special(Special::Separator(Separator::Space)),
2106 },
2107 PositionalToken {
2108 source: uws,
2109 offset: 4,
2110 length: 5,
2111 token: Token::Word(Word::Word("quick".to_string())),
2112 },
2113 PositionalToken {
2114 source: uws,
2115 offset: 9,
2116 length: 1,
2117 token: Token::Special(Special::Separator(Separator::Space)),
2118 },
2119 PositionalToken {
2120 source: uws,
2121 offset: 10,
2122 length: 1,
2123 token: Token::Special(Special::Punctuation('(')),
2124 },
2125 PositionalToken {
2126 source: uws,
2127 offset: 11,
2128 length: 1,
2129 token: Token::Special(Special::Punctuation('"')),
2130 },
2131 PositionalToken {
2132 source: uws,
2133 offset: 12,
2134 length: 5,
2135 token: Token::Word(Word::Word("brown".to_string())),
2136 },
2137 PositionalToken {
2138 source: uws,
2139 offset: 17,
2140 length: 1,
2141 token: Token::Special(Special::Punctuation('"')),
2142 },
2143 PositionalToken {
2144 source: uws,
2145 offset: 18,
2146 length: 1,
2147 token: Token::Special(Special::Punctuation(')')),
2148 },
2149 PositionalToken {
2150 source: uws,
2151 offset: 19,
2152 length: 1,
2153 token: Token::Special(Special::Separator(Separator::Space)),
2154 },
2155 PositionalToken {
2156 source: uws,
2157 offset: 20,
2158 length: 3,
2159 token: Token::Word(Word::Word("fox".to_string())),
2160 },
2161 PositionalToken {
2162 source: uws,
2163 offset: 23,
2164 length: 1,
2165 token: Token::Special(Special::Separator(Separator::Space)),
2166 },
2167 PositionalToken {
2168 source: uws,
2169 offset: 24,
2170 length: 5,
2171 token: Token::Word(Word::Word("can\'t".to_string())),
2172 },
2173 PositionalToken {
2174 source: uws,
2175 offset: 29,
2176 length: 1,
2177 token: Token::Special(Special::Separator(Separator::Space)),
2178 },
2179 PositionalToken {
2180 source: uws,
2181 offset: 30,
2182 length: 4,
2183 token: Token::Word(Word::Word("jump".to_string())),
2184 },
2185 PositionalToken {
2186 source: uws,
2187 offset: 34,
2188 length: 1,
2189 token: Token::Special(Special::Separator(Separator::Space)),
2190 },
2191 PositionalToken {
2192 source: uws,
2193 offset: 35,
2194 length: 4,
2195 token: Token::Word(Word::Number(Number::Float(32.3))),
2196 },
2197 PositionalToken {
2198 source: uws,
2199 offset: 39,
2200 length: 1,
2201 token: Token::Special(Special::Separator(Separator::Space)),
2202 },
2203 PositionalToken {
2204 source: uws,
2205 offset: 40,
2206 length: 4,
2207 token: Token::Word(Word::Word("feet".to_string())),
2208 },
2209 PositionalToken {
2210 source: uws,
2211 offset: 44,
2212 length: 1,
2213 token: Token::Special(Special::Punctuation(',')),
2214 },
2215 PositionalToken {
2216 source: uws,
2217 offset: 45,
2218 length: 1,
2219 token: Token::Special(Special::Separator(Separator::Space)),
2220 },
2221 PositionalToken {
2222 source: uws,
2223 offset: 46,
2224 length: 5,
2225 token: Token::Word(Word::Word("right".to_string())),
2226 },
2227 PositionalToken {
2228 source: uws,
2229 offset: 51,
2230 length: 1,
2231 token: Token::Special(Special::Punctuation('?')),
2232 },
2233 PositionalToken {
2234 source: uws,
2235 offset: 52,
2236 length: 1,
2237 token: Token::Special(Special::Separator(Separator::Space)),
2238 },
2239 PositionalToken {
2240 source: uws,
2241 offset: 53,
2242 length: 4,
2243 token: Token::Word(Word::Numerical(Numerical::Measures("4pda".to_string()))),
2244 }, PositionalToken {
2246 source: uws,
2247 offset: 57,
2248 length: 1,
2249 token: Token::Special(Special::Separator(Separator::Space)),
2250 },
2251 PositionalToken {
2252 source: uws,
2253 offset: 58,
2254 length: 3,
2255 token: Token::Word(Word::Word("etc".to_string())),
2256 },
2257 PositionalToken {
2258 source: uws,
2259 offset: 61,
2260 length: 1,
2261 token: Token::Special(Special::Punctuation('.')),
2262 },
2263 PositionalToken {
2264 source: uws,
2265 offset: 62,
2266 length: 1,
2267 token: Token::Special(Special::Separator(Separator::Space)),
2268 },
2269 PositionalToken {
2270 source: uws,
2271 offset: 63,
2272 length: 3,
2273 token: Token::Word(Word::Word("qeq".to_string())),
2274 },
2275 PositionalToken {
2276 source: uws,
2277 offset: 66,
2278 length: 1,
2279 token: Token::Special(Special::Separator(Separator::Space)),
2280 },
2281 PositionalToken {
2282 source: uws,
2283 offset: 67,
2284 length: 5,
2285 token: Token::Word(Word::Word("U.S.A".to_string())),
2286 },
2287 PositionalToken {
2288 source: uws,
2289 offset: 72,
2290 length: 1,
2291 token: Token::Special(Special::Separator(Separator::Space)),
2292 },
2293 PositionalToken {
2294 source: uws,
2295 offset: 73,
2296 length: 1,
2297 token: Token::Special(Special::Separator(Separator::Space)),
2298 },
2299 PositionalToken {
2300 source: uws,
2301 offset: 74,
2302 length: 3,
2303 token: Token::Word(Word::Word("asd".to_string())),
2304 },
2305 PositionalToken {
2306 source: uws,
2307 offset: 77,
2308 length: 1,
2309 token: Token::Special(Special::Separator(Separator::Newline)),
2310 },
2311 PositionalToken {
2312 source: uws,
2313 offset: 78,
2314 length: 1,
2315 token: Token::Special(Special::Separator(Separator::Newline)),
2316 },
2317 PositionalToken {
2318 source: uws,
2319 offset: 79,
2320 length: 1,
2321 token: Token::Special(Special::Separator(Separator::Newline)),
2322 },
2323 PositionalToken {
2324 source: uws,
2325 offset: 80,
2326 length: 3,
2327 token: Token::Word(Word::Word("Brr".to_string())),
2328 },
2329 PositionalToken {
2330 source: uws,
2331 offset: 83,
2332 length: 1,
2333 token: Token::Special(Special::Punctuation(',')),
2334 },
2335 PositionalToken {
2336 source: uws,
2337 offset: 84,
2338 length: 1,
2339 token: Token::Special(Special::Separator(Separator::Space)),
2340 },
2341 PositionalToken {
2342 source: uws,
2343 offset: 85,
2344 length: 4,
2345 token: Token::Word(Word::Word("it\'s".to_string())),
2346 },
2347 PositionalToken {
2348 source: uws,
2349 offset: 89,
2350 length: 1,
2351 token: Token::Special(Special::Separator(Separator::Space)),
2352 },
2353 PositionalToken {
2354 source: uws,
2355 offset: 90,
2356 length: 4,
2357 token: Token::Word(Word::Number(Number::Float(29.3))),
2358 },
2359 PositionalToken {
2360 source: uws,
2361 offset: 94,
2362 length: 2,
2363 token: Token::Special(Special::Symbol('°')),
2364 },
2365 PositionalToken {
2366 source: uws,
2367 offset: 96,
2368 length: 1,
2369 token: Token::Word(Word::Word("F".to_string())),
2370 },
2371 PositionalToken {
2372 source: uws,
2373 offset: 97,
2374 length: 1,
2375 token: Token::Special(Special::Punctuation('!')),
2376 },
2377 PositionalToken {
2378 source: uws,
2379 offset: 98,
2380 length: 1,
2381 token: Token::Special(Special::Separator(Separator::Newline)),
2382 },
2383 PositionalToken {
2384 source: uws,
2385 offset: 99,
2386 length: 1,
2387 token: Token::Special(Special::Separator(Separator::Space)),
2388 },
2389 PositionalToken {
2390 source: uws,
2391 offset: 100,
2392 length: 14,
2393 token: Token::Word(Word::Word("Русское".to_string())),
2394 },
2395 PositionalToken {
2396 source: uws,
2397 offset: 114,
2398 length: 1,
2399 token: Token::Special(Special::Separator(Separator::Space)),
2400 },
2401 PositionalToken {
2402 source: uws,
2403 offset: 115,
2404 length: 22,
2405 token: Token::Word(Word::Word("предложение".to_string())),
2406 },
2407 PositionalToken {
2408 source: uws,
2409 offset: 137,
2410 length: 1,
2411 token: Token::Special(Special::Separator(Separator::Space)),
2412 },
2413 PositionalToken {
2414 source: uws,
2415 offset: 138,
2416 length: 1,
2417 token: Token::Special(Special::Punctuation('#')),
2418 },
2419 PositionalToken {
2420 source: uws,
2421 offset: 139,
2422 length: 4,
2423 token: Token::Word(Word::Number(Number::Float(36.6))),
2424 },
2425 PositionalToken {
2426 source: uws,
2427 offset: 143,
2428 length: 1,
2429 token: Token::Special(Special::Separator(Separator::Space)),
2430 },
2431 PositionalToken {
2432 source: uws,
2433 offset: 144,
2434 length: 6,
2435 token: Token::Word(Word::Word("для".to_string())),
2436 },
2437 PositionalToken {
2438 source: uws,
2439 offset: 150,
2440 length: 1,
2441 token: Token::Special(Special::Separator(Separator::Space)),
2442 },
2443 PositionalToken {
2444 source: uws,
2445 offset: 151,
2446 length: 24,
2447 token: Token::Word(Word::Word("тестирования".to_string())),
2448 },
2449 PositionalToken {
2450 source: uws,
2451 offset: 175,
2452 length: 1,
2453 token: Token::Special(Special::Separator(Separator::Space)),
2454 },
2455 PositionalToken {
2456 source: uws,
2457 offset: 176,
2458 length: 14,
2459 token: Token::Word(Word::Word("деления".to_string())),
2460 },
2461 PositionalToken {
2462 source: uws,
2463 offset: 190,
2464 length: 1,
2465 token: Token::Special(Special::Separator(Separator::Space)),
2466 },
2467 PositionalToken {
2468 source: uws,
2469 offset: 191,
2470 length: 4,
2471 token: Token::Word(Word::Word("по".to_string())),
2472 },
2473 PositionalToken {
2474 source: uws,
2475 offset: 195,
2476 length: 1,
2477 token: Token::Special(Special::Separator(Separator::Space)),
2478 },
2479 PositionalToken {
2480 source: uws,
2481 offset: 196,
2482 length: 12,
2483 token: Token::Word(Word::Word("юникод".to_string())),
2484 },
2485 PositionalToken {
2486 source: uws,
2487 offset: 208,
2488 length: 1,
2489 token: Token::Special(Special::Punctuation('-')),
2490 },
2491 PositionalToken {
2492 source: uws,
2493 offset: 209,
2494 length: 12,
2495 token: Token::Word(Word::Word("словам".to_string())),
2496 },
2497 PositionalToken {
2498 source: uws,
2499 offset: 221,
2500 length: 1,
2501 token: Token::Special(Special::Punctuation('.')),
2502 },
2503 PositionalToken {
2504 source: uws,
2505 offset: 222,
2506 length: 1,
2507 token: Token::Special(Special::Punctuation('.')),
2508 },
2509 PositionalToken {
2510 source: uws,
2511 offset: 223,
2512 length: 1,
2513 token: Token::Special(Special::Punctuation('.')),
2514 },
2515 PositionalToken {
2516 source: uws,
2517 offset: 224,
2518 length: 1,
2519 token: Token::Special(Special::Separator(Separator::Newline)),
2520 },
2521 ];
2522 let lib_res = uws.into_tokenizer(Default::default()).collect::<Vec<_>>();
2523 check_results(&result, &lib_res, uws);
2524 }
2525
2526 #[test]
2527 fn general_complex() {
2528 let uws = "The quick (\"brown\") fox can't jump 32.3 feet, right? 4pda etc. qeq U.S.A asd\n\n\nBrr, it's 29.3°F!\n Русское предложение #36.6 для тестирования деления по юникод-словам...\n";
2529 let result = vec![
2530 PositionalToken {
2531 source: uws,
2532 offset: 0,
2533 length: 3,
2534 token: Token::Word(Word::Word("The".to_string())),
2535 },
2536 PositionalToken {
2537 source: uws,
2538 offset: 3,
2539 length: 1,
2540 token: Token::Special(Special::Separator(Separator::Space)),
2541 },
2542 PositionalToken {
2543 source: uws,
2544 offset: 4,
2545 length: 5,
2546 token: Token::Word(Word::Word("quick".to_string())),
2547 },
2548 PositionalToken {
2549 source: uws,
2550 offset: 9,
2551 length: 1,
2552 token: Token::Special(Special::Separator(Separator::Space)),
2553 },
2554 PositionalToken {
2555 source: uws,
2556 offset: 10,
2557 length: 1,
2558 token: Token::Special(Special::Punctuation('(')),
2559 },
2560 PositionalToken {
2561 source: uws,
2562 offset: 11,
2563 length: 1,
2564 token: Token::Special(Special::Punctuation('"')),
2565 },
2566 PositionalToken {
2567 source: uws,
2568 offset: 12,
2569 length: 5,
2570 token: Token::Word(Word::Word("brown".to_string())),
2571 },
2572 PositionalToken {
2573 source: uws,
2574 offset: 17,
2575 length: 1,
2576 token: Token::Special(Special::Punctuation('"')),
2577 },
2578 PositionalToken {
2579 source: uws,
2580 offset: 18,
2581 length: 1,
2582 token: Token::Special(Special::Punctuation(')')),
2583 },
2584 PositionalToken {
2585 source: uws,
2586 offset: 19,
2587 length: 1,
2588 token: Token::Special(Special::Separator(Separator::Space)),
2589 },
2590 PositionalToken {
2591 source: uws,
2592 offset: 20,
2593 length: 3,
2594 token: Token::Word(Word::Word("fox".to_string())),
2595 },
2596 PositionalToken {
2597 source: uws,
2598 offset: 23,
2599 length: 1,
2600 token: Token::Special(Special::Separator(Separator::Space)),
2601 },
2602 PositionalToken {
2603 source: uws,
2604 offset: 24,
2605 length: 5,
2606 token: Token::Word(Word::Word("can\'t".to_string())),
2607 },
2608 PositionalToken {
2609 source: uws,
2610 offset: 29,
2611 length: 1,
2612 token: Token::Special(Special::Separator(Separator::Space)),
2613 },
2614 PositionalToken {
2615 source: uws,
2616 offset: 30,
2617 length: 4,
2618 token: Token::Word(Word::Word("jump".to_string())),
2619 },
2620 PositionalToken {
2621 source: uws,
2622 offset: 34,
2623 length: 1,
2624 token: Token::Special(Special::Separator(Separator::Space)),
2625 },
2626 PositionalToken {
2627 source: uws,
2628 offset: 35,
2629 length: 4,
2630 token: Token::Word(Word::Number(Number::Float(32.3))),
2631 },
2632 PositionalToken {
2633 source: uws,
2634 offset: 39,
2635 length: 1,
2636 token: Token::Special(Special::Separator(Separator::Space)),
2637 },
2638 PositionalToken {
2639 source: uws,
2640 offset: 40,
2641 length: 4,
2642 token: Token::Word(Word::Word("feet".to_string())),
2643 },
2644 PositionalToken {
2645 source: uws,
2646 offset: 44,
2647 length: 1,
2648 token: Token::Special(Special::Punctuation(',')),
2649 },
2650 PositionalToken {
2651 source: uws,
2652 offset: 45,
2653 length: 1,
2654 token: Token::Special(Special::Separator(Separator::Space)),
2655 },
2656 PositionalToken {
2657 source: uws,
2658 offset: 46,
2659 length: 5,
2660 token: Token::Word(Word::Word("right".to_string())),
2661 },
2662 PositionalToken {
2663 source: uws,
2664 offset: 51,
2665 length: 1,
2666 token: Token::Special(Special::Punctuation('?')),
2667 },
2668 PositionalToken {
2669 source: uws,
2670 offset: 52,
2671 length: 1,
2672 token: Token::Special(Special::Separator(Separator::Space)),
2673 },
2674 PositionalToken {
2675 source: uws,
2676 offset: 53,
2677 length: 4,
2678 token: Token::Word(Word::Numerical(Numerical::Measures("4pda".to_string()))),
2679 }, PositionalToken {
2681 source: uws,
2682 offset: 57,
2683 length: 1,
2684 token: Token::Special(Special::Separator(Separator::Space)),
2685 },
2686 PositionalToken {
2687 source: uws,
2688 offset: 58,
2689 length: 3,
2690 token: Token::Word(Word::Word("etc".to_string())),
2691 },
2692 PositionalToken {
2693 source: uws,
2694 offset: 61,
2695 length: 1,
2696 token: Token::Special(Special::Punctuation('.')),
2697 },
2698 PositionalToken {
2699 source: uws,
2700 offset: 62,
2701 length: 1,
2702 token: Token::Special(Special::Separator(Separator::Space)),
2703 },
2704 PositionalToken {
2705 source: uws,
2706 offset: 63,
2707 length: 3,
2708 token: Token::Word(Word::Word("qeq".to_string())),
2709 },
2710 PositionalToken {
2711 source: uws,
2712 offset: 66,
2713 length: 1,
2714 token: Token::Special(Special::Separator(Separator::Space)),
2715 },
2716 PositionalToken {
2717 source: uws,
2718 offset: 67,
2719 length: 5,
2720 token: Token::Word(Word::Word("U.S.A".to_string())),
2721 },
2722 PositionalToken {
2723 source: uws,
2724 offset: 72,
2725 length: 2,
2726 token: Token::Special(Special::Separator(Separator::Space)),
2727 },
2728 PositionalToken {
2729 source: uws,
2730 offset: 74,
2731 length: 3,
2732 token: Token::Word(Word::Word("asd".to_string())),
2733 },
2734 PositionalToken {
2735 source: uws,
2736 offset: 77,
2737 length: 3,
2738 token: Token::Special(Special::Separator(Separator::Newline)),
2739 },
2740 PositionalToken {
2741 source: uws,
2742 offset: 80,
2743 length: 3,
2744 token: Token::Word(Word::Word("Brr".to_string())),
2745 },
2746 PositionalToken {
2747 source: uws,
2748 offset: 83,
2749 length: 1,
2750 token: Token::Special(Special::Punctuation(',')),
2751 },
2752 PositionalToken {
2753 source: uws,
2754 offset: 84,
2755 length: 1,
2756 token: Token::Special(Special::Separator(Separator::Space)),
2757 },
2758 PositionalToken {
2759 source: uws,
2760 offset: 85,
2761 length: 4,
2762 token: Token::Word(Word::Word("it\'s".to_string())),
2763 },
2764 PositionalToken {
2765 source: uws,
2766 offset: 89,
2767 length: 1,
2768 token: Token::Special(Special::Separator(Separator::Space)),
2769 },
2770 PositionalToken {
2771 source: uws,
2772 offset: 90,
2773 length: 4,
2774 token: Token::Word(Word::Number(Number::Float(29.3))),
2775 },
2776 PositionalToken {
2777 source: uws,
2778 offset: 94,
2779 length: 2,
2780 token: Token::Special(Special::Symbol('°')),
2781 },
2782 PositionalToken {
2783 source: uws,
2784 offset: 96,
2785 length: 1,
2786 token: Token::Word(Word::Word("F".to_string())),
2787 },
2788 PositionalToken {
2789 source: uws,
2790 offset: 97,
2791 length: 1,
2792 token: Token::Special(Special::Punctuation('!')),
2793 },
2794 PositionalToken {
2795 source: uws,
2796 offset: 98,
2797 length: 1,
2798 token: Token::Special(Special::Separator(Separator::Newline)),
2799 },
2800 PositionalToken {
2801 source: uws,
2802 offset: 99,
2803 length: 1,
2804 token: Token::Special(Special::Separator(Separator::Space)),
2805 },
2806 PositionalToken {
2807 source: uws,
2808 offset: 100,
2809 length: 14,
2810 token: Token::Word(Word::Word("Русское".to_string())),
2811 },
2812 PositionalToken {
2813 source: uws,
2814 offset: 114,
2815 length: 1,
2816 token: Token::Special(Special::Separator(Separator::Space)),
2817 },
2818 PositionalToken {
2819 source: uws,
2820 offset: 115,
2821 length: 22,
2822 token: Token::Word(Word::Word("предложение".to_string())),
2823 },
2824 PositionalToken {
2825 source: uws,
2826 offset: 137,
2827 length: 1,
2828 token: Token::Special(Special::Separator(Separator::Space)),
2829 },
2830 PositionalToken {
2831 source: uws,
2832 offset: 138,
2833 length: 5,
2834 token: Token::Struct(Struct::Hashtag("36.6".to_string())),
2835 },
2836 PositionalToken {
2837 source: uws,
2838 offset: 143,
2839 length: 1,
2840 token: Token::Special(Special::Separator(Separator::Space)),
2841 },
2842 PositionalToken {
2843 source: uws,
2844 offset: 144,
2845 length: 6,
2846 token: Token::Word(Word::Word("для".to_string())),
2847 },
2848 PositionalToken {
2849 source: uws,
2850 offset: 150,
2851 length: 1,
2852 token: Token::Special(Special::Separator(Separator::Space)),
2853 },
2854 PositionalToken {
2855 source: uws,
2856 offset: 151,
2857 length: 24,
2858 token: Token::Word(Word::Word("тестирования".to_string())),
2859 },
2860 PositionalToken {
2861 source: uws,
2862 offset: 175,
2863 length: 1,
2864 token: Token::Special(Special::Separator(Separator::Space)),
2865 },
2866 PositionalToken {
2867 source: uws,
2868 offset: 176,
2869 length: 14,
2870 token: Token::Word(Word::Word("деления".to_string())),
2871 },
2872 PositionalToken {
2873 source: uws,
2874 offset: 190,
2875 length: 1,
2876 token: Token::Special(Special::Separator(Separator::Space)),
2877 },
2878 PositionalToken {
2879 source: uws,
2880 offset: 191,
2881 length: 4,
2882 token: Token::Word(Word::Word("по".to_string())),
2883 },
2884 PositionalToken {
2885 source: uws,
2886 offset: 195,
2887 length: 1,
2888 token: Token::Special(Special::Separator(Separator::Space)),
2889 },
2890 PositionalToken {
2891 source: uws,
2892 offset: 196,
2893 length: 12,
2894 token: Token::Word(Word::Word("юникод".to_string())),
2895 },
2896 PositionalToken {
2897 source: uws,
2898 offset: 208,
2899 length: 1,
2900 token: Token::Special(Special::Punctuation('-')),
2901 },
2902 PositionalToken {
2903 source: uws,
2904 offset: 209,
2905 length: 12,
2906 token: Token::Word(Word::Word("словам".to_string())),
2907 },
2908 PositionalToken {
2909 source: uws,
2910 offset: 221,
2911 length: 3,
2912 token: Token::Special(Special::Punctuation('.')),
2913 },
2914 PositionalToken {
2915 source: uws,
2916 offset: 224,
2917 length: 1,
2918 token: Token::Special(Special::Separator(Separator::Newline)),
2919 },
2920 ];
2921 let lib_res = uws
2922 .into_tokenizer(TokenizerParams::complex())
2923 .collect::<Vec<_>>();
2924 check_results(&result, &lib_res, uws);
2925 }
2926
2927 #[test]
2928 fn plus_minus() {
2929 let uws = "+23 -4.5 -34 +25.7 - 2 + 5.6";
2930 let result = vec![
2931 PositionalToken {
2932 source: uws,
2933 offset: 0,
2934 length: 3,
2935 token: Token::Word(Word::Number(Number::Integer(23))),
2936 },
2937 PositionalToken {
2938 source: uws,
2939 offset: 3,
2940 length: 1,
2941 token: Token::Special(Special::Separator(Separator::Space)),
2942 },
2943 PositionalToken {
2944 source: uws,
2945 offset: 4,
2946 length: 4,
2947 token: Token::Word(Word::Number(Number::Float(-4.5))),
2948 },
2949 PositionalToken {
2950 source: uws,
2951 offset: 8,
2952 length: 1,
2953 token: Token::Special(Special::Separator(Separator::Space)),
2954 },
2955 PositionalToken {
2956 source: uws,
2957 offset: 9,
2958 length: 3,
2959 token: Token::Word(Word::Number(Number::Integer(-34))),
2960 },
2961 PositionalToken {
2962 source: uws,
2963 offset: 12,
2964 length: 1,
2965 token: Token::Special(Special::Separator(Separator::Space)),
2966 },
2967 PositionalToken {
2968 source: uws,
2969 offset: 13,
2970 length: 5,
2971 token: Token::Word(Word::Number(Number::Float(25.7))),
2972 },
2973 PositionalToken {
2974 source: uws,
2975 offset: 18,
2976 length: 1,
2977 token: Token::Special(Special::Separator(Separator::Space)),
2978 },
2979 PositionalToken {
2980 source: uws,
2981 offset: 19,
2982 length: 1,
2983 token: Token::Special(Special::Punctuation('-')),
2984 },
2985 PositionalToken {
2986 source: uws,
2987 offset: 20,
2988 length: 1,
2989 token: Token::Special(Special::Separator(Separator::Space)),
2990 },
2991 PositionalToken {
2992 source: uws,
2993 offset: 21,
2994 length: 1,
2995 token: Token::Word(Word::Number(Number::Integer(2))),
2996 },
2997 PositionalToken {
2998 source: uws,
2999 offset: 22,
3000 length: 1,
3001 token: Token::Special(Special::Separator(Separator::Space)),
3002 },
3003 PositionalToken {
3004 source: uws,
3005 offset: 23,
3006 length: 1,
3007 token: Token::Special(Special::Punctuation('+')),
3008 },
3009 PositionalToken {
3010 source: uws,
3011 offset: 24,
3012 length: 1,
3013 token: Token::Special(Special::Separator(Separator::Space)),
3014 },
3015 PositionalToken {
3016 source: uws,
3017 offset: 25,
3018 length: 3,
3019 token: Token::Word(Word::Number(Number::Float(5.6))),
3020 },
3021 ];
3022 let lib_res = uws
3023 .into_tokenizer(TokenizerParams::v1())
3024 .collect::<Vec<_>>();
3025 check(&result, &lib_res, uws);
3026 }
3028
3029 #[test]
3030 #[ignore]
3031 fn woman_bouncing_ball() {
3032 let uws = "\u{26f9}\u{200d}\u{2640}";
3033 let result = vec![PositionalToken {
3034 source: uws,
3035 offset: 0,
3036 length: 9,
3037 token: Token::Word(Word::Emoji("woman_bouncing_ball")),
3038 }];
3039 let lib_res = uws
3040 .into_tokenizer(TokenizerParams::v1())
3041 .collect::<Vec<_>>();
3042 check_results(&result, &lib_res, uws);
3043 }
3045
3046 #[test]
3047 fn emoji_and_rusabbr_default() {
3048 let uws = "🇷🇺 🇸🇹\n👱🏿👶🏽👨🏽\n👱\nС.С.С.Р.\n👨👩👦👦\n🧠\n";
3049 let result = vec![
3050 PositionalToken {
3051 source: uws,
3052 offset: 0,
3053 length: 8,
3054 token: Token::Word(Word::Emoji("russia")),
3055 },
3056 PositionalToken {
3057 source: uws,
3058 offset: 8,
3059 length: 1,
3060 token: Token::Special(Special::Separator(Separator::Space)),
3061 },
3062 PositionalToken {
3063 source: uws,
3064 offset: 9,
3065 length: 8,
3066 token: Token::Word(Word::Emoji("sao_tome_and_principe")),
3067 },
3068 PositionalToken {
3069 source: uws,
3070 offset: 17,
3071 length: 1,
3072 token: Token::Special(Special::Separator(Separator::Newline)),
3073 },
3074 PositionalToken {
3075 source: uws,
3076 offset: 18,
3077 length: 8,
3078 token: Token::Word(Word::Emoji("blond_haired_person_dark_skin_tone")),
3079 },
3080 PositionalToken {
3081 source: uws,
3082 offset: 26,
3083 length: 8,
3084 token: Token::Word(Word::Emoji("baby_medium_skin_tone")),
3085 },
3086 PositionalToken {
3087 source: uws,
3088 offset: 34,
3089 length: 8,
3090 token: Token::Word(Word::Emoji("man_medium_skin_tone")),
3091 },
3092 PositionalToken {
3093 source: uws,
3094 offset: 42,
3095 length: 1,
3096 token: Token::Special(Special::Separator(Separator::Newline)),
3097 },
3098 PositionalToken {
3099 source: uws,
3100 offset: 43,
3101 length: 4,
3102 token: Token::Word(Word::Emoji("blond_haired_person")),
3103 },
3104 PositionalToken {
3105 source: uws,
3106 offset: 47,
3107 length: 1,
3108 token: Token::Special(Special::Separator(Separator::Newline)),
3109 },
3110 PositionalToken {
3111 source: uws,
3112 offset: 48,
3113 length: 2,
3114 token: Token::Word(Word::Word("С".to_string())),
3115 },
3116 PositionalToken {
3117 source: uws,
3118 offset: 50,
3119 length: 1,
3120 token: Token::Special(Special::Punctuation('.')),
3121 },
3122 PositionalToken {
3123 source: uws,
3124 offset: 51,
3125 length: 2,
3126 token: Token::Word(Word::Word("С".to_string())),
3127 },
3128 PositionalToken {
3129 source: uws,
3130 offset: 53,
3131 length: 1,
3132 token: Token::Special(Special::Punctuation('.')),
3133 },
3134 PositionalToken {
3135 source: uws,
3136 offset: 54,
3137 length: 2,
3138 token: Token::Word(Word::Word("С".to_string())),
3139 },
3140 PositionalToken {
3141 source: uws,
3142 offset: 56,
3143 length: 1,
3144 token: Token::Special(Special::Punctuation('.')),
3145 },
3146 PositionalToken {
3147 source: uws,
3148 offset: 57,
3149 length: 2,
3150 token: Token::Word(Word::Word("Р".to_string())),
3151 },
3152 PositionalToken {
3153 source: uws,
3154 offset: 59,
3155 length: 1,
3156 token: Token::Special(Special::Punctuation('.')),
3157 },
3158 PositionalToken {
3159 source: uws,
3160 offset: 60,
3161 length: 1,
3162 token: Token::Special(Special::Separator(Separator::Newline)),
3163 },
3164 PositionalToken {
3165 source: uws,
3166 offset: 61,
3167 length: 25,
3168 token: Token::Word(Word::Emoji("family_man_woman_boy_boy")),
3169 },
3170 PositionalToken {
3171 source: uws,
3172 offset: 86,
3173 length: 1,
3174 token: Token::Special(Special::Separator(Separator::Newline)),
3175 },
3176 PositionalToken {
3177 source: uws,
3178 offset: 87,
3179 length: 4,
3180 token: Token::Word(Word::Emoji("brain")),
3181 },
3182 PositionalToken {
3183 source: uws,
3184 offset: 91,
3185 length: 1,
3186 token: Token::Special(Special::Separator(Separator::Newline)),
3187 },
3188 ];
3189
3190 let lib_res = uws
3191 .into_tokenizer(TokenizerParams::v1())
3192 .collect::<Vec<_>>();
3193 check_results(&result, &lib_res, uws);
3194 }
3196
3197 #[test]
3198 fn emoji_and_rusabbr_no_split() {
3199 let uws = "🇷🇺 🇸🇹\n👱🏿👶🏽👨🏽\n👱\nС.С.С.Р.\n👨👩👦👦\n🧠\n";
3200 let result = vec![
3201 PositionalToken {
3202 source: uws,
3203 offset: 0,
3204 length: 8,
3205 token: Token::Word(Word::Emoji("russia")),
3206 },
3207 PositionalToken {
3208 source: uws,
3209 offset: 8,
3210 length: 1,
3211 token: Token::Special(Special::Separator(Separator::Space)),
3212 },
3213 PositionalToken {
3214 source: uws,
3215 offset: 9,
3216 length: 8,
3217 token: Token::Word(Word::Emoji("sao_tome_and_principe")),
3218 },
3219 PositionalToken {
3220 source: uws,
3221 offset: 17,
3222 length: 1,
3223 token: Token::Special(Special::Separator(Separator::Newline)),
3224 },
3225 PositionalToken {
3226 source: uws,
3227 offset: 18,
3228 length: 8,
3229 token: Token::Word(Word::Emoji("blond_haired_person_dark_skin_tone")),
3230 },
3231 PositionalToken {
3232 source: uws,
3233 offset: 26,
3234 length: 8,
3235 token: Token::Word(Word::Emoji("baby_medium_skin_tone")),
3236 },
3237 PositionalToken {
3238 source: uws,
3239 offset: 34,
3240 length: 8,
3241 token: Token::Word(Word::Emoji("man_medium_skin_tone")),
3242 },
3243 PositionalToken {
3244 source: uws,
3245 offset: 42,
3246 length: 1,
3247 token: Token::Special(Special::Separator(Separator::Newline)),
3248 },
3249 PositionalToken {
3250 source: uws,
3251 offset: 43,
3252 length: 4,
3253 token: Token::Word(Word::Emoji("blond_haired_person")),
3254 },
3255 PositionalToken {
3256 source: uws,
3257 offset: 47,
3258 length: 1,
3259 token: Token::Special(Special::Separator(Separator::Newline)),
3260 },
3261 PositionalToken {
3262 source: uws,
3263 offset: 48,
3264 length: 11,
3265 token: Token::Word(Word::Word("С.С.С.Р".to_string())),
3266 },
3267 PositionalToken {
3268 source: uws,
3269 offset: 59,
3270 length: 1,
3271 token: Token::Special(Special::Punctuation('.')),
3272 },
3273 PositionalToken {
3274 source: uws,
3275 offset: 60,
3276 length: 1,
3277 token: Token::Special(Special::Separator(Separator::Newline)),
3278 },
3279 PositionalToken {
3280 source: uws,
3281 offset: 61,
3282 length: 25,
3283 token: Token::Word(Word::Emoji("family_man_woman_boy_boy")),
3284 },
3285 PositionalToken {
3286 source: uws,
3287 offset: 86,
3288 length: 1,
3289 token: Token::Special(Special::Separator(Separator::Newline)),
3290 },
3291 PositionalToken {
3292 source: uws,
3293 offset: 87,
3294 length: 4,
3295 token: Token::Word(Word::Emoji("brain")),
3296 },
3297 PositionalToken {
3298 source: uws,
3299 offset: 91,
3300 length: 1,
3301 token: Token::Special(Special::Separator(Separator::Newline)),
3302 },
3303 ];
3304
3305 let lib_res = uws.into_tokenizer(Default::default()).collect::<Vec<_>>();
3306 check_results(&result, &lib_res, uws);
3307 }
3309
3310 #[test]
3534 fn html() {
3535 let uws = "<div class=\"article article_view \" id=\"article_view_-113039156_9551\" data-article-url=\"/@chaibuket-o-chem-ne-zabyt-25-noyabrya\" data-audio-context=\"article:-113039156_9551\"><h1 class=\"article_decoration_first article_decoration_last\" >День Мамы </h1><p class=\"article_decoration_first article_decoration_last\" >День, когда поздравляют мам, бабушек, сестер и жён — это всемирный праздник, называемый «День Мамы». В настоящее время его отмечают почти в каждой стране, просто везде разные даты и способы празднования. </p><h3 class=\"article_decoration_first article_decoration_last\" ><span class='article_anchor_title'>\n <span class='article_anchor_button' id='pochemu-my-ego-prazdnuem'></span>\n <span class='article_anchor_fsymbol'>П</span>\n</span>ПОЧЕМУ МЫ ЕГО ПРАЗДНУЕМ</h3><p class=\"article_decoration_first article_decoration_last article_decoration_before\" >В 1987 году комитет госдумы по делам женщин, семьи и молодежи выступил с предложением учредить «День мамы», а сам приказ был подписан уже 30 января 1988 года Борисом Ельциным. Было решено, что ежегодно в России празднество дня мамы будет выпадать на последнее воскресенье ноября. </p><figure data-type=\"101\" data-mode=\"\" class=\"article_decoration_first article_decoration_last\" >\n <div class=\"article_figure_content\" style=\"width: 1125px\">\n <div class=\"article_figure_sizer_content\"><div class=\"article_object_sizer_wrap\" data-sizes=\"[{"s":["https://pp.userapi.com/c849128/v849128704/c0ffd/pcNJaBH3NDo.jpg",75,50],"m":["https://pp.userapi.com/c849128/v849128704/c0ffe/ozCLs2kHtRY.jpg",130,87],"x":["https://pp.userapi.com/c849128/v849128704/c0fff/E4KtTNDydzE.jpg",604,403],"y":["https://pp.userapi.com/c849128/v849128704/c1000/1nLxpYKavzU.jpg",807,538],"z":["https://pp.userapi.com/c849128/v849128704/c1001/IgEODe90yEk.jpg",1125,750],"o":["https://pp.userapi.com/c849128/v849128704/c1002/01faNwVZ2_E.jpg",130,87],"p":["https://pp.userapi.com/c849128/v849128704/c1003/baDFzbdRP2s.jpg",200,133],"q":["https://pp.userapi.com/c849128/v849128704/c1004/CY4khI6KJKA.jpg",320,213],"r":["https://pp.userapi.com/c849128/v849128704/c1005/NOvAJ6-VltY.jpg",510,340]}]\">\n <img class=\"article_object_sizer_inner article_object_photo__image_blur\" src=\"https://pp.userapi.com/c849128/v849128704/c0ffd/pcNJaBH3NDo.jpg\" data-baseurl=\"\"/>\n \n</div></div>\n <div class=\"article_figure_sizer\" style=\"padding-bottom: 66.666666666667%\"></div>";
3536 let result = vec![
3537 PositionalToken {
3538 source: uws,
3539 offset: 236,
3540 length: 8,
3541 token: Token::Word(Word::Word("День".to_string())),
3542 },
3543 PositionalToken {
3544 source: uws,
3545 offset: 244,
3546 length: 1,
3547 token: Token::Special(Special::Separator(Separator::Space)),
3548 },
3549 PositionalToken {
3550 source: uws,
3551 offset: 245,
3552 length: 8,
3553 token: Token::Word(Word::Word("Мамы".to_string())),
3554 },
3555 PositionalToken {
3556 source: uws,
3557 offset: 253,
3558 length: 1,
3559 token: Token::Special(Special::Separator(Separator::Space)),
3560 },
3561 PositionalToken {
3562 source: uws,
3563 offset: 321,
3564 length: 8,
3565 token: Token::Word(Word::Word("День".to_string())),
3566 },
3567 PositionalToken {
3568 source: uws,
3569 offset: 329,
3570 length: 1,
3571 token: Token::Special(Special::Punctuation(',')),
3572 },
3573 PositionalToken {
3574 source: uws,
3575 offset: 330,
3576 length: 1,
3577 token: Token::Special(Special::Separator(Separator::Space)),
3578 },
3579 PositionalToken {
3580 source: uws,
3581 offset: 331,
3582 length: 10,
3583 token: Token::Word(Word::Word("когда".to_string())),
3584 },
3585 PositionalToken {
3586 source: uws,
3587 offset: 341,
3588 length: 1,
3589 token: Token::Special(Special::Separator(Separator::Space)),
3590 },
3591 PositionalToken {
3592 source: uws,
3593 offset: 342,
3594 length: 22,
3595 token: Token::Word(Word::Word("поздравляют".to_string())),
3596 },
3597 PositionalToken {
3598 source: uws,
3599 offset: 364,
3600 length: 1,
3601 token: Token::Special(Special::Separator(Separator::Space)),
3602 },
3603 PositionalToken {
3604 source: uws,
3605 offset: 365,
3606 length: 6,
3607 token: Token::Word(Word::Word("мам".to_string())),
3608 },
3609 PositionalToken {
3610 source: uws,
3611 offset: 371,
3612 length: 1,
3613 token: Token::Special(Special::Punctuation(',')),
3614 },
3615 PositionalToken {
3616 source: uws,
3617 offset: 372,
3618 length: 1,
3619 token: Token::Special(Special::Separator(Separator::Space)),
3620 },
3621 PositionalToken {
3622 source: uws,
3623 offset: 373,
3624 length: 14,
3625 token: Token::Word(Word::Word("бабушек".to_string())),
3626 },
3627 PositionalToken {
3628 source: uws,
3629 offset: 387,
3630 length: 1,
3631 token: Token::Special(Special::Punctuation(',')),
3632 },
3633 PositionalToken {
3634 source: uws,
3635 offset: 388,
3636 length: 1,
3637 token: Token::Special(Special::Separator(Separator::Space)),
3638 },
3639 PositionalToken {
3640 source: uws,
3641 offset: 389,
3642 length: 12,
3643 token: Token::Word(Word::Word("сестер".to_string())),
3644 },
3645 PositionalToken {
3646 source: uws,
3647 offset: 401,
3648 length: 1,
3649 token: Token::Special(Special::Separator(Separator::Space)),
3650 },
3651 PositionalToken {
3652 source: uws,
3653 offset: 402,
3654 length: 2,
3655 token: Token::Word(Word::Word("и".to_string())),
3656 },
3657 PositionalToken {
3658 source: uws,
3659 offset: 404,
3660 length: 1,
3661 token: Token::Special(Special::Separator(Separator::Space)),
3662 },
3663 PositionalToken {
3664 source: uws,
3665 offset: 405,
3666 length: 6,
3667 token: Token::Word(Word::Word("жён".to_string())),
3668 },
3669 PositionalToken {
3670 source: uws,
3671 offset: 411,
3672 length: 1,
3673 token: Token::Special(Special::Separator(Separator::Space)),
3674 },
3675 PositionalToken {
3676 source: uws,
3677 offset: 412,
3678 length: 3,
3679 token: Token::Special(Special::Punctuation('—')),
3680 },
3681 PositionalToken {
3682 source: uws,
3683 offset: 415,
3684 length: 1,
3685 token: Token::Special(Special::Separator(Separator::Space)),
3686 },
3687 PositionalToken {
3688 source: uws,
3689 offset: 416,
3690 length: 6,
3691 token: Token::Word(Word::Word("это".to_string())),
3692 },
3693 PositionalToken {
3694 source: uws,
3695 offset: 422,
3696 length: 1,
3697 token: Token::Special(Special::Separator(Separator::Space)),
3698 },
3699 PositionalToken {
3700 source: uws,
3701 offset: 423,
3702 length: 18,
3703 token: Token::Word(Word::Word("всемирный".to_string())),
3704 },
3705 PositionalToken {
3706 source: uws,
3707 offset: 441,
3708 length: 1,
3709 token: Token::Special(Special::Separator(Separator::Space)),
3710 },
3711 PositionalToken {
3712 source: uws,
3713 offset: 442,
3714 length: 16,
3715 token: Token::Word(Word::Word("праздник".to_string())),
3716 },
3717 PositionalToken {
3718 source: uws,
3719 offset: 458,
3720 length: 1,
3721 token: Token::Special(Special::Punctuation(',')),
3722 },
3723 PositionalToken {
3724 source: uws,
3725 offset: 459,
3726 length: 1,
3727 token: Token::Special(Special::Separator(Separator::Space)),
3728 },
3729 PositionalToken {
3730 source: uws,
3731 offset: 460,
3732 length: 20,
3733 token: Token::Word(Word::Word("называемый".to_string())),
3734 },
3735 PositionalToken {
3736 source: uws,
3737 offset: 480,
3738 length: 1,
3739 token: Token::Special(Special::Separator(Separator::Space)),
3740 },
3741 PositionalToken {
3742 source: uws,
3743 offset: 481,
3744 length: 2,
3745 token: Token::Special(Special::Punctuation('«')),
3746 },
3747 PositionalToken {
3748 source: uws,
3749 offset: 483,
3750 length: 8,
3751 token: Token::Word(Word::Word("День".to_string())),
3752 },
3753 PositionalToken {
3754 source: uws,
3755 offset: 491,
3756 length: 1,
3757 token: Token::Special(Special::Separator(Separator::Space)),
3758 },
3759 PositionalToken {
3760 source: uws,
3761 offset: 492,
3762 length: 8,
3763 token: Token::Word(Word::Word("Мамы".to_string())),
3764 },
3765 PositionalToken {
3766 source: uws,
3767 offset: 500,
3768 length: 2,
3769 token: Token::Special(Special::Punctuation('»')),
3770 },
3771 PositionalToken {
3772 source: uws,
3773 offset: 502,
3774 length: 1,
3775 token: Token::Special(Special::Punctuation('.')),
3776 },
3777 PositionalToken {
3778 source: uws,
3779 offset: 503,
3780 length: 1,
3781 token: Token::Special(Special::Separator(Separator::Space)),
3782 },
3783 PositionalToken {
3784 source: uws,
3785 offset: 504,
3786 length: 2,
3787 token: Token::Word(Word::Word("В".to_string())),
3788 },
3789 PositionalToken {
3790 source: uws,
3791 offset: 506,
3792 length: 1,
3793 token: Token::Special(Special::Separator(Separator::Space)),
3794 },
3795 PositionalToken {
3796 source: uws,
3797 offset: 507,
3798 length: 18,
3799 token: Token::Word(Word::Word("настоящее".to_string())),
3800 },
3801 PositionalToken {
3802 source: uws,
3803 offset: 525,
3804 length: 1,
3805 token: Token::Special(Special::Separator(Separator::Space)),
3806 },
3807 PositionalToken {
3808 source: uws,
3809 offset: 526,
3810 length: 10,
3811 token: Token::Word(Word::Word("время".to_string())),
3812 },
3813 PositionalToken {
3814 source: uws,
3815 offset: 536,
3816 length: 1,
3817 token: Token::Special(Special::Separator(Separator::Space)),
3818 },
3819 PositionalToken {
3820 source: uws,
3821 offset: 537,
3822 length: 6,
3823 token: Token::Word(Word::Word("его".to_string())),
3824 },
3825 PositionalToken {
3826 source: uws,
3827 offset: 543,
3828 length: 1,
3829 token: Token::Special(Special::Separator(Separator::Space)),
3830 },
3831 PositionalToken {
3832 source: uws,
3833 offset: 544,
3834 length: 16,
3835 token: Token::Word(Word::Word("отмечают".to_string())),
3836 },
3837 PositionalToken {
3838 source: uws,
3839 offset: 560,
3840 length: 1,
3841 token: Token::Special(Special::Separator(Separator::Space)),
3842 },
3843 PositionalToken {
3844 source: uws,
3845 offset: 561,
3846 length: 10,
3847 token: Token::Word(Word::Word("почти".to_string())),
3848 },
3849 PositionalToken {
3850 source: uws,
3851 offset: 571,
3852 length: 1,
3853 token: Token::Special(Special::Separator(Separator::Space)),
3854 },
3855 PositionalToken {
3856 source: uws,
3857 offset: 572,
3858 length: 2,
3859 token: Token::Word(Word::Word("в".to_string())),
3860 },
3861 PositionalToken {
3862 source: uws,
3863 offset: 574,
3864 length: 1,
3865 token: Token::Special(Special::Separator(Separator::Space)),
3866 },
3867 PositionalToken {
3868 source: uws,
3869 offset: 575,
3870 length: 12,
3871 token: Token::Word(Word::Word("каждой".to_string())),
3872 },
3873 PositionalToken {
3874 source: uws,
3875 offset: 587,
3876 length: 1,
3877 token: Token::Special(Special::Separator(Separator::Space)),
3878 },
3879 PositionalToken {
3880 source: uws,
3881 offset: 588,
3882 length: 12,
3883 token: Token::Word(Word::Word("стране".to_string())),
3884 },
3885 PositionalToken {
3886 source: uws,
3887 offset: 600,
3888 length: 1,
3889 token: Token::Special(Special::Punctuation(',')),
3890 },
3891 PositionalToken {
3892 source: uws,
3893 offset: 601,
3894 length: 1,
3895 token: Token::Special(Special::Separator(Separator::Space)),
3896 },
3897 PositionalToken {
3898 source: uws,
3899 offset: 602,
3900 length: 12,
3901 token: Token::Word(Word::Word("просто".to_string())),
3902 },
3903 PositionalToken {
3904 source: uws,
3905 offset: 614,
3906 length: 1,
3907 token: Token::Special(Special::Separator(Separator::Space)),
3908 },
3909 PositionalToken {
3910 source: uws,
3911 offset: 615,
3912 length: 10,
3913 token: Token::Word(Word::Word("везде".to_string())),
3914 },
3915 PositionalToken {
3916 source: uws,
3917 offset: 625,
3918 length: 1,
3919 token: Token::Special(Special::Separator(Separator::Space)),
3920 },
3921 PositionalToken {
3922 source: uws,
3923 offset: 626,
3924 length: 12,
3925 token: Token::Word(Word::Word("разные".to_string())),
3926 },
3927 PositionalToken {
3928 source: uws,
3929 offset: 638,
3930 length: 1,
3931 token: Token::Special(Special::Separator(Separator::Space)),
3932 },
3933 PositionalToken {
3934 source: uws,
3935 offset: 639,
3936 length: 8,
3937 token: Token::Word(Word::Word("даты".to_string())),
3938 },
3939 PositionalToken {
3940 source: uws,
3941 offset: 647,
3942 length: 1,
3943 token: Token::Special(Special::Separator(Separator::Space)),
3944 },
3945 PositionalToken {
3946 source: uws,
3947 offset: 648,
3948 length: 2,
3949 token: Token::Word(Word::Word("и".to_string())),
3950 },
3951 PositionalToken {
3952 source: uws,
3953 offset: 650,
3954 length: 1,
3955 token: Token::Special(Special::Separator(Separator::Space)),
3956 },
3957 PositionalToken {
3958 source: uws,
3959 offset: 651,
3960 length: 14,
3961 token: Token::Word(Word::Word("способы".to_string())),
3962 },
3963 PositionalToken {
3964 source: uws,
3965 offset: 665,
3966 length: 1,
3967 token: Token::Special(Special::Separator(Separator::Space)),
3968 },
3969 PositionalToken {
3970 source: uws,
3971 offset: 666,
3972 length: 24,
3973 token: Token::Word(Word::Word("празднования".to_string())),
3974 },
3975 PositionalToken {
3976 source: uws,
3977 offset: 690,
3978 length: 1,
3979 token: Token::Special(Special::Punctuation('.')),
3980 },
3981 PositionalToken {
3982 source: uws,
3983 offset: 691,
3984 length: 1,
3985 token: Token::Special(Special::Separator(Separator::Space)),
3986 },
3987 PositionalToken {
3988 source: uws,
3989 offset: 794,
3990 length: 1,
3991 token: Token::Special(Special::Separator(Separator::Newline)),
3992 },
3993 PositionalToken {
3994 source: uws,
3995 offset: 795,
3996 length: 2,
3997 token: Token::Special(Special::Separator(Separator::Space)),
3998 },
3999 PositionalToken {
4000 source: uws,
4001 offset: 870,
4002 length: 1,
4003 token: Token::Special(Special::Separator(Separator::Newline)),
4004 },
4005 PositionalToken {
4006 source: uws,
4007 offset: 871,
4008 length: 2,
4009 token: Token::Special(Special::Separator(Separator::Space)),
4010 },
4011 PositionalToken {
4012 source: uws,
4013 offset: 910,
4014 length: 2,
4015 token: Token::Word(Word::Word("П".to_string())),
4016 },
4017 PositionalToken {
4018 source: uws,
4019 offset: 919,
4020 length: 1,
4021 token: Token::Special(Special::Separator(Separator::Newline)),
4022 },
4023 PositionalToken {
4024 source: uws,
4025 offset: 927,
4026 length: 12,
4027 token: Token::Word(Word::Word("ПОЧЕМУ".to_string())),
4028 },
4029 PositionalToken {
4030 source: uws,
4031 offset: 939,
4032 length: 1,
4033 token: Token::Special(Special::Separator(Separator::Space)),
4034 },
4035 PositionalToken {
4036 source: uws,
4037 offset: 940,
4038 length: 4,
4039 token: Token::Word(Word::Word("МЫ".to_string())),
4040 },
4041 PositionalToken {
4042 source: uws,
4043 offset: 944,
4044 length: 1,
4045 token: Token::Special(Special::Separator(Separator::Space)),
4046 },
4047 PositionalToken {
4048 source: uws,
4049 offset: 945,
4050 length: 6,
4051 token: Token::Word(Word::Word("ЕГО".to_string())),
4052 },
4053 PositionalToken {
4054 source: uws,
4055 offset: 951,
4056 length: 1,
4057 token: Token::Special(Special::Separator(Separator::Space)),
4058 },
4059 PositionalToken {
4060 source: uws,
4061 offset: 952,
4062 length: 18,
4063 token: Token::Word(Word::Word("ПРАЗДНУЕМ".to_string())),
4064 },
4065 PositionalToken {
4066 source: uws,
4067 offset: 1063,
4068 length: 2,
4069 token: Token::Word(Word::Word("В".to_string())),
4070 },
4071 PositionalToken {
4072 source: uws,
4073 offset: 1065,
4074 length: 1,
4075 token: Token::Special(Special::Separator(Separator::Space)),
4076 },
4077 PositionalToken {
4078 source: uws,
4079 offset: 1066,
4080 length: 4,
4081 token: Token::Word(Word::Number(Number::Integer(1987))),
4082 },
4083 PositionalToken {
4084 source: uws,
4085 offset: 1070,
4086 length: 1,
4087 token: Token::Special(Special::Separator(Separator::Space)),
4088 },
4089 PositionalToken {
4090 source: uws,
4091 offset: 1071,
4092 length: 8,
4093 token: Token::Word(Word::Word("году".to_string())),
4094 },
4095 PositionalToken {
4096 source: uws,
4097 offset: 1079,
4098 length: 1,
4099 token: Token::Special(Special::Separator(Separator::Space)),
4100 },
4101 PositionalToken {
4102 source: uws,
4103 offset: 1080,
4104 length: 14,
4105 token: Token::Word(Word::Word("комитет".to_string())),
4106 },
4107 PositionalToken {
4108 source: uws,
4109 offset: 1094,
4110 length: 1,
4111 token: Token::Special(Special::Separator(Separator::Space)),
4112 },
4113 PositionalToken {
4114 source: uws,
4115 offset: 1095,
4116 length: 14,
4117 token: Token::Word(Word::Word("госдумы".to_string())),
4118 },
4119 PositionalToken {
4120 source: uws,
4121 offset: 1109,
4122 length: 1,
4123 token: Token::Special(Special::Separator(Separator::Space)),
4124 },
4125 PositionalToken {
4126 source: uws,
4127 offset: 1110,
4128 length: 4,
4129 token: Token::Word(Word::Word("по".to_string())),
4130 },
4131 PositionalToken {
4132 source: uws,
4133 offset: 1114,
4134 length: 1,
4135 token: Token::Special(Special::Separator(Separator::Space)),
4136 },
4137 PositionalToken {
4138 source: uws,
4139 offset: 1115,
4140 length: 10,
4141 token: Token::Word(Word::Word("делам".to_string())),
4142 },
4143 PositionalToken {
4144 source: uws,
4145 offset: 1125,
4146 length: 1,
4147 token: Token::Special(Special::Separator(Separator::Space)),
4148 },
4149 PositionalToken {
4150 source: uws,
4151 offset: 1126,
4152 length: 12,
4153 token: Token::Word(Word::Word("женщин".to_string())),
4154 },
4155 PositionalToken {
4156 source: uws,
4157 offset: 1138,
4158 length: 1,
4159 token: Token::Special(Special::Punctuation(',')),
4160 },
4161 PositionalToken {
4162 source: uws,
4163 offset: 1139,
4164 length: 1,
4165 token: Token::Special(Special::Separator(Separator::Space)),
4166 },
4167 PositionalToken {
4168 source: uws,
4169 offset: 1140,
4170 length: 10,
4171 token: Token::Word(Word::Word("семьи".to_string())),
4172 },
4173 PositionalToken {
4174 source: uws,
4175 offset: 1150,
4176 length: 1,
4177 token: Token::Special(Special::Separator(Separator::Space)),
4178 },
4179 PositionalToken {
4180 source: uws,
4181 offset: 1151,
4182 length: 2,
4183 token: Token::Word(Word::Word("и".to_string())),
4184 },
4185 PositionalToken {
4186 source: uws,
4187 offset: 1153,
4188 length: 1,
4189 token: Token::Special(Special::Separator(Separator::Space)),
4190 },
4191 PositionalToken {
4192 source: uws,
4193 offset: 1154,
4194 length: 16,
4195 token: Token::Word(Word::Word("молодежи".to_string())),
4196 },
4197 PositionalToken {
4198 source: uws,
4199 offset: 1170,
4200 length: 1,
4201 token: Token::Special(Special::Separator(Separator::Space)),
4202 },
4203 PositionalToken {
4204 source: uws,
4205 offset: 1171,
4206 length: 16,
4207 token: Token::Word(Word::Word("выступил".to_string())),
4208 },
4209 PositionalToken {
4210 source: uws,
4211 offset: 1187,
4212 length: 1,
4213 token: Token::Special(Special::Separator(Separator::Space)),
4214 },
4215 PositionalToken {
4216 source: uws,
4217 offset: 1188,
4218 length: 2,
4219 token: Token::Word(Word::Word("с".to_string())),
4220 },
4221 PositionalToken {
4222 source: uws,
4223 offset: 1190,
4224 length: 1,
4225 token: Token::Special(Special::Separator(Separator::Space)),
4226 },
4227 PositionalToken {
4228 source: uws,
4229 offset: 1191,
4230 length: 24,
4231 token: Token::Word(Word::Word("предложением".to_string())),
4232 },
4233 PositionalToken {
4234 source: uws,
4235 offset: 1215,
4236 length: 1,
4237 token: Token::Special(Special::Separator(Separator::Space)),
4238 },
4239 PositionalToken {
4240 source: uws,
4241 offset: 1216,
4242 length: 16,
4243 token: Token::Word(Word::Word("учредить".to_string())),
4244 },
4245 PositionalToken {
4246 source: uws,
4247 offset: 1232,
4248 length: 1,
4249 token: Token::Special(Special::Separator(Separator::Space)),
4250 },
4251 PositionalToken {
4252 source: uws,
4253 offset: 1233,
4254 length: 2,
4255 token: Token::Special(Special::Punctuation('«')),
4256 },
4257 PositionalToken {
4258 source: uws,
4259 offset: 1235,
4260 length: 8,
4261 token: Token::Word(Word::Word("День".to_string())),
4262 },
4263 PositionalToken {
4264 source: uws,
4265 offset: 1243,
4266 length: 1,
4267 token: Token::Special(Special::Separator(Separator::Space)),
4268 },
4269 PositionalToken {
4270 source: uws,
4271 offset: 1244,
4272 length: 8,
4273 token: Token::Word(Word::Word("мамы".to_string())),
4274 },
4275 PositionalToken {
4276 source: uws,
4277 offset: 1252,
4278 length: 2,
4279 token: Token::Special(Special::Punctuation('»')),
4280 },
4281 PositionalToken {
4282 source: uws,
4283 offset: 1254,
4284 length: 1,
4285 token: Token::Special(Special::Punctuation(',')),
4286 },
4287 PositionalToken {
4288 source: uws,
4289 offset: 1255,
4290 length: 1,
4291 token: Token::Special(Special::Separator(Separator::Space)),
4292 },
4293 PositionalToken {
4294 source: uws,
4295 offset: 1256,
4296 length: 2,
4297 token: Token::Word(Word::Word("а".to_string())),
4298 },
4299 PositionalToken {
4300 source: uws,
4301 offset: 1258,
4302 length: 1,
4303 token: Token::Special(Special::Separator(Separator::Space)),
4304 },
4305 PositionalToken {
4306 source: uws,
4307 offset: 1259,
4308 length: 6,
4309 token: Token::Word(Word::Word("сам".to_string())),
4310 },
4311 PositionalToken {
4312 source: uws,
4313 offset: 1265,
4314 length: 1,
4315 token: Token::Special(Special::Separator(Separator::Space)),
4316 },
4317 PositionalToken {
4318 source: uws,
4319 offset: 1266,
4320 length: 12,
4321 token: Token::Word(Word::Word("приказ".to_string())),
4322 },
4323 PositionalToken {
4324 source: uws,
4325 offset: 1278,
4326 length: 1,
4327 token: Token::Special(Special::Separator(Separator::Space)),
4328 },
4329 PositionalToken {
4330 source: uws,
4331 offset: 1279,
4332 length: 6,
4333 token: Token::Word(Word::Word("был".to_string())),
4334 },
4335 PositionalToken {
4336 source: uws,
4337 offset: 1285,
4338 length: 1,
4339 token: Token::Special(Special::Separator(Separator::Space)),
4340 },
4341 PositionalToken {
4342 source: uws,
4343 offset: 1286,
4344 length: 16,
4345 token: Token::Word(Word::Word("подписан".to_string())),
4346 },
4347 PositionalToken {
4348 source: uws,
4349 offset: 1302,
4350 length: 1,
4351 token: Token::Special(Special::Separator(Separator::Space)),
4352 },
4353 PositionalToken {
4354 source: uws,
4355 offset: 1303,
4356 length: 6,
4357 token: Token::Word(Word::Word("уже".to_string())),
4358 },
4359 PositionalToken {
4360 source: uws,
4361 offset: 1309,
4362 length: 1,
4363 token: Token::Special(Special::Separator(Separator::Space)),
4364 },
4365 PositionalToken {
4366 source: uws,
4367 offset: 1310,
4368 length: 2,
4369 token: Token::Word(Word::Number(Number::Integer(30))),
4370 },
4371 PositionalToken {
4372 source: uws,
4373 offset: 1312,
4374 length: 1,
4375 token: Token::Special(Special::Separator(Separator::Space)),
4376 },
4377 PositionalToken {
4378 source: uws,
4379 offset: 1313,
4380 length: 12,
4381 token: Token::Word(Word::Word("января".to_string())),
4382 },
4383 PositionalToken {
4384 source: uws,
4385 offset: 1325,
4386 length: 1,
4387 token: Token::Special(Special::Separator(Separator::Space)),
4388 },
4389 PositionalToken {
4390 source: uws,
4391 offset: 1326,
4392 length: 4,
4393 token: Token::Word(Word::Number(Number::Integer(1988))),
4394 },
4395 PositionalToken {
4396 source: uws,
4397 offset: 1330,
4398 length: 1,
4399 token: Token::Special(Special::Separator(Separator::Space)),
4400 },
4401 PositionalToken {
4402 source: uws,
4403 offset: 1331,
4404 length: 8,
4405 token: Token::Word(Word::Word("года".to_string())),
4406 },
4407 PositionalToken {
4408 source: uws,
4409 offset: 1339,
4410 length: 1,
4411 token: Token::Special(Special::Separator(Separator::Space)),
4412 },
4413 PositionalToken {
4414 source: uws,
4415 offset: 1340,
4416 length: 14,
4417 token: Token::Word(Word::Word("Борисом".to_string())),
4418 },
4419 PositionalToken {
4420 source: uws,
4421 offset: 1354,
4422 length: 1,
4423 token: Token::Special(Special::Separator(Separator::Space)),
4424 },
4425 PositionalToken {
4426 source: uws,
4427 offset: 1355,
4428 length: 16,
4429 token: Token::Word(Word::Word("Ельциным".to_string())),
4430 },
4431 PositionalToken {
4432 source: uws,
4433 offset: 1371,
4434 length: 1,
4435 token: Token::Special(Special::Punctuation('.')),
4436 },
4437 PositionalToken {
4438 source: uws,
4439 offset: 1372,
4440 length: 1,
4441 token: Token::Special(Special::Separator(Separator::Space)),
4442 },
4443 PositionalToken {
4444 source: uws,
4445 offset: 1373,
4446 length: 8,
4447 token: Token::Word(Word::Word("Было".to_string())),
4448 },
4449 PositionalToken {
4450 source: uws,
4451 offset: 1381,
4452 length: 1,
4453 token: Token::Special(Special::Separator(Separator::Space)),
4454 },
4455 PositionalToken {
4456 source: uws,
4457 offset: 1382,
4458 length: 12,
4459 token: Token::Word(Word::Word("решено".to_string())),
4460 },
4461 PositionalToken {
4462 source: uws,
4463 offset: 1394,
4464 length: 1,
4465 token: Token::Special(Special::Punctuation(',')),
4466 },
4467 PositionalToken {
4468 source: uws,
4469 offset: 1395,
4470 length: 1,
4471 token: Token::Special(Special::Separator(Separator::Space)),
4472 },
4473 PositionalToken {
4474 source: uws,
4475 offset: 1396,
4476 length: 6,
4477 token: Token::Word(Word::Word("что".to_string())),
4478 },
4479 PositionalToken {
4480 source: uws,
4481 offset: 1402,
4482 length: 1,
4483 token: Token::Special(Special::Separator(Separator::Space)),
4484 },
4485 PositionalToken {
4486 source: uws,
4487 offset: 1403,
4488 length: 16,
4489 token: Token::Word(Word::Word("ежегодно".to_string())),
4490 },
4491 PositionalToken {
4492 source: uws,
4493 offset: 1419,
4494 length: 1,
4495 token: Token::Special(Special::Separator(Separator::Space)),
4496 },
4497 PositionalToken {
4498 source: uws,
4499 offset: 1420,
4500 length: 2,
4501 token: Token::Word(Word::Word("в".to_string())),
4502 },
4503 PositionalToken {
4504 source: uws,
4505 offset: 1422,
4506 length: 1,
4507 token: Token::Special(Special::Separator(Separator::Space)),
4508 },
4509 PositionalToken {
4510 source: uws,
4511 offset: 1423,
4512 length: 12,
4513 token: Token::Word(Word::Word("России".to_string())),
4514 },
4515 PositionalToken {
4516 source: uws,
4517 offset: 1435,
4518 length: 1,
4519 token: Token::Special(Special::Separator(Separator::Space)),
4520 },
4521 PositionalToken {
4522 source: uws,
4523 offset: 1436,
4524 length: 22,
4525 token: Token::Word(Word::Word("празднество".to_string())),
4526 },
4527 PositionalToken {
4528 source: uws,
4529 offset: 1458,
4530 length: 1,
4531 token: Token::Special(Special::Separator(Separator::Space)),
4532 },
4533 PositionalToken {
4534 source: uws,
4535 offset: 1459,
4536 length: 6,
4537 token: Token::Word(Word::Word("дня".to_string())),
4538 },
4539 PositionalToken {
4540 source: uws,
4541 offset: 1465,
4542 length: 1,
4543 token: Token::Special(Special::Separator(Separator::Space)),
4544 },
4545 PositionalToken {
4546 source: uws,
4547 offset: 1466,
4548 length: 8,
4549 token: Token::Word(Word::Word("мамы".to_string())),
4550 },
4551 PositionalToken {
4552 source: uws,
4553 offset: 1474,
4554 length: 1,
4555 token: Token::Special(Special::Separator(Separator::Space)),
4556 },
4557 PositionalToken {
4558 source: uws,
4559 offset: 1475,
4560 length: 10,
4561 token: Token::Word(Word::Word("будет".to_string())),
4562 },
4563 PositionalToken {
4564 source: uws,
4565 offset: 1485,
4566 length: 1,
4567 token: Token::Special(Special::Separator(Separator::Space)),
4568 },
4569 PositionalToken {
4570 source: uws,
4571 offset: 1486,
4572 length: 16,
4573 token: Token::Word(Word::Word("выпадать".to_string())),
4574 },
4575 PositionalToken {
4576 source: uws,
4577 offset: 1502,
4578 length: 1,
4579 token: Token::Special(Special::Separator(Separator::Space)),
4580 },
4581 PositionalToken {
4582 source: uws,
4583 offset: 1503,
4584 length: 4,
4585 token: Token::Word(Word::Word("на".to_string())),
4586 },
4587 PositionalToken {
4588 source: uws,
4589 offset: 1507,
4590 length: 1,
4591 token: Token::Special(Special::Separator(Separator::Space)),
4592 },
4593 PositionalToken {
4594 source: uws,
4595 offset: 1508,
4596 length: 18,
4597 token: Token::Word(Word::Word("последнее".to_string())),
4598 },
4599 PositionalToken {
4600 source: uws,
4601 offset: 1526,
4602 length: 1,
4603 token: Token::Special(Special::Separator(Separator::Space)),
4604 },
4605 PositionalToken {
4606 source: uws,
4607 offset: 1527,
4608 length: 22,
4609 token: Token::Word(Word::Word("воскресенье".to_string())),
4610 },
4611 PositionalToken {
4612 source: uws,
4613 offset: 1549,
4614 length: 1,
4615 token: Token::Special(Special::Separator(Separator::Space)),
4616 },
4617 PositionalToken {
4618 source: uws,
4619 offset: 1550,
4620 length: 12,
4621 token: Token::Word(Word::Word("ноября".to_string())),
4622 },
4623 PositionalToken {
4624 source: uws,
4625 offset: 1562,
4626 length: 1,
4627 token: Token::Special(Special::Punctuation('.')),
4628 },
4629 PositionalToken {
4630 source: uws,
4631 offset: 1563,
4632 length: 1,
4633 token: Token::Special(Special::Separator(Separator::Space)),
4634 },
4635 PositionalToken {
4636 source: uws,
4637 offset: 1664,
4638 length: 1,
4639 token: Token::Special(Special::Separator(Separator::Newline)),
4640 },
4641 PositionalToken {
4642 source: uws,
4643 offset: 1665,
4644 length: 2,
4645 token: Token::Special(Special::Separator(Separator::Space)),
4646 },
4647 PositionalToken {
4648 source: uws,
4649 offset: 1725,
4650 length: 1,
4651 token: Token::Special(Special::Separator(Separator::Newline)),
4652 },
4653 PositionalToken {
4654 source: uws,
4655 offset: 1726,
4656 length: 4,
4657 token: Token::Special(Special::Separator(Separator::Space)),
4658 },
4659 PositionalToken {
4660 source: uws,
4661 offset: 2725,
4662 length: 1,
4663 token: Token::Special(Special::Separator(Separator::Newline)),
4664 },
4665 PositionalToken {
4666 source: uws,
4667 offset: 2726,
4668 length: 2,
4669 token: Token::Special(Special::Separator(Separator::Space)),
4670 },
4671 PositionalToken {
4672 source: uws,
4673 offset: 2888,
4674 length: 1,
4675 token: Token::Special(Special::Separator(Separator::Newline)),
4676 },
4677 PositionalToken {
4678 source: uws,
4679 offset: 2889,
4680 length: 2,
4681 token: Token::Special(Special::Separator(Separator::Space)),
4682 },
4683 PositionalToken {
4684 source: uws,
4685 offset: 2891,
4686 length: 1,
4687 token: Token::Special(Special::Separator(Separator::Newline)),
4688 },
4689 PositionalToken {
4690 source: uws,
4691 offset: 2904,
4692 length: 1,
4693 token: Token::Special(Special::Separator(Separator::Newline)),
4694 },
4695 PositionalToken {
4696 source: uws,
4697 offset: 2905,
4698 length: 4,
4699 token: Token::Special(Special::Separator(Separator::Space)),
4700 },
4701 ];
4702
4703 let text = Text::new({
4704 uws.into_source()
4705 .pipe(tagger::Builder::new().create().into_breaker())
4706 .pipe(entities::Builder::new().create().into_piped())
4707 .into_separator()
4708 })
4709 .unwrap();
4710
4711 let lib_res = text
4712 .into_tokenizer(TokenizerParams::v1())
4713 .filter_map(|tt| tt.into_original_token_1())
4714 .collect::<Vec<_>>();
4715
4716 check_results(&result, &lib_res, uws);
4717 }
4718
4719 #[test]
4770 fn numerical_no_split() {
4771 let uws = "12.02.18 31.28.34 23.11.2018 123.568.365.234.578 127.0.0.1 1st 1кг 123123афываыв 12321фвафыов234выалфо 12_123_343.4234_4234";
4772 let lib_res = uws.into_tokenizer(Default::default()).collect::<Vec<_>>();
4773 let result = vec![
4775 PositionalToken {
4776 source: uws,
4777 offset: 0,
4778 length: 8,
4779 token: Token::Word(Word::Numerical(Numerical::DotSeparated(
4780 "12.02.18".to_string(),
4781 ))),
4782 },
4783 PositionalToken {
4784 source: uws,
4785 offset: 8,
4786 length: 1,
4787 token: Token::Special(Special::Separator(Separator::Space)),
4788 },
4789 PositionalToken {
4790 source: uws,
4791 offset: 9,
4792 length: 8,
4793 token: Token::Word(Word::Numerical(Numerical::DotSeparated(
4794 "31.28.34".to_string(),
4795 ))),
4796 },
4797 PositionalToken {
4798 source: uws,
4799 offset: 17,
4800 length: 1,
4801 token: Token::Special(Special::Separator(Separator::Space)),
4802 },
4803 PositionalToken {
4804 source: uws,
4805 offset: 18,
4806 length: 10,
4807 token: Token::Word(Word::Numerical(Numerical::DotSeparated(
4808 "23.11.2018".to_string(),
4809 ))),
4810 },
4811 PositionalToken {
4812 source: uws,
4813 offset: 28,
4814 length: 1,
4815 token: Token::Special(Special::Separator(Separator::Space)),
4816 },
4817 PositionalToken {
4818 source: uws,
4819 offset: 29,
4820 length: 19,
4821 token: Token::Word(Word::Numerical(Numerical::DotSeparated(
4822 "123.568.365.234.578".to_string(),
4823 ))),
4824 },
4825 PositionalToken {
4826 source: uws,
4827 offset: 48,
4828 length: 1,
4829 token: Token::Special(Special::Separator(Separator::Space)),
4830 },
4831 PositionalToken {
4832 source: uws,
4833 offset: 49,
4834 length: 9,
4835 token: Token::Word(Word::Numerical(Numerical::DotSeparated(
4836 "127.0.0.1".to_string(),
4837 ))),
4838 },
4839 PositionalToken {
4840 source: uws,
4841 offset: 58,
4842 length: 1,
4843 token: Token::Special(Special::Separator(Separator::Space)),
4844 },
4845 PositionalToken {
4846 source: uws,
4847 offset: 59,
4848 length: 3,
4849 token: Token::Word(Word::Numerical(Numerical::Measures("1st".to_string()))),
4850 },
4851 PositionalToken {
4852 source: uws,
4853 offset: 62,
4854 length: 1,
4855 token: Token::Special(Special::Separator(Separator::Space)),
4856 },
4857 PositionalToken {
4858 source: uws,
4859 offset: 63,
4860 length: 5,
4861 token: Token::Word(Word::Numerical(Numerical::Measures("1кг".to_string()))),
4862 },
4863 PositionalToken {
4864 source: uws,
4865 offset: 68,
4866 length: 1,
4867 token: Token::Special(Special::Separator(Separator::Space)),
4868 },
4869 PositionalToken {
4870 source: uws,
4871 offset: 69,
4872 length: 20,
4873 token: Token::Word(Word::Numerical(Numerical::Measures(
4874 "123123афываыв".to_string(),
4875 ))),
4876 },
4877 PositionalToken {
4878 source: uws,
4879 offset: 89,
4880 length: 1,
4881 token: Token::Special(Special::Separator(Separator::Space)),
4882 },
4883 PositionalToken {
4884 source: uws,
4885 offset: 90,
4886 length: 34,
4887 token: Token::Word(Word::Numerical(Numerical::Alphanumeric(
4888 "12321фвафыов234выалфо".to_string(),
4889 ))),
4890 },
4891 PositionalToken {
4892 source: uws,
4893 offset: 124,
4894 length: 1,
4895 token: Token::Special(Special::Separator(Separator::Space)),
4896 },
4897 PositionalToken {
4898 source: uws,
4899 offset: 125,
4900 length: 20,
4901 token: Token::Word(Word::Numerical(Numerical::Alphanumeric(
4902 "12_123_343.4234_4234".to_string(),
4903 ))),
4904 },
4905 ];
4906 check_results(&result, &lib_res, uws);
4907 }
4908
4909 #[test]
4910 fn numerical_default() {
4911 let uws = "12.02.18 31.28.34 23.11.2018 123.568.365.234.578 127.0.0.1 1st 1кг 123123афываыв 12321фвафыов234выалфо 12_123_343.4234_4234";
4912 let lib_res = uws
4913 .into_tokenizer(TokenizerParams::v1())
4914 .collect::<Vec<_>>();
4915 let result = vec![
4917 PositionalToken {
4918 source: uws,
4919 offset: 0,
4920 length: 2,
4921 token: Token::Word(Word::Number(Number::Integer(12))),
4922 },
4923 PositionalToken {
4924 source: uws,
4925 offset: 2,
4926 length: 1,
4927 token: Token::Special(Special::Punctuation('.')),
4928 },
4929 PositionalToken {
4930 source: uws,
4931 offset: 3,
4932 length: 2,
4933 token: Token::Word(Word::Number(Number::Integer(2))),
4934 },
4935 PositionalToken {
4936 source: uws,
4937 offset: 5,
4938 length: 1,
4939 token: Token::Special(Special::Punctuation('.')),
4940 },
4941 PositionalToken {
4942 source: uws,
4943 offset: 6,
4944 length: 2,
4945 token: Token::Word(Word::Number(Number::Integer(18))),
4946 },
4947 PositionalToken {
4948 source: uws,
4949 offset: 8,
4950 length: 1,
4951 token: Token::Special(Special::Separator(Separator::Space)),
4952 },
4953 PositionalToken {
4954 source: uws,
4955 offset: 9,
4956 length: 2,
4957 token: Token::Word(Word::Number(Number::Integer(31))),
4958 },
4959 PositionalToken {
4960 source: uws,
4961 offset: 11,
4962 length: 1,
4963 token: Token::Special(Special::Punctuation('.')),
4964 },
4965 PositionalToken {
4966 source: uws,
4967 offset: 12,
4968 length: 2,
4969 token: Token::Word(Word::Number(Number::Integer(28))),
4970 },
4971 PositionalToken {
4972 source: uws,
4973 offset: 14,
4974 length: 1,
4975 token: Token::Special(Special::Punctuation('.')),
4976 },
4977 PositionalToken {
4978 source: uws,
4979 offset: 15,
4980 length: 2,
4981 token: Token::Word(Word::Number(Number::Integer(34))),
4982 },
4983 PositionalToken {
4984 source: uws,
4985 offset: 17,
4986 length: 1,
4987 token: Token::Special(Special::Separator(Separator::Space)),
4988 },
4989 PositionalToken {
4990 source: uws,
4991 offset: 18,
4992 length: 2,
4993 token: Token::Word(Word::Number(Number::Integer(23))),
4994 },
4995 PositionalToken {
4996 source: uws,
4997 offset: 20,
4998 length: 1,
4999 token: Token::Special(Special::Punctuation('.')),
5000 },
5001 PositionalToken {
5002 source: uws,
5003 offset: 21,
5004 length: 2,
5005 token: Token::Word(Word::Number(Number::Integer(11))),
5006 },
5007 PositionalToken {
5008 source: uws,
5009 offset: 23,
5010 length: 1,
5011 token: Token::Special(Special::Punctuation('.')),
5012 },
5013 PositionalToken {
5014 source: uws,
5015 offset: 24,
5016 length: 4,
5017 token: Token::Word(Word::Number(Number::Integer(2018))),
5018 },
5019 PositionalToken {
5020 source: uws,
5021 offset: 28,
5022 length: 1,
5023 token: Token::Special(Special::Separator(Separator::Space)),
5024 },
5025 PositionalToken {
5026 source: uws,
5027 offset: 29,
5028 length: 3,
5029 token: Token::Word(Word::Number(Number::Integer(123))),
5030 },
5031 PositionalToken {
5032 source: uws,
5033 offset: 32,
5034 length: 1,
5035 token: Token::Special(Special::Punctuation('.')),
5036 },
5037 PositionalToken {
5038 source: uws,
5039 offset: 33,
5040 length: 3,
5041 token: Token::Word(Word::Number(Number::Integer(568))),
5042 },
5043 PositionalToken {
5044 source: uws,
5045 offset: 36,
5046 length: 1,
5047 token: Token::Special(Special::Punctuation('.')),
5048 },
5049 PositionalToken {
5050 source: uws,
5051 offset: 37,
5052 length: 3,
5053 token: Token::Word(Word::Number(Number::Integer(365))),
5054 },
5055 PositionalToken {
5056 source: uws,
5057 offset: 40,
5058 length: 1,
5059 token: Token::Special(Special::Punctuation('.')),
5060 },
5061 PositionalToken {
5062 source: uws,
5063 offset: 41,
5064 length: 3,
5065 token: Token::Word(Word::Number(Number::Integer(234))),
5066 },
5067 PositionalToken {
5068 source: uws,
5069 offset: 44,
5070 length: 1,
5071 token: Token::Special(Special::Punctuation('.')),
5072 },
5073 PositionalToken {
5074 source: uws,
5075 offset: 45,
5076 length: 3,
5077 token: Token::Word(Word::Number(Number::Integer(578))),
5078 },
5079 PositionalToken {
5080 source: uws,
5081 offset: 48,
5082 length: 1,
5083 token: Token::Special(Special::Separator(Separator::Space)),
5084 },
5085 PositionalToken {
5086 source: uws,
5087 offset: 49,
5088 length: 3,
5089 token: Token::Word(Word::Number(Number::Integer(127))),
5090 },
5091 PositionalToken {
5092 source: uws,
5093 offset: 52,
5094 length: 1,
5095 token: Token::Special(Special::Punctuation('.')),
5096 },
5097 PositionalToken {
5098 source: uws,
5099 offset: 53,
5100 length: 1,
5101 token: Token::Word(Word::Number(Number::Integer(0))),
5102 },
5103 PositionalToken {
5104 source: uws,
5105 offset: 54,
5106 length: 1,
5107 token: Token::Special(Special::Punctuation('.')),
5108 },
5109 PositionalToken {
5110 source: uws,
5111 offset: 55,
5112 length: 1,
5113 token: Token::Word(Word::Number(Number::Integer(0))),
5114 },
5115 PositionalToken {
5116 source: uws,
5117 offset: 56,
5118 length: 1,
5119 token: Token::Special(Special::Punctuation('.')),
5120 },
5121 PositionalToken {
5122 source: uws,
5123 offset: 57,
5124 length: 1,
5125 token: Token::Word(Word::Number(Number::Integer(1))),
5126 },
5127 PositionalToken {
5128 source: uws,
5129 offset: 58,
5130 length: 1,
5131 token: Token::Special(Special::Separator(Separator::Space)),
5132 },
5133 PositionalToken {
5134 source: uws,
5135 offset: 59,
5136 length: 3,
5137 token: Token::Word(Word::Numerical(Numerical::Measures("1st".to_string()))),
5138 },
5139 PositionalToken {
5140 source: uws,
5141 offset: 62,
5142 length: 1,
5143 token: Token::Special(Special::Separator(Separator::Space)),
5144 },
5145 PositionalToken {
5146 source: uws,
5147 offset: 63,
5148 length: 5,
5149 token: Token::Word(Word::Numerical(Numerical::Measures("1кг".to_string()))),
5150 },
5151 PositionalToken {
5152 source: uws,
5153 offset: 68,
5154 length: 1,
5155 token: Token::Special(Special::Separator(Separator::Space)),
5156 },
5157 PositionalToken {
5158 source: uws,
5159 offset: 69,
5160 length: 20,
5161 token: Token::Word(Word::Numerical(Numerical::Measures(
5162 "123123афываыв".to_string(),
5163 ))),
5164 },
5165 PositionalToken {
5166 source: uws,
5167 offset: 89,
5168 length: 1,
5169 token: Token::Special(Special::Separator(Separator::Space)),
5170 },
5171 PositionalToken {
5172 source: uws,
5173 offset: 90,
5174 length: 34,
5175 token: Token::Word(Word::Numerical(Numerical::Alphanumeric(
5176 "12321фвафыов234выалфо".to_string(),
5177 ))),
5178 },
5179 PositionalToken {
5180 source: uws,
5181 offset: 124,
5182 length: 1,
5183 token: Token::Special(Special::Separator(Separator::Space)),
5184 },
5185 PositionalToken {
5186 source: uws,
5187 offset: 125,
5188 length: 2,
5189 token: Token::Word(Word::Number(Number::Integer(12))),
5190 },
5191 PositionalToken {
5192 source: uws,
5193 offset: 127,
5194 length: 1,
5195 token: Token::Special(Special::Punctuation('_')),
5196 },
5197 PositionalToken {
5198 source: uws,
5199 offset: 128,
5200 length: 3,
5201 token: Token::Word(Word::Number(Number::Integer(123))),
5202 },
5203 PositionalToken {
5204 source: uws,
5205 offset: 131,
5206 length: 1,
5207 token: Token::Special(Special::Punctuation('_')),
5208 },
5209 PositionalToken {
5210 source: uws,
5211 offset: 132,
5212 length: 3,
5213 token: Token::Word(Word::Number(Number::Integer(343))),
5214 },
5215 PositionalToken {
5216 source: uws,
5217 offset: 135,
5218 length: 1,
5219 token: Token::Special(Special::Punctuation('.')),
5220 },
5221 PositionalToken {
5222 source: uws,
5223 offset: 136,
5224 length: 4,
5225 token: Token::Word(Word::Number(Number::Integer(4234))),
5226 },
5227 PositionalToken {
5228 source: uws,
5229 offset: 140,
5230 length: 1,
5231 token: Token::Special(Special::Punctuation('_')),
5232 },
5233 PositionalToken {
5234 source: uws,
5235 offset: 141,
5236 length: 4,
5237 token: Token::Word(Word::Number(Number::Integer(4234))),
5238 },
5239 ];
5240 check_results(&result, &lib_res, uws);
5241 }
5242
5243 enum Lang {
5256 Zho,
5257 Jpn,
5258 Kor,
5259 Ara,
5260 Ell,
5261 }
5262
5263 #[test]
5264 fn test_lang_zho() {
5265 let (uws, result) = get_lang_test(Lang::Zho);
5266 let lib_res = uws
5267 .into_tokenizer(TokenizerParams::v1())
5268 .collect::<Vec<_>>();
5269 check_results(&result, &lib_res, &uws);
5270 }
5271
5272 #[test]
5273 fn test_lang_jpn() {
5274 let (uws, result) = get_lang_test(Lang::Jpn);
5275 let lib_res = uws
5276 .into_tokenizer(TokenizerParams::v1())
5277 .collect::<Vec<_>>();
5278 check_results(&result, &lib_res, &uws);
5279 }
5280
5281 #[test]
5282 fn test_lang_kor() {
5283 let (uws, result) = get_lang_test(Lang::Kor);
5284 let lib_res = uws
5285 .into_tokenizer(TokenizerParams::v1())
5286 .collect::<Vec<_>>();
5287 check_results(&result, &lib_res, &uws);
5288 }
5289
5290 #[test]
5291 fn test_lang_ara() {
5292 let (uws, result) = get_lang_test(Lang::Ara);
5293 let lib_res = uws
5294 .into_tokenizer(TokenizerParams::v1())
5295 .collect::<Vec<_>>();
5296 check_results(&result, &lib_res, &uws);
5297 }
5298
5299 #[test]
5300 fn test_lang_ell() {
5301 let (uws, result) = get_lang_test(Lang::Ell);
5302 let lib_res = uws
5303 .into_tokenizer(TokenizerParams::v1())
5304 .collect::<Vec<_>>();
5305 check_results(&result, &lib_res, &uws);
5306 }
5307
5308 fn get_lang_test(lng: Lang) -> (String, Vec<PositionalToken>) {
5309 let uws = match lng {
5310 Lang::Zho => "美国电视连续剧《超人前传》的第一集《试播集》于2001年10月16日在電視網首播,剧集主创人阿尔弗雷德·高夫和迈尔斯·米勒編劇,大卫·努特尔执导。这一试播首次向观众引荐了克拉克·肯特一角,他是位拥有超能力的外星孤儿,与家人和朋友一起在堪薩斯州虚构小镇斯莫维尔生活。在这一集里,肯特首度得知自己的来历,同时还需要阻止一位学生试图杀死镇上高中多名学生的报复之举。本集节目里引入了多个之后将贯穿全季甚至整部剧集的主题元素,例如几位主要角色之间的三角恋情。电视剧在加拿大溫哥華取景,旨在选用其“美国中产阶级”景观,主创人花了5个月的时间专门用于为主角物色合适的演员。试播集在所有演员选好4天后正式开拍。由于时间上的限制,剧组无法搭建好实体外景,因此只能使用计算机绘图技术将数字化的外景插入到镜头中。节目一经上映就打破了电视网的多项收视纪录,并且获得了评论员的普遍好评和多个奖项提名,并在其中两项上胜出",
5311 Lang::Kor => "플레이스테이션 은 소니 컴퓨터 엔터테인먼트가 개발한 세 번째 가정용 게임기이다. 마이크로소프트의 엑스박스 360, 닌텐도의 Wii와 경쟁하고 있다. 이전 제품에서 온라인 플레이 기능을 비디오 게임 개발사에 전적으로 의존하던 것과 달리 통합 온라인 게임 서비스인 플레이스테이션 네트워크 서비스를 발매와 함께 시작해 제공하고 있으며, 탄탄한 멀티미디어 재생 기능, 플레이스테이션 포터블과의 연결, 고화질 광학 디스크 포맷인 블루레이 디스크 재생 기능 등의 기능을 갖추고 있다. 2006년 11월 11일에 일본에서 처음으로 출시했으며, 11월 17일에는 북미 지역, 2007년 3월 23일에는 유럽과 오세아니아 지역에서, 대한민국의 경우 6월 5일부터 일주일간 예약판매를 실시해, 매일 준비한 수량이 동이 나는 등 많은 관심을 받았으며 6월 16일에 정식 출시 행사를 열었다",
5312 Lang::Jpn => "熊野三山本願所は、15世紀末以降における熊野三山(熊野本宮、熊野新宮、熊野那智)の造営・修造のための勧進を担った組織の総称。 熊野三山を含めて、日本における古代から中世前半にかけての寺社の造営は、寺社領経営のような恒常的財源、幕府や朝廷などからの一時的な造営料所の寄進、あるいは公権力からの臨時の保護によって行われていた。しかしながら、熊野三山では、これらの財源はすべて15世紀半ばまでに実効性を失った",
5313 Lang::Ara => "لشکرکشیهای روسهای وارنگی به دریای خزر مجموعهای از حملات نظامی در بین سالهای ۸۶۴ تا ۱۰۴۱ میلادی به سواحل دریای خزر بودهاست. روسهای وارنگی ابتدا در قرن نهم میلادی به عنوان بازرگانان پوست، عسل و برده در سرزمینهای اسلامی(سرکلند) ظاهر شدند. این بازرگانان در مسیر تجاری ولگا به خرید و فروش میپرداختند. نخستین حملهٔ آنان در فاصله سالهای ۸۶۴ تا ۸۸۴ میلادی در مقیاسی کوچک علیه علویان طبرستان رخ داد. نخستین یورش بزرگ روسها در سال ۹۱۳ رخ داد و آنان با ۵۰۰ فروند درازکشتی شهر گرگان و اطراف آن را غارت کردند. آنها در این حمله مقداری کالا و برده را به تاراج بردند و در راه بازگشتن به سمت شمال، در دلتای ولگا، مورد حملهٔ خزرهای مسلمان قرار گرفتند و بعضی از آنان موفق به فرار شدند، ولی در میانهٔ ولگا به قتل رسیدند. دومین هجوم بزرگ روسها به دریای خزر در سال ۹۴۳ به وقوع پیوست. در این دوره ایگور یکم، حاکم روس کیف، رهبری روسها را در دست داشت. روسها پس از توافق با دولت خزرها برای عبور امن از منطقه، تا رود کورا و اعماق قفقاز پیش رفتند و در سال ۹۴۳ موفق شدند بندر بردعه، پایتخت اران (جمهوری آذربایجان کنونی)، را تصرف کنند. روسها در آنجا به مدت چند ماه ماندند و بسیاری از ساکنان شهر را کشتند و از راه غارتگری اموالی را به تاراج بردند. تنها دلیل بازگشت آنان ",
5314 Lang::Ell => "Το Πρόγραμμα υλοποιείται εξ ολοκλήρου από απόσταση και μπορεί να συμμετέχει κάθε εμπλεκόμενος στη ή/και ενδιαφερόμενος για τη διδασκαλία της Ελληνικής ως δεύτερης/ξένης γλώσσας στην Ελλάδα και στο εξωτερικό, αρκεί να είναι απόφοιτος ελληνικής φιλολογίας, ξένων φιλολογιών, παιδαγωγικών τμημάτων, θεολογικών σχολών ή άλλων πανεπιστημιακών τμημάτων ελληνικών ή ισότιμων ξένων πανεπιστημίων. Υπό όρους γίνονται δεκτοί υποψήφιοι που δεν έχουν ολοκληρώσει σπουδές τριτοβάθμιας εκπαίδευσης.",
5315 };
5316 let tokens = match lng {
5317 Lang::Zho => vec![
5318 PositionalToken {
5319 source: uws,
5320 offset: 0,
5321 length: 3,
5322 token: Token::Word(Word::Word("美".to_string())),
5323 },
5324 PositionalToken {
5325 source: uws,
5326 offset: 3,
5327 length: 3,
5328 token: Token::Word(Word::Word("国".to_string())),
5329 },
5330 PositionalToken {
5331 source: uws,
5332 offset: 6,
5333 length: 3,
5334 token: Token::Word(Word::Word("电".to_string())),
5335 },
5336 PositionalToken {
5337 source: uws,
5338 offset: 9,
5339 length: 3,
5340 token: Token::Word(Word::Word("视".to_string())),
5341 },
5342 PositionalToken {
5343 source: uws,
5344 offset: 12,
5345 length: 3,
5346 token: Token::Word(Word::Word("连".to_string())),
5347 },
5348 PositionalToken {
5349 source: uws,
5350 offset: 15,
5351 length: 3,
5352 token: Token::Word(Word::Word("续".to_string())),
5353 },
5354 PositionalToken {
5355 source: uws,
5356 offset: 18,
5357 length: 3,
5358 token: Token::Word(Word::Word("剧".to_string())),
5359 },
5360 PositionalToken {
5361 source: uws,
5362 offset: 21,
5363 length: 3,
5364 token: Token::Special(Special::Punctuation('《')),
5365 },
5366 PositionalToken {
5367 source: uws,
5368 offset: 24,
5369 length: 3,
5370 token: Token::Word(Word::Word("超".to_string())),
5371 },
5372 PositionalToken {
5373 source: uws,
5374 offset: 27,
5375 length: 3,
5376 token: Token::Word(Word::Word("人".to_string())),
5377 },
5378 PositionalToken {
5379 source: uws,
5380 offset: 30,
5381 length: 3,
5382 token: Token::Word(Word::Word("前".to_string())),
5383 },
5384 PositionalToken {
5385 source: uws,
5386 offset: 33,
5387 length: 3,
5388 token: Token::Word(Word::Word("传".to_string())),
5389 },
5390 PositionalToken {
5391 source: uws,
5392 offset: 36,
5393 length: 3,
5394 token: Token::Special(Special::Punctuation('》')),
5395 },
5396 PositionalToken {
5397 source: uws,
5398 offset: 39,
5399 length: 3,
5400 token: Token::Word(Word::Word("的".to_string())),
5401 },
5402 PositionalToken {
5403 source: uws,
5404 offset: 42,
5405 length: 3,
5406 token: Token::Word(Word::Word("第".to_string())),
5407 },
5408 PositionalToken {
5409 source: uws,
5410 offset: 45,
5411 length: 3,
5412 token: Token::Word(Word::Word("一".to_string())),
5413 },
5414 PositionalToken {
5415 source: uws,
5416 offset: 48,
5417 length: 3,
5418 token: Token::Word(Word::Word("集".to_string())),
5419 },
5420 PositionalToken {
5421 source: uws,
5422 offset: 51,
5423 length: 3,
5424 token: Token::Special(Special::Punctuation('《')),
5425 },
5426 PositionalToken {
5427 source: uws,
5428 offset: 54,
5429 length: 3,
5430 token: Token::Word(Word::Word("试".to_string())),
5431 },
5432 PositionalToken {
5433 source: uws,
5434 offset: 57,
5435 length: 3,
5436 token: Token::Word(Word::Word("播".to_string())),
5437 },
5438 PositionalToken {
5439 source: uws,
5440 offset: 60,
5441 length: 3,
5442 token: Token::Word(Word::Word("集".to_string())),
5443 },
5444 PositionalToken {
5445 source: uws,
5446 offset: 63,
5447 length: 3,
5448 token: Token::Special(Special::Punctuation('》')),
5449 },
5450 PositionalToken {
5451 source: uws,
5452 offset: 66,
5453 length: 3,
5454 token: Token::Word(Word::Word("于".to_string())),
5455 },
5456 PositionalToken {
5457 source: uws,
5458 offset: 69,
5459 length: 4,
5460 token: Token::Word(Word::Number(Number::Integer(2001))),
5461 },
5462 PositionalToken {
5463 source: uws,
5464 offset: 73,
5465 length: 3,
5466 token: Token::Word(Word::Word("年".to_string())),
5467 },
5468 PositionalToken {
5469 source: uws,
5470 offset: 76,
5471 length: 2,
5472 token: Token::Word(Word::Number(Number::Integer(10))),
5473 },
5474 PositionalToken {
5475 source: uws,
5476 offset: 78,
5477 length: 3,
5478 token: Token::Word(Word::Word("月".to_string())),
5479 },
5480 PositionalToken {
5481 source: uws,
5482 offset: 81,
5483 length: 2,
5484 token: Token::Word(Word::Number(Number::Integer(16))),
5485 },
5486 PositionalToken {
5487 source: uws,
5488 offset: 83,
5489 length: 3,
5490 token: Token::Word(Word::Word("日".to_string())),
5491 },
5492 PositionalToken {
5493 source: uws,
5494 offset: 86,
5495 length: 3,
5496 token: Token::Word(Word::Word("在".to_string())),
5497 },
5498 PositionalToken {
5499 source: uws,
5500 offset: 89,
5501 length: 3,
5502 token: Token::Word(Word::Word("電".to_string())),
5503 },
5504 PositionalToken {
5505 source: uws,
5506 offset: 92,
5507 length: 3,
5508 token: Token::Word(Word::Word("視".to_string())),
5509 },
5510 PositionalToken {
5511 source: uws,
5512 offset: 95,
5513 length: 3,
5514 token: Token::Word(Word::Word("網".to_string())),
5515 },
5516 PositionalToken {
5517 source: uws,
5518 offset: 98,
5519 length: 3,
5520 token: Token::Word(Word::Word("首".to_string())),
5521 },
5522 PositionalToken {
5523 source: uws,
5524 offset: 101,
5525 length: 3,
5526 token: Token::Word(Word::Word("播".to_string())),
5527 },
5528 PositionalToken {
5529 source: uws,
5530 offset: 104,
5531 length: 3,
5532 token: Token::Special(Special::Punctuation(',')),
5533 },
5534 PositionalToken {
5535 source: uws,
5536 offset: 107,
5537 length: 3,
5538 token: Token::Word(Word::Word("剧".to_string())),
5539 },
5540 PositionalToken {
5541 source: uws,
5542 offset: 110,
5543 length: 3,
5544 token: Token::Word(Word::Word("集".to_string())),
5545 },
5546 PositionalToken {
5547 source: uws,
5548 offset: 113,
5549 length: 3,
5550 token: Token::Word(Word::Word("主".to_string())),
5551 },
5552 PositionalToken {
5553 source: uws,
5554 offset: 116,
5555 length: 3,
5556 token: Token::Word(Word::Word("创".to_string())),
5557 },
5558 PositionalToken {
5559 source: uws,
5560 offset: 119,
5561 length: 3,
5562 token: Token::Word(Word::Word("人".to_string())),
5563 },
5564 PositionalToken {
5565 source: uws,
5566 offset: 122,
5567 length: 3,
5568 token: Token::Word(Word::Word("阿".to_string())),
5569 },
5570 PositionalToken {
5571 source: uws,
5572 offset: 125,
5573 length: 3,
5574 token: Token::Word(Word::Word("尔".to_string())),
5575 },
5576 PositionalToken {
5577 source: uws,
5578 offset: 128,
5579 length: 3,
5580 token: Token::Word(Word::Word("弗".to_string())),
5581 },
5582 PositionalToken {
5583 source: uws,
5584 offset: 131,
5585 length: 3,
5586 token: Token::Word(Word::Word("雷".to_string())),
5587 },
5588 PositionalToken {
5589 source: uws,
5590 offset: 134,
5591 length: 3,
5592 token: Token::Word(Word::Word("德".to_string())),
5593 },
5594 PositionalToken {
5595 source: uws,
5596 offset: 137,
5597 length: 2,
5598 token: Token::Special(Special::Punctuation('·')),
5599 },
5600 PositionalToken {
5601 source: uws,
5602 offset: 139,
5603 length: 3,
5604 token: Token::Word(Word::Word("高".to_string())),
5605 },
5606 PositionalToken {
5607 source: uws,
5608 offset: 142,
5609 length: 3,
5610 token: Token::Word(Word::Word("夫".to_string())),
5611 },
5612 PositionalToken {
5613 source: uws,
5614 offset: 145,
5615 length: 3,
5616 token: Token::Word(Word::Word("和".to_string())),
5617 },
5618 PositionalToken {
5619 source: uws,
5620 offset: 148,
5621 length: 3,
5622 token: Token::Word(Word::Word("迈".to_string())),
5623 },
5624 PositionalToken {
5625 source: uws,
5626 offset: 151,
5627 length: 3,
5628 token: Token::Word(Word::Word("尔".to_string())),
5629 },
5630 PositionalToken {
5631 source: uws,
5632 offset: 154,
5633 length: 3,
5634 token: Token::Word(Word::Word("斯".to_string())),
5635 },
5636 PositionalToken {
5637 source: uws,
5638 offset: 157,
5639 length: 2,
5640 token: Token::Special(Special::Punctuation('·')),
5641 },
5642 PositionalToken {
5643 source: uws,
5644 offset: 159,
5645 length: 3,
5646 token: Token::Word(Word::Word("米".to_string())),
5647 },
5648 PositionalToken {
5649 source: uws,
5650 offset: 162,
5651 length: 3,
5652 token: Token::Word(Word::Word("勒".to_string())),
5653 },
5654 PositionalToken {
5655 source: uws,
5656 offset: 165,
5657 length: 3,
5658 token: Token::Word(Word::Word("編".to_string())),
5659 },
5660 PositionalToken {
5661 source: uws,
5662 offset: 168,
5663 length: 3,
5664 token: Token::Word(Word::Word("劇".to_string())),
5665 },
5666 PositionalToken {
5667 source: uws,
5668 offset: 171,
5669 length: 3,
5670 token: Token::Special(Special::Punctuation(',')),
5671 },
5672 PositionalToken {
5673 source: uws,
5674 offset: 174,
5675 length: 3,
5676 token: Token::Word(Word::Word("大".to_string())),
5677 },
5678 PositionalToken {
5679 source: uws,
5680 offset: 177,
5681 length: 3,
5682 token: Token::Word(Word::Word("卫".to_string())),
5683 },
5684 PositionalToken {
5685 source: uws,
5686 offset: 180,
5687 length: 2,
5688 token: Token::Special(Special::Punctuation('·')),
5689 },
5690 PositionalToken {
5691 source: uws,
5692 offset: 182,
5693 length: 3,
5694 token: Token::Word(Word::Word("努".to_string())),
5695 },
5696 PositionalToken {
5697 source: uws,
5698 offset: 185,
5699 length: 3,
5700 token: Token::Word(Word::Word("特".to_string())),
5701 },
5702 PositionalToken {
5703 source: uws,
5704 offset: 188,
5705 length: 3,
5706 token: Token::Word(Word::Word("尔".to_string())),
5707 },
5708 PositionalToken {
5709 source: uws,
5710 offset: 191,
5711 length: 3,
5712 token: Token::Word(Word::Word("执".to_string())),
5713 },
5714 PositionalToken {
5715 source: uws,
5716 offset: 194,
5717 length: 3,
5718 token: Token::Word(Word::Word("导".to_string())),
5719 },
5720 PositionalToken {
5721 source: uws,
5722 offset: 197,
5723 length: 3,
5724 token: Token::Special(Special::Punctuation('。')),
5725 },
5726 PositionalToken {
5727 source: uws,
5728 offset: 200,
5729 length: 3,
5730 token: Token::Word(Word::Word("这".to_string())),
5731 },
5732 PositionalToken {
5733 source: uws,
5734 offset: 203,
5735 length: 3,
5736 token: Token::Word(Word::Word("一".to_string())),
5737 },
5738 PositionalToken {
5739 source: uws,
5740 offset: 206,
5741 length: 3,
5742 token: Token::Word(Word::Word("试".to_string())),
5743 },
5744 PositionalToken {
5745 source: uws,
5746 offset: 209,
5747 length: 3,
5748 token: Token::Word(Word::Word("播".to_string())),
5749 },
5750 PositionalToken {
5751 source: uws,
5752 offset: 212,
5753 length: 3,
5754 token: Token::Word(Word::Word("首".to_string())),
5755 },
5756 PositionalToken {
5757 source: uws,
5758 offset: 215,
5759 length: 3,
5760 token: Token::Word(Word::Word("次".to_string())),
5761 },
5762 PositionalToken {
5763 source: uws,
5764 offset: 218,
5765 length: 3,
5766 token: Token::Word(Word::Word("向".to_string())),
5767 },
5768 PositionalToken {
5769 source: uws,
5770 offset: 221,
5771 length: 3,
5772 token: Token::Word(Word::Word("观".to_string())),
5773 },
5774 PositionalToken {
5775 source: uws,
5776 offset: 224,
5777 length: 3,
5778 token: Token::Word(Word::Word("众".to_string())),
5779 },
5780 PositionalToken {
5781 source: uws,
5782 offset: 227,
5783 length: 3,
5784 token: Token::Word(Word::Word("引".to_string())),
5785 },
5786 PositionalToken {
5787 source: uws,
5788 offset: 230,
5789 length: 3,
5790 token: Token::Word(Word::Word("荐".to_string())),
5791 },
5792 PositionalToken {
5793 source: uws,
5794 offset: 233,
5795 length: 3,
5796 token: Token::Word(Word::Word("了".to_string())),
5797 },
5798 PositionalToken {
5799 source: uws,
5800 offset: 236,
5801 length: 3,
5802 token: Token::Word(Word::Word("克".to_string())),
5803 },
5804 PositionalToken {
5805 source: uws,
5806 offset: 239,
5807 length: 3,
5808 token: Token::Word(Word::Word("拉".to_string())),
5809 },
5810 PositionalToken {
5811 source: uws,
5812 offset: 242,
5813 length: 3,
5814 token: Token::Word(Word::Word("克".to_string())),
5815 },
5816 PositionalToken {
5817 source: uws,
5818 offset: 245,
5819 length: 2,
5820 token: Token::Special(Special::Punctuation('·')),
5821 },
5822 PositionalToken {
5823 source: uws,
5824 offset: 247,
5825 length: 3,
5826 token: Token::Word(Word::Word("肯".to_string())),
5827 },
5828 PositionalToken {
5829 source: uws,
5830 offset: 250,
5831 length: 3,
5832 token: Token::Word(Word::Word("特".to_string())),
5833 },
5834 PositionalToken {
5835 source: uws,
5836 offset: 253,
5837 length: 3,
5838 token: Token::Word(Word::Word("一".to_string())),
5839 },
5840 PositionalToken {
5841 source: uws,
5842 offset: 256,
5843 length: 3,
5844 token: Token::Word(Word::Word("角".to_string())),
5845 },
5846 PositionalToken {
5847 source: uws,
5848 offset: 259,
5849 length: 3,
5850 token: Token::Special(Special::Punctuation(',')),
5851 },
5852 PositionalToken {
5853 source: uws,
5854 offset: 262,
5855 length: 3,
5856 token: Token::Word(Word::Word("他".to_string())),
5857 },
5858 PositionalToken {
5859 source: uws,
5860 offset: 265,
5861 length: 3,
5862 token: Token::Word(Word::Word("是".to_string())),
5863 },
5864 PositionalToken {
5865 source: uws,
5866 offset: 268,
5867 length: 3,
5868 token: Token::Word(Word::Word("位".to_string())),
5869 },
5870 PositionalToken {
5871 source: uws,
5872 offset: 271,
5873 length: 3,
5874 token: Token::Word(Word::Word("拥".to_string())),
5875 },
5876 PositionalToken {
5877 source: uws,
5878 offset: 274,
5879 length: 3,
5880 token: Token::Word(Word::Word("有".to_string())),
5881 },
5882 PositionalToken {
5883 source: uws,
5884 offset: 277,
5885 length: 3,
5886 token: Token::Word(Word::Word("超".to_string())),
5887 },
5888 ],
5889 Lang::Jpn => vec![
5890 PositionalToken {
5891 source: uws,
5892 offset: 0,
5893 length: 3,
5894 token: Token::Word(Word::Word("熊".to_string())),
5895 },
5896 PositionalToken {
5897 source: uws,
5898 offset: 3,
5899 length: 3,
5900 token: Token::Word(Word::Word("野".to_string())),
5901 },
5902 PositionalToken {
5903 source: uws,
5904 offset: 6,
5905 length: 3,
5906 token: Token::Word(Word::Word("三".to_string())),
5907 },
5908 PositionalToken {
5909 source: uws,
5910 offset: 9,
5911 length: 3,
5912 token: Token::Word(Word::Word("山".to_string())),
5913 },
5914 PositionalToken {
5915 source: uws,
5916 offset: 12,
5917 length: 3,
5918 token: Token::Word(Word::Word("本".to_string())),
5919 },
5920 PositionalToken {
5921 source: uws,
5922 offset: 15,
5923 length: 3,
5924 token: Token::Word(Word::Word("願".to_string())),
5925 },
5926 PositionalToken {
5927 source: uws,
5928 offset: 18,
5929 length: 3,
5930 token: Token::Word(Word::Word("所".to_string())),
5931 },
5932 PositionalToken {
5933 source: uws,
5934 offset: 21,
5935 length: 3,
5936 token: Token::Word(Word::Word("は".to_string())),
5937 },
5938 PositionalToken {
5939 source: uws,
5940 offset: 24,
5941 length: 3,
5942 token: Token::Special(Special::Punctuation('、')),
5943 },
5944 PositionalToken {
5945 source: uws,
5946 offset: 27,
5947 length: 2,
5948 token: Token::Word(Word::Number(Number::Integer(15))),
5949 },
5950 PositionalToken {
5951 source: uws,
5952 offset: 29,
5953 length: 3,
5954 token: Token::Word(Word::Word("世".to_string())),
5955 },
5956 PositionalToken {
5957 source: uws,
5958 offset: 32,
5959 length: 3,
5960 token: Token::Word(Word::Word("紀".to_string())),
5961 },
5962 PositionalToken {
5963 source: uws,
5964 offset: 35,
5965 length: 3,
5966 token: Token::Word(Word::Word("末".to_string())),
5967 },
5968 PositionalToken {
5969 source: uws,
5970 offset: 38,
5971 length: 3,
5972 token: Token::Word(Word::Word("以".to_string())),
5973 },
5974 PositionalToken {
5975 source: uws,
5976 offset: 41,
5977 length: 3,
5978 token: Token::Word(Word::Word("降".to_string())),
5979 },
5980 PositionalToken {
5981 source: uws,
5982 offset: 44,
5983 length: 3,
5984 token: Token::Word(Word::Word("に".to_string())),
5985 },
5986 PositionalToken {
5987 source: uws,
5988 offset: 47,
5989 length: 3,
5990 token: Token::Word(Word::Word("お".to_string())),
5991 },
5992 PositionalToken {
5993 source: uws,
5994 offset: 50,
5995 length: 3,
5996 token: Token::Word(Word::Word("け".to_string())),
5997 },
5998 PositionalToken {
5999 source: uws,
6000 offset: 53,
6001 length: 3,
6002 token: Token::Word(Word::Word("る".to_string())),
6003 },
6004 PositionalToken {
6005 source: uws,
6006 offset: 56,
6007 length: 3,
6008 token: Token::Word(Word::Word("熊".to_string())),
6009 },
6010 PositionalToken {
6011 source: uws,
6012 offset: 59,
6013 length: 3,
6014 token: Token::Word(Word::Word("野".to_string())),
6015 },
6016 PositionalToken {
6017 source: uws,
6018 offset: 62,
6019 length: 3,
6020 token: Token::Word(Word::Word("三".to_string())),
6021 },
6022 PositionalToken {
6023 source: uws,
6024 offset: 65,
6025 length: 3,
6026 token: Token::Word(Word::Word("山".to_string())),
6027 },
6028 PositionalToken {
6029 source: uws,
6030 offset: 68,
6031 length: 3,
6032 token: Token::Special(Special::Punctuation('(')),
6033 },
6034 PositionalToken {
6035 source: uws,
6036 offset: 71,
6037 length: 3,
6038 token: Token::Word(Word::Word("熊".to_string())),
6039 },
6040 PositionalToken {
6041 source: uws,
6042 offset: 74,
6043 length: 3,
6044 token: Token::Word(Word::Word("野".to_string())),
6045 },
6046 PositionalToken {
6047 source: uws,
6048 offset: 77,
6049 length: 3,
6050 token: Token::Word(Word::Word("本".to_string())),
6051 },
6052 PositionalToken {
6053 source: uws,
6054 offset: 80,
6055 length: 3,
6056 token: Token::Word(Word::Word("宮".to_string())),
6057 },
6058 PositionalToken {
6059 source: uws,
6060 offset: 83,
6061 length: 3,
6062 token: Token::Special(Special::Punctuation('、')),
6063 },
6064 PositionalToken {
6065 source: uws,
6066 offset: 86,
6067 length: 3,
6068 token: Token::Word(Word::Word("熊".to_string())),
6069 },
6070 PositionalToken {
6071 source: uws,
6072 offset: 89,
6073 length: 3,
6074 token: Token::Word(Word::Word("野".to_string())),
6075 },
6076 PositionalToken {
6077 source: uws,
6078 offset: 92,
6079 length: 3,
6080 token: Token::Word(Word::Word("新".to_string())),
6081 },
6082 PositionalToken {
6083 source: uws,
6084 offset: 95,
6085 length: 3,
6086 token: Token::Word(Word::Word("宮".to_string())),
6087 },
6088 PositionalToken {
6089 source: uws,
6090 offset: 98,
6091 length: 3,
6092 token: Token::Special(Special::Punctuation('、')),
6093 },
6094 PositionalToken {
6095 source: uws,
6096 offset: 101,
6097 length: 3,
6098 token: Token::Word(Word::Word("熊".to_string())),
6099 },
6100 PositionalToken {
6101 source: uws,
6102 offset: 104,
6103 length: 3,
6104 token: Token::Word(Word::Word("野".to_string())),
6105 },
6106 PositionalToken {
6107 source: uws,
6108 offset: 107,
6109 length: 3,
6110 token: Token::Word(Word::Word("那".to_string())),
6111 },
6112 PositionalToken {
6113 source: uws,
6114 offset: 110,
6115 length: 3,
6116 token: Token::Word(Word::Word("智".to_string())),
6117 },
6118 PositionalToken {
6119 source: uws,
6120 offset: 113,
6121 length: 3,
6122 token: Token::Special(Special::Punctuation(')')),
6123 },
6124 PositionalToken {
6125 source: uws,
6126 offset: 116,
6127 length: 3,
6128 token: Token::Word(Word::Word("の".to_string())),
6129 },
6130 PositionalToken {
6131 source: uws,
6132 offset: 119,
6133 length: 3,
6134 token: Token::Word(Word::Word("造".to_string())),
6135 },
6136 PositionalToken {
6137 source: uws,
6138 offset: 122,
6139 length: 3,
6140 token: Token::Word(Word::Word("営".to_string())),
6141 },
6142 PositionalToken {
6143 source: uws,
6144 offset: 125,
6145 length: 3,
6146 token: Token::Special(Special::Punctuation('・')),
6147 },
6148 PositionalToken {
6149 source: uws,
6150 offset: 128,
6151 length: 3,
6152 token: Token::Word(Word::Word("修".to_string())),
6153 },
6154 PositionalToken {
6155 source: uws,
6156 offset: 131,
6157 length: 3,
6158 token: Token::Word(Word::Word("造".to_string())),
6159 },
6160 PositionalToken {
6161 source: uws,
6162 offset: 134,
6163 length: 3,
6164 token: Token::Word(Word::Word("の".to_string())),
6165 },
6166 PositionalToken {
6167 source: uws,
6168 offset: 137,
6169 length: 3,
6170 token: Token::Word(Word::Word("た".to_string())),
6171 },
6172 PositionalToken {
6173 source: uws,
6174 offset: 140,
6175 length: 3,
6176 token: Token::Word(Word::Word("め".to_string())),
6177 },
6178 PositionalToken {
6179 source: uws,
6180 offset: 143,
6181 length: 3,
6182 token: Token::Word(Word::Word("の".to_string())),
6183 },
6184 PositionalToken {
6185 source: uws,
6186 offset: 146,
6187 length: 3,
6188 token: Token::Word(Word::Word("勧".to_string())),
6189 },
6190 PositionalToken {
6191 source: uws,
6192 offset: 149,
6193 length: 3,
6194 token: Token::Word(Word::Word("進".to_string())),
6195 },
6196 PositionalToken {
6197 source: uws,
6198 offset: 152,
6199 length: 3,
6200 token: Token::Word(Word::Word("を".to_string())),
6201 },
6202 PositionalToken {
6203 source: uws,
6204 offset: 155,
6205 length: 3,
6206 token: Token::Word(Word::Word("担".to_string())),
6207 },
6208 PositionalToken {
6209 source: uws,
6210 offset: 158,
6211 length: 3,
6212 token: Token::Word(Word::Word("っ".to_string())),
6213 },
6214 PositionalToken {
6215 source: uws,
6216 offset: 161,
6217 length: 3,
6218 token: Token::Word(Word::Word("た".to_string())),
6219 },
6220 PositionalToken {
6221 source: uws,
6222 offset: 164,
6223 length: 3,
6224 token: Token::Word(Word::Word("組".to_string())),
6225 },
6226 PositionalToken {
6227 source: uws,
6228 offset: 167,
6229 length: 3,
6230 token: Token::Word(Word::Word("織".to_string())),
6231 },
6232 PositionalToken {
6233 source: uws,
6234 offset: 170,
6235 length: 3,
6236 token: Token::Word(Word::Word("の".to_string())),
6237 },
6238 PositionalToken {
6239 source: uws,
6240 offset: 173,
6241 length: 3,
6242 token: Token::Word(Word::Word("総".to_string())),
6243 },
6244 PositionalToken {
6245 source: uws,
6246 offset: 176,
6247 length: 3,
6248 token: Token::Word(Word::Word("称".to_string())),
6249 },
6250 PositionalToken {
6251 source: uws,
6252 offset: 179,
6253 length: 3,
6254 token: Token::Special(Special::Punctuation('。')),
6255 },
6256 PositionalToken {
6257 source: uws,
6258 offset: 182,
6259 length: 1,
6260 token: Token::Special(Special::Separator(Separator::Space)),
6261 },
6262 PositionalToken {
6263 source: uws,
6264 offset: 183,
6265 length: 3,
6266 token: Token::Word(Word::Word("熊".to_string())),
6267 },
6268 PositionalToken {
6269 source: uws,
6270 offset: 186,
6271 length: 3,
6272 token: Token::Word(Word::Word("野".to_string())),
6273 },
6274 PositionalToken {
6275 source: uws,
6276 offset: 189,
6277 length: 3,
6278 token: Token::Word(Word::Word("三".to_string())),
6279 },
6280 PositionalToken {
6281 source: uws,
6282 offset: 192,
6283 length: 3,
6284 token: Token::Word(Word::Word("山".to_string())),
6285 },
6286 PositionalToken {
6287 source: uws,
6288 offset: 195,
6289 length: 3,
6290 token: Token::Word(Word::Word("を".to_string())),
6291 },
6292 PositionalToken {
6293 source: uws,
6294 offset: 198,
6295 length: 3,
6296 token: Token::Word(Word::Word("含".to_string())),
6297 },
6298 PositionalToken {
6299 source: uws,
6300 offset: 201,
6301 length: 3,
6302 token: Token::Word(Word::Word("め".to_string())),
6303 },
6304 PositionalToken {
6305 source: uws,
6306 offset: 204,
6307 length: 3,
6308 token: Token::Word(Word::Word("て".to_string())),
6309 },
6310 PositionalToken {
6311 source: uws,
6312 offset: 207,
6313 length: 3,
6314 token: Token::Special(Special::Punctuation('、')),
6315 },
6316 PositionalToken {
6317 source: uws,
6318 offset: 210,
6319 length: 3,
6320 token: Token::Word(Word::Word("日".to_string())),
6321 },
6322 PositionalToken {
6323 source: uws,
6324 offset: 213,
6325 length: 3,
6326 token: Token::Word(Word::Word("本".to_string())),
6327 },
6328 PositionalToken {
6329 source: uws,
6330 offset: 216,
6331 length: 3,
6332 token: Token::Word(Word::Word("に".to_string())),
6333 },
6334 PositionalToken {
6335 source: uws,
6336 offset: 219,
6337 length: 3,
6338 token: Token::Word(Word::Word("お".to_string())),
6339 },
6340 PositionalToken {
6341 source: uws,
6342 offset: 222,
6343 length: 3,
6344 token: Token::Word(Word::Word("け".to_string())),
6345 },
6346 PositionalToken {
6347 source: uws,
6348 offset: 225,
6349 length: 3,
6350 token: Token::Word(Word::Word("る".to_string())),
6351 },
6352 PositionalToken {
6353 source: uws,
6354 offset: 228,
6355 length: 3,
6356 token: Token::Word(Word::Word("古".to_string())),
6357 },
6358 PositionalToken {
6359 source: uws,
6360 offset: 231,
6361 length: 3,
6362 token: Token::Word(Word::Word("代".to_string())),
6363 },
6364 PositionalToken {
6365 source: uws,
6366 offset: 234,
6367 length: 3,
6368 token: Token::Word(Word::Word("か".to_string())),
6369 },
6370 PositionalToken {
6371 source: uws,
6372 offset: 237,
6373 length: 3,
6374 token: Token::Word(Word::Word("ら".to_string())),
6375 },
6376 PositionalToken {
6377 source: uws,
6378 offset: 240,
6379 length: 3,
6380 token: Token::Word(Word::Word("中".to_string())),
6381 },
6382 PositionalToken {
6383 source: uws,
6384 offset: 243,
6385 length: 3,
6386 token: Token::Word(Word::Word("世".to_string())),
6387 },
6388 PositionalToken {
6389 source: uws,
6390 offset: 246,
6391 length: 3,
6392 token: Token::Word(Word::Word("前".to_string())),
6393 },
6394 PositionalToken {
6395 source: uws,
6396 offset: 249,
6397 length: 3,
6398 token: Token::Word(Word::Word("半".to_string())),
6399 },
6400 PositionalToken {
6401 source: uws,
6402 offset: 252,
6403 length: 3,
6404 token: Token::Word(Word::Word("に".to_string())),
6405 },
6406 PositionalToken {
6407 source: uws,
6408 offset: 255,
6409 length: 3,
6410 token: Token::Word(Word::Word("か".to_string())),
6411 },
6412 PositionalToken {
6413 source: uws,
6414 offset: 258,
6415 length: 3,
6416 token: Token::Word(Word::Word("け".to_string())),
6417 },
6418 PositionalToken {
6419 source: uws,
6420 offset: 261,
6421 length: 3,
6422 token: Token::Word(Word::Word("て".to_string())),
6423 },
6424 PositionalToken {
6425 source: uws,
6426 offset: 264,
6427 length: 3,
6428 token: Token::Word(Word::Word("の".to_string())),
6429 },
6430 PositionalToken {
6431 source: uws,
6432 offset: 267,
6433 length: 3,
6434 token: Token::Word(Word::Word("寺".to_string())),
6435 },
6436 PositionalToken {
6437 source: uws,
6438 offset: 270,
6439 length: 3,
6440 token: Token::Word(Word::Word("社".to_string())),
6441 },
6442 PositionalToken {
6443 source: uws,
6444 offset: 273,
6445 length: 3,
6446 token: Token::Word(Word::Word("の".to_string())),
6447 },
6448 PositionalToken {
6449 source: uws,
6450 offset: 276,
6451 length: 3,
6452 token: Token::Word(Word::Word("造".to_string())),
6453 },
6454 PositionalToken {
6455 source: uws,
6456 offset: 279,
6457 length: 3,
6458 token: Token::Word(Word::Word("営".to_string())),
6459 },
6460 PositionalToken {
6461 source: uws,
6462 offset: 282,
6463 length: 3,
6464 token: Token::Word(Word::Word("は".to_string())),
6465 },
6466 PositionalToken {
6467 source: uws,
6468 offset: 285,
6469 length: 3,
6470 token: Token::Special(Special::Punctuation('、')),
6471 },
6472 PositionalToken {
6473 source: uws,
6474 offset: 288,
6475 length: 3,
6476 token: Token::Word(Word::Word("寺".to_string())),
6477 },
6478 PositionalToken {
6479 source: uws,
6480 offset: 291,
6481 length: 3,
6482 token: Token::Word(Word::Word("社".to_string())),
6483 },
6484 ],
6485 Lang::Kor => vec![
6486 PositionalToken {
6487 source: uws,
6488 offset: 0,
6489 length: 21,
6490 token: Token::Word(Word::Word("플레이스테이션".to_string())),
6491 },
6492 PositionalToken {
6493 source: uws,
6494 offset: 21,
6495 length: 1,
6496 token: Token::Special(Special::Separator(Separator::Space)),
6497 },
6498 PositionalToken {
6499 source: uws,
6500 offset: 22,
6501 length: 3,
6502 token: Token::Word(Word::Word("은".to_string())),
6503 },
6504 PositionalToken {
6505 source: uws,
6506 offset: 25,
6507 length: 1,
6508 token: Token::Special(Special::Separator(Separator::Space)),
6509 },
6510 PositionalToken {
6511 source: uws,
6512 offset: 26,
6513 length: 6,
6514 token: Token::Word(Word::Word("소니".to_string())),
6515 },
6516 PositionalToken {
6517 source: uws,
6518 offset: 32,
6519 length: 1,
6520 token: Token::Special(Special::Separator(Separator::Space)),
6521 },
6522 PositionalToken {
6523 source: uws,
6524 offset: 33,
6525 length: 9,
6526 token: Token::Word(Word::Word("컴퓨터".to_string())),
6527 },
6528 PositionalToken {
6529 source: uws,
6530 offset: 42,
6531 length: 1,
6532 token: Token::Special(Special::Separator(Separator::Space)),
6533 },
6534 PositionalToken {
6535 source: uws,
6536 offset: 43,
6537 length: 21,
6538 token: Token::Word(Word::Word("엔터테인먼트가".to_string())),
6539 },
6540 PositionalToken {
6541 source: uws,
6542 offset: 64,
6543 length: 1,
6544 token: Token::Special(Special::Separator(Separator::Space)),
6545 },
6546 PositionalToken {
6547 source: uws,
6548 offset: 65,
6549 length: 9,
6550 token: Token::Word(Word::Word("개발한".to_string())),
6551 },
6552 PositionalToken {
6553 source: uws,
6554 offset: 74,
6555 length: 1,
6556 token: Token::Special(Special::Separator(Separator::Space)),
6557 },
6558 PositionalToken {
6559 source: uws,
6560 offset: 75,
6561 length: 3,
6562 token: Token::Word(Word::Word("세".to_string())),
6563 },
6564 PositionalToken {
6565 source: uws,
6566 offset: 78,
6567 length: 1,
6568 token: Token::Special(Special::Separator(Separator::Space)),
6569 },
6570 PositionalToken {
6571 source: uws,
6572 offset: 79,
6573 length: 6,
6574 token: Token::Word(Word::Word("번째".to_string())),
6575 },
6576 PositionalToken {
6577 source: uws,
6578 offset: 85,
6579 length: 1,
6580 token: Token::Special(Special::Separator(Separator::Space)),
6581 },
6582 PositionalToken {
6583 source: uws,
6584 offset: 86,
6585 length: 9,
6586 token: Token::Word(Word::Word("가정용".to_string())),
6587 },
6588 PositionalToken {
6589 source: uws,
6590 offset: 95,
6591 length: 1,
6592 token: Token::Special(Special::Separator(Separator::Space)),
6593 },
6594 PositionalToken {
6595 source: uws,
6596 offset: 96,
6597 length: 15,
6598 token: Token::Word(Word::Word("게임기이다".to_string())),
6599 },
6600 PositionalToken {
6601 source: uws,
6602 offset: 111,
6603 length: 1,
6604 token: Token::Special(Special::Punctuation('.')),
6605 },
6606 PositionalToken {
6607 source: uws,
6608 offset: 112,
6609 length: 1,
6610 token: Token::Special(Special::Separator(Separator::Space)),
6611 },
6612 PositionalToken {
6613 source: uws,
6614 offset: 113,
6615 length: 24,
6616 token: Token::Word(Word::Word("마이크로소프트의".to_string())),
6617 },
6618 PositionalToken {
6619 source: uws,
6620 offset: 137,
6621 length: 1,
6622 token: Token::Special(Special::Separator(Separator::Space)),
6623 },
6624 PositionalToken {
6625 source: uws,
6626 offset: 138,
6627 length: 12,
6628 token: Token::Word(Word::Word("엑스박스".to_string())),
6629 },
6630 PositionalToken {
6631 source: uws,
6632 offset: 150,
6633 length: 1,
6634 token: Token::Special(Special::Separator(Separator::Space)),
6635 },
6636 PositionalToken {
6637 source: uws,
6638 offset: 151,
6639 length: 3,
6640 token: Token::Word(Word::Number(Number::Integer(360))),
6641 },
6642 PositionalToken {
6643 source: uws,
6644 offset: 154,
6645 length: 1,
6646 token: Token::Special(Special::Punctuation(',')),
6647 },
6648 PositionalToken {
6649 source: uws,
6650 offset: 155,
6651 length: 1,
6652 token: Token::Special(Special::Separator(Separator::Space)),
6653 },
6654 PositionalToken {
6655 source: uws,
6656 offset: 156,
6657 length: 12,
6658 token: Token::Word(Word::Word("닌텐도의".to_string())),
6659 },
6660 PositionalToken {
6661 source: uws,
6662 offset: 168,
6663 length: 1,
6664 token: Token::Special(Special::Separator(Separator::Space)),
6665 },
6666 PositionalToken {
6667 source: uws,
6668 offset: 169,
6669 length: 6,
6670 token: Token::Word(Word::Word("Wii와".to_string())),
6671 },
6672 PositionalToken {
6673 source: uws,
6674 offset: 175,
6675 length: 1,
6676 token: Token::Special(Special::Separator(Separator::Space)),
6677 },
6678 PositionalToken {
6679 source: uws,
6680 offset: 176,
6681 length: 12,
6682 token: Token::Word(Word::Word("경쟁하고".to_string())),
6683 },
6684 PositionalToken {
6685 source: uws,
6686 offset: 188,
6687 length: 1,
6688 token: Token::Special(Special::Separator(Separator::Space)),
6689 },
6690 PositionalToken {
6691 source: uws,
6692 offset: 189,
6693 length: 6,
6694 token: Token::Word(Word::Word("있다".to_string())),
6695 },
6696 PositionalToken {
6697 source: uws,
6698 offset: 195,
6699 length: 1,
6700 token: Token::Special(Special::Punctuation('.')),
6701 },
6702 PositionalToken {
6703 source: uws,
6704 offset: 196,
6705 length: 1,
6706 token: Token::Special(Special::Separator(Separator::Space)),
6707 },
6708 PositionalToken {
6709 source: uws,
6710 offset: 197,
6711 length: 6,
6712 token: Token::Word(Word::Word("이전".to_string())),
6713 },
6714 PositionalToken {
6715 source: uws,
6716 offset: 203,
6717 length: 1,
6718 token: Token::Special(Special::Separator(Separator::Space)),
6719 },
6720 PositionalToken {
6721 source: uws,
6722 offset: 204,
6723 length: 12,
6724 token: Token::Word(Word::Word("제품에서".to_string())),
6725 },
6726 PositionalToken {
6727 source: uws,
6728 offset: 216,
6729 length: 1,
6730 token: Token::Special(Special::Separator(Separator::Space)),
6731 },
6732 PositionalToken {
6733 source: uws,
6734 offset: 217,
6735 length: 9,
6736 token: Token::Word(Word::Word("온라인".to_string())),
6737 },
6738 PositionalToken {
6739 source: uws,
6740 offset: 226,
6741 length: 1,
6742 token: Token::Special(Special::Separator(Separator::Space)),
6743 },
6744 PositionalToken {
6745 source: uws,
6746 offset: 227,
6747 length: 9,
6748 token: Token::Word(Word::Word("플레이".to_string())),
6749 },
6750 PositionalToken {
6751 source: uws,
6752 offset: 236,
6753 length: 1,
6754 token: Token::Special(Special::Separator(Separator::Space)),
6755 },
6756 PositionalToken {
6757 source: uws,
6758 offset: 237,
6759 length: 3,
6760 token: Token::Word(Word::Word("기".to_string())),
6761 },
6762 ],
6763 Lang::Ara => vec![
6764 PositionalToken {
6765 source: uws,
6766 offset: 0,
6767 length: 14,
6768 token: Token::Word(Word::Word("لشکرکشی".to_string())),
6769 },
6770 PositionalToken {
6771 source: uws,
6772 offset: 14,
6773 length: 3,
6774 token: Token::Unicode(Unicode::Formatter(Formatter::Char('\u{200c}'))),
6775 },
6776 PositionalToken {
6777 source: uws,
6778 offset: 17,
6779 length: 6,
6780 token: Token::Word(Word::Word("های".to_string())),
6781 },
6782 PositionalToken {
6783 source: uws,
6784 offset: 23,
6785 length: 1,
6786 token: Token::Special(Special::Separator(Separator::Space)),
6787 },
6788 PositionalToken {
6789 source: uws,
6790 offset: 24,
6791 length: 6,
6792 token: Token::Word(Word::Word("روس".to_string())),
6793 },
6794 PositionalToken {
6795 source: uws,
6796 offset: 30,
6797 length: 3,
6798 token: Token::Unicode(Unicode::Formatter(Formatter::Char('\u{200c}'))),
6799 },
6800 PositionalToken {
6801 source: uws,
6802 offset: 33,
6803 length: 6,
6804 token: Token::Word(Word::Word("های".to_string())),
6805 },
6806 PositionalToken {
6807 source: uws,
6808 offset: 39,
6809 length: 1,
6810 token: Token::Special(Special::Separator(Separator::Space)),
6811 },
6812 PositionalToken {
6813 source: uws,
6814 offset: 40,
6815 length: 12,
6816 token: Token::Word(Word::Word("وارنگی".to_string())),
6817 },
6818 PositionalToken {
6819 source: uws,
6820 offset: 52,
6821 length: 1,
6822 token: Token::Special(Special::Separator(Separator::Space)),
6823 },
6824 PositionalToken {
6825 source: uws,
6826 offset: 53,
6827 length: 4,
6828 token: Token::Word(Word::Word("به".to_string())),
6829 },
6830 PositionalToken {
6831 source: uws,
6832 offset: 57,
6833 length: 1,
6834 token: Token::Special(Special::Separator(Separator::Space)),
6835 },
6836 PositionalToken {
6837 source: uws,
6838 offset: 58,
6839 length: 10,
6840 token: Token::Word(Word::Word("دریای".to_string())),
6841 },
6842 PositionalToken {
6843 source: uws,
6844 offset: 68,
6845 length: 1,
6846 token: Token::Special(Special::Separator(Separator::Space)),
6847 },
6848 PositionalToken {
6849 source: uws,
6850 offset: 69,
6851 length: 6,
6852 token: Token::Word(Word::Word("خزر".to_string())),
6853 },
6854 PositionalToken {
6855 source: uws,
6856 offset: 75,
6857 length: 1,
6858 token: Token::Special(Special::Separator(Separator::Space)),
6859 },
6860 PositionalToken {
6861 source: uws,
6862 offset: 76,
6863 length: 12,
6864 token: Token::Word(Word::Word("مجموعه".to_string())),
6865 },
6866 PositionalToken {
6867 source: uws,
6868 offset: 88,
6869 length: 3,
6870 token: Token::Unicode(Unicode::Formatter(Formatter::Char('\u{200c}'))),
6871 },
6872 PositionalToken {
6873 source: uws,
6874 offset: 91,
6875 length: 4,
6876 token: Token::Word(Word::Word("ای".to_string())),
6877 },
6878 PositionalToken {
6879 source: uws,
6880 offset: 95,
6881 length: 1,
6882 token: Token::Special(Special::Separator(Separator::Space)),
6883 },
6884 PositionalToken {
6885 source: uws,
6886 offset: 96,
6887 length: 4,
6888 token: Token::Word(Word::Word("از".to_string())),
6889 },
6890 PositionalToken {
6891 source: uws,
6892 offset: 100,
6893 length: 1,
6894 token: Token::Special(Special::Separator(Separator::Space)),
6895 },
6896 PositionalToken {
6897 source: uws,
6898 offset: 101,
6899 length: 10,
6900 token: Token::Word(Word::Word("حملات".to_string())),
6901 },
6902 PositionalToken {
6903 source: uws,
6904 offset: 111,
6905 length: 1,
6906 token: Token::Special(Special::Separator(Separator::Space)),
6907 },
6908 PositionalToken {
6909 source: uws,
6910 offset: 112,
6911 length: 10,
6912 token: Token::Word(Word::Word("نظامی".to_string())),
6913 },
6914 PositionalToken {
6915 source: uws,
6916 offset: 122,
6917 length: 1,
6918 token: Token::Special(Special::Separator(Separator::Space)),
6919 },
6920 PositionalToken {
6921 source: uws,
6922 offset: 123,
6923 length: 4,
6924 token: Token::Word(Word::Word("در".to_string())),
6925 },
6926 PositionalToken {
6927 source: uws,
6928 offset: 127,
6929 length: 1,
6930 token: Token::Special(Special::Separator(Separator::Space)),
6931 },
6932 PositionalToken {
6933 source: uws,
6934 offset: 128,
6935 length: 6,
6936 token: Token::Word(Word::Word("بین".to_string())),
6937 },
6938 PositionalToken {
6939 source: uws,
6940 offset: 134,
6941 length: 1,
6942 token: Token::Special(Special::Separator(Separator::Space)),
6943 },
6944 PositionalToken {
6945 source: uws,
6946 offset: 135,
6947 length: 6,
6948 token: Token::Word(Word::Word("سال".to_string())),
6949 },
6950 PositionalToken {
6951 source: uws,
6952 offset: 141,
6953 length: 3,
6954 token: Token::Unicode(Unicode::Formatter(Formatter::Char('\u{200c}'))),
6955 },
6956 PositionalToken {
6957 source: uws,
6958 offset: 144,
6959 length: 6,
6960 token: Token::Word(Word::Word("های".to_string())),
6961 },
6962 PositionalToken {
6963 source: uws,
6964 offset: 150,
6965 length: 1,
6966 token: Token::Special(Special::Separator(Separator::Space)),
6967 },
6968 PositionalToken {
6969 source: uws,
6970 offset: 151,
6971 length: 6,
6972 token: Token::Word(Word::StrangeWord("۸۶۴".to_string())),
6973 },
6974 PositionalToken {
6975 source: uws,
6976 offset: 157,
6977 length: 1,
6978 token: Token::Special(Special::Separator(Separator::Space)),
6979 },
6980 PositionalToken {
6981 source: uws,
6982 offset: 158,
6983 length: 4,
6984 token: Token::Word(Word::Word("تا".to_string())),
6985 },
6986 PositionalToken {
6987 source: uws,
6988 offset: 162,
6989 length: 1,
6990 token: Token::Special(Special::Separator(Separator::Space)),
6991 },
6992 PositionalToken {
6993 source: uws,
6994 offset: 163,
6995 length: 8,
6996 token: Token::Word(Word::StrangeWord("۱۰۴۱".to_string())),
6997 },
6998 PositionalToken {
6999 source: uws,
7000 offset: 171,
7001 length: 1,
7002 token: Token::Special(Special::Separator(Separator::Space)),
7003 },
7004 PositionalToken {
7005 source: uws,
7006 offset: 172,
7007 length: 12,
7008 token: Token::Word(Word::Word("میلادی".to_string())),
7009 },
7010 PositionalToken {
7011 source: uws,
7012 offset: 184,
7013 length: 1,
7014 token: Token::Special(Special::Separator(Separator::Space)),
7015 },
7016 PositionalToken {
7017 source: uws,
7018 offset: 185,
7019 length: 2,
7020 token: Token::Word(Word::Word("ب".to_string())),
7021 },
7022 ],
7023 Lang::Ell => vec![
7024 PositionalToken {
7025 source: uws,
7026 offset: 0,
7027 length: 4,
7028 token: Token::Word(Word::Word("Το".to_string())),
7029 },
7030 PositionalToken {
7031 source: uws,
7032 offset: 4,
7033 length: 1,
7034 token: Token::Special(Special::Separator(Separator::Space)),
7035 },
7036 PositionalToken {
7037 source: uws,
7038 offset: 5,
7039 length: 18,
7040 token: Token::Word(Word::Word("Πρόγραμμα".to_string())),
7041 },
7042 PositionalToken {
7043 source: uws,
7044 offset: 23,
7045 length: 1,
7046 token: Token::Special(Special::Separator(Separator::Space)),
7047 },
7048 PositionalToken {
7049 source: uws,
7050 offset: 24,
7051 length: 22,
7052 token: Token::Word(Word::Word("υλοποιείται".to_string())),
7053 },
7054 PositionalToken {
7055 source: uws,
7056 offset: 46,
7057 length: 1,
7058 token: Token::Special(Special::Separator(Separator::Space)),
7059 },
7060 PositionalToken {
7061 source: uws,
7062 offset: 47,
7063 length: 4,
7064 token: Token::Word(Word::Word("εξ".to_string())),
7065 },
7066 PositionalToken {
7067 source: uws,
7068 offset: 51,
7069 length: 1,
7070 token: Token::Special(Special::Separator(Separator::Space)),
7071 },
7072 PositionalToken {
7073 source: uws,
7074 offset: 52,
7075 length: 18,
7076 token: Token::Word(Word::Word("ολοκλήρου".to_string())),
7077 },
7078 PositionalToken {
7079 source: uws,
7080 offset: 70,
7081 length: 1,
7082 token: Token::Special(Special::Separator(Separator::Space)),
7083 },
7084 PositionalToken {
7085 source: uws,
7086 offset: 71,
7087 length: 6,
7088 token: Token::Word(Word::Word("από".to_string())),
7089 },
7090 PositionalToken {
7091 source: uws,
7092 offset: 77,
7093 length: 1,
7094 token: Token::Special(Special::Separator(Separator::Space)),
7095 },
7096 PositionalToken {
7097 source: uws,
7098 offset: 78,
7099 length: 16,
7100 token: Token::Word(Word::Word("απόσταση".to_string())),
7101 },
7102 PositionalToken {
7103 source: uws,
7104 offset: 94,
7105 length: 1,
7106 token: Token::Special(Special::Separator(Separator::Space)),
7107 },
7108 PositionalToken {
7109 source: uws,
7110 offset: 95,
7111 length: 6,
7112 token: Token::Word(Word::Word("και".to_string())),
7113 },
7114 PositionalToken {
7115 source: uws,
7116 offset: 101,
7117 length: 1,
7118 token: Token::Special(Special::Separator(Separator::Space)),
7119 },
7120 PositionalToken {
7121 source: uws,
7122 offset: 102,
7123 length: 12,
7124 token: Token::Word(Word::Word("μπορεί".to_string())),
7125 },
7126 PositionalToken {
7127 source: uws,
7128 offset: 114,
7129 length: 1,
7130 token: Token::Special(Special::Separator(Separator::Space)),
7131 },
7132 PositionalToken {
7133 source: uws,
7134 offset: 115,
7135 length: 4,
7136 token: Token::Word(Word::Word("να".to_string())),
7137 },
7138 PositionalToken {
7139 source: uws,
7140 offset: 119,
7141 length: 1,
7142 token: Token::Special(Special::Separator(Separator::Space)),
7143 },
7144 PositionalToken {
7145 source: uws,
7146 offset: 120,
7147 length: 20,
7148 token: Token::Word(Word::Word("συμμετέχει".to_string())),
7149 },
7150 PositionalToken {
7151 source: uws,
7152 offset: 140,
7153 length: 1,
7154 token: Token::Special(Special::Separator(Separator::Space)),
7155 },
7156 PositionalToken {
7157 source: uws,
7158 offset: 141,
7159 length: 8,
7160 token: Token::Word(Word::Word("κάθε".to_string())),
7161 },
7162 PositionalToken {
7163 source: uws,
7164 offset: 149,
7165 length: 1,
7166 token: Token::Special(Special::Separator(Separator::Space)),
7167 },
7168 PositionalToken {
7169 source: uws,
7170 offset: 150,
7171 length: 24,
7172 token: Token::Word(Word::Word("εμπλεκόμενος".to_string())),
7173 },
7174 PositionalToken {
7175 source: uws,
7176 offset: 174,
7177 length: 1,
7178 token: Token::Special(Special::Separator(Separator::Space)),
7179 },
7180 PositionalToken {
7181 source: uws,
7182 offset: 175,
7183 length: 6,
7184 token: Token::Word(Word::Word("στη".to_string())),
7185 },
7186 PositionalToken {
7187 source: uws,
7188 offset: 181,
7189 length: 1,
7190 token: Token::Special(Special::Separator(Separator::Space)),
7191 },
7192 PositionalToken {
7193 source: uws,
7194 offset: 182,
7195 length: 2,
7196 token: Token::Word(Word::Word("ή".to_string())),
7197 },
7198 PositionalToken {
7199 source: uws,
7200 offset: 184,
7201 length: 1,
7202 token: Token::Special(Special::Punctuation('/')),
7203 },
7204 ],
7205 };
7206 (
7207 uws.chars()
7208 .take(100)
7209 .fold(String::new(), |acc, c| acc + &format!("{}", c)),
7210 tokens,
7211 )
7212 }
7213}