1use std::sync::Arc;
2use text_parsing::{Breaker, IntoSource, Local, Localize, Snip, Source, SourceEvent};
3
4mod emoji;
5pub use emoji::EMOJIMAP;
6
7mod breakers;
8pub use breakers::{SentenceBreaker, UnicodeSentenceBreaker};
9
10mod wordbreaker;
11
12mod options;
13pub use options::{IntoTokenizer, TokenizerOptions, TokenizerParams};
14
15mod tokens;
16pub use tokens::Tokens;
17
18mod text_tokens;
19use text_tokens::InnerBound;
20pub use text_tokens::TextTokens;
21
22#[derive(Debug)]
23pub enum Error {
24 TextParser(text_parsing::Error),
25}
26
27const EPS: f64 = 1e-10;
28
29#[derive(Debug, Clone, Copy, PartialEq, PartialOrd)]
30pub enum Number {
31 Integer(i64),
32 Float(f64),
33}
34impl Number {
35 pub fn as_f64(&self) -> f64 {
36 match self {
37 Number::Integer(i) => *i as f64,
38 Number::Float(f) => *f,
39 }
40 }
41}
42impl Ord for Number {
43 fn cmp(&self, other: &Number) -> std::cmp::Ordering {
44 let s = self.as_f64();
45 let o = other.as_f64();
46 let d = s - o;
47 match d.abs() < EPS {
48 true => std::cmp::Ordering::Equal,
49 false => {
50 if d > 0.0 {
51 return std::cmp::Ordering::Greater;
52 }
53 if d < 0.0 {
54 return std::cmp::Ordering::Less;
55 }
56 std::cmp::Ordering::Equal
57 }
58 }
59 }
60}
61impl Eq for Number {}
62
63#[derive(Debug, Clone, Copy, Eq, PartialEq, Ord, PartialOrd)]
64pub enum Separator {
65 Space,
66 Tab,
67 Newline,
68 Char(char),
69}
70
71#[derive(Debug, Clone, Copy, Eq, PartialEq, Ord, PartialOrd)]
72pub enum Formatter {
73 Char(char),
74 Joiner, }
76
77#[derive(Debug, Clone, Copy, PartialEq, Eq, Ord, PartialOrd)]
78pub enum Special {
79 Punctuation(char),
80 Symbol(char),
81 Separator(Separator),
82}
83
84#[cfg(feature = "strings")]
85#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord)]
86pub enum Word {
87 Word(String),
88 StrangeWord(String),
89 Numerical(Numerical),
90 Number(Number),
91 Emoji(&'static str),
92}
93
94#[cfg(feature = "strings")]
95#[derive(Debug, Clone, Eq, PartialEq, Ord, PartialOrd)]
96pub enum Numerical {
97 DotSeparated(String),
101 Measures(String),
102 Alphanumeric(String),
103}
104
105#[cfg(feature = "strings")]
106#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord)]
107pub enum Struct {
108 Hashtag(String),
109 Mention(String),
110 }
112
113#[cfg(feature = "strings")]
114#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord)]
115pub enum Unicode {
116 String(String),
117 Formatter(Formatter),
118}
119
120#[cfg(not(feature = "strings"))]
121#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord)]
122pub enum Word {
123 Word,
124 StrangeWord,
125 Numerical(Numerical),
126 Number(Number),
127 Emoji(&'static str),
128}
129
130#[cfg(not(feature = "strings"))]
131#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord)]
132pub enum Numerical {
133 DotSeparated,
137 Measures,
138 Alphanumeric,
139}
140
141#[cfg(not(feature = "strings"))]
142#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord)]
143pub enum Struct {
144 Hashtag,
145 Mention,
146 }
148
149#[cfg(not(feature = "strings"))]
150#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord)]
151pub enum Unicode {
152 String,
153 Formatter(Formatter),
154}
155
156#[cfg(feature = "strings")]
157#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord)]
158pub enum Token {
159 Word(Word),
160 Struct(Struct),
161 Special(Special),
162 Unicode(Unicode),
163}
164
165#[cfg(not(feature = "strings"))]
166#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord)]
167pub enum Token {
168 Word(Word),
169 Struct(Struct),
170 Special(Special),
171 Unicode(Unicode),
172}
173
174#[derive(Debug)]
188pub struct TextStr<'s> {
189 buffer: &'s str,
190 localities: Arc<Vec<TextLocality>>,
191 breakers: Arc<Vec<InnerBound>>,
192}
193impl<'s> TextStr<'s> {
194 pub fn new<'a>(s: &'a str) -> Result<TextStr<'a>, Error> {
195 let text = inner_new(s.into_source(), false)?;
196 Ok(TextStr {
197 buffer: s,
198 localities: text.localities,
199 breakers: text.breakers,
200 })
201 }
202}
203
204fn inner_new<S: Source>(mut source: S, with_buffer: bool) -> Result<Text, Error> {
205 let mut buffer = String::new();
206 let mut localities = Vec::new();
207 let mut breakers = Vec::new();
208 let mut buffer_len = 0;
209
210 while let Some(local_se) = source.next_char().map_err(Error::TextParser)? {
211 let (local, se) = local_se.into_inner();
212 let c = match se {
213 SourceEvent::Char(c) => match c {
214 '\u{0060}' => '\u{0027}',
215 _ => c,
216 },
217 SourceEvent::Breaker(b) => {
218 let (c, opt_b) = match b {
219 Breaker::None => continue,
220 Breaker::Space => (' ', None),
221 Breaker::Line => ('\n', None),
222 Breaker::Word => ('\u{200B}', Some(b)), Breaker::Sentence | Breaker::Paragraph | Breaker::Section => ('\n', Some(b)),
224 };
225 if let Some(b) = opt_b {
226 let br = InnerBound {
227 bytes: Snip {
228 offset: buffer_len,
229 length: c.len_utf8(),
230 },
231 chars: Snip {
232 offset: localities.len(),
233 length: 1,
234 },
235 breaker: b,
236 original: Some(local),
237 };
238 breakers.push(br);
240 }
241 c
242 }
243 };
244
245 let buf_local = ().localize(
246 Snip {
247 offset: buffer_len,
248 length: c.len_utf8(),
249 },
250 Snip {
251 offset: localities.len(),
252 length: 1,
253 },
254 );
255 if with_buffer {
256 buffer.push(c);
257 }
258 buffer_len += c.len_utf8();
259 localities.push(TextLocality {
260 buffer: buf_local,
261 original: local,
262 });
263 }
264 Ok(Text {
265 buffer: Arc::new(buffer),
266 localities: Arc::new(localities),
267 breakers: Arc::new(breakers),
268 })
269}
270
271#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord)]
272pub struct TextLocality {
273 pub buffer: Local<()>,
274 pub original: Local<()>,
275}
276
277#[derive(Debug)]
278pub struct Text {
279 buffer: Arc<String>,
280 localities: Arc<Vec<TextLocality>>,
281 breakers: Arc<Vec<InnerBound>>,
282}
283impl Text {
284 pub fn new<S: Source>(source: S) -> Result<Text, Error> {
285 inner_new(source, true)
286 }
287 pub fn token_text<'s>(&'s self, token: &TextToken) -> &'s str {
288 let Snip {
289 offset: begin,
290 length: len,
291 } = token.locality.bytes();
292 let end = begin + len;
293 &self.buffer[begin..end]
294 }
295 pub fn text(&self) -> &str {
296 self.buffer.as_ref()
297 }
298 pub fn original_locality(&self, idx: usize) -> Option<Local<()>> {
299 self.localities.get(idx).map(|tl| tl.original)
300 }
301 pub fn localities(&self) -> &Vec<TextLocality> {
302 self.localities.as_ref()
303 }
304 pub fn shared_text(&self) -> Text {
305 Text {
306 buffer: self.buffer.clone(),
307 localities: self.localities.clone(),
308 breakers: self.breakers.clone(),
309 }
310 }
311}
312
313impl TryFrom<String> for Text {
314 type Error = Error;
315
316 fn try_from(s: String) -> Result<Text, Error> {
317 let mut text = inner_new((&s).into_source(), false)?;
318 text.buffer = Arc::new(s);
319 Ok(text)
320 }
321}
322
323impl TryFrom<&str> for Text {
324 type Error = Error;
325
326 fn try_from(s: &str) -> Result<Text, Error> {
327 Text::new(s.into_source())
328 }
329}
330
331#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord)]
332pub enum Bound {
333 Sentence,
334 Paragraph,
335 Section,
336}
337
338#[cfg(feature = "strings")]
339#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord)]
340pub struct TextToken {
341 locality: Local<()>,
342 original: Option<Local<()>>,
343 pub token: Token2,
344}
345
346#[cfg(not(feature = "strings"))]
347#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord)]
348pub struct TextToken {
349 locality: Local<()>,
350 original: Option<Local<()>>,
351 pub token: Token2,
352}
353
354#[cfg(test)]
355impl TextToken {
356 fn into_original_token_1(self) -> Option<Local<Token>> {
357 match self.original {
358 Some(original) => self.token.into_token().map(|t| original.local(t)),
359 None => None,
360 }
361 }
362}
363
364impl TextToken {
365 pub fn local(&self) -> Local<()> {
366 self.locality
367 }
368 pub fn original(&self) -> Option<Local<()>> {
369 self.original
370 }
371 pub fn into_position(mut self) -> TextToken {
372 self.locality = self.locality.into_position();
373 self.original = self.original.map(|or| or.into_position());
374 self
375 }
376 pub fn try_as_token(&self) -> Result<Token, Bound> {
377 self.token.try_as_token()
378 }
379 pub fn as_original_token(&self) -> Option<Local<&Token2>> {
380 self.original.map(|original| original.local(&self.token))
381 }
382 pub fn into_original_token(self) -> Option<Local<Token2>> {
383 self.original.map(|original| original.local(self.token))
384 }
385 pub fn original_str<'s>(&self, original: &'s str) -> Result<&'s str, OriginalError> {
386 match self.original {
387 Some(local) => {
388 let Snip {
389 offset: begin,
390 length: len,
391 } = local.bytes();
392 let end = begin + len;
393 match original.get(begin..end) {
394 Some(s) => Ok(s),
395 None => Err(OriginalError::InvalidSnip),
396 }
397 }
398 None => Err(OriginalError::NoOriginal),
399 }
400 }
401
402 pub fn test_token(lt: Local<Token2>) -> TextToken {
403 let (local, token) = lt.into_inner();
404 TextToken {
405 locality: local,
406 original: Some(local.local(())),
407 token,
408 }
409 }
410 pub fn test_new(token: Token2, local: Local<()>, original: Option<Local<()>>) -> TextToken {
411 TextToken {
412 locality: local,
413 original,
414 token,
415 }
416 }
417}
418
419#[derive(Debug)]
446pub enum OriginalError {
447 NoOriginal,
448 InvalidSnip,
449}
450
451#[cfg(feature = "strings")]
459#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord)]
460pub enum Token2 {
461 Word(Word),
462 Struct(Struct),
463 Special(Special),
464 Unicode(Unicode),
465
466 Bound(Bound),
467}
468#[cfg(not(feature = "strings"))]
469#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord)]
470pub enum Token2 {
471 Word(Word),
472 Struct(Struct),
473 Special(Special),
474 Unicode(Unicode),
475
476 Bound(Bound),
477}
478impl From<Token> for Token2 {
479 fn from(t: Token) -> Token2 {
480 match t {
481 Token::Word(w) => Token2::Word(w),
482 Token::Struct(s) => Token2::Struct(s),
483 Token::Special(s) => Token2::Special(s),
484 Token::Unicode(u) => Token2::Unicode(u),
485 }
486 }
487}
488impl Token2 {
489 #[cfg(not(feature = "strings"))]
490 fn try_as_token(&self) -> Result<Token, Bound> {
491 (*self).try_into_token()
492 }
493
494 #[cfg(feature = "strings")]
495 fn try_as_token(&self) -> Result<Token, Bound> {
496 self.clone().try_into_token()
497 }
498
499 fn try_into_token(self) -> Result<Token, Bound> {
500 match self {
501 Token2::Word(w) => Ok(Token::Word(w)),
502 Token2::Struct(s) => Ok(Token::Struct(s)),
503 Token2::Special(s) => Ok(Token::Special(s)),
504 Token2::Unicode(u) => Ok(Token::Unicode(u)),
505 Token2::Bound(b) => Err(b),
506 }
507 }
508}
509#[cfg(test)]
510impl Token2 {
511 fn into_token(self) -> Option<Token> {
512 match self {
513 Token2::Word(w) => Some(Token::Word(w)),
514 Token2::Struct(s) => Some(Token::Struct(s)),
515 Token2::Special(s) => Some(Token::Special(s)),
516 Token2::Unicode(u) => Some(Token::Unicode(u)),
517 Token2::Bound(_) => None,
518 }
519 }
520}
521
522#[cfg(test)]
523mod test_v0_5 {
524 use super::*;
525 use text_parsing::{entities, tagger, IntoPipeParser, IntoSource, ParserExt, SourceExt};
526
527 fn basic() {
529 let uws = "<p>Oxana Putan shared the quick (\"brown\") fox can't jump 32.3 feet, right? 4pda etc.</p><p> qeq U.S.A asd\n\n\nBrr, it's 29.3°F!\n Русское предложение #36.6 для тестирования деления по юникод-словам...\n🇷🇺 🇸🇹\n👱🏿👶🏽👨🏽\n+Done! Готово</p>";
544 let text = Text::new({
545 uws.into_source()
546 .pipe(tagger::Builder::new().create().into_breaker())
547 .pipe(entities::Builder::new().create().into_piped())
548 .into_separator()
549 })
550 .unwrap();
551 let lib_res = text
552 .into_tokenizer({
553 TokenizerParams::default()
554 .add_option(TokenizerOptions::SplitDot)
555 .add_option(TokenizerOptions::SplitUnderscore)
556 .add_option(TokenizerOptions::SplitColon)
557 .with_default_sentences()
558 })
559 .collect::<Vec<_>>();
560
561 for tok in lib_res {
562 println!(
563 "C{:?}, B{:?}, {:?} -> {:?}",
564 tok.original.map(|loc| loc.chars()),
565 tok.original.map(|loc| loc.bytes()),
566 tok.token,
567 tok.original_str(uws)
568 );
569 }
570
571 panic!()
572 }
573}
574
575#[cfg(test)]
576#[cfg(feature = "strings")]
577mod test {
578 use super::*;
579 use text_parsing::{
580 entities, tagger, IntoPipeParser, IntoSource, Localize, ParserExt, SourceExt,
581 };
582
583 #[derive(Debug, Clone)]
644 struct CharToken {
645 byte_offset: usize,
646 byte_length: usize,
647 char_offset: usize,
648 char_length: usize,
649 token: Token,
650 }
651 impl Into<Local<Token>> for CharToken {
652 fn into(self) -> Local<Token> {
653 self.token.localize(
654 Snip {
655 offset: self.char_offset,
656 length: self.char_length,
657 },
658 Snip {
659 offset: self.byte_offset,
660 length: self.byte_length,
661 },
662 )
663 }
664 }
665
666 #[derive(Debug, Clone)]
667 struct PositionalToken {
668 source: &'static str,
669 offset: usize,
670 length: usize,
671 token: Token,
672 }
673 impl Into<Local<Token>> for PositionalToken {
674 fn into(self) -> Local<Token> {
675 self.token.localize(
676 Snip {
677 offset: self.source[..self.offset].chars().count(),
678 length: self.source[self.offset..self.offset + self.length]
679 .chars()
680 .count(),
681 },
682 Snip {
683 offset: self.offset,
684 length: self.length,
685 },
686 )
687 }
688 }
689
690 fn check_results(result: &Vec<PositionalToken>, lib_res: &Vec<Local<Token>>, _uws: &str) {
691 assert_eq!(result.len(), lib_res.len());
692 for i in 0..result.len() {
693 let res: Local<Token> = result[i].clone().into();
694 assert_eq!(res, lib_res[i]);
695 }
696 }
697
698 fn check_cresults(result: &Vec<CharToken>, lib_res: &Vec<Local<Token>>, _uws: &str) {
699 assert_eq!(result.len(), lib_res.len());
700 for i in 0..result.len() {
701 let res: Local<Token> = result[i].clone().into();
702 assert_eq!(res, lib_res[i]);
703 }
704 }
705
706 fn check<T: Clone + std::fmt::Debug + Into<Local<Token>>>(
707 res: &Vec<T>,
708 lib: &Vec<Local<Token>>,
709 _uws: &str,
710 ) {
711 let mut lib = lib.iter();
712 let mut res = res.iter().map(|r| {
713 let res: Local<Token> = r.clone().into();
714 res
715 });
716 let mut diff = Vec::new();
717 loop {
718 match (lib.next(), res.next()) {
719 (Some(lw), Some(rw)) => {
720 if *lw != rw {
721 diff.push(format!("LIB: {:?}", lw));
722 diff.push(format!("TEST: {:?}", rw));
723 diff.push("".to_string())
724 }
725 }
726 (Some(lw), None) => {
727 diff.push(format!("LIB: {:?}", lw));
728 diff.push("TEST: ----".to_string());
729 diff.push("".to_string())
730 }
731 (None, Some(rw)) => {
732 diff.push("LIB: ----".to_string());
733 diff.push(format!("TEST: {:?}", rw));
734 diff.push("".to_string())
735 }
736 (None, None) => break,
737 }
738 }
739 if diff.len() > 0 {
740 for ln in &diff {
741 println!("{}", ln);
742 }
743 panic!("Diff count: {}", diff.len() / 3);
744 }
745 }
746
747 #[test]
748 fn spaces() {
749 let uws = " spaces too many apces ";
750 let result = vec![
751 PositionalToken {
752 source: uws,
753 offset: 0,
754 length: 4,
755 token: Token::Special(Special::Separator(Separator::Space)),
756 },
757 PositionalToken {
758 source: uws,
759 offset: 4,
760 length: 6,
761 token: Token::Word(Word::Word("spaces".to_string())),
762 },
763 PositionalToken {
764 source: uws,
765 offset: 10,
766 length: 4,
767 token: Token::Special(Special::Separator(Separator::Space)),
768 },
769 PositionalToken {
770 source: uws,
771 offset: 14,
772 length: 3,
773 token: Token::Word(Word::Word("too".to_string())),
774 },
775 PositionalToken {
776 source: uws,
777 offset: 17,
778 length: 3,
779 token: Token::Special(Special::Separator(Separator::Space)),
780 },
781 PositionalToken {
782 source: uws,
783 offset: 20,
784 length: 4,
785 token: Token::Word(Word::Word("many".to_string())),
786 },
787 PositionalToken {
788 source: uws,
789 offset: 24,
790 length: 3,
791 token: Token::Special(Special::Separator(Separator::Space)),
792 },
793 PositionalToken {
794 source: uws,
795 offset: 27,
796 length: 5,
797 token: Token::Word(Word::Word("apces".to_string())),
798 },
799 PositionalToken {
800 source: uws,
801 offset: 32,
802 length: 3,
803 token: Token::Special(Special::Separator(Separator::Space)),
804 },
805 ];
806 let lib_res = uws
807 .into_tokenizer(TokenizerParams::v1())
808 .collect::<Vec<_>>();
809 check_results(&result, &lib_res, uws);
810 }
812
813 #[test]
814 fn numbers() {
815 let uws = "(() -2\n() -2";
816 let result = vec![
817 PositionalToken {
818 source: uws,
819 offset: 0,
820 length: 1,
821 token: Token::Special(Special::Punctuation('(')),
822 },
823 PositionalToken {
824 source: uws,
825 offset: 1,
826 length: 1,
827 token: Token::Special(Special::Punctuation('(')),
828 },
829 PositionalToken {
830 source: uws,
831 offset: 2,
832 length: 1,
833 token: Token::Special(Special::Punctuation(')')),
834 },
835 PositionalToken {
836 source: uws,
837 offset: 3,
838 length: 1,
839 token: Token::Special(Special::Separator(Separator::Space)),
840 },
841 PositionalToken {
842 source: uws,
843 offset: 4,
844 length: 2,
845 token: Token::Word(Word::Number(Number::Integer(-2))),
846 },
847 PositionalToken {
848 source: uws,
849 offset: 6,
850 length: 1,
851 token: Token::Special(Special::Separator(Separator::Newline)),
852 },
853 PositionalToken {
854 source: uws,
855 offset: 7,
856 length: 1,
857 token: Token::Special(Special::Punctuation('(')),
858 },
859 PositionalToken {
860 source: uws,
861 offset: 8,
862 length: 1,
863 token: Token::Special(Special::Punctuation(')')),
864 },
865 PositionalToken {
866 source: uws,
867 offset: 9,
868 length: 2,
869 token: Token::Special(Special::Separator(Separator::Space)),
870 },
871 PositionalToken {
872 source: uws,
873 offset: 11,
874 length: 2,
875 token: Token::Word(Word::Number(Number::Integer(-2))),
876 },
877 ];
878 let lib_res = uws
879 .into_tokenizer({
880 TokenizerParams::default()
881 .add_option(TokenizerOptions::SplitDot)
882 .add_option(TokenizerOptions::SplitUnderscore)
883 .add_option(TokenizerOptions::SplitColon)
884 .add_option(TokenizerOptions::MergeWhites)
885 })
886 .collect::<Vec<_>>();
887 check_results(&result, &lib_res, uws);
888 }
889
890 #[test]
891 fn word_with_inner_hyphens() {
892 let uws = "Опросы показывают";
893 let result = vec![
894 PositionalToken {
895 source: uws,
896 offset: 0,
897 length: 14,
898 token: Token::Word(Word::StrangeWord("Опросы".to_string())),
899 },
900 PositionalToken {
901 source: uws,
902 offset: 14,
903 length: 1,
904 token: Token::Special(Special::Separator(Separator::Space)),
905 },
906 PositionalToken {
907 source: uws,
908 offset: 15,
909 length: 28,
910 token: Token::Word(Word::StrangeWord("показывают".to_string())),
911 },
912 ];
913 let lib_res = uws
914 .into_tokenizer(TokenizerParams::v1())
915 .collect::<Vec<_>>();
916 check_results(&result, &lib_res, uws);
917 }
918
919 #[test]
920 fn mixed_but_word() {
921 let uws = "L’Oreal";
922 let result = vec![PositionalToken {
923 source: uws,
924 offset: 0,
925 length: 9,
926 token: Token::Word(Word::StrangeWord("L’Oreal".to_string())),
927 }];
928 let lib_res = uws
929 .into_tokenizer(TokenizerParams::v1())
930 .collect::<Vec<_>>();
931 check_results(&result, &lib_res, uws);
932 }
933
934 #[test]
935 fn hashtags() {
936 let uws = "#hashtag#hashtag2";
937 let result = vec![
938 PositionalToken {
939 source: uws,
940 offset: 0,
941 length: 1,
942 token: Token::Special(Special::Punctuation('#')),
943 },
944 PositionalToken {
945 source: uws,
946 offset: 1,
947 length: 7,
948 token: Token::Word(Word::Word("hashtag".to_string())),
949 },
950 PositionalToken {
951 source: uws,
952 offset: 8,
953 length: 1,
954 token: Token::Special(Special::Punctuation('#')),
955 },
956 PositionalToken {
957 source: uws,
958 offset: 9,
959 length: 8,
960 token: Token::Word(Word::Numerical(Numerical::Alphanumeric(
961 "hashtag2".to_string(),
962 ))),
963 },
964 ];
965 let lib_res = uws
966 .into_tokenizer(TokenizerParams::v1())
967 .collect::<Vec<_>>();
968 check_results(&result, &lib_res, uws);
969 }
970
971 #[test]
972 fn apostrophe() {
973 let uws = "l'oreal; l\u{0060}oreal";
974 let result = vec![
975 PositionalToken {
976 source: uws,
977 offset: 0,
978 length: 7,
979 token: Token::Word(Word::Word("l'oreal".to_string())),
980 },
981 PositionalToken {
982 source: uws,
983 offset: 7,
984 length: 1,
985 token: Token::Special(Special::Punctuation(';')),
986 },
987 PositionalToken {
988 source: uws,
989 offset: 8,
990 length: 1,
991 token: Token::Special(Special::Separator(Separator::Space)),
992 },
993 PositionalToken {
994 source: uws,
995 offset: 9,
996 length: 7,
997 token: Token::Word(Word::Word("l'oreal".to_string())),
998 },
999 ];
1000 let text = Text::new(uws.into_source()).unwrap();
1001 let lib_res = text
1002 .into_tokenizer(TokenizerParams::v1())
1003 .filter_map(|tt| tt.into_original_token_1())
1004 .collect::<Vec<_>>();
1005 check_results(&result, &lib_res, uws);
1006 }
1007
1008 #[test]
1009 fn char_tokens() {
1010 let uws = "[Oxana Putan|1712640565] shared the quick (\"brown\") fox can't jump 32.3 feet, right? 4pda etc. qeq U.S.A asd\n\n\nBrr, it's 29.3°F!\n Русское предложение #36.6 для тестирования деления по юникод-словам...\n🇷🇺 🇸🇹\n👱🏿👶🏽👨🏽\n+Done! Готово";
1011 let result = vec![
1012 CharToken {
1013 byte_offset: 0,
1014 byte_length: 1,
1015 char_offset: 0,
1016 char_length: 1,
1017 token: Token::Special(Special::Punctuation('[')),
1018 },
1019 CharToken {
1020 byte_offset: 1,
1021 byte_length: 5,
1022 char_offset: 1,
1023 char_length: 5,
1024 token: Token::Word(Word::Word("Oxana".to_string())),
1025 },
1026 CharToken {
1027 byte_offset: 6,
1028 byte_length: 1,
1029 char_offset: 6,
1030 char_length: 1,
1031 token: Token::Special(Special::Separator(Separator::Space)),
1032 },
1033 CharToken {
1034 byte_offset: 7,
1035 byte_length: 5,
1036 char_offset: 7,
1037 char_length: 5,
1038 token: Token::Word(Word::Word("Putan".to_string())),
1039 },
1040 CharToken {
1041 byte_offset: 12,
1042 byte_length: 1,
1043 char_offset: 12,
1044 char_length: 1,
1045 token: Token::Special(Special::Punctuation('|')),
1046 },
1047 CharToken {
1048 byte_offset: 13,
1049 byte_length: 10,
1050 char_offset: 13,
1051 char_length: 10,
1052 token: Token::Word(Word::Number(Number::Integer(1712640565))),
1053 },
1054 CharToken {
1055 byte_offset: 23,
1056 byte_length: 1,
1057 char_offset: 23,
1058 char_length: 1,
1059 token: Token::Special(Special::Punctuation(']')),
1060 },
1061 CharToken {
1069 byte_offset: 24,
1070 byte_length: 1,
1071 char_offset: 24,
1072 char_length: 1,
1073 token: Token::Special(Special::Separator(Separator::Space)),
1074 },
1075 CharToken {
1076 byte_offset: 25,
1077 byte_length: 6,
1078 char_offset: 25,
1079 char_length: 6,
1080 token: Token::Word(Word::Word("shared".to_string())),
1081 },
1082 CharToken {
1083 byte_offset: 31,
1084 byte_length: 1,
1085 char_offset: 31,
1086 char_length: 1,
1087 token: Token::Special(Special::Separator(Separator::Space)),
1088 },
1089 CharToken {
1090 byte_offset: 32,
1091 byte_length: 3,
1092 char_offset: 32,
1093 char_length: 3,
1094 token: Token::Word(Word::Word("the".to_string())),
1095 },
1096 CharToken {
1097 byte_offset: 35,
1098 byte_length: 1,
1099 char_offset: 35,
1100 char_length: 1,
1101 token: Token::Special(Special::Separator(Separator::Space)),
1102 },
1103 CharToken {
1104 byte_offset: 36,
1105 byte_length: 5,
1106 char_offset: 36,
1107 char_length: 5,
1108 token: Token::Word(Word::Word("quick".to_string())),
1109 },
1110 CharToken {
1111 byte_offset: 41,
1112 byte_length: 1,
1113 char_offset: 41,
1114 char_length: 1,
1115 token: Token::Special(Special::Separator(Separator::Space)),
1116 },
1117 CharToken {
1118 byte_offset: 42,
1119 byte_length: 1,
1120 char_offset: 42,
1121 char_length: 1,
1122 token: Token::Special(Special::Punctuation('(')),
1123 },
1124 CharToken {
1125 byte_offset: 43,
1126 byte_length: 1,
1127 char_offset: 43,
1128 char_length: 1,
1129 token: Token::Special(Special::Punctuation('"')),
1130 },
1131 CharToken {
1132 byte_offset: 44,
1133 byte_length: 5,
1134 char_offset: 44,
1135 char_length: 5,
1136 token: Token::Word(Word::Word("brown".to_string())),
1137 },
1138 CharToken {
1139 byte_offset: 49,
1140 byte_length: 1,
1141 char_offset: 49,
1142 char_length: 1,
1143 token: Token::Special(Special::Punctuation('"')),
1144 },
1145 CharToken {
1146 byte_offset: 50,
1147 byte_length: 1,
1148 char_offset: 50,
1149 char_length: 1,
1150 token: Token::Special(Special::Punctuation(')')),
1151 },
1152 CharToken {
1153 byte_offset: 51,
1154 byte_length: 1,
1155 char_offset: 51,
1156 char_length: 1,
1157 token: Token::Special(Special::Separator(Separator::Space)),
1158 },
1159 CharToken {
1160 byte_offset: 52,
1161 byte_length: 3,
1162 char_offset: 52,
1163 char_length: 3,
1164 token: Token::Word(Word::Word("fox".to_string())),
1165 },
1166 CharToken {
1167 byte_offset: 55,
1168 byte_length: 1,
1169 char_offset: 55,
1170 char_length: 1,
1171 token: Token::Special(Special::Separator(Separator::Space)),
1172 },
1173 CharToken {
1174 byte_offset: 56,
1175 byte_length: 5,
1176 char_offset: 56,
1177 char_length: 5,
1178 token: Token::Word(Word::Word("can\'t".to_string())),
1179 },
1180 CharToken {
1181 byte_offset: 61,
1182 byte_length: 1,
1183 char_offset: 61,
1184 char_length: 1,
1185 token: Token::Special(Special::Separator(Separator::Space)),
1186 },
1187 CharToken {
1188 byte_offset: 62,
1189 byte_length: 4,
1190 char_offset: 62,
1191 char_length: 4,
1192 token: Token::Word(Word::Word("jump".to_string())),
1193 },
1194 CharToken {
1195 byte_offset: 66,
1196 byte_length: 1,
1197 char_offset: 66,
1198 char_length: 1,
1199 token: Token::Special(Special::Separator(Separator::Space)),
1200 },
1201 CharToken {
1202 byte_offset: 67,
1203 byte_length: 4,
1204 char_offset: 67,
1205 char_length: 4,
1206 token: Token::Word(Word::Number(Number::Float(32.3))),
1207 },
1208 CharToken {
1209 byte_offset: 71,
1210 byte_length: 1,
1211 char_offset: 71,
1212 char_length: 1,
1213 token: Token::Special(Special::Separator(Separator::Space)),
1214 },
1215 CharToken {
1216 byte_offset: 72,
1217 byte_length: 4,
1218 char_offset: 72,
1219 char_length: 4,
1220 token: Token::Word(Word::Word("feet".to_string())),
1221 },
1222 CharToken {
1223 byte_offset: 76,
1224 byte_length: 1,
1225 char_offset: 76,
1226 char_length: 1,
1227 token: Token::Special(Special::Punctuation(',')),
1228 },
1229 CharToken {
1230 byte_offset: 77,
1231 byte_length: 1,
1232 char_offset: 77,
1233 char_length: 1,
1234 token: Token::Special(Special::Separator(Separator::Space)),
1235 },
1236 CharToken {
1237 byte_offset: 78,
1238 byte_length: 5,
1239 char_offset: 78,
1240 char_length: 5,
1241 token: Token::Word(Word::Word("right".to_string())),
1242 },
1243 CharToken {
1244 byte_offset: 83,
1245 byte_length: 1,
1246 char_offset: 83,
1247 char_length: 1,
1248 token: Token::Special(Special::Punctuation('?')),
1249 },
1250 CharToken {
1251 byte_offset: 84,
1252 byte_length: 1,
1253 char_offset: 84,
1254 char_length: 1,
1255 token: Token::Special(Special::Separator(Separator::Space)),
1256 },
1257 CharToken {
1258 byte_offset: 85,
1259 byte_length: 4,
1260 char_offset: 85,
1261 char_length: 4,
1262 token: Token::Word(Word::Numerical(Numerical::Measures("4pda".to_string()))),
1263 },
1264 CharToken {
1265 byte_offset: 89,
1266 byte_length: 1,
1267 char_offset: 89,
1268 char_length: 1,
1269 token: Token::Special(Special::Separator(Separator::Space)),
1270 },
1271 CharToken {
1272 byte_offset: 90,
1273 byte_length: 3,
1274 char_offset: 90,
1275 char_length: 3,
1276 token: Token::Word(Word::Word("etc".to_string())),
1277 },
1278 CharToken {
1279 byte_offset: 93,
1280 byte_length: 1,
1281 char_offset: 93,
1282 char_length: 1,
1283 token: Token::Special(Special::Punctuation('.')),
1284 },
1285 CharToken {
1286 byte_offset: 94,
1287 byte_length: 1,
1288 char_offset: 94,
1289 char_length: 1,
1290 token: Token::Special(Special::Separator(Separator::Space)),
1291 },
1292 CharToken {
1293 byte_offset: 95,
1294 byte_length: 3,
1295 char_offset: 95,
1296 char_length: 3,
1297 token: Token::Word(Word::Word("qeq".to_string())),
1298 },
1299 CharToken {
1300 byte_offset: 98,
1301 byte_length: 1,
1302 char_offset: 98,
1303 char_length: 1,
1304 token: Token::Special(Special::Separator(Separator::Space)),
1305 },
1306 CharToken {
1307 byte_offset: 99,
1308 byte_length: 5,
1309 char_offset: 99,
1310 char_length: 5,
1311 token: Token::Word(Word::Word("U.S.A".to_string())),
1312 },
1313 CharToken {
1314 byte_offset: 104,
1315 byte_length: 2,
1316 char_offset: 104,
1317 char_length: 2,
1318 token: Token::Special(Special::Separator(Separator::Space)),
1319 },
1320 CharToken {
1321 byte_offset: 106,
1322 byte_length: 3,
1323 char_offset: 106,
1324 char_length: 3,
1325 token: Token::Word(Word::Word("asd".to_string())),
1326 },
1327 CharToken {
1328 byte_offset: 109,
1329 byte_length: 3,
1330 char_offset: 109,
1331 char_length: 3,
1332 token: Token::Special(Special::Separator(Separator::Newline)),
1333 },
1334 CharToken {
1335 byte_offset: 112,
1336 byte_length: 3,
1337 char_offset: 112,
1338 char_length: 3,
1339 token: Token::Word(Word::Word("Brr".to_string())),
1340 },
1341 CharToken {
1342 byte_offset: 115,
1343 byte_length: 1,
1344 char_offset: 115,
1345 char_length: 1,
1346 token: Token::Special(Special::Punctuation(',')),
1347 },
1348 CharToken {
1349 byte_offset: 116,
1350 byte_length: 1,
1351 char_offset: 116,
1352 char_length: 1,
1353 token: Token::Special(Special::Separator(Separator::Space)),
1354 },
1355 CharToken {
1356 byte_offset: 117,
1357 byte_length: 4,
1358 char_offset: 117,
1359 char_length: 4,
1360 token: Token::Word(Word::Word("it\'s".to_string())),
1361 },
1362 CharToken {
1363 byte_offset: 121,
1364 byte_length: 1,
1365 char_offset: 121,
1366 char_length: 1,
1367 token: Token::Special(Special::Separator(Separator::Space)),
1368 },
1369 CharToken {
1370 byte_offset: 122,
1371 byte_length: 4,
1372 char_offset: 122,
1373 char_length: 4,
1374 token: Token::Word(Word::Number(Number::Float(29.3))),
1375 },
1376 CharToken {
1377 byte_offset: 126,
1378 byte_length: 2,
1379 char_offset: 126,
1380 char_length: 1,
1381 token: Token::Special(Special::Symbol('°')),
1382 },
1383 CharToken {
1384 byte_offset: 128,
1385 byte_length: 1,
1386 char_offset: 127,
1387 char_length: 1,
1388 token: Token::Word(Word::Word("F".to_string())),
1389 },
1390 CharToken {
1391 byte_offset: 129,
1392 byte_length: 1,
1393 char_offset: 128,
1394 char_length: 1,
1395 token: Token::Special(Special::Punctuation('!')),
1396 },
1397 CharToken {
1398 byte_offset: 130,
1399 byte_length: 1,
1400 char_offset: 129,
1401 char_length: 1,
1402 token: Token::Special(Special::Separator(Separator::Newline)),
1403 },
1404 CharToken {
1405 byte_offset: 131,
1406 byte_length: 1,
1407 char_offset: 130,
1408 char_length: 1,
1409 token: Token::Special(Special::Separator(Separator::Space)),
1410 },
1411 CharToken {
1412 byte_offset: 132,
1413 byte_length: 14,
1414 char_offset: 131,
1415 char_length: 7,
1416 token: Token::Word(Word::Word("Русское".to_string())),
1417 },
1418 CharToken {
1419 byte_offset: 146,
1420 byte_length: 1,
1421 char_offset: 138,
1422 char_length: 1,
1423 token: Token::Special(Special::Separator(Separator::Space)),
1424 },
1425 CharToken {
1426 byte_offset: 147,
1427 byte_length: 22,
1428 char_offset: 139,
1429 char_length: 11,
1430 token: Token::Word(Word::Word("предложение".to_string())),
1431 },
1432 CharToken {
1433 byte_offset: 169,
1434 byte_length: 1,
1435 char_offset: 150,
1436 char_length: 1,
1437 token: Token::Special(Special::Separator(Separator::Space)),
1438 },
1439 CharToken {
1440 byte_offset: 170,
1441 byte_length: 5,
1442 char_offset: 151,
1443 char_length: 5,
1444 token: Token::Struct(Struct::Hashtag("36.6".to_string())),
1445 },
1446 CharToken {
1447 byte_offset: 175,
1448 byte_length: 1,
1449 char_offset: 156,
1450 char_length: 1,
1451 token: Token::Special(Special::Separator(Separator::Space)),
1452 },
1453 CharToken {
1454 byte_offset: 176,
1455 byte_length: 6,
1456 char_offset: 157,
1457 char_length: 3,
1458 token: Token::Word(Word::Word("для".to_string())),
1459 },
1460 CharToken {
1461 byte_offset: 182,
1462 byte_length: 1,
1463 char_offset: 160,
1464 char_length: 1,
1465 token: Token::Special(Special::Separator(Separator::Space)),
1466 },
1467 CharToken {
1468 byte_offset: 183,
1469 byte_length: 24,
1470 char_offset: 161,
1471 char_length: 12,
1472 token: Token::Word(Word::Word("тестирования".to_string())),
1473 },
1474 CharToken {
1475 byte_offset: 207,
1476 byte_length: 1,
1477 char_offset: 173,
1478 char_length: 1,
1479 token: Token::Special(Special::Separator(Separator::Space)),
1480 },
1481 CharToken {
1482 byte_offset: 208,
1483 byte_length: 14,
1484 char_offset: 174,
1485 char_length: 7,
1486 token: Token::Word(Word::Word("деления".to_string())),
1487 },
1488 CharToken {
1489 byte_offset: 222,
1490 byte_length: 1,
1491 char_offset: 181,
1492 char_length: 1,
1493 token: Token::Special(Special::Separator(Separator::Space)),
1494 },
1495 CharToken {
1496 byte_offset: 223,
1497 byte_length: 4,
1498 char_offset: 182,
1499 char_length: 2,
1500 token: Token::Word(Word::Word("по".to_string())),
1501 },
1502 CharToken {
1503 byte_offset: 227,
1504 byte_length: 1,
1505 char_offset: 184,
1506 char_length: 1,
1507 token: Token::Special(Special::Separator(Separator::Space)),
1508 },
1509 CharToken {
1510 byte_offset: 228,
1511 byte_length: 12,
1512 char_offset: 185,
1513 char_length: 6,
1514 token: Token::Word(Word::Word("юникод".to_string())),
1515 },
1516 CharToken {
1517 byte_offset: 240,
1518 byte_length: 1,
1519 char_offset: 191,
1520 char_length: 1,
1521 token: Token::Special(Special::Punctuation('-')),
1522 },
1523 CharToken {
1524 byte_offset: 241,
1525 byte_length: 12,
1526 char_offset: 192,
1527 char_length: 6,
1528 token: Token::Word(Word::Word("словам".to_string())),
1529 },
1530 CharToken {
1531 byte_offset: 253,
1532 byte_length: 3,
1533 char_offset: 198,
1534 char_length: 3,
1535 token: Token::Special(Special::Punctuation('.')),
1536 },
1537 CharToken {
1538 byte_offset: 256,
1539 byte_length: 1,
1540 char_offset: 201,
1541 char_length: 1,
1542 token: Token::Special(Special::Separator(Separator::Newline)),
1543 },
1544 CharToken {
1545 byte_offset: 257,
1546 byte_length: 8,
1547 char_offset: 202,
1548 char_length: 2,
1549 token: Token::Word(Word::Emoji("russia")),
1550 },
1551 CharToken {
1552 byte_offset: 265,
1553 byte_length: 1,
1554 char_offset: 204,
1555 char_length: 1,
1556 token: Token::Special(Special::Separator(Separator::Space)),
1557 },
1558 CharToken {
1559 byte_offset: 266,
1560 byte_length: 8,
1561 char_offset: 205,
1562 char_length: 2,
1563 token: Token::Word(Word::Emoji("sao_tome_and_principe")),
1564 },
1565 CharToken {
1566 byte_offset: 274,
1567 byte_length: 1,
1568 char_offset: 207,
1569 char_length: 1,
1570 token: Token::Special(Special::Separator(Separator::Newline)),
1571 },
1572 CharToken {
1573 byte_offset: 275,
1574 byte_length: 8,
1575 char_offset: 208,
1576 char_length: 2,
1577 token: Token::Word(Word::Emoji("blond_haired_person_dark_skin_tone")),
1578 },
1579 CharToken {
1580 byte_offset: 283,
1581 byte_length: 8,
1582 char_offset: 210,
1583 char_length: 2,
1584 token: Token::Word(Word::Emoji("baby_medium_skin_tone")),
1585 },
1586 CharToken {
1587 byte_offset: 291,
1588 byte_length: 8,
1589 char_offset: 212,
1590 char_length: 2,
1591 token: Token::Word(Word::Emoji("man_medium_skin_tone")),
1592 },
1593 CharToken {
1594 byte_offset: 299,
1595 byte_length: 1,
1596 char_offset: 214,
1597 char_length: 1,
1598 token: Token::Special(Special::Separator(Separator::Newline)),
1599 },
1600 CharToken {
1601 byte_offset: 300,
1602 byte_length: 1,
1603 char_offset: 215,
1604 char_length: 1,
1605 token: Token::Special(Special::Punctuation('+')),
1606 },
1607 CharToken {
1608 byte_offset: 301,
1609 byte_length: 4,
1610 char_offset: 216,
1611 char_length: 4,
1612 token: Token::Word(Word::Word("Done".to_string())),
1613 },
1614 CharToken {
1615 byte_offset: 305,
1616 byte_length: 1,
1617 char_offset: 220,
1618 char_length: 1,
1619 token: Token::Special(Special::Punctuation('!')),
1620 },
1621 CharToken {
1622 byte_offset: 306,
1623 byte_length: 1,
1624 char_offset: 221,
1625 char_length: 1,
1626 token: Token::Special(Special::Separator(Separator::Space)),
1627 },
1628 CharToken {
1629 byte_offset: 307,
1630 byte_length: 12,
1631 char_offset: 222,
1632 char_length: 6,
1633 token: Token::Word(Word::Word("Готово".to_string())),
1634 },
1635 ];
1636
1637 let lib_res = uws
1638 .into_tokenizer(TokenizerParams::complex())
1639 .collect::<Vec<_>>();
1640
1641 check_cresults(&result, &lib_res, uws);
1643 }
1644
1645 #[test]
1646 fn general_default() {
1647 let uws = "The quick (\"brown\") fox can't jump 32.3 feet, right? 4pda etc. qeq U.S.A asd\n\n\nBrr, it's 29.3°F!\n Русское предложение #36.6 для тестирования деления по юникод-словам...\n";
1648 let result = vec![
1649 PositionalToken {
1650 source: uws,
1651 offset: 0,
1652 length: 3,
1653 token: Token::Word(Word::Word("The".to_string())),
1654 },
1655 PositionalToken {
1656 source: uws,
1657 offset: 3,
1658 length: 1,
1659 token: Token::Special(Special::Separator(Separator::Space)),
1660 },
1661 PositionalToken {
1662 source: uws,
1663 offset: 4,
1664 length: 5,
1665 token: Token::Word(Word::Word("quick".to_string())),
1666 },
1667 PositionalToken {
1668 source: uws,
1669 offset: 9,
1670 length: 1,
1671 token: Token::Special(Special::Separator(Separator::Space)),
1672 },
1673 PositionalToken {
1674 source: uws,
1675 offset: 10,
1676 length: 1,
1677 token: Token::Special(Special::Punctuation('(')),
1678 },
1679 PositionalToken {
1680 source: uws,
1681 offset: 11,
1682 length: 1,
1683 token: Token::Special(Special::Punctuation('"')),
1684 },
1685 PositionalToken {
1686 source: uws,
1687 offset: 12,
1688 length: 5,
1689 token: Token::Word(Word::Word("brown".to_string())),
1690 },
1691 PositionalToken {
1692 source: uws,
1693 offset: 17,
1694 length: 1,
1695 token: Token::Special(Special::Punctuation('"')),
1696 },
1697 PositionalToken {
1698 source: uws,
1699 offset: 18,
1700 length: 1,
1701 token: Token::Special(Special::Punctuation(')')),
1702 },
1703 PositionalToken {
1704 source: uws,
1705 offset: 19,
1706 length: 1,
1707 token: Token::Special(Special::Separator(Separator::Space)),
1708 },
1709 PositionalToken {
1710 source: uws,
1711 offset: 20,
1712 length: 3,
1713 token: Token::Word(Word::Word("fox".to_string())),
1714 },
1715 PositionalToken {
1716 source: uws,
1717 offset: 23,
1718 length: 1,
1719 token: Token::Special(Special::Separator(Separator::Space)),
1720 },
1721 PositionalToken {
1722 source: uws,
1723 offset: 24,
1724 length: 5,
1725 token: Token::Word(Word::Word("can\'t".to_string())),
1726 },
1727 PositionalToken {
1728 source: uws,
1729 offset: 29,
1730 length: 1,
1731 token: Token::Special(Special::Separator(Separator::Space)),
1732 },
1733 PositionalToken {
1734 source: uws,
1735 offset: 30,
1736 length: 4,
1737 token: Token::Word(Word::Word("jump".to_string())),
1738 },
1739 PositionalToken {
1740 source: uws,
1741 offset: 34,
1742 length: 1,
1743 token: Token::Special(Special::Separator(Separator::Space)),
1744 },
1745 PositionalToken {
1746 source: uws,
1747 offset: 35,
1748 length: 4,
1749 token: Token::Word(Word::Number(Number::Float(32.3))),
1750 },
1751 PositionalToken {
1752 source: uws,
1753 offset: 39,
1754 length: 1,
1755 token: Token::Special(Special::Separator(Separator::Space)),
1756 },
1757 PositionalToken {
1758 source: uws,
1759 offset: 40,
1760 length: 4,
1761 token: Token::Word(Word::Word("feet".to_string())),
1762 },
1763 PositionalToken {
1764 source: uws,
1765 offset: 44,
1766 length: 1,
1767 token: Token::Special(Special::Punctuation(',')),
1768 },
1769 PositionalToken {
1770 source: uws,
1771 offset: 45,
1772 length: 1,
1773 token: Token::Special(Special::Separator(Separator::Space)),
1774 },
1775 PositionalToken {
1776 source: uws,
1777 offset: 46,
1778 length: 5,
1779 token: Token::Word(Word::Word("right".to_string())),
1780 },
1781 PositionalToken {
1782 source: uws,
1783 offset: 51,
1784 length: 1,
1785 token: Token::Special(Special::Punctuation('?')),
1786 },
1787 PositionalToken {
1788 source: uws,
1789 offset: 52,
1790 length: 1,
1791 token: Token::Special(Special::Separator(Separator::Space)),
1792 },
1793 PositionalToken {
1794 source: uws,
1795 offset: 53,
1796 length: 4,
1797 token: Token::Word(Word::Numerical(Numerical::Measures("4pda".to_string()))),
1798 }, PositionalToken {
1800 source: uws,
1801 offset: 57,
1802 length: 1,
1803 token: Token::Special(Special::Separator(Separator::Space)),
1804 },
1805 PositionalToken {
1806 source: uws,
1807 offset: 58,
1808 length: 3,
1809 token: Token::Word(Word::Word("etc".to_string())),
1810 },
1811 PositionalToken {
1812 source: uws,
1813 offset: 61,
1814 length: 1,
1815 token: Token::Special(Special::Punctuation('.')),
1816 },
1817 PositionalToken {
1818 source: uws,
1819 offset: 62,
1820 length: 1,
1821 token: Token::Special(Special::Separator(Separator::Space)),
1822 },
1823 PositionalToken {
1824 source: uws,
1825 offset: 63,
1826 length: 3,
1827 token: Token::Word(Word::Word("qeq".to_string())),
1828 },
1829 PositionalToken {
1830 source: uws,
1831 offset: 66,
1832 length: 1,
1833 token: Token::Special(Special::Separator(Separator::Space)),
1834 },
1835 PositionalToken {
1836 source: uws,
1837 offset: 67,
1838 length: 1,
1839 token: Token::Word(Word::Word("U".to_string())),
1840 },
1841 PositionalToken {
1842 source: uws,
1843 offset: 68,
1844 length: 1,
1845 token: Token::Special(Special::Punctuation('.')),
1846 },
1847 PositionalToken {
1848 source: uws,
1849 offset: 69,
1850 length: 1,
1851 token: Token::Word(Word::Word("S".to_string())),
1852 },
1853 PositionalToken {
1854 source: uws,
1855 offset: 70,
1856 length: 1,
1857 token: Token::Special(Special::Punctuation('.')),
1858 },
1859 PositionalToken {
1860 source: uws,
1861 offset: 71,
1862 length: 1,
1863 token: Token::Word(Word::Word("A".to_string())),
1864 },
1865 PositionalToken {
1866 source: uws,
1867 offset: 72,
1868 length: 2,
1869 token: Token::Special(Special::Separator(Separator::Space)),
1870 },
1871 PositionalToken {
1872 source: uws,
1873 offset: 74,
1874 length: 3,
1875 token: Token::Word(Word::Word("asd".to_string())),
1876 },
1877 PositionalToken {
1878 source: uws,
1879 offset: 77,
1880 length: 3,
1881 token: Token::Special(Special::Separator(Separator::Newline)),
1882 },
1883 PositionalToken {
1884 source: uws,
1885 offset: 80,
1886 length: 3,
1887 token: Token::Word(Word::Word("Brr".to_string())),
1888 },
1889 PositionalToken {
1890 source: uws,
1891 offset: 83,
1892 length: 1,
1893 token: Token::Special(Special::Punctuation(',')),
1894 },
1895 PositionalToken {
1896 source: uws,
1897 offset: 84,
1898 length: 1,
1899 token: Token::Special(Special::Separator(Separator::Space)),
1900 },
1901 PositionalToken {
1902 source: uws,
1903 offset: 85,
1904 length: 4,
1905 token: Token::Word(Word::Word("it\'s".to_string())),
1906 },
1907 PositionalToken {
1908 source: uws,
1909 offset: 89,
1910 length: 1,
1911 token: Token::Special(Special::Separator(Separator::Space)),
1912 },
1913 PositionalToken {
1914 source: uws,
1915 offset: 90,
1916 length: 4,
1917 token: Token::Word(Word::Number(Number::Float(29.3))),
1918 },
1919 PositionalToken {
1920 source: uws,
1921 offset: 94,
1922 length: 2,
1923 token: Token::Special(Special::Symbol('°')),
1924 },
1925 PositionalToken {
1926 source: uws,
1927 offset: 96,
1928 length: 1,
1929 token: Token::Word(Word::Word("F".to_string())),
1930 },
1931 PositionalToken {
1932 source: uws,
1933 offset: 97,
1934 length: 1,
1935 token: Token::Special(Special::Punctuation('!')),
1936 },
1937 PositionalToken {
1938 source: uws,
1939 offset: 98,
1940 length: 1,
1941 token: Token::Special(Special::Separator(Separator::Newline)),
1942 },
1943 PositionalToken {
1944 source: uws,
1945 offset: 99,
1946 length: 1,
1947 token: Token::Special(Special::Separator(Separator::Space)),
1948 },
1949 PositionalToken {
1950 source: uws,
1951 offset: 100,
1952 length: 14,
1953 token: Token::Word(Word::Word("Русское".to_string())),
1954 },
1955 PositionalToken {
1956 source: uws,
1957 offset: 114,
1958 length: 1,
1959 token: Token::Special(Special::Separator(Separator::Space)),
1960 },
1961 PositionalToken {
1962 source: uws,
1963 offset: 115,
1964 length: 22,
1965 token: Token::Word(Word::Word("предложение".to_string())),
1966 },
1967 PositionalToken {
1968 source: uws,
1969 offset: 137,
1970 length: 1,
1971 token: Token::Special(Special::Separator(Separator::Space)),
1972 },
1973 PositionalToken {
1974 source: uws,
1975 offset: 138,
1976 length: 1,
1977 token: Token::Special(Special::Punctuation('#')),
1978 },
1979 PositionalToken {
1980 source: uws,
1981 offset: 139,
1982 length: 4,
1983 token: Token::Word(Word::Number(Number::Float(36.6))),
1984 },
1985 PositionalToken {
1986 source: uws,
1987 offset: 143,
1988 length: 1,
1989 token: Token::Special(Special::Separator(Separator::Space)),
1990 },
1991 PositionalToken {
1992 source: uws,
1993 offset: 144,
1994 length: 6,
1995 token: Token::Word(Word::Word("для".to_string())),
1996 },
1997 PositionalToken {
1998 source: uws,
1999 offset: 150,
2000 length: 1,
2001 token: Token::Special(Special::Separator(Separator::Space)),
2002 },
2003 PositionalToken {
2004 source: uws,
2005 offset: 151,
2006 length: 24,
2007 token: Token::Word(Word::Word("тестирования".to_string())),
2008 },
2009 PositionalToken {
2010 source: uws,
2011 offset: 175,
2012 length: 1,
2013 token: Token::Special(Special::Separator(Separator::Space)),
2014 },
2015 PositionalToken {
2016 source: uws,
2017 offset: 176,
2018 length: 14,
2019 token: Token::Word(Word::Word("деления".to_string())),
2020 },
2021 PositionalToken {
2022 source: uws,
2023 offset: 190,
2024 length: 1,
2025 token: Token::Special(Special::Separator(Separator::Space)),
2026 },
2027 PositionalToken {
2028 source: uws,
2029 offset: 191,
2030 length: 4,
2031 token: Token::Word(Word::Word("по".to_string())),
2032 },
2033 PositionalToken {
2034 source: uws,
2035 offset: 195,
2036 length: 1,
2037 token: Token::Special(Special::Separator(Separator::Space)),
2038 },
2039 PositionalToken {
2040 source: uws,
2041 offset: 196,
2042 length: 12,
2043 token: Token::Word(Word::Word("юникод".to_string())),
2044 },
2045 PositionalToken {
2046 source: uws,
2047 offset: 208,
2048 length: 1,
2049 token: Token::Special(Special::Punctuation('-')),
2050 },
2051 PositionalToken {
2052 source: uws,
2053 offset: 209,
2054 length: 12,
2055 token: Token::Word(Word::Word("словам".to_string())),
2056 },
2057 PositionalToken {
2058 source: uws,
2059 offset: 221,
2060 length: 3,
2061 token: Token::Special(Special::Punctuation('.')),
2062 },
2063 PositionalToken {
2064 source: uws,
2065 offset: 224,
2066 length: 1,
2067 token: Token::Special(Special::Separator(Separator::Newline)),
2068 },
2069 ];
2070 let lib_res = uws
2071 .into_tokenizer(TokenizerParams::v1())
2072 .collect::<Vec<_>>();
2073 check_results(&result, &lib_res, uws);
2074 }
2075
2076 #[test]
2077 fn general_no_split() {
2078 let uws = "The quick (\"brown\") fox can't jump 32.3 feet, right? 4pda etc. qeq U.S.A asd\n\n\nBrr, it's 29.3°F!\n Русское предложение #36.6 для тестирования деления по юникод-словам...\n";
2079 let result = vec![
2080 PositionalToken {
2081 source: uws,
2082 offset: 0,
2083 length: 3,
2084 token: Token::Word(Word::Word("The".to_string())),
2085 },
2086 PositionalToken {
2087 source: uws,
2088 offset: 3,
2089 length: 1,
2090 token: Token::Special(Special::Separator(Separator::Space)),
2091 },
2092 PositionalToken {
2093 source: uws,
2094 offset: 4,
2095 length: 5,
2096 token: Token::Word(Word::Word("quick".to_string())),
2097 },
2098 PositionalToken {
2099 source: uws,
2100 offset: 9,
2101 length: 1,
2102 token: Token::Special(Special::Separator(Separator::Space)),
2103 },
2104 PositionalToken {
2105 source: uws,
2106 offset: 10,
2107 length: 1,
2108 token: Token::Special(Special::Punctuation('(')),
2109 },
2110 PositionalToken {
2111 source: uws,
2112 offset: 11,
2113 length: 1,
2114 token: Token::Special(Special::Punctuation('"')),
2115 },
2116 PositionalToken {
2117 source: uws,
2118 offset: 12,
2119 length: 5,
2120 token: Token::Word(Word::Word("brown".to_string())),
2121 },
2122 PositionalToken {
2123 source: uws,
2124 offset: 17,
2125 length: 1,
2126 token: Token::Special(Special::Punctuation('"')),
2127 },
2128 PositionalToken {
2129 source: uws,
2130 offset: 18,
2131 length: 1,
2132 token: Token::Special(Special::Punctuation(')')),
2133 },
2134 PositionalToken {
2135 source: uws,
2136 offset: 19,
2137 length: 1,
2138 token: Token::Special(Special::Separator(Separator::Space)),
2139 },
2140 PositionalToken {
2141 source: uws,
2142 offset: 20,
2143 length: 3,
2144 token: Token::Word(Word::Word("fox".to_string())),
2145 },
2146 PositionalToken {
2147 source: uws,
2148 offset: 23,
2149 length: 1,
2150 token: Token::Special(Special::Separator(Separator::Space)),
2151 },
2152 PositionalToken {
2153 source: uws,
2154 offset: 24,
2155 length: 5,
2156 token: Token::Word(Word::Word("can\'t".to_string())),
2157 },
2158 PositionalToken {
2159 source: uws,
2160 offset: 29,
2161 length: 1,
2162 token: Token::Special(Special::Separator(Separator::Space)),
2163 },
2164 PositionalToken {
2165 source: uws,
2166 offset: 30,
2167 length: 4,
2168 token: Token::Word(Word::Word("jump".to_string())),
2169 },
2170 PositionalToken {
2171 source: uws,
2172 offset: 34,
2173 length: 1,
2174 token: Token::Special(Special::Separator(Separator::Space)),
2175 },
2176 PositionalToken {
2177 source: uws,
2178 offset: 35,
2179 length: 4,
2180 token: Token::Word(Word::Number(Number::Float(32.3))),
2181 },
2182 PositionalToken {
2183 source: uws,
2184 offset: 39,
2185 length: 1,
2186 token: Token::Special(Special::Separator(Separator::Space)),
2187 },
2188 PositionalToken {
2189 source: uws,
2190 offset: 40,
2191 length: 4,
2192 token: Token::Word(Word::Word("feet".to_string())),
2193 },
2194 PositionalToken {
2195 source: uws,
2196 offset: 44,
2197 length: 1,
2198 token: Token::Special(Special::Punctuation(',')),
2199 },
2200 PositionalToken {
2201 source: uws,
2202 offset: 45,
2203 length: 1,
2204 token: Token::Special(Special::Separator(Separator::Space)),
2205 },
2206 PositionalToken {
2207 source: uws,
2208 offset: 46,
2209 length: 5,
2210 token: Token::Word(Word::Word("right".to_string())),
2211 },
2212 PositionalToken {
2213 source: uws,
2214 offset: 51,
2215 length: 1,
2216 token: Token::Special(Special::Punctuation('?')),
2217 },
2218 PositionalToken {
2219 source: uws,
2220 offset: 52,
2221 length: 1,
2222 token: Token::Special(Special::Separator(Separator::Space)),
2223 },
2224 PositionalToken {
2225 source: uws,
2226 offset: 53,
2227 length: 4,
2228 token: Token::Word(Word::Numerical(Numerical::Measures("4pda".to_string()))),
2229 }, PositionalToken {
2231 source: uws,
2232 offset: 57,
2233 length: 1,
2234 token: Token::Special(Special::Separator(Separator::Space)),
2235 },
2236 PositionalToken {
2237 source: uws,
2238 offset: 58,
2239 length: 3,
2240 token: Token::Word(Word::Word("etc".to_string())),
2241 },
2242 PositionalToken {
2243 source: uws,
2244 offset: 61,
2245 length: 1,
2246 token: Token::Special(Special::Punctuation('.')),
2247 },
2248 PositionalToken {
2249 source: uws,
2250 offset: 62,
2251 length: 1,
2252 token: Token::Special(Special::Separator(Separator::Space)),
2253 },
2254 PositionalToken {
2255 source: uws,
2256 offset: 63,
2257 length: 3,
2258 token: Token::Word(Word::Word("qeq".to_string())),
2259 },
2260 PositionalToken {
2261 source: uws,
2262 offset: 66,
2263 length: 1,
2264 token: Token::Special(Special::Separator(Separator::Space)),
2265 },
2266 PositionalToken {
2267 source: uws,
2268 offset: 67,
2269 length: 5,
2270 token: Token::Word(Word::Word("U.S.A".to_string())),
2271 },
2272 PositionalToken {
2273 source: uws,
2274 offset: 72,
2275 length: 1,
2276 token: Token::Special(Special::Separator(Separator::Space)),
2277 },
2278 PositionalToken {
2279 source: uws,
2280 offset: 73,
2281 length: 1,
2282 token: Token::Special(Special::Separator(Separator::Space)),
2283 },
2284 PositionalToken {
2285 source: uws,
2286 offset: 74,
2287 length: 3,
2288 token: Token::Word(Word::Word("asd".to_string())),
2289 },
2290 PositionalToken {
2291 source: uws,
2292 offset: 77,
2293 length: 1,
2294 token: Token::Special(Special::Separator(Separator::Newline)),
2295 },
2296 PositionalToken {
2297 source: uws,
2298 offset: 78,
2299 length: 1,
2300 token: Token::Special(Special::Separator(Separator::Newline)),
2301 },
2302 PositionalToken {
2303 source: uws,
2304 offset: 79,
2305 length: 1,
2306 token: Token::Special(Special::Separator(Separator::Newline)),
2307 },
2308 PositionalToken {
2309 source: uws,
2310 offset: 80,
2311 length: 3,
2312 token: Token::Word(Word::Word("Brr".to_string())),
2313 },
2314 PositionalToken {
2315 source: uws,
2316 offset: 83,
2317 length: 1,
2318 token: Token::Special(Special::Punctuation(',')),
2319 },
2320 PositionalToken {
2321 source: uws,
2322 offset: 84,
2323 length: 1,
2324 token: Token::Special(Special::Separator(Separator::Space)),
2325 },
2326 PositionalToken {
2327 source: uws,
2328 offset: 85,
2329 length: 4,
2330 token: Token::Word(Word::Word("it\'s".to_string())),
2331 },
2332 PositionalToken {
2333 source: uws,
2334 offset: 89,
2335 length: 1,
2336 token: Token::Special(Special::Separator(Separator::Space)),
2337 },
2338 PositionalToken {
2339 source: uws,
2340 offset: 90,
2341 length: 4,
2342 token: Token::Word(Word::Number(Number::Float(29.3))),
2343 },
2344 PositionalToken {
2345 source: uws,
2346 offset: 94,
2347 length: 2,
2348 token: Token::Special(Special::Symbol('°')),
2349 },
2350 PositionalToken {
2351 source: uws,
2352 offset: 96,
2353 length: 1,
2354 token: Token::Word(Word::Word("F".to_string())),
2355 },
2356 PositionalToken {
2357 source: uws,
2358 offset: 97,
2359 length: 1,
2360 token: Token::Special(Special::Punctuation('!')),
2361 },
2362 PositionalToken {
2363 source: uws,
2364 offset: 98,
2365 length: 1,
2366 token: Token::Special(Special::Separator(Separator::Newline)),
2367 },
2368 PositionalToken {
2369 source: uws,
2370 offset: 99,
2371 length: 1,
2372 token: Token::Special(Special::Separator(Separator::Space)),
2373 },
2374 PositionalToken {
2375 source: uws,
2376 offset: 100,
2377 length: 14,
2378 token: Token::Word(Word::Word("Русское".to_string())),
2379 },
2380 PositionalToken {
2381 source: uws,
2382 offset: 114,
2383 length: 1,
2384 token: Token::Special(Special::Separator(Separator::Space)),
2385 },
2386 PositionalToken {
2387 source: uws,
2388 offset: 115,
2389 length: 22,
2390 token: Token::Word(Word::Word("предложение".to_string())),
2391 },
2392 PositionalToken {
2393 source: uws,
2394 offset: 137,
2395 length: 1,
2396 token: Token::Special(Special::Separator(Separator::Space)),
2397 },
2398 PositionalToken {
2399 source: uws,
2400 offset: 138,
2401 length: 1,
2402 token: Token::Special(Special::Punctuation('#')),
2403 },
2404 PositionalToken {
2405 source: uws,
2406 offset: 139,
2407 length: 4,
2408 token: Token::Word(Word::Number(Number::Float(36.6))),
2409 },
2410 PositionalToken {
2411 source: uws,
2412 offset: 143,
2413 length: 1,
2414 token: Token::Special(Special::Separator(Separator::Space)),
2415 },
2416 PositionalToken {
2417 source: uws,
2418 offset: 144,
2419 length: 6,
2420 token: Token::Word(Word::Word("для".to_string())),
2421 },
2422 PositionalToken {
2423 source: uws,
2424 offset: 150,
2425 length: 1,
2426 token: Token::Special(Special::Separator(Separator::Space)),
2427 },
2428 PositionalToken {
2429 source: uws,
2430 offset: 151,
2431 length: 24,
2432 token: Token::Word(Word::Word("тестирования".to_string())),
2433 },
2434 PositionalToken {
2435 source: uws,
2436 offset: 175,
2437 length: 1,
2438 token: Token::Special(Special::Separator(Separator::Space)),
2439 },
2440 PositionalToken {
2441 source: uws,
2442 offset: 176,
2443 length: 14,
2444 token: Token::Word(Word::Word("деления".to_string())),
2445 },
2446 PositionalToken {
2447 source: uws,
2448 offset: 190,
2449 length: 1,
2450 token: Token::Special(Special::Separator(Separator::Space)),
2451 },
2452 PositionalToken {
2453 source: uws,
2454 offset: 191,
2455 length: 4,
2456 token: Token::Word(Word::Word("по".to_string())),
2457 },
2458 PositionalToken {
2459 source: uws,
2460 offset: 195,
2461 length: 1,
2462 token: Token::Special(Special::Separator(Separator::Space)),
2463 },
2464 PositionalToken {
2465 source: uws,
2466 offset: 196,
2467 length: 12,
2468 token: Token::Word(Word::Word("юникод".to_string())),
2469 },
2470 PositionalToken {
2471 source: uws,
2472 offset: 208,
2473 length: 1,
2474 token: Token::Special(Special::Punctuation('-')),
2475 },
2476 PositionalToken {
2477 source: uws,
2478 offset: 209,
2479 length: 12,
2480 token: Token::Word(Word::Word("словам".to_string())),
2481 },
2482 PositionalToken {
2483 source: uws,
2484 offset: 221,
2485 length: 1,
2486 token: Token::Special(Special::Punctuation('.')),
2487 },
2488 PositionalToken {
2489 source: uws,
2490 offset: 222,
2491 length: 1,
2492 token: Token::Special(Special::Punctuation('.')),
2493 },
2494 PositionalToken {
2495 source: uws,
2496 offset: 223,
2497 length: 1,
2498 token: Token::Special(Special::Punctuation('.')),
2499 },
2500 PositionalToken {
2501 source: uws,
2502 offset: 224,
2503 length: 1,
2504 token: Token::Special(Special::Separator(Separator::Newline)),
2505 },
2506 ];
2507 let lib_res = uws.into_tokenizer(Default::default()).collect::<Vec<_>>();
2508 check_results(&result, &lib_res, uws);
2509 }
2510
2511 #[test]
2512 fn general_complex() {
2513 let uws = "The quick (\"brown\") fox can't jump 32.3 feet, right? 4pda etc. qeq U.S.A asd\n\n\nBrr, it's 29.3°F!\n Русское предложение #36.6 для тестирования деления по юникод-словам...\n";
2514 let result = vec![
2515 PositionalToken {
2516 source: uws,
2517 offset: 0,
2518 length: 3,
2519 token: Token::Word(Word::Word("The".to_string())),
2520 },
2521 PositionalToken {
2522 source: uws,
2523 offset: 3,
2524 length: 1,
2525 token: Token::Special(Special::Separator(Separator::Space)),
2526 },
2527 PositionalToken {
2528 source: uws,
2529 offset: 4,
2530 length: 5,
2531 token: Token::Word(Word::Word("quick".to_string())),
2532 },
2533 PositionalToken {
2534 source: uws,
2535 offset: 9,
2536 length: 1,
2537 token: Token::Special(Special::Separator(Separator::Space)),
2538 },
2539 PositionalToken {
2540 source: uws,
2541 offset: 10,
2542 length: 1,
2543 token: Token::Special(Special::Punctuation('(')),
2544 },
2545 PositionalToken {
2546 source: uws,
2547 offset: 11,
2548 length: 1,
2549 token: Token::Special(Special::Punctuation('"')),
2550 },
2551 PositionalToken {
2552 source: uws,
2553 offset: 12,
2554 length: 5,
2555 token: Token::Word(Word::Word("brown".to_string())),
2556 },
2557 PositionalToken {
2558 source: uws,
2559 offset: 17,
2560 length: 1,
2561 token: Token::Special(Special::Punctuation('"')),
2562 },
2563 PositionalToken {
2564 source: uws,
2565 offset: 18,
2566 length: 1,
2567 token: Token::Special(Special::Punctuation(')')),
2568 },
2569 PositionalToken {
2570 source: uws,
2571 offset: 19,
2572 length: 1,
2573 token: Token::Special(Special::Separator(Separator::Space)),
2574 },
2575 PositionalToken {
2576 source: uws,
2577 offset: 20,
2578 length: 3,
2579 token: Token::Word(Word::Word("fox".to_string())),
2580 },
2581 PositionalToken {
2582 source: uws,
2583 offset: 23,
2584 length: 1,
2585 token: Token::Special(Special::Separator(Separator::Space)),
2586 },
2587 PositionalToken {
2588 source: uws,
2589 offset: 24,
2590 length: 5,
2591 token: Token::Word(Word::Word("can\'t".to_string())),
2592 },
2593 PositionalToken {
2594 source: uws,
2595 offset: 29,
2596 length: 1,
2597 token: Token::Special(Special::Separator(Separator::Space)),
2598 },
2599 PositionalToken {
2600 source: uws,
2601 offset: 30,
2602 length: 4,
2603 token: Token::Word(Word::Word("jump".to_string())),
2604 },
2605 PositionalToken {
2606 source: uws,
2607 offset: 34,
2608 length: 1,
2609 token: Token::Special(Special::Separator(Separator::Space)),
2610 },
2611 PositionalToken {
2612 source: uws,
2613 offset: 35,
2614 length: 4,
2615 token: Token::Word(Word::Number(Number::Float(32.3))),
2616 },
2617 PositionalToken {
2618 source: uws,
2619 offset: 39,
2620 length: 1,
2621 token: Token::Special(Special::Separator(Separator::Space)),
2622 },
2623 PositionalToken {
2624 source: uws,
2625 offset: 40,
2626 length: 4,
2627 token: Token::Word(Word::Word("feet".to_string())),
2628 },
2629 PositionalToken {
2630 source: uws,
2631 offset: 44,
2632 length: 1,
2633 token: Token::Special(Special::Punctuation(',')),
2634 },
2635 PositionalToken {
2636 source: uws,
2637 offset: 45,
2638 length: 1,
2639 token: Token::Special(Special::Separator(Separator::Space)),
2640 },
2641 PositionalToken {
2642 source: uws,
2643 offset: 46,
2644 length: 5,
2645 token: Token::Word(Word::Word("right".to_string())),
2646 },
2647 PositionalToken {
2648 source: uws,
2649 offset: 51,
2650 length: 1,
2651 token: Token::Special(Special::Punctuation('?')),
2652 },
2653 PositionalToken {
2654 source: uws,
2655 offset: 52,
2656 length: 1,
2657 token: Token::Special(Special::Separator(Separator::Space)),
2658 },
2659 PositionalToken {
2660 source: uws,
2661 offset: 53,
2662 length: 4,
2663 token: Token::Word(Word::Numerical(Numerical::Measures("4pda".to_string()))),
2664 }, PositionalToken {
2666 source: uws,
2667 offset: 57,
2668 length: 1,
2669 token: Token::Special(Special::Separator(Separator::Space)),
2670 },
2671 PositionalToken {
2672 source: uws,
2673 offset: 58,
2674 length: 3,
2675 token: Token::Word(Word::Word("etc".to_string())),
2676 },
2677 PositionalToken {
2678 source: uws,
2679 offset: 61,
2680 length: 1,
2681 token: Token::Special(Special::Punctuation('.')),
2682 },
2683 PositionalToken {
2684 source: uws,
2685 offset: 62,
2686 length: 1,
2687 token: Token::Special(Special::Separator(Separator::Space)),
2688 },
2689 PositionalToken {
2690 source: uws,
2691 offset: 63,
2692 length: 3,
2693 token: Token::Word(Word::Word("qeq".to_string())),
2694 },
2695 PositionalToken {
2696 source: uws,
2697 offset: 66,
2698 length: 1,
2699 token: Token::Special(Special::Separator(Separator::Space)),
2700 },
2701 PositionalToken {
2702 source: uws,
2703 offset: 67,
2704 length: 5,
2705 token: Token::Word(Word::Word("U.S.A".to_string())),
2706 },
2707 PositionalToken {
2708 source: uws,
2709 offset: 72,
2710 length: 2,
2711 token: Token::Special(Special::Separator(Separator::Space)),
2712 },
2713 PositionalToken {
2714 source: uws,
2715 offset: 74,
2716 length: 3,
2717 token: Token::Word(Word::Word("asd".to_string())),
2718 },
2719 PositionalToken {
2720 source: uws,
2721 offset: 77,
2722 length: 3,
2723 token: Token::Special(Special::Separator(Separator::Newline)),
2724 },
2725 PositionalToken {
2726 source: uws,
2727 offset: 80,
2728 length: 3,
2729 token: Token::Word(Word::Word("Brr".to_string())),
2730 },
2731 PositionalToken {
2732 source: uws,
2733 offset: 83,
2734 length: 1,
2735 token: Token::Special(Special::Punctuation(',')),
2736 },
2737 PositionalToken {
2738 source: uws,
2739 offset: 84,
2740 length: 1,
2741 token: Token::Special(Special::Separator(Separator::Space)),
2742 },
2743 PositionalToken {
2744 source: uws,
2745 offset: 85,
2746 length: 4,
2747 token: Token::Word(Word::Word("it\'s".to_string())),
2748 },
2749 PositionalToken {
2750 source: uws,
2751 offset: 89,
2752 length: 1,
2753 token: Token::Special(Special::Separator(Separator::Space)),
2754 },
2755 PositionalToken {
2756 source: uws,
2757 offset: 90,
2758 length: 4,
2759 token: Token::Word(Word::Number(Number::Float(29.3))),
2760 },
2761 PositionalToken {
2762 source: uws,
2763 offset: 94,
2764 length: 2,
2765 token: Token::Special(Special::Symbol('°')),
2766 },
2767 PositionalToken {
2768 source: uws,
2769 offset: 96,
2770 length: 1,
2771 token: Token::Word(Word::Word("F".to_string())),
2772 },
2773 PositionalToken {
2774 source: uws,
2775 offset: 97,
2776 length: 1,
2777 token: Token::Special(Special::Punctuation('!')),
2778 },
2779 PositionalToken {
2780 source: uws,
2781 offset: 98,
2782 length: 1,
2783 token: Token::Special(Special::Separator(Separator::Newline)),
2784 },
2785 PositionalToken {
2786 source: uws,
2787 offset: 99,
2788 length: 1,
2789 token: Token::Special(Special::Separator(Separator::Space)),
2790 },
2791 PositionalToken {
2792 source: uws,
2793 offset: 100,
2794 length: 14,
2795 token: Token::Word(Word::Word("Русское".to_string())),
2796 },
2797 PositionalToken {
2798 source: uws,
2799 offset: 114,
2800 length: 1,
2801 token: Token::Special(Special::Separator(Separator::Space)),
2802 },
2803 PositionalToken {
2804 source: uws,
2805 offset: 115,
2806 length: 22,
2807 token: Token::Word(Word::Word("предложение".to_string())),
2808 },
2809 PositionalToken {
2810 source: uws,
2811 offset: 137,
2812 length: 1,
2813 token: Token::Special(Special::Separator(Separator::Space)),
2814 },
2815 PositionalToken {
2816 source: uws,
2817 offset: 138,
2818 length: 5,
2819 token: Token::Struct(Struct::Hashtag("36.6".to_string())),
2820 },
2821 PositionalToken {
2822 source: uws,
2823 offset: 143,
2824 length: 1,
2825 token: Token::Special(Special::Separator(Separator::Space)),
2826 },
2827 PositionalToken {
2828 source: uws,
2829 offset: 144,
2830 length: 6,
2831 token: Token::Word(Word::Word("для".to_string())),
2832 },
2833 PositionalToken {
2834 source: uws,
2835 offset: 150,
2836 length: 1,
2837 token: Token::Special(Special::Separator(Separator::Space)),
2838 },
2839 PositionalToken {
2840 source: uws,
2841 offset: 151,
2842 length: 24,
2843 token: Token::Word(Word::Word("тестирования".to_string())),
2844 },
2845 PositionalToken {
2846 source: uws,
2847 offset: 175,
2848 length: 1,
2849 token: Token::Special(Special::Separator(Separator::Space)),
2850 },
2851 PositionalToken {
2852 source: uws,
2853 offset: 176,
2854 length: 14,
2855 token: Token::Word(Word::Word("деления".to_string())),
2856 },
2857 PositionalToken {
2858 source: uws,
2859 offset: 190,
2860 length: 1,
2861 token: Token::Special(Special::Separator(Separator::Space)),
2862 },
2863 PositionalToken {
2864 source: uws,
2865 offset: 191,
2866 length: 4,
2867 token: Token::Word(Word::Word("по".to_string())),
2868 },
2869 PositionalToken {
2870 source: uws,
2871 offset: 195,
2872 length: 1,
2873 token: Token::Special(Special::Separator(Separator::Space)),
2874 },
2875 PositionalToken {
2876 source: uws,
2877 offset: 196,
2878 length: 12,
2879 token: Token::Word(Word::Word("юникод".to_string())),
2880 },
2881 PositionalToken {
2882 source: uws,
2883 offset: 208,
2884 length: 1,
2885 token: Token::Special(Special::Punctuation('-')),
2886 },
2887 PositionalToken {
2888 source: uws,
2889 offset: 209,
2890 length: 12,
2891 token: Token::Word(Word::Word("словам".to_string())),
2892 },
2893 PositionalToken {
2894 source: uws,
2895 offset: 221,
2896 length: 3,
2897 token: Token::Special(Special::Punctuation('.')),
2898 },
2899 PositionalToken {
2900 source: uws,
2901 offset: 224,
2902 length: 1,
2903 token: Token::Special(Special::Separator(Separator::Newline)),
2904 },
2905 ];
2906 let lib_res = uws
2907 .into_tokenizer(TokenizerParams::complex())
2908 .collect::<Vec<_>>();
2909 check_results(&result, &lib_res, uws);
2910 }
2911
2912 #[test]
2913 fn plus_minus() {
2914 let uws = "+23 -4.5 -34 +25.7 - 2 + 5.6";
2915 let result = vec![
2916 PositionalToken {
2917 source: uws,
2918 offset: 0,
2919 length: 3,
2920 token: Token::Word(Word::Number(Number::Integer(23))),
2921 },
2922 PositionalToken {
2923 source: uws,
2924 offset: 3,
2925 length: 1,
2926 token: Token::Special(Special::Separator(Separator::Space)),
2927 },
2928 PositionalToken {
2929 source: uws,
2930 offset: 4,
2931 length: 4,
2932 token: Token::Word(Word::Number(Number::Float(-4.5))),
2933 },
2934 PositionalToken {
2935 source: uws,
2936 offset: 8,
2937 length: 1,
2938 token: Token::Special(Special::Separator(Separator::Space)),
2939 },
2940 PositionalToken {
2941 source: uws,
2942 offset: 9,
2943 length: 3,
2944 token: Token::Word(Word::Number(Number::Integer(-34))),
2945 },
2946 PositionalToken {
2947 source: uws,
2948 offset: 12,
2949 length: 1,
2950 token: Token::Special(Special::Separator(Separator::Space)),
2951 },
2952 PositionalToken {
2953 source: uws,
2954 offset: 13,
2955 length: 5,
2956 token: Token::Word(Word::Number(Number::Float(25.7))),
2957 },
2958 PositionalToken {
2959 source: uws,
2960 offset: 18,
2961 length: 1,
2962 token: Token::Special(Special::Separator(Separator::Space)),
2963 },
2964 PositionalToken {
2965 source: uws,
2966 offset: 19,
2967 length: 1,
2968 token: Token::Special(Special::Punctuation('-')),
2969 },
2970 PositionalToken {
2971 source: uws,
2972 offset: 20,
2973 length: 1,
2974 token: Token::Special(Special::Separator(Separator::Space)),
2975 },
2976 PositionalToken {
2977 source: uws,
2978 offset: 21,
2979 length: 1,
2980 token: Token::Word(Word::Number(Number::Integer(2))),
2981 },
2982 PositionalToken {
2983 source: uws,
2984 offset: 22,
2985 length: 1,
2986 token: Token::Special(Special::Separator(Separator::Space)),
2987 },
2988 PositionalToken {
2989 source: uws,
2990 offset: 23,
2991 length: 1,
2992 token: Token::Special(Special::Punctuation('+')),
2993 },
2994 PositionalToken {
2995 source: uws,
2996 offset: 24,
2997 length: 1,
2998 token: Token::Special(Special::Separator(Separator::Space)),
2999 },
3000 PositionalToken {
3001 source: uws,
3002 offset: 25,
3003 length: 3,
3004 token: Token::Word(Word::Number(Number::Float(5.6))),
3005 },
3006 ];
3007 let lib_res = uws
3008 .into_tokenizer(TokenizerParams::v1())
3009 .collect::<Vec<_>>();
3010 check(&result, &lib_res, uws);
3011 }
3013
3014 #[test]
3015 #[ignore]
3016 fn woman_bouncing_ball() {
3017 let uws = "\u{26f9}\u{200d}\u{2640}";
3018 let result = vec![PositionalToken {
3019 source: uws,
3020 offset: 0,
3021 length: 9,
3022 token: Token::Word(Word::Emoji("woman_bouncing_ball")),
3023 }];
3024 let lib_res = uws
3025 .into_tokenizer(TokenizerParams::v1())
3026 .collect::<Vec<_>>();
3027 check_results(&result, &lib_res, uws);
3028 }
3030
3031 #[test]
3032 fn emoji_and_rusabbr_default() {
3033 let uws = "🇷🇺 🇸🇹\n👱🏿👶🏽👨🏽\n👱\nС.С.С.Р.\n👨👩👦👦\n🧠\n";
3034 let result = vec![
3035 PositionalToken {
3036 source: uws,
3037 offset: 0,
3038 length: 8,
3039 token: Token::Word(Word::Emoji("russia")),
3040 },
3041 PositionalToken {
3042 source: uws,
3043 offset: 8,
3044 length: 1,
3045 token: Token::Special(Special::Separator(Separator::Space)),
3046 },
3047 PositionalToken {
3048 source: uws,
3049 offset: 9,
3050 length: 8,
3051 token: Token::Word(Word::Emoji("sao_tome_and_principe")),
3052 },
3053 PositionalToken {
3054 source: uws,
3055 offset: 17,
3056 length: 1,
3057 token: Token::Special(Special::Separator(Separator::Newline)),
3058 },
3059 PositionalToken {
3060 source: uws,
3061 offset: 18,
3062 length: 8,
3063 token: Token::Word(Word::Emoji("blond_haired_person_dark_skin_tone")),
3064 },
3065 PositionalToken {
3066 source: uws,
3067 offset: 26,
3068 length: 8,
3069 token: Token::Word(Word::Emoji("baby_medium_skin_tone")),
3070 },
3071 PositionalToken {
3072 source: uws,
3073 offset: 34,
3074 length: 8,
3075 token: Token::Word(Word::Emoji("man_medium_skin_tone")),
3076 },
3077 PositionalToken {
3078 source: uws,
3079 offset: 42,
3080 length: 1,
3081 token: Token::Special(Special::Separator(Separator::Newline)),
3082 },
3083 PositionalToken {
3084 source: uws,
3085 offset: 43,
3086 length: 4,
3087 token: Token::Word(Word::Emoji("blond_haired_person")),
3088 },
3089 PositionalToken {
3090 source: uws,
3091 offset: 47,
3092 length: 1,
3093 token: Token::Special(Special::Separator(Separator::Newline)),
3094 },
3095 PositionalToken {
3096 source: uws,
3097 offset: 48,
3098 length: 2,
3099 token: Token::Word(Word::Word("С".to_string())),
3100 },
3101 PositionalToken {
3102 source: uws,
3103 offset: 50,
3104 length: 1,
3105 token: Token::Special(Special::Punctuation('.')),
3106 },
3107 PositionalToken {
3108 source: uws,
3109 offset: 51,
3110 length: 2,
3111 token: Token::Word(Word::Word("С".to_string())),
3112 },
3113 PositionalToken {
3114 source: uws,
3115 offset: 53,
3116 length: 1,
3117 token: Token::Special(Special::Punctuation('.')),
3118 },
3119 PositionalToken {
3120 source: uws,
3121 offset: 54,
3122 length: 2,
3123 token: Token::Word(Word::Word("С".to_string())),
3124 },
3125 PositionalToken {
3126 source: uws,
3127 offset: 56,
3128 length: 1,
3129 token: Token::Special(Special::Punctuation('.')),
3130 },
3131 PositionalToken {
3132 source: uws,
3133 offset: 57,
3134 length: 2,
3135 token: Token::Word(Word::Word("Р".to_string())),
3136 },
3137 PositionalToken {
3138 source: uws,
3139 offset: 59,
3140 length: 1,
3141 token: Token::Special(Special::Punctuation('.')),
3142 },
3143 PositionalToken {
3144 source: uws,
3145 offset: 60,
3146 length: 1,
3147 token: Token::Special(Special::Separator(Separator::Newline)),
3148 },
3149 PositionalToken {
3150 source: uws,
3151 offset: 61,
3152 length: 25,
3153 token: Token::Word(Word::Emoji("family_man_woman_boy_boy")),
3154 },
3155 PositionalToken {
3156 source: uws,
3157 offset: 86,
3158 length: 1,
3159 token: Token::Special(Special::Separator(Separator::Newline)),
3160 },
3161 PositionalToken {
3162 source: uws,
3163 offset: 87,
3164 length: 4,
3165 token: Token::Word(Word::Emoji("brain")),
3166 },
3167 PositionalToken {
3168 source: uws,
3169 offset: 91,
3170 length: 1,
3171 token: Token::Special(Special::Separator(Separator::Newline)),
3172 },
3173 ];
3174
3175 let lib_res = uws
3176 .into_tokenizer(TokenizerParams::v1())
3177 .collect::<Vec<_>>();
3178 check_results(&result, &lib_res, uws);
3179 }
3181
3182 #[test]
3183 fn emoji_and_rusabbr_no_split() {
3184 let uws = "🇷🇺 🇸🇹\n👱🏿👶🏽👨🏽\n👱\nС.С.С.Р.\n👨👩👦👦\n🧠\n";
3185 let result = vec![
3186 PositionalToken {
3187 source: uws,
3188 offset: 0,
3189 length: 8,
3190 token: Token::Word(Word::Emoji("russia")),
3191 },
3192 PositionalToken {
3193 source: uws,
3194 offset: 8,
3195 length: 1,
3196 token: Token::Special(Special::Separator(Separator::Space)),
3197 },
3198 PositionalToken {
3199 source: uws,
3200 offset: 9,
3201 length: 8,
3202 token: Token::Word(Word::Emoji("sao_tome_and_principe")),
3203 },
3204 PositionalToken {
3205 source: uws,
3206 offset: 17,
3207 length: 1,
3208 token: Token::Special(Special::Separator(Separator::Newline)),
3209 },
3210 PositionalToken {
3211 source: uws,
3212 offset: 18,
3213 length: 8,
3214 token: Token::Word(Word::Emoji("blond_haired_person_dark_skin_tone")),
3215 },
3216 PositionalToken {
3217 source: uws,
3218 offset: 26,
3219 length: 8,
3220 token: Token::Word(Word::Emoji("baby_medium_skin_tone")),
3221 },
3222 PositionalToken {
3223 source: uws,
3224 offset: 34,
3225 length: 8,
3226 token: Token::Word(Word::Emoji("man_medium_skin_tone")),
3227 },
3228 PositionalToken {
3229 source: uws,
3230 offset: 42,
3231 length: 1,
3232 token: Token::Special(Special::Separator(Separator::Newline)),
3233 },
3234 PositionalToken {
3235 source: uws,
3236 offset: 43,
3237 length: 4,
3238 token: Token::Word(Word::Emoji("blond_haired_person")),
3239 },
3240 PositionalToken {
3241 source: uws,
3242 offset: 47,
3243 length: 1,
3244 token: Token::Special(Special::Separator(Separator::Newline)),
3245 },
3246 PositionalToken {
3247 source: uws,
3248 offset: 48,
3249 length: 11,
3250 token: Token::Word(Word::Word("С.С.С.Р".to_string())),
3251 },
3252 PositionalToken {
3253 source: uws,
3254 offset: 59,
3255 length: 1,
3256 token: Token::Special(Special::Punctuation('.')),
3257 },
3258 PositionalToken {
3259 source: uws,
3260 offset: 60,
3261 length: 1,
3262 token: Token::Special(Special::Separator(Separator::Newline)),
3263 },
3264 PositionalToken {
3265 source: uws,
3266 offset: 61,
3267 length: 25,
3268 token: Token::Word(Word::Emoji("family_man_woman_boy_boy")),
3269 },
3270 PositionalToken {
3271 source: uws,
3272 offset: 86,
3273 length: 1,
3274 token: Token::Special(Special::Separator(Separator::Newline)),
3275 },
3276 PositionalToken {
3277 source: uws,
3278 offset: 87,
3279 length: 4,
3280 token: Token::Word(Word::Emoji("brain")),
3281 },
3282 PositionalToken {
3283 source: uws,
3284 offset: 91,
3285 length: 1,
3286 token: Token::Special(Special::Separator(Separator::Newline)),
3287 },
3288 ];
3289
3290 let lib_res = uws.into_tokenizer(Default::default()).collect::<Vec<_>>();
3291 check_results(&result, &lib_res, uws);
3292 }
3294
3295 #[test]
3519 fn html() {
3520 let uws = "<div class=\"article article_view \" id=\"article_view_-113039156_9551\" data-article-url=\"/@chaibuket-o-chem-ne-zabyt-25-noyabrya\" data-audio-context=\"article:-113039156_9551\"><h1 class=\"article_decoration_first article_decoration_last\" >День Мамы </h1><p class=\"article_decoration_first article_decoration_last\" >День, когда поздравляют мам, бабушек, сестер и жён — это всемирный праздник, называемый «День Мамы». В настоящее время его отмечают почти в каждой стране, просто везде разные даты и способы празднования. </p><h3 class=\"article_decoration_first article_decoration_last\" ><span class='article_anchor_title'>\n <span class='article_anchor_button' id='pochemu-my-ego-prazdnuem'></span>\n <span class='article_anchor_fsymbol'>П</span>\n</span>ПОЧЕМУ МЫ ЕГО ПРАЗДНУЕМ</h3><p class=\"article_decoration_first article_decoration_last article_decoration_before\" >В 1987 году комитет госдумы по делам женщин, семьи и молодежи выступил с предложением учредить «День мамы», а сам приказ был подписан уже 30 января 1988 года Борисом Ельциным. Было решено, что ежегодно в России празднество дня мамы будет выпадать на последнее воскресенье ноября. </p><figure data-type=\"101\" data-mode=\"\" class=\"article_decoration_first article_decoration_last\" >\n <div class=\"article_figure_content\" style=\"width: 1125px\">\n <div class=\"article_figure_sizer_content\"><div class=\"article_object_sizer_wrap\" data-sizes=\"[{"s":["https://pp.userapi.com/c849128/v849128704/c0ffd/pcNJaBH3NDo.jpg",75,50],"m":["https://pp.userapi.com/c849128/v849128704/c0ffe/ozCLs2kHtRY.jpg",130,87],"x":["https://pp.userapi.com/c849128/v849128704/c0fff/E4KtTNDydzE.jpg",604,403],"y":["https://pp.userapi.com/c849128/v849128704/c1000/1nLxpYKavzU.jpg",807,538],"z":["https://pp.userapi.com/c849128/v849128704/c1001/IgEODe90yEk.jpg",1125,750],"o":["https://pp.userapi.com/c849128/v849128704/c1002/01faNwVZ2_E.jpg",130,87],"p":["https://pp.userapi.com/c849128/v849128704/c1003/baDFzbdRP2s.jpg",200,133],"q":["https://pp.userapi.com/c849128/v849128704/c1004/CY4khI6KJKA.jpg",320,213],"r":["https://pp.userapi.com/c849128/v849128704/c1005/NOvAJ6-VltY.jpg",510,340]}]\">\n <img class=\"article_object_sizer_inner article_object_photo__image_blur\" src=\"https://pp.userapi.com/c849128/v849128704/c0ffd/pcNJaBH3NDo.jpg\" data-baseurl=\"\"/>\n \n</div></div>\n <div class=\"article_figure_sizer\" style=\"padding-bottom: 66.666666666667%\"></div>";
3521 let result = vec![
3522 PositionalToken {
3523 source: uws,
3524 offset: 236,
3525 length: 8,
3526 token: Token::Word(Word::Word("День".to_string())),
3527 },
3528 PositionalToken {
3529 source: uws,
3530 offset: 244,
3531 length: 1,
3532 token: Token::Special(Special::Separator(Separator::Space)),
3533 },
3534 PositionalToken {
3535 source: uws,
3536 offset: 245,
3537 length: 8,
3538 token: Token::Word(Word::Word("Мамы".to_string())),
3539 },
3540 PositionalToken {
3541 source: uws,
3542 offset: 253,
3543 length: 1,
3544 token: Token::Special(Special::Separator(Separator::Space)),
3545 },
3546 PositionalToken {
3547 source: uws,
3548 offset: 321,
3549 length: 8,
3550 token: Token::Word(Word::Word("День".to_string())),
3551 },
3552 PositionalToken {
3553 source: uws,
3554 offset: 329,
3555 length: 1,
3556 token: Token::Special(Special::Punctuation(',')),
3557 },
3558 PositionalToken {
3559 source: uws,
3560 offset: 330,
3561 length: 1,
3562 token: Token::Special(Special::Separator(Separator::Space)),
3563 },
3564 PositionalToken {
3565 source: uws,
3566 offset: 331,
3567 length: 10,
3568 token: Token::Word(Word::Word("когда".to_string())),
3569 },
3570 PositionalToken {
3571 source: uws,
3572 offset: 341,
3573 length: 1,
3574 token: Token::Special(Special::Separator(Separator::Space)),
3575 },
3576 PositionalToken {
3577 source: uws,
3578 offset: 342,
3579 length: 22,
3580 token: Token::Word(Word::Word("поздравляют".to_string())),
3581 },
3582 PositionalToken {
3583 source: uws,
3584 offset: 364,
3585 length: 1,
3586 token: Token::Special(Special::Separator(Separator::Space)),
3587 },
3588 PositionalToken {
3589 source: uws,
3590 offset: 365,
3591 length: 6,
3592 token: Token::Word(Word::Word("мам".to_string())),
3593 },
3594 PositionalToken {
3595 source: uws,
3596 offset: 371,
3597 length: 1,
3598 token: Token::Special(Special::Punctuation(',')),
3599 },
3600 PositionalToken {
3601 source: uws,
3602 offset: 372,
3603 length: 1,
3604 token: Token::Special(Special::Separator(Separator::Space)),
3605 },
3606 PositionalToken {
3607 source: uws,
3608 offset: 373,
3609 length: 14,
3610 token: Token::Word(Word::Word("бабушек".to_string())),
3611 },
3612 PositionalToken {
3613 source: uws,
3614 offset: 387,
3615 length: 1,
3616 token: Token::Special(Special::Punctuation(',')),
3617 },
3618 PositionalToken {
3619 source: uws,
3620 offset: 388,
3621 length: 1,
3622 token: Token::Special(Special::Separator(Separator::Space)),
3623 },
3624 PositionalToken {
3625 source: uws,
3626 offset: 389,
3627 length: 12,
3628 token: Token::Word(Word::Word("сестер".to_string())),
3629 },
3630 PositionalToken {
3631 source: uws,
3632 offset: 401,
3633 length: 1,
3634 token: Token::Special(Special::Separator(Separator::Space)),
3635 },
3636 PositionalToken {
3637 source: uws,
3638 offset: 402,
3639 length: 2,
3640 token: Token::Word(Word::Word("и".to_string())),
3641 },
3642 PositionalToken {
3643 source: uws,
3644 offset: 404,
3645 length: 1,
3646 token: Token::Special(Special::Separator(Separator::Space)),
3647 },
3648 PositionalToken {
3649 source: uws,
3650 offset: 405,
3651 length: 6,
3652 token: Token::Word(Word::Word("жён".to_string())),
3653 },
3654 PositionalToken {
3655 source: uws,
3656 offset: 411,
3657 length: 1,
3658 token: Token::Special(Special::Separator(Separator::Space)),
3659 },
3660 PositionalToken {
3661 source: uws,
3662 offset: 412,
3663 length: 3,
3664 token: Token::Special(Special::Punctuation('—')),
3665 },
3666 PositionalToken {
3667 source: uws,
3668 offset: 415,
3669 length: 1,
3670 token: Token::Special(Special::Separator(Separator::Space)),
3671 },
3672 PositionalToken {
3673 source: uws,
3674 offset: 416,
3675 length: 6,
3676 token: Token::Word(Word::Word("это".to_string())),
3677 },
3678 PositionalToken {
3679 source: uws,
3680 offset: 422,
3681 length: 1,
3682 token: Token::Special(Special::Separator(Separator::Space)),
3683 },
3684 PositionalToken {
3685 source: uws,
3686 offset: 423,
3687 length: 18,
3688 token: Token::Word(Word::Word("всемирный".to_string())),
3689 },
3690 PositionalToken {
3691 source: uws,
3692 offset: 441,
3693 length: 1,
3694 token: Token::Special(Special::Separator(Separator::Space)),
3695 },
3696 PositionalToken {
3697 source: uws,
3698 offset: 442,
3699 length: 16,
3700 token: Token::Word(Word::Word("праздник".to_string())),
3701 },
3702 PositionalToken {
3703 source: uws,
3704 offset: 458,
3705 length: 1,
3706 token: Token::Special(Special::Punctuation(',')),
3707 },
3708 PositionalToken {
3709 source: uws,
3710 offset: 459,
3711 length: 1,
3712 token: Token::Special(Special::Separator(Separator::Space)),
3713 },
3714 PositionalToken {
3715 source: uws,
3716 offset: 460,
3717 length: 20,
3718 token: Token::Word(Word::Word("называемый".to_string())),
3719 },
3720 PositionalToken {
3721 source: uws,
3722 offset: 480,
3723 length: 1,
3724 token: Token::Special(Special::Separator(Separator::Space)),
3725 },
3726 PositionalToken {
3727 source: uws,
3728 offset: 481,
3729 length: 2,
3730 token: Token::Special(Special::Punctuation('«')),
3731 },
3732 PositionalToken {
3733 source: uws,
3734 offset: 483,
3735 length: 8,
3736 token: Token::Word(Word::Word("День".to_string())),
3737 },
3738 PositionalToken {
3739 source: uws,
3740 offset: 491,
3741 length: 1,
3742 token: Token::Special(Special::Separator(Separator::Space)),
3743 },
3744 PositionalToken {
3745 source: uws,
3746 offset: 492,
3747 length: 8,
3748 token: Token::Word(Word::Word("Мамы".to_string())),
3749 },
3750 PositionalToken {
3751 source: uws,
3752 offset: 500,
3753 length: 2,
3754 token: Token::Special(Special::Punctuation('»')),
3755 },
3756 PositionalToken {
3757 source: uws,
3758 offset: 502,
3759 length: 1,
3760 token: Token::Special(Special::Punctuation('.')),
3761 },
3762 PositionalToken {
3763 source: uws,
3764 offset: 503,
3765 length: 1,
3766 token: Token::Special(Special::Separator(Separator::Space)),
3767 },
3768 PositionalToken {
3769 source: uws,
3770 offset: 504,
3771 length: 2,
3772 token: Token::Word(Word::Word("В".to_string())),
3773 },
3774 PositionalToken {
3775 source: uws,
3776 offset: 506,
3777 length: 1,
3778 token: Token::Special(Special::Separator(Separator::Space)),
3779 },
3780 PositionalToken {
3781 source: uws,
3782 offset: 507,
3783 length: 18,
3784 token: Token::Word(Word::Word("настоящее".to_string())),
3785 },
3786 PositionalToken {
3787 source: uws,
3788 offset: 525,
3789 length: 1,
3790 token: Token::Special(Special::Separator(Separator::Space)),
3791 },
3792 PositionalToken {
3793 source: uws,
3794 offset: 526,
3795 length: 10,
3796 token: Token::Word(Word::Word("время".to_string())),
3797 },
3798 PositionalToken {
3799 source: uws,
3800 offset: 536,
3801 length: 1,
3802 token: Token::Special(Special::Separator(Separator::Space)),
3803 },
3804 PositionalToken {
3805 source: uws,
3806 offset: 537,
3807 length: 6,
3808 token: Token::Word(Word::Word("его".to_string())),
3809 },
3810 PositionalToken {
3811 source: uws,
3812 offset: 543,
3813 length: 1,
3814 token: Token::Special(Special::Separator(Separator::Space)),
3815 },
3816 PositionalToken {
3817 source: uws,
3818 offset: 544,
3819 length: 16,
3820 token: Token::Word(Word::Word("отмечают".to_string())),
3821 },
3822 PositionalToken {
3823 source: uws,
3824 offset: 560,
3825 length: 1,
3826 token: Token::Special(Special::Separator(Separator::Space)),
3827 },
3828 PositionalToken {
3829 source: uws,
3830 offset: 561,
3831 length: 10,
3832 token: Token::Word(Word::Word("почти".to_string())),
3833 },
3834 PositionalToken {
3835 source: uws,
3836 offset: 571,
3837 length: 1,
3838 token: Token::Special(Special::Separator(Separator::Space)),
3839 },
3840 PositionalToken {
3841 source: uws,
3842 offset: 572,
3843 length: 2,
3844 token: Token::Word(Word::Word("в".to_string())),
3845 },
3846 PositionalToken {
3847 source: uws,
3848 offset: 574,
3849 length: 1,
3850 token: Token::Special(Special::Separator(Separator::Space)),
3851 },
3852 PositionalToken {
3853 source: uws,
3854 offset: 575,
3855 length: 12,
3856 token: Token::Word(Word::Word("каждой".to_string())),
3857 },
3858 PositionalToken {
3859 source: uws,
3860 offset: 587,
3861 length: 1,
3862 token: Token::Special(Special::Separator(Separator::Space)),
3863 },
3864 PositionalToken {
3865 source: uws,
3866 offset: 588,
3867 length: 12,
3868 token: Token::Word(Word::Word("стране".to_string())),
3869 },
3870 PositionalToken {
3871 source: uws,
3872 offset: 600,
3873 length: 1,
3874 token: Token::Special(Special::Punctuation(',')),
3875 },
3876 PositionalToken {
3877 source: uws,
3878 offset: 601,
3879 length: 1,
3880 token: Token::Special(Special::Separator(Separator::Space)),
3881 },
3882 PositionalToken {
3883 source: uws,
3884 offset: 602,
3885 length: 12,
3886 token: Token::Word(Word::Word("просто".to_string())),
3887 },
3888 PositionalToken {
3889 source: uws,
3890 offset: 614,
3891 length: 1,
3892 token: Token::Special(Special::Separator(Separator::Space)),
3893 },
3894 PositionalToken {
3895 source: uws,
3896 offset: 615,
3897 length: 10,
3898 token: Token::Word(Word::Word("везде".to_string())),
3899 },
3900 PositionalToken {
3901 source: uws,
3902 offset: 625,
3903 length: 1,
3904 token: Token::Special(Special::Separator(Separator::Space)),
3905 },
3906 PositionalToken {
3907 source: uws,
3908 offset: 626,
3909 length: 12,
3910 token: Token::Word(Word::Word("разные".to_string())),
3911 },
3912 PositionalToken {
3913 source: uws,
3914 offset: 638,
3915 length: 1,
3916 token: Token::Special(Special::Separator(Separator::Space)),
3917 },
3918 PositionalToken {
3919 source: uws,
3920 offset: 639,
3921 length: 8,
3922 token: Token::Word(Word::Word("даты".to_string())),
3923 },
3924 PositionalToken {
3925 source: uws,
3926 offset: 647,
3927 length: 1,
3928 token: Token::Special(Special::Separator(Separator::Space)),
3929 },
3930 PositionalToken {
3931 source: uws,
3932 offset: 648,
3933 length: 2,
3934 token: Token::Word(Word::Word("и".to_string())),
3935 },
3936 PositionalToken {
3937 source: uws,
3938 offset: 650,
3939 length: 1,
3940 token: Token::Special(Special::Separator(Separator::Space)),
3941 },
3942 PositionalToken {
3943 source: uws,
3944 offset: 651,
3945 length: 14,
3946 token: Token::Word(Word::Word("способы".to_string())),
3947 },
3948 PositionalToken {
3949 source: uws,
3950 offset: 665,
3951 length: 1,
3952 token: Token::Special(Special::Separator(Separator::Space)),
3953 },
3954 PositionalToken {
3955 source: uws,
3956 offset: 666,
3957 length: 24,
3958 token: Token::Word(Word::Word("празднования".to_string())),
3959 },
3960 PositionalToken {
3961 source: uws,
3962 offset: 690,
3963 length: 1,
3964 token: Token::Special(Special::Punctuation('.')),
3965 },
3966 PositionalToken {
3967 source: uws,
3968 offset: 691,
3969 length: 1,
3970 token: Token::Special(Special::Separator(Separator::Space)),
3971 },
3972 PositionalToken {
3973 source: uws,
3974 offset: 794,
3975 length: 1,
3976 token: Token::Special(Special::Separator(Separator::Newline)),
3977 },
3978 PositionalToken {
3979 source: uws,
3980 offset: 795,
3981 length: 2,
3982 token: Token::Special(Special::Separator(Separator::Space)),
3983 },
3984 PositionalToken {
3985 source: uws,
3986 offset: 870,
3987 length: 1,
3988 token: Token::Special(Special::Separator(Separator::Newline)),
3989 },
3990 PositionalToken {
3991 source: uws,
3992 offset: 871,
3993 length: 2,
3994 token: Token::Special(Special::Separator(Separator::Space)),
3995 },
3996 PositionalToken {
3997 source: uws,
3998 offset: 910,
3999 length: 2,
4000 token: Token::Word(Word::Word("П".to_string())),
4001 },
4002 PositionalToken {
4003 source: uws,
4004 offset: 919,
4005 length: 1,
4006 token: Token::Special(Special::Separator(Separator::Newline)),
4007 },
4008 PositionalToken {
4009 source: uws,
4010 offset: 927,
4011 length: 12,
4012 token: Token::Word(Word::Word("ПОЧЕМУ".to_string())),
4013 },
4014 PositionalToken {
4015 source: uws,
4016 offset: 939,
4017 length: 1,
4018 token: Token::Special(Special::Separator(Separator::Space)),
4019 },
4020 PositionalToken {
4021 source: uws,
4022 offset: 940,
4023 length: 4,
4024 token: Token::Word(Word::Word("МЫ".to_string())),
4025 },
4026 PositionalToken {
4027 source: uws,
4028 offset: 944,
4029 length: 1,
4030 token: Token::Special(Special::Separator(Separator::Space)),
4031 },
4032 PositionalToken {
4033 source: uws,
4034 offset: 945,
4035 length: 6,
4036 token: Token::Word(Word::Word("ЕГО".to_string())),
4037 },
4038 PositionalToken {
4039 source: uws,
4040 offset: 951,
4041 length: 1,
4042 token: Token::Special(Special::Separator(Separator::Space)),
4043 },
4044 PositionalToken {
4045 source: uws,
4046 offset: 952,
4047 length: 18,
4048 token: Token::Word(Word::Word("ПРАЗДНУЕМ".to_string())),
4049 },
4050 PositionalToken {
4051 source: uws,
4052 offset: 1063,
4053 length: 2,
4054 token: Token::Word(Word::Word("В".to_string())),
4055 },
4056 PositionalToken {
4057 source: uws,
4058 offset: 1065,
4059 length: 1,
4060 token: Token::Special(Special::Separator(Separator::Space)),
4061 },
4062 PositionalToken {
4063 source: uws,
4064 offset: 1066,
4065 length: 4,
4066 token: Token::Word(Word::Number(Number::Integer(1987))),
4067 },
4068 PositionalToken {
4069 source: uws,
4070 offset: 1070,
4071 length: 1,
4072 token: Token::Special(Special::Separator(Separator::Space)),
4073 },
4074 PositionalToken {
4075 source: uws,
4076 offset: 1071,
4077 length: 8,
4078 token: Token::Word(Word::Word("году".to_string())),
4079 },
4080 PositionalToken {
4081 source: uws,
4082 offset: 1079,
4083 length: 1,
4084 token: Token::Special(Special::Separator(Separator::Space)),
4085 },
4086 PositionalToken {
4087 source: uws,
4088 offset: 1080,
4089 length: 14,
4090 token: Token::Word(Word::Word("комитет".to_string())),
4091 },
4092 PositionalToken {
4093 source: uws,
4094 offset: 1094,
4095 length: 1,
4096 token: Token::Special(Special::Separator(Separator::Space)),
4097 },
4098 PositionalToken {
4099 source: uws,
4100 offset: 1095,
4101 length: 14,
4102 token: Token::Word(Word::Word("госдумы".to_string())),
4103 },
4104 PositionalToken {
4105 source: uws,
4106 offset: 1109,
4107 length: 1,
4108 token: Token::Special(Special::Separator(Separator::Space)),
4109 },
4110 PositionalToken {
4111 source: uws,
4112 offset: 1110,
4113 length: 4,
4114 token: Token::Word(Word::Word("по".to_string())),
4115 },
4116 PositionalToken {
4117 source: uws,
4118 offset: 1114,
4119 length: 1,
4120 token: Token::Special(Special::Separator(Separator::Space)),
4121 },
4122 PositionalToken {
4123 source: uws,
4124 offset: 1115,
4125 length: 10,
4126 token: Token::Word(Word::Word("делам".to_string())),
4127 },
4128 PositionalToken {
4129 source: uws,
4130 offset: 1125,
4131 length: 1,
4132 token: Token::Special(Special::Separator(Separator::Space)),
4133 },
4134 PositionalToken {
4135 source: uws,
4136 offset: 1126,
4137 length: 12,
4138 token: Token::Word(Word::Word("женщин".to_string())),
4139 },
4140 PositionalToken {
4141 source: uws,
4142 offset: 1138,
4143 length: 1,
4144 token: Token::Special(Special::Punctuation(',')),
4145 },
4146 PositionalToken {
4147 source: uws,
4148 offset: 1139,
4149 length: 1,
4150 token: Token::Special(Special::Separator(Separator::Space)),
4151 },
4152 PositionalToken {
4153 source: uws,
4154 offset: 1140,
4155 length: 10,
4156 token: Token::Word(Word::Word("семьи".to_string())),
4157 },
4158 PositionalToken {
4159 source: uws,
4160 offset: 1150,
4161 length: 1,
4162 token: Token::Special(Special::Separator(Separator::Space)),
4163 },
4164 PositionalToken {
4165 source: uws,
4166 offset: 1151,
4167 length: 2,
4168 token: Token::Word(Word::Word("и".to_string())),
4169 },
4170 PositionalToken {
4171 source: uws,
4172 offset: 1153,
4173 length: 1,
4174 token: Token::Special(Special::Separator(Separator::Space)),
4175 },
4176 PositionalToken {
4177 source: uws,
4178 offset: 1154,
4179 length: 16,
4180 token: Token::Word(Word::Word("молодежи".to_string())),
4181 },
4182 PositionalToken {
4183 source: uws,
4184 offset: 1170,
4185 length: 1,
4186 token: Token::Special(Special::Separator(Separator::Space)),
4187 },
4188 PositionalToken {
4189 source: uws,
4190 offset: 1171,
4191 length: 16,
4192 token: Token::Word(Word::Word("выступил".to_string())),
4193 },
4194 PositionalToken {
4195 source: uws,
4196 offset: 1187,
4197 length: 1,
4198 token: Token::Special(Special::Separator(Separator::Space)),
4199 },
4200 PositionalToken {
4201 source: uws,
4202 offset: 1188,
4203 length: 2,
4204 token: Token::Word(Word::Word("с".to_string())),
4205 },
4206 PositionalToken {
4207 source: uws,
4208 offset: 1190,
4209 length: 1,
4210 token: Token::Special(Special::Separator(Separator::Space)),
4211 },
4212 PositionalToken {
4213 source: uws,
4214 offset: 1191,
4215 length: 24,
4216 token: Token::Word(Word::Word("предложением".to_string())),
4217 },
4218 PositionalToken {
4219 source: uws,
4220 offset: 1215,
4221 length: 1,
4222 token: Token::Special(Special::Separator(Separator::Space)),
4223 },
4224 PositionalToken {
4225 source: uws,
4226 offset: 1216,
4227 length: 16,
4228 token: Token::Word(Word::Word("учредить".to_string())),
4229 },
4230 PositionalToken {
4231 source: uws,
4232 offset: 1232,
4233 length: 1,
4234 token: Token::Special(Special::Separator(Separator::Space)),
4235 },
4236 PositionalToken {
4237 source: uws,
4238 offset: 1233,
4239 length: 2,
4240 token: Token::Special(Special::Punctuation('«')),
4241 },
4242 PositionalToken {
4243 source: uws,
4244 offset: 1235,
4245 length: 8,
4246 token: Token::Word(Word::Word("День".to_string())),
4247 },
4248 PositionalToken {
4249 source: uws,
4250 offset: 1243,
4251 length: 1,
4252 token: Token::Special(Special::Separator(Separator::Space)),
4253 },
4254 PositionalToken {
4255 source: uws,
4256 offset: 1244,
4257 length: 8,
4258 token: Token::Word(Word::Word("мамы".to_string())),
4259 },
4260 PositionalToken {
4261 source: uws,
4262 offset: 1252,
4263 length: 2,
4264 token: Token::Special(Special::Punctuation('»')),
4265 },
4266 PositionalToken {
4267 source: uws,
4268 offset: 1254,
4269 length: 1,
4270 token: Token::Special(Special::Punctuation(',')),
4271 },
4272 PositionalToken {
4273 source: uws,
4274 offset: 1255,
4275 length: 1,
4276 token: Token::Special(Special::Separator(Separator::Space)),
4277 },
4278 PositionalToken {
4279 source: uws,
4280 offset: 1256,
4281 length: 2,
4282 token: Token::Word(Word::Word("а".to_string())),
4283 },
4284 PositionalToken {
4285 source: uws,
4286 offset: 1258,
4287 length: 1,
4288 token: Token::Special(Special::Separator(Separator::Space)),
4289 },
4290 PositionalToken {
4291 source: uws,
4292 offset: 1259,
4293 length: 6,
4294 token: Token::Word(Word::Word("сам".to_string())),
4295 },
4296 PositionalToken {
4297 source: uws,
4298 offset: 1265,
4299 length: 1,
4300 token: Token::Special(Special::Separator(Separator::Space)),
4301 },
4302 PositionalToken {
4303 source: uws,
4304 offset: 1266,
4305 length: 12,
4306 token: Token::Word(Word::Word("приказ".to_string())),
4307 },
4308 PositionalToken {
4309 source: uws,
4310 offset: 1278,
4311 length: 1,
4312 token: Token::Special(Special::Separator(Separator::Space)),
4313 },
4314 PositionalToken {
4315 source: uws,
4316 offset: 1279,
4317 length: 6,
4318 token: Token::Word(Word::Word("был".to_string())),
4319 },
4320 PositionalToken {
4321 source: uws,
4322 offset: 1285,
4323 length: 1,
4324 token: Token::Special(Special::Separator(Separator::Space)),
4325 },
4326 PositionalToken {
4327 source: uws,
4328 offset: 1286,
4329 length: 16,
4330 token: Token::Word(Word::Word("подписан".to_string())),
4331 },
4332 PositionalToken {
4333 source: uws,
4334 offset: 1302,
4335 length: 1,
4336 token: Token::Special(Special::Separator(Separator::Space)),
4337 },
4338 PositionalToken {
4339 source: uws,
4340 offset: 1303,
4341 length: 6,
4342 token: Token::Word(Word::Word("уже".to_string())),
4343 },
4344 PositionalToken {
4345 source: uws,
4346 offset: 1309,
4347 length: 1,
4348 token: Token::Special(Special::Separator(Separator::Space)),
4349 },
4350 PositionalToken {
4351 source: uws,
4352 offset: 1310,
4353 length: 2,
4354 token: Token::Word(Word::Number(Number::Integer(30))),
4355 },
4356 PositionalToken {
4357 source: uws,
4358 offset: 1312,
4359 length: 1,
4360 token: Token::Special(Special::Separator(Separator::Space)),
4361 },
4362 PositionalToken {
4363 source: uws,
4364 offset: 1313,
4365 length: 12,
4366 token: Token::Word(Word::Word("января".to_string())),
4367 },
4368 PositionalToken {
4369 source: uws,
4370 offset: 1325,
4371 length: 1,
4372 token: Token::Special(Special::Separator(Separator::Space)),
4373 },
4374 PositionalToken {
4375 source: uws,
4376 offset: 1326,
4377 length: 4,
4378 token: Token::Word(Word::Number(Number::Integer(1988))),
4379 },
4380 PositionalToken {
4381 source: uws,
4382 offset: 1330,
4383 length: 1,
4384 token: Token::Special(Special::Separator(Separator::Space)),
4385 },
4386 PositionalToken {
4387 source: uws,
4388 offset: 1331,
4389 length: 8,
4390 token: Token::Word(Word::Word("года".to_string())),
4391 },
4392 PositionalToken {
4393 source: uws,
4394 offset: 1339,
4395 length: 1,
4396 token: Token::Special(Special::Separator(Separator::Space)),
4397 },
4398 PositionalToken {
4399 source: uws,
4400 offset: 1340,
4401 length: 14,
4402 token: Token::Word(Word::Word("Борисом".to_string())),
4403 },
4404 PositionalToken {
4405 source: uws,
4406 offset: 1354,
4407 length: 1,
4408 token: Token::Special(Special::Separator(Separator::Space)),
4409 },
4410 PositionalToken {
4411 source: uws,
4412 offset: 1355,
4413 length: 16,
4414 token: Token::Word(Word::Word("Ельциным".to_string())),
4415 },
4416 PositionalToken {
4417 source: uws,
4418 offset: 1371,
4419 length: 1,
4420 token: Token::Special(Special::Punctuation('.')),
4421 },
4422 PositionalToken {
4423 source: uws,
4424 offset: 1372,
4425 length: 1,
4426 token: Token::Special(Special::Separator(Separator::Space)),
4427 },
4428 PositionalToken {
4429 source: uws,
4430 offset: 1373,
4431 length: 8,
4432 token: Token::Word(Word::Word("Было".to_string())),
4433 },
4434 PositionalToken {
4435 source: uws,
4436 offset: 1381,
4437 length: 1,
4438 token: Token::Special(Special::Separator(Separator::Space)),
4439 },
4440 PositionalToken {
4441 source: uws,
4442 offset: 1382,
4443 length: 12,
4444 token: Token::Word(Word::Word("решено".to_string())),
4445 },
4446 PositionalToken {
4447 source: uws,
4448 offset: 1394,
4449 length: 1,
4450 token: Token::Special(Special::Punctuation(',')),
4451 },
4452 PositionalToken {
4453 source: uws,
4454 offset: 1395,
4455 length: 1,
4456 token: Token::Special(Special::Separator(Separator::Space)),
4457 },
4458 PositionalToken {
4459 source: uws,
4460 offset: 1396,
4461 length: 6,
4462 token: Token::Word(Word::Word("что".to_string())),
4463 },
4464 PositionalToken {
4465 source: uws,
4466 offset: 1402,
4467 length: 1,
4468 token: Token::Special(Special::Separator(Separator::Space)),
4469 },
4470 PositionalToken {
4471 source: uws,
4472 offset: 1403,
4473 length: 16,
4474 token: Token::Word(Word::Word("ежегодно".to_string())),
4475 },
4476 PositionalToken {
4477 source: uws,
4478 offset: 1419,
4479 length: 1,
4480 token: Token::Special(Special::Separator(Separator::Space)),
4481 },
4482 PositionalToken {
4483 source: uws,
4484 offset: 1420,
4485 length: 2,
4486 token: Token::Word(Word::Word("в".to_string())),
4487 },
4488 PositionalToken {
4489 source: uws,
4490 offset: 1422,
4491 length: 1,
4492 token: Token::Special(Special::Separator(Separator::Space)),
4493 },
4494 PositionalToken {
4495 source: uws,
4496 offset: 1423,
4497 length: 12,
4498 token: Token::Word(Word::Word("России".to_string())),
4499 },
4500 PositionalToken {
4501 source: uws,
4502 offset: 1435,
4503 length: 1,
4504 token: Token::Special(Special::Separator(Separator::Space)),
4505 },
4506 PositionalToken {
4507 source: uws,
4508 offset: 1436,
4509 length: 22,
4510 token: Token::Word(Word::Word("празднество".to_string())),
4511 },
4512 PositionalToken {
4513 source: uws,
4514 offset: 1458,
4515 length: 1,
4516 token: Token::Special(Special::Separator(Separator::Space)),
4517 },
4518 PositionalToken {
4519 source: uws,
4520 offset: 1459,
4521 length: 6,
4522 token: Token::Word(Word::Word("дня".to_string())),
4523 },
4524 PositionalToken {
4525 source: uws,
4526 offset: 1465,
4527 length: 1,
4528 token: Token::Special(Special::Separator(Separator::Space)),
4529 },
4530 PositionalToken {
4531 source: uws,
4532 offset: 1466,
4533 length: 8,
4534 token: Token::Word(Word::Word("мамы".to_string())),
4535 },
4536 PositionalToken {
4537 source: uws,
4538 offset: 1474,
4539 length: 1,
4540 token: Token::Special(Special::Separator(Separator::Space)),
4541 },
4542 PositionalToken {
4543 source: uws,
4544 offset: 1475,
4545 length: 10,
4546 token: Token::Word(Word::Word("будет".to_string())),
4547 },
4548 PositionalToken {
4549 source: uws,
4550 offset: 1485,
4551 length: 1,
4552 token: Token::Special(Special::Separator(Separator::Space)),
4553 },
4554 PositionalToken {
4555 source: uws,
4556 offset: 1486,
4557 length: 16,
4558 token: Token::Word(Word::Word("выпадать".to_string())),
4559 },
4560 PositionalToken {
4561 source: uws,
4562 offset: 1502,
4563 length: 1,
4564 token: Token::Special(Special::Separator(Separator::Space)),
4565 },
4566 PositionalToken {
4567 source: uws,
4568 offset: 1503,
4569 length: 4,
4570 token: Token::Word(Word::Word("на".to_string())),
4571 },
4572 PositionalToken {
4573 source: uws,
4574 offset: 1507,
4575 length: 1,
4576 token: Token::Special(Special::Separator(Separator::Space)),
4577 },
4578 PositionalToken {
4579 source: uws,
4580 offset: 1508,
4581 length: 18,
4582 token: Token::Word(Word::Word("последнее".to_string())),
4583 },
4584 PositionalToken {
4585 source: uws,
4586 offset: 1526,
4587 length: 1,
4588 token: Token::Special(Special::Separator(Separator::Space)),
4589 },
4590 PositionalToken {
4591 source: uws,
4592 offset: 1527,
4593 length: 22,
4594 token: Token::Word(Word::Word("воскресенье".to_string())),
4595 },
4596 PositionalToken {
4597 source: uws,
4598 offset: 1549,
4599 length: 1,
4600 token: Token::Special(Special::Separator(Separator::Space)),
4601 },
4602 PositionalToken {
4603 source: uws,
4604 offset: 1550,
4605 length: 12,
4606 token: Token::Word(Word::Word("ноября".to_string())),
4607 },
4608 PositionalToken {
4609 source: uws,
4610 offset: 1562,
4611 length: 1,
4612 token: Token::Special(Special::Punctuation('.')),
4613 },
4614 PositionalToken {
4615 source: uws,
4616 offset: 1563,
4617 length: 1,
4618 token: Token::Special(Special::Separator(Separator::Space)),
4619 },
4620 PositionalToken {
4621 source: uws,
4622 offset: 1664,
4623 length: 1,
4624 token: Token::Special(Special::Separator(Separator::Newline)),
4625 },
4626 PositionalToken {
4627 source: uws,
4628 offset: 1665,
4629 length: 2,
4630 token: Token::Special(Special::Separator(Separator::Space)),
4631 },
4632 PositionalToken {
4633 source: uws,
4634 offset: 1725,
4635 length: 1,
4636 token: Token::Special(Special::Separator(Separator::Newline)),
4637 },
4638 PositionalToken {
4639 source: uws,
4640 offset: 1726,
4641 length: 4,
4642 token: Token::Special(Special::Separator(Separator::Space)),
4643 },
4644 PositionalToken {
4645 source: uws,
4646 offset: 2725,
4647 length: 1,
4648 token: Token::Special(Special::Separator(Separator::Newline)),
4649 },
4650 PositionalToken {
4651 source: uws,
4652 offset: 2726,
4653 length: 2,
4654 token: Token::Special(Special::Separator(Separator::Space)),
4655 },
4656 PositionalToken {
4657 source: uws,
4658 offset: 2888,
4659 length: 1,
4660 token: Token::Special(Special::Separator(Separator::Newline)),
4661 },
4662 PositionalToken {
4663 source: uws,
4664 offset: 2889,
4665 length: 2,
4666 token: Token::Special(Special::Separator(Separator::Space)),
4667 },
4668 PositionalToken {
4669 source: uws,
4670 offset: 2891,
4671 length: 1,
4672 token: Token::Special(Special::Separator(Separator::Newline)),
4673 },
4674 PositionalToken {
4675 source: uws,
4676 offset: 2904,
4677 length: 1,
4678 token: Token::Special(Special::Separator(Separator::Newline)),
4679 },
4680 PositionalToken {
4681 source: uws,
4682 offset: 2905,
4683 length: 4,
4684 token: Token::Special(Special::Separator(Separator::Space)),
4685 },
4686 ];
4687
4688 let text = Text::new({
4689 uws.into_source()
4690 .pipe(tagger::Builder::new().create().into_breaker())
4691 .pipe(entities::Builder::new().create().into_piped())
4692 .into_separator()
4693 })
4694 .unwrap();
4695
4696 let lib_res = text
4697 .into_tokenizer(TokenizerParams::v1())
4698 .filter_map(|tt| tt.into_original_token_1())
4699 .collect::<Vec<_>>();
4700
4701 check_results(&result, &lib_res, uws);
4702 }
4703
4704 #[test]
4755 fn numerical_no_split() {
4756 let uws = "12.02.18 31.28.34 23.11.2018 123.568.365.234.578 127.0.0.1 1st 1кг 123123афываыв 12321фвафыов234выалфо 12_123_343.4234_4234";
4757 let lib_res = uws.into_tokenizer(Default::default()).collect::<Vec<_>>();
4758 let result = vec![
4760 PositionalToken {
4761 source: uws,
4762 offset: 0,
4763 length: 8,
4764 token: Token::Word(Word::Numerical(Numerical::DotSeparated(
4765 "12.02.18".to_string(),
4766 ))),
4767 },
4768 PositionalToken {
4769 source: uws,
4770 offset: 8,
4771 length: 1,
4772 token: Token::Special(Special::Separator(Separator::Space)),
4773 },
4774 PositionalToken {
4775 source: uws,
4776 offset: 9,
4777 length: 8,
4778 token: Token::Word(Word::Numerical(Numerical::DotSeparated(
4779 "31.28.34".to_string(),
4780 ))),
4781 },
4782 PositionalToken {
4783 source: uws,
4784 offset: 17,
4785 length: 1,
4786 token: Token::Special(Special::Separator(Separator::Space)),
4787 },
4788 PositionalToken {
4789 source: uws,
4790 offset: 18,
4791 length: 10,
4792 token: Token::Word(Word::Numerical(Numerical::DotSeparated(
4793 "23.11.2018".to_string(),
4794 ))),
4795 },
4796 PositionalToken {
4797 source: uws,
4798 offset: 28,
4799 length: 1,
4800 token: Token::Special(Special::Separator(Separator::Space)),
4801 },
4802 PositionalToken {
4803 source: uws,
4804 offset: 29,
4805 length: 19,
4806 token: Token::Word(Word::Numerical(Numerical::DotSeparated(
4807 "123.568.365.234.578".to_string(),
4808 ))),
4809 },
4810 PositionalToken {
4811 source: uws,
4812 offset: 48,
4813 length: 1,
4814 token: Token::Special(Special::Separator(Separator::Space)),
4815 },
4816 PositionalToken {
4817 source: uws,
4818 offset: 49,
4819 length: 9,
4820 token: Token::Word(Word::Numerical(Numerical::DotSeparated(
4821 "127.0.0.1".to_string(),
4822 ))),
4823 },
4824 PositionalToken {
4825 source: uws,
4826 offset: 58,
4827 length: 1,
4828 token: Token::Special(Special::Separator(Separator::Space)),
4829 },
4830 PositionalToken {
4831 source: uws,
4832 offset: 59,
4833 length: 3,
4834 token: Token::Word(Word::Numerical(Numerical::Measures("1st".to_string()))),
4835 },
4836 PositionalToken {
4837 source: uws,
4838 offset: 62,
4839 length: 1,
4840 token: Token::Special(Special::Separator(Separator::Space)),
4841 },
4842 PositionalToken {
4843 source: uws,
4844 offset: 63,
4845 length: 5,
4846 token: Token::Word(Word::Numerical(Numerical::Measures("1кг".to_string()))),
4847 },
4848 PositionalToken {
4849 source: uws,
4850 offset: 68,
4851 length: 1,
4852 token: Token::Special(Special::Separator(Separator::Space)),
4853 },
4854 PositionalToken {
4855 source: uws,
4856 offset: 69,
4857 length: 20,
4858 token: Token::Word(Word::Numerical(Numerical::Measures(
4859 "123123афываыв".to_string(),
4860 ))),
4861 },
4862 PositionalToken {
4863 source: uws,
4864 offset: 89,
4865 length: 1,
4866 token: Token::Special(Special::Separator(Separator::Space)),
4867 },
4868 PositionalToken {
4869 source: uws,
4870 offset: 90,
4871 length: 34,
4872 token: Token::Word(Word::Numerical(Numerical::Alphanumeric(
4873 "12321фвафыов234выалфо".to_string(),
4874 ))),
4875 },
4876 PositionalToken {
4877 source: uws,
4878 offset: 124,
4879 length: 1,
4880 token: Token::Special(Special::Separator(Separator::Space)),
4881 },
4882 PositionalToken {
4883 source: uws,
4884 offset: 125,
4885 length: 20,
4886 token: Token::Word(Word::Numerical(Numerical::Alphanumeric(
4887 "12_123_343.4234_4234".to_string(),
4888 ))),
4889 },
4890 ];
4891 check_results(&result, &lib_res, uws);
4892 }
4893
4894 #[test]
4895 fn numerical_default() {
4896 let uws = "12.02.18 31.28.34 23.11.2018 123.568.365.234.578 127.0.0.1 1st 1кг 123123афываыв 12321фвафыов234выалфо 12_123_343.4234_4234";
4897 let lib_res = uws
4898 .into_tokenizer(TokenizerParams::v1())
4899 .collect::<Vec<_>>();
4900 let result = vec![
4902 PositionalToken {
4903 source: uws,
4904 offset: 0,
4905 length: 2,
4906 token: Token::Word(Word::Number(Number::Integer(12))),
4907 },
4908 PositionalToken {
4909 source: uws,
4910 offset: 2,
4911 length: 1,
4912 token: Token::Special(Special::Punctuation('.')),
4913 },
4914 PositionalToken {
4915 source: uws,
4916 offset: 3,
4917 length: 2,
4918 token: Token::Word(Word::Number(Number::Integer(2))),
4919 },
4920 PositionalToken {
4921 source: uws,
4922 offset: 5,
4923 length: 1,
4924 token: Token::Special(Special::Punctuation('.')),
4925 },
4926 PositionalToken {
4927 source: uws,
4928 offset: 6,
4929 length: 2,
4930 token: Token::Word(Word::Number(Number::Integer(18))),
4931 },
4932 PositionalToken {
4933 source: uws,
4934 offset: 8,
4935 length: 1,
4936 token: Token::Special(Special::Separator(Separator::Space)),
4937 },
4938 PositionalToken {
4939 source: uws,
4940 offset: 9,
4941 length: 2,
4942 token: Token::Word(Word::Number(Number::Integer(31))),
4943 },
4944 PositionalToken {
4945 source: uws,
4946 offset: 11,
4947 length: 1,
4948 token: Token::Special(Special::Punctuation('.')),
4949 },
4950 PositionalToken {
4951 source: uws,
4952 offset: 12,
4953 length: 2,
4954 token: Token::Word(Word::Number(Number::Integer(28))),
4955 },
4956 PositionalToken {
4957 source: uws,
4958 offset: 14,
4959 length: 1,
4960 token: Token::Special(Special::Punctuation('.')),
4961 },
4962 PositionalToken {
4963 source: uws,
4964 offset: 15,
4965 length: 2,
4966 token: Token::Word(Word::Number(Number::Integer(34))),
4967 },
4968 PositionalToken {
4969 source: uws,
4970 offset: 17,
4971 length: 1,
4972 token: Token::Special(Special::Separator(Separator::Space)),
4973 },
4974 PositionalToken {
4975 source: uws,
4976 offset: 18,
4977 length: 2,
4978 token: Token::Word(Word::Number(Number::Integer(23))),
4979 },
4980 PositionalToken {
4981 source: uws,
4982 offset: 20,
4983 length: 1,
4984 token: Token::Special(Special::Punctuation('.')),
4985 },
4986 PositionalToken {
4987 source: uws,
4988 offset: 21,
4989 length: 2,
4990 token: Token::Word(Word::Number(Number::Integer(11))),
4991 },
4992 PositionalToken {
4993 source: uws,
4994 offset: 23,
4995 length: 1,
4996 token: Token::Special(Special::Punctuation('.')),
4997 },
4998 PositionalToken {
4999 source: uws,
5000 offset: 24,
5001 length: 4,
5002 token: Token::Word(Word::Number(Number::Integer(2018))),
5003 },
5004 PositionalToken {
5005 source: uws,
5006 offset: 28,
5007 length: 1,
5008 token: Token::Special(Special::Separator(Separator::Space)),
5009 },
5010 PositionalToken {
5011 source: uws,
5012 offset: 29,
5013 length: 3,
5014 token: Token::Word(Word::Number(Number::Integer(123))),
5015 },
5016 PositionalToken {
5017 source: uws,
5018 offset: 32,
5019 length: 1,
5020 token: Token::Special(Special::Punctuation('.')),
5021 },
5022 PositionalToken {
5023 source: uws,
5024 offset: 33,
5025 length: 3,
5026 token: Token::Word(Word::Number(Number::Integer(568))),
5027 },
5028 PositionalToken {
5029 source: uws,
5030 offset: 36,
5031 length: 1,
5032 token: Token::Special(Special::Punctuation('.')),
5033 },
5034 PositionalToken {
5035 source: uws,
5036 offset: 37,
5037 length: 3,
5038 token: Token::Word(Word::Number(Number::Integer(365))),
5039 },
5040 PositionalToken {
5041 source: uws,
5042 offset: 40,
5043 length: 1,
5044 token: Token::Special(Special::Punctuation('.')),
5045 },
5046 PositionalToken {
5047 source: uws,
5048 offset: 41,
5049 length: 3,
5050 token: Token::Word(Word::Number(Number::Integer(234))),
5051 },
5052 PositionalToken {
5053 source: uws,
5054 offset: 44,
5055 length: 1,
5056 token: Token::Special(Special::Punctuation('.')),
5057 },
5058 PositionalToken {
5059 source: uws,
5060 offset: 45,
5061 length: 3,
5062 token: Token::Word(Word::Number(Number::Integer(578))),
5063 },
5064 PositionalToken {
5065 source: uws,
5066 offset: 48,
5067 length: 1,
5068 token: Token::Special(Special::Separator(Separator::Space)),
5069 },
5070 PositionalToken {
5071 source: uws,
5072 offset: 49,
5073 length: 3,
5074 token: Token::Word(Word::Number(Number::Integer(127))),
5075 },
5076 PositionalToken {
5077 source: uws,
5078 offset: 52,
5079 length: 1,
5080 token: Token::Special(Special::Punctuation('.')),
5081 },
5082 PositionalToken {
5083 source: uws,
5084 offset: 53,
5085 length: 1,
5086 token: Token::Word(Word::Number(Number::Integer(0))),
5087 },
5088 PositionalToken {
5089 source: uws,
5090 offset: 54,
5091 length: 1,
5092 token: Token::Special(Special::Punctuation('.')),
5093 },
5094 PositionalToken {
5095 source: uws,
5096 offset: 55,
5097 length: 1,
5098 token: Token::Word(Word::Number(Number::Integer(0))),
5099 },
5100 PositionalToken {
5101 source: uws,
5102 offset: 56,
5103 length: 1,
5104 token: Token::Special(Special::Punctuation('.')),
5105 },
5106 PositionalToken {
5107 source: uws,
5108 offset: 57,
5109 length: 1,
5110 token: Token::Word(Word::Number(Number::Integer(1))),
5111 },
5112 PositionalToken {
5113 source: uws,
5114 offset: 58,
5115 length: 1,
5116 token: Token::Special(Special::Separator(Separator::Space)),
5117 },
5118 PositionalToken {
5119 source: uws,
5120 offset: 59,
5121 length: 3,
5122 token: Token::Word(Word::Numerical(Numerical::Measures("1st".to_string()))),
5123 },
5124 PositionalToken {
5125 source: uws,
5126 offset: 62,
5127 length: 1,
5128 token: Token::Special(Special::Separator(Separator::Space)),
5129 },
5130 PositionalToken {
5131 source: uws,
5132 offset: 63,
5133 length: 5,
5134 token: Token::Word(Word::Numerical(Numerical::Measures("1кг".to_string()))),
5135 },
5136 PositionalToken {
5137 source: uws,
5138 offset: 68,
5139 length: 1,
5140 token: Token::Special(Special::Separator(Separator::Space)),
5141 },
5142 PositionalToken {
5143 source: uws,
5144 offset: 69,
5145 length: 20,
5146 token: Token::Word(Word::Numerical(Numerical::Measures(
5147 "123123афываыв".to_string(),
5148 ))),
5149 },
5150 PositionalToken {
5151 source: uws,
5152 offset: 89,
5153 length: 1,
5154 token: Token::Special(Special::Separator(Separator::Space)),
5155 },
5156 PositionalToken {
5157 source: uws,
5158 offset: 90,
5159 length: 34,
5160 token: Token::Word(Word::Numerical(Numerical::Alphanumeric(
5161 "12321фвафыов234выалфо".to_string(),
5162 ))),
5163 },
5164 PositionalToken {
5165 source: uws,
5166 offset: 124,
5167 length: 1,
5168 token: Token::Special(Special::Separator(Separator::Space)),
5169 },
5170 PositionalToken {
5171 source: uws,
5172 offset: 125,
5173 length: 2,
5174 token: Token::Word(Word::Number(Number::Integer(12))),
5175 },
5176 PositionalToken {
5177 source: uws,
5178 offset: 127,
5179 length: 1,
5180 token: Token::Special(Special::Punctuation('_')),
5181 },
5182 PositionalToken {
5183 source: uws,
5184 offset: 128,
5185 length: 3,
5186 token: Token::Word(Word::Number(Number::Integer(123))),
5187 },
5188 PositionalToken {
5189 source: uws,
5190 offset: 131,
5191 length: 1,
5192 token: Token::Special(Special::Punctuation('_')),
5193 },
5194 PositionalToken {
5195 source: uws,
5196 offset: 132,
5197 length: 3,
5198 token: Token::Word(Word::Number(Number::Integer(343))),
5199 },
5200 PositionalToken {
5201 source: uws,
5202 offset: 135,
5203 length: 1,
5204 token: Token::Special(Special::Punctuation('.')),
5205 },
5206 PositionalToken {
5207 source: uws,
5208 offset: 136,
5209 length: 4,
5210 token: Token::Word(Word::Number(Number::Integer(4234))),
5211 },
5212 PositionalToken {
5213 source: uws,
5214 offset: 140,
5215 length: 1,
5216 token: Token::Special(Special::Punctuation('_')),
5217 },
5218 PositionalToken {
5219 source: uws,
5220 offset: 141,
5221 length: 4,
5222 token: Token::Word(Word::Number(Number::Integer(4234))),
5223 },
5224 ];
5225 check_results(&result, &lib_res, uws);
5226 }
5227
5228 enum Lang {
5241 Zho,
5242 Jpn,
5243 Kor,
5244 Ara,
5245 Ell,
5246 }
5247
5248 #[test]
5249 fn test_lang_zho() {
5250 let (uws, result) = get_lang_test(Lang::Zho);
5251 let lib_res = uws
5252 .into_tokenizer(TokenizerParams::v1())
5253 .collect::<Vec<_>>();
5254 check_results(&result, &lib_res, &uws);
5255 }
5256
5257 #[test]
5258 fn test_lang_jpn() {
5259 let (uws, result) = get_lang_test(Lang::Jpn);
5260 let lib_res = uws
5261 .into_tokenizer(TokenizerParams::v1())
5262 .collect::<Vec<_>>();
5263 check_results(&result, &lib_res, &uws);
5264 }
5265
5266 #[test]
5267 fn test_lang_kor() {
5268 let (uws, result) = get_lang_test(Lang::Kor);
5269 let lib_res = uws
5270 .into_tokenizer(TokenizerParams::v1())
5271 .collect::<Vec<_>>();
5272 check_results(&result, &lib_res, &uws);
5273 }
5274
5275 #[test]
5276 fn test_lang_ara() {
5277 let (uws, result) = get_lang_test(Lang::Ara);
5278 let lib_res = uws
5279 .into_tokenizer(TokenizerParams::v1())
5280 .collect::<Vec<_>>();
5281 check_results(&result, &lib_res, &uws);
5282 }
5283
5284 #[test]
5285 fn test_lang_ell() {
5286 let (uws, result) = get_lang_test(Lang::Ell);
5287 let lib_res = uws
5288 .into_tokenizer(TokenizerParams::v1())
5289 .collect::<Vec<_>>();
5290 check_results(&result, &lib_res, &uws);
5291 }
5292
5293 fn get_lang_test(lng: Lang) -> (String, Vec<PositionalToken>) {
5294 let uws = match lng {
5295 Lang::Zho => "美国电视连续剧《超人前传》的第一集《试播集》于2001年10月16日在電視網首播,剧集主创人阿尔弗雷德·高夫和迈尔斯·米勒編劇,大卫·努特尔执导。这一试播首次向观众引荐了克拉克·肯特一角,他是位拥有超能力的外星孤儿,与家人和朋友一起在堪薩斯州虚构小镇斯莫维尔生活。在这一集里,肯特首度得知自己的来历,同时还需要阻止一位学生试图杀死镇上高中多名学生的报复之举。本集节目里引入了多个之后将贯穿全季甚至整部剧集的主题元素,例如几位主要角色之间的三角恋情。电视剧在加拿大溫哥華取景,旨在选用其“美国中产阶级”景观,主创人花了5个月的时间专门用于为主角物色合适的演员。试播集在所有演员选好4天后正式开拍。由于时间上的限制,剧组无法搭建好实体外景,因此只能使用计算机绘图技术将数字化的外景插入到镜头中。节目一经上映就打破了电视网的多项收视纪录,并且获得了评论员的普遍好评和多个奖项提名,并在其中两项上胜出",
5296 Lang::Kor => "플레이스테이션 은 소니 컴퓨터 엔터테인먼트가 개발한 세 번째 가정용 게임기이다. 마이크로소프트의 엑스박스 360, 닌텐도의 Wii와 경쟁하고 있다. 이전 제품에서 온라인 플레이 기능을 비디오 게임 개발사에 전적으로 의존하던 것과 달리 통합 온라인 게임 서비스인 플레이스테이션 네트워크 서비스를 발매와 함께 시작해 제공하고 있으며, 탄탄한 멀티미디어 재생 기능, 플레이스테이션 포터블과의 연결, 고화질 광학 디스크 포맷인 블루레이 디스크 재생 기능 등의 기능을 갖추고 있다. 2006년 11월 11일에 일본에서 처음으로 출시했으며, 11월 17일에는 북미 지역, 2007년 3월 23일에는 유럽과 오세아니아 지역에서, 대한민국의 경우 6월 5일부터 일주일간 예약판매를 실시해, 매일 준비한 수량이 동이 나는 등 많은 관심을 받았으며 6월 16일에 정식 출시 행사를 열었다",
5297 Lang::Jpn => "熊野三山本願所は、15世紀末以降における熊野三山(熊野本宮、熊野新宮、熊野那智)の造営・修造のための勧進を担った組織の総称。 熊野三山を含めて、日本における古代から中世前半にかけての寺社の造営は、寺社領経営のような恒常的財源、幕府や朝廷などからの一時的な造営料所の寄進、あるいは公権力からの臨時の保護によって行われていた。しかしながら、熊野三山では、これらの財源はすべて15世紀半ばまでに実効性を失った",
5298 Lang::Ara => "لشکرکشیهای روسهای وارنگی به دریای خزر مجموعهای از حملات نظامی در بین سالهای ۸۶۴ تا ۱۰۴۱ میلادی به سواحل دریای خزر بودهاست. روسهای وارنگی ابتدا در قرن نهم میلادی به عنوان بازرگانان پوست، عسل و برده در سرزمینهای اسلامی(سرکلند) ظاهر شدند. این بازرگانان در مسیر تجاری ولگا به خرید و فروش میپرداختند. نخستین حملهٔ آنان در فاصله سالهای ۸۶۴ تا ۸۸۴ میلادی در مقیاسی کوچک علیه علویان طبرستان رخ داد. نخستین یورش بزرگ روسها در سال ۹۱۳ رخ داد و آنان با ۵۰۰ فروند درازکشتی شهر گرگان و اطراف آن را غارت کردند. آنها در این حمله مقداری کالا و برده را به تاراج بردند و در راه بازگشتن به سمت شمال، در دلتای ولگا، مورد حملهٔ خزرهای مسلمان قرار گرفتند و بعضی از آنان موفق به فرار شدند، ولی در میانهٔ ولگا به قتل رسیدند. دومین هجوم بزرگ روسها به دریای خزر در سال ۹۴۳ به وقوع پیوست. در این دوره ایگور یکم، حاکم روس کیف، رهبری روسها را در دست داشت. روسها پس از توافق با دولت خزرها برای عبور امن از منطقه، تا رود کورا و اعماق قفقاز پیش رفتند و در سال ۹۴۳ موفق شدند بندر بردعه، پایتخت اران (جمهوری آذربایجان کنونی)، را تصرف کنند. روسها در آنجا به مدت چند ماه ماندند و بسیاری از ساکنان شهر را کشتند و از راه غارتگری اموالی را به تاراج بردند. تنها دلیل بازگشت آنان ",
5299 Lang::Ell => "Το Πρόγραμμα υλοποιείται εξ ολοκλήρου από απόσταση και μπορεί να συμμετέχει κάθε εμπλεκόμενος στη ή/και ενδιαφερόμενος για τη διδασκαλία της Ελληνικής ως δεύτερης/ξένης γλώσσας στην Ελλάδα και στο εξωτερικό, αρκεί να είναι απόφοιτος ελληνικής φιλολογίας, ξένων φιλολογιών, παιδαγωγικών τμημάτων, θεολογικών σχολών ή άλλων πανεπιστημιακών τμημάτων ελληνικών ή ισότιμων ξένων πανεπιστημίων. Υπό όρους γίνονται δεκτοί υποψήφιοι που δεν έχουν ολοκληρώσει σπουδές τριτοβάθμιας εκπαίδευσης.",
5300 };
5301 let tokens = match lng {
5302 Lang::Zho => vec![
5303 PositionalToken {
5304 source: uws,
5305 offset: 0,
5306 length: 3,
5307 token: Token::Word(Word::Word("美".to_string())),
5308 },
5309 PositionalToken {
5310 source: uws,
5311 offset: 3,
5312 length: 3,
5313 token: Token::Word(Word::Word("国".to_string())),
5314 },
5315 PositionalToken {
5316 source: uws,
5317 offset: 6,
5318 length: 3,
5319 token: Token::Word(Word::Word("电".to_string())),
5320 },
5321 PositionalToken {
5322 source: uws,
5323 offset: 9,
5324 length: 3,
5325 token: Token::Word(Word::Word("视".to_string())),
5326 },
5327 PositionalToken {
5328 source: uws,
5329 offset: 12,
5330 length: 3,
5331 token: Token::Word(Word::Word("连".to_string())),
5332 },
5333 PositionalToken {
5334 source: uws,
5335 offset: 15,
5336 length: 3,
5337 token: Token::Word(Word::Word("续".to_string())),
5338 },
5339 PositionalToken {
5340 source: uws,
5341 offset: 18,
5342 length: 3,
5343 token: Token::Word(Word::Word("剧".to_string())),
5344 },
5345 PositionalToken {
5346 source: uws,
5347 offset: 21,
5348 length: 3,
5349 token: Token::Special(Special::Punctuation('《')),
5350 },
5351 PositionalToken {
5352 source: uws,
5353 offset: 24,
5354 length: 3,
5355 token: Token::Word(Word::Word("超".to_string())),
5356 },
5357 PositionalToken {
5358 source: uws,
5359 offset: 27,
5360 length: 3,
5361 token: Token::Word(Word::Word("人".to_string())),
5362 },
5363 PositionalToken {
5364 source: uws,
5365 offset: 30,
5366 length: 3,
5367 token: Token::Word(Word::Word("前".to_string())),
5368 },
5369 PositionalToken {
5370 source: uws,
5371 offset: 33,
5372 length: 3,
5373 token: Token::Word(Word::Word("传".to_string())),
5374 },
5375 PositionalToken {
5376 source: uws,
5377 offset: 36,
5378 length: 3,
5379 token: Token::Special(Special::Punctuation('》')),
5380 },
5381 PositionalToken {
5382 source: uws,
5383 offset: 39,
5384 length: 3,
5385 token: Token::Word(Word::Word("的".to_string())),
5386 },
5387 PositionalToken {
5388 source: uws,
5389 offset: 42,
5390 length: 3,
5391 token: Token::Word(Word::Word("第".to_string())),
5392 },
5393 PositionalToken {
5394 source: uws,
5395 offset: 45,
5396 length: 3,
5397 token: Token::Word(Word::Word("一".to_string())),
5398 },
5399 PositionalToken {
5400 source: uws,
5401 offset: 48,
5402 length: 3,
5403 token: Token::Word(Word::Word("集".to_string())),
5404 },
5405 PositionalToken {
5406 source: uws,
5407 offset: 51,
5408 length: 3,
5409 token: Token::Special(Special::Punctuation('《')),
5410 },
5411 PositionalToken {
5412 source: uws,
5413 offset: 54,
5414 length: 3,
5415 token: Token::Word(Word::Word("试".to_string())),
5416 },
5417 PositionalToken {
5418 source: uws,
5419 offset: 57,
5420 length: 3,
5421 token: Token::Word(Word::Word("播".to_string())),
5422 },
5423 PositionalToken {
5424 source: uws,
5425 offset: 60,
5426 length: 3,
5427 token: Token::Word(Word::Word("集".to_string())),
5428 },
5429 PositionalToken {
5430 source: uws,
5431 offset: 63,
5432 length: 3,
5433 token: Token::Special(Special::Punctuation('》')),
5434 },
5435 PositionalToken {
5436 source: uws,
5437 offset: 66,
5438 length: 3,
5439 token: Token::Word(Word::Word("于".to_string())),
5440 },
5441 PositionalToken {
5442 source: uws,
5443 offset: 69,
5444 length: 4,
5445 token: Token::Word(Word::Number(Number::Integer(2001))),
5446 },
5447 PositionalToken {
5448 source: uws,
5449 offset: 73,
5450 length: 3,
5451 token: Token::Word(Word::Word("年".to_string())),
5452 },
5453 PositionalToken {
5454 source: uws,
5455 offset: 76,
5456 length: 2,
5457 token: Token::Word(Word::Number(Number::Integer(10))),
5458 },
5459 PositionalToken {
5460 source: uws,
5461 offset: 78,
5462 length: 3,
5463 token: Token::Word(Word::Word("月".to_string())),
5464 },
5465 PositionalToken {
5466 source: uws,
5467 offset: 81,
5468 length: 2,
5469 token: Token::Word(Word::Number(Number::Integer(16))),
5470 },
5471 PositionalToken {
5472 source: uws,
5473 offset: 83,
5474 length: 3,
5475 token: Token::Word(Word::Word("日".to_string())),
5476 },
5477 PositionalToken {
5478 source: uws,
5479 offset: 86,
5480 length: 3,
5481 token: Token::Word(Word::Word("在".to_string())),
5482 },
5483 PositionalToken {
5484 source: uws,
5485 offset: 89,
5486 length: 3,
5487 token: Token::Word(Word::Word("電".to_string())),
5488 },
5489 PositionalToken {
5490 source: uws,
5491 offset: 92,
5492 length: 3,
5493 token: Token::Word(Word::Word("視".to_string())),
5494 },
5495 PositionalToken {
5496 source: uws,
5497 offset: 95,
5498 length: 3,
5499 token: Token::Word(Word::Word("網".to_string())),
5500 },
5501 PositionalToken {
5502 source: uws,
5503 offset: 98,
5504 length: 3,
5505 token: Token::Word(Word::Word("首".to_string())),
5506 },
5507 PositionalToken {
5508 source: uws,
5509 offset: 101,
5510 length: 3,
5511 token: Token::Word(Word::Word("播".to_string())),
5512 },
5513 PositionalToken {
5514 source: uws,
5515 offset: 104,
5516 length: 3,
5517 token: Token::Special(Special::Punctuation(',')),
5518 },
5519 PositionalToken {
5520 source: uws,
5521 offset: 107,
5522 length: 3,
5523 token: Token::Word(Word::Word("剧".to_string())),
5524 },
5525 PositionalToken {
5526 source: uws,
5527 offset: 110,
5528 length: 3,
5529 token: Token::Word(Word::Word("集".to_string())),
5530 },
5531 PositionalToken {
5532 source: uws,
5533 offset: 113,
5534 length: 3,
5535 token: Token::Word(Word::Word("主".to_string())),
5536 },
5537 PositionalToken {
5538 source: uws,
5539 offset: 116,
5540 length: 3,
5541 token: Token::Word(Word::Word("创".to_string())),
5542 },
5543 PositionalToken {
5544 source: uws,
5545 offset: 119,
5546 length: 3,
5547 token: Token::Word(Word::Word("人".to_string())),
5548 },
5549 PositionalToken {
5550 source: uws,
5551 offset: 122,
5552 length: 3,
5553 token: Token::Word(Word::Word("阿".to_string())),
5554 },
5555 PositionalToken {
5556 source: uws,
5557 offset: 125,
5558 length: 3,
5559 token: Token::Word(Word::Word("尔".to_string())),
5560 },
5561 PositionalToken {
5562 source: uws,
5563 offset: 128,
5564 length: 3,
5565 token: Token::Word(Word::Word("弗".to_string())),
5566 },
5567 PositionalToken {
5568 source: uws,
5569 offset: 131,
5570 length: 3,
5571 token: Token::Word(Word::Word("雷".to_string())),
5572 },
5573 PositionalToken {
5574 source: uws,
5575 offset: 134,
5576 length: 3,
5577 token: Token::Word(Word::Word("德".to_string())),
5578 },
5579 PositionalToken {
5580 source: uws,
5581 offset: 137,
5582 length: 2,
5583 token: Token::Special(Special::Punctuation('·')),
5584 },
5585 PositionalToken {
5586 source: uws,
5587 offset: 139,
5588 length: 3,
5589 token: Token::Word(Word::Word("高".to_string())),
5590 },
5591 PositionalToken {
5592 source: uws,
5593 offset: 142,
5594 length: 3,
5595 token: Token::Word(Word::Word("夫".to_string())),
5596 },
5597 PositionalToken {
5598 source: uws,
5599 offset: 145,
5600 length: 3,
5601 token: Token::Word(Word::Word("和".to_string())),
5602 },
5603 PositionalToken {
5604 source: uws,
5605 offset: 148,
5606 length: 3,
5607 token: Token::Word(Word::Word("迈".to_string())),
5608 },
5609 PositionalToken {
5610 source: uws,
5611 offset: 151,
5612 length: 3,
5613 token: Token::Word(Word::Word("尔".to_string())),
5614 },
5615 PositionalToken {
5616 source: uws,
5617 offset: 154,
5618 length: 3,
5619 token: Token::Word(Word::Word("斯".to_string())),
5620 },
5621 PositionalToken {
5622 source: uws,
5623 offset: 157,
5624 length: 2,
5625 token: Token::Special(Special::Punctuation('·')),
5626 },
5627 PositionalToken {
5628 source: uws,
5629 offset: 159,
5630 length: 3,
5631 token: Token::Word(Word::Word("米".to_string())),
5632 },
5633 PositionalToken {
5634 source: uws,
5635 offset: 162,
5636 length: 3,
5637 token: Token::Word(Word::Word("勒".to_string())),
5638 },
5639 PositionalToken {
5640 source: uws,
5641 offset: 165,
5642 length: 3,
5643 token: Token::Word(Word::Word("編".to_string())),
5644 },
5645 PositionalToken {
5646 source: uws,
5647 offset: 168,
5648 length: 3,
5649 token: Token::Word(Word::Word("劇".to_string())),
5650 },
5651 PositionalToken {
5652 source: uws,
5653 offset: 171,
5654 length: 3,
5655 token: Token::Special(Special::Punctuation(',')),
5656 },
5657 PositionalToken {
5658 source: uws,
5659 offset: 174,
5660 length: 3,
5661 token: Token::Word(Word::Word("大".to_string())),
5662 },
5663 PositionalToken {
5664 source: uws,
5665 offset: 177,
5666 length: 3,
5667 token: Token::Word(Word::Word("卫".to_string())),
5668 },
5669 PositionalToken {
5670 source: uws,
5671 offset: 180,
5672 length: 2,
5673 token: Token::Special(Special::Punctuation('·')),
5674 },
5675 PositionalToken {
5676 source: uws,
5677 offset: 182,
5678 length: 3,
5679 token: Token::Word(Word::Word("努".to_string())),
5680 },
5681 PositionalToken {
5682 source: uws,
5683 offset: 185,
5684 length: 3,
5685 token: Token::Word(Word::Word("特".to_string())),
5686 },
5687 PositionalToken {
5688 source: uws,
5689 offset: 188,
5690 length: 3,
5691 token: Token::Word(Word::Word("尔".to_string())),
5692 },
5693 PositionalToken {
5694 source: uws,
5695 offset: 191,
5696 length: 3,
5697 token: Token::Word(Word::Word("执".to_string())),
5698 },
5699 PositionalToken {
5700 source: uws,
5701 offset: 194,
5702 length: 3,
5703 token: Token::Word(Word::Word("导".to_string())),
5704 },
5705 PositionalToken {
5706 source: uws,
5707 offset: 197,
5708 length: 3,
5709 token: Token::Special(Special::Punctuation('。')),
5710 },
5711 PositionalToken {
5712 source: uws,
5713 offset: 200,
5714 length: 3,
5715 token: Token::Word(Word::Word("这".to_string())),
5716 },
5717 PositionalToken {
5718 source: uws,
5719 offset: 203,
5720 length: 3,
5721 token: Token::Word(Word::Word("一".to_string())),
5722 },
5723 PositionalToken {
5724 source: uws,
5725 offset: 206,
5726 length: 3,
5727 token: Token::Word(Word::Word("试".to_string())),
5728 },
5729 PositionalToken {
5730 source: uws,
5731 offset: 209,
5732 length: 3,
5733 token: Token::Word(Word::Word("播".to_string())),
5734 },
5735 PositionalToken {
5736 source: uws,
5737 offset: 212,
5738 length: 3,
5739 token: Token::Word(Word::Word("首".to_string())),
5740 },
5741 PositionalToken {
5742 source: uws,
5743 offset: 215,
5744 length: 3,
5745 token: Token::Word(Word::Word("次".to_string())),
5746 },
5747 PositionalToken {
5748 source: uws,
5749 offset: 218,
5750 length: 3,
5751 token: Token::Word(Word::Word("向".to_string())),
5752 },
5753 PositionalToken {
5754 source: uws,
5755 offset: 221,
5756 length: 3,
5757 token: Token::Word(Word::Word("观".to_string())),
5758 },
5759 PositionalToken {
5760 source: uws,
5761 offset: 224,
5762 length: 3,
5763 token: Token::Word(Word::Word("众".to_string())),
5764 },
5765 PositionalToken {
5766 source: uws,
5767 offset: 227,
5768 length: 3,
5769 token: Token::Word(Word::Word("引".to_string())),
5770 },
5771 PositionalToken {
5772 source: uws,
5773 offset: 230,
5774 length: 3,
5775 token: Token::Word(Word::Word("荐".to_string())),
5776 },
5777 PositionalToken {
5778 source: uws,
5779 offset: 233,
5780 length: 3,
5781 token: Token::Word(Word::Word("了".to_string())),
5782 },
5783 PositionalToken {
5784 source: uws,
5785 offset: 236,
5786 length: 3,
5787 token: Token::Word(Word::Word("克".to_string())),
5788 },
5789 PositionalToken {
5790 source: uws,
5791 offset: 239,
5792 length: 3,
5793 token: Token::Word(Word::Word("拉".to_string())),
5794 },
5795 PositionalToken {
5796 source: uws,
5797 offset: 242,
5798 length: 3,
5799 token: Token::Word(Word::Word("克".to_string())),
5800 },
5801 PositionalToken {
5802 source: uws,
5803 offset: 245,
5804 length: 2,
5805 token: Token::Special(Special::Punctuation('·')),
5806 },
5807 PositionalToken {
5808 source: uws,
5809 offset: 247,
5810 length: 3,
5811 token: Token::Word(Word::Word("肯".to_string())),
5812 },
5813 PositionalToken {
5814 source: uws,
5815 offset: 250,
5816 length: 3,
5817 token: Token::Word(Word::Word("特".to_string())),
5818 },
5819 PositionalToken {
5820 source: uws,
5821 offset: 253,
5822 length: 3,
5823 token: Token::Word(Word::Word("一".to_string())),
5824 },
5825 PositionalToken {
5826 source: uws,
5827 offset: 256,
5828 length: 3,
5829 token: Token::Word(Word::Word("角".to_string())),
5830 },
5831 PositionalToken {
5832 source: uws,
5833 offset: 259,
5834 length: 3,
5835 token: Token::Special(Special::Punctuation(',')),
5836 },
5837 PositionalToken {
5838 source: uws,
5839 offset: 262,
5840 length: 3,
5841 token: Token::Word(Word::Word("他".to_string())),
5842 },
5843 PositionalToken {
5844 source: uws,
5845 offset: 265,
5846 length: 3,
5847 token: Token::Word(Word::Word("是".to_string())),
5848 },
5849 PositionalToken {
5850 source: uws,
5851 offset: 268,
5852 length: 3,
5853 token: Token::Word(Word::Word("位".to_string())),
5854 },
5855 PositionalToken {
5856 source: uws,
5857 offset: 271,
5858 length: 3,
5859 token: Token::Word(Word::Word("拥".to_string())),
5860 },
5861 PositionalToken {
5862 source: uws,
5863 offset: 274,
5864 length: 3,
5865 token: Token::Word(Word::Word("有".to_string())),
5866 },
5867 PositionalToken {
5868 source: uws,
5869 offset: 277,
5870 length: 3,
5871 token: Token::Word(Word::Word("超".to_string())),
5872 },
5873 ],
5874 Lang::Jpn => vec![
5875 PositionalToken {
5876 source: uws,
5877 offset: 0,
5878 length: 3,
5879 token: Token::Word(Word::Word("熊".to_string())),
5880 },
5881 PositionalToken {
5882 source: uws,
5883 offset: 3,
5884 length: 3,
5885 token: Token::Word(Word::Word("野".to_string())),
5886 },
5887 PositionalToken {
5888 source: uws,
5889 offset: 6,
5890 length: 3,
5891 token: Token::Word(Word::Word("三".to_string())),
5892 },
5893 PositionalToken {
5894 source: uws,
5895 offset: 9,
5896 length: 3,
5897 token: Token::Word(Word::Word("山".to_string())),
5898 },
5899 PositionalToken {
5900 source: uws,
5901 offset: 12,
5902 length: 3,
5903 token: Token::Word(Word::Word("本".to_string())),
5904 },
5905 PositionalToken {
5906 source: uws,
5907 offset: 15,
5908 length: 3,
5909 token: Token::Word(Word::Word("願".to_string())),
5910 },
5911 PositionalToken {
5912 source: uws,
5913 offset: 18,
5914 length: 3,
5915 token: Token::Word(Word::Word("所".to_string())),
5916 },
5917 PositionalToken {
5918 source: uws,
5919 offset: 21,
5920 length: 3,
5921 token: Token::Word(Word::Word("は".to_string())),
5922 },
5923 PositionalToken {
5924 source: uws,
5925 offset: 24,
5926 length: 3,
5927 token: Token::Special(Special::Punctuation('、')),
5928 },
5929 PositionalToken {
5930 source: uws,
5931 offset: 27,
5932 length: 2,
5933 token: Token::Word(Word::Number(Number::Integer(15))),
5934 },
5935 PositionalToken {
5936 source: uws,
5937 offset: 29,
5938 length: 3,
5939 token: Token::Word(Word::Word("世".to_string())),
5940 },
5941 PositionalToken {
5942 source: uws,
5943 offset: 32,
5944 length: 3,
5945 token: Token::Word(Word::Word("紀".to_string())),
5946 },
5947 PositionalToken {
5948 source: uws,
5949 offset: 35,
5950 length: 3,
5951 token: Token::Word(Word::Word("末".to_string())),
5952 },
5953 PositionalToken {
5954 source: uws,
5955 offset: 38,
5956 length: 3,
5957 token: Token::Word(Word::Word("以".to_string())),
5958 },
5959 PositionalToken {
5960 source: uws,
5961 offset: 41,
5962 length: 3,
5963 token: Token::Word(Word::Word("降".to_string())),
5964 },
5965 PositionalToken {
5966 source: uws,
5967 offset: 44,
5968 length: 3,
5969 token: Token::Word(Word::Word("に".to_string())),
5970 },
5971 PositionalToken {
5972 source: uws,
5973 offset: 47,
5974 length: 3,
5975 token: Token::Word(Word::Word("お".to_string())),
5976 },
5977 PositionalToken {
5978 source: uws,
5979 offset: 50,
5980 length: 3,
5981 token: Token::Word(Word::Word("け".to_string())),
5982 },
5983 PositionalToken {
5984 source: uws,
5985 offset: 53,
5986 length: 3,
5987 token: Token::Word(Word::Word("る".to_string())),
5988 },
5989 PositionalToken {
5990 source: uws,
5991 offset: 56,
5992 length: 3,
5993 token: Token::Word(Word::Word("熊".to_string())),
5994 },
5995 PositionalToken {
5996 source: uws,
5997 offset: 59,
5998 length: 3,
5999 token: Token::Word(Word::Word("野".to_string())),
6000 },
6001 PositionalToken {
6002 source: uws,
6003 offset: 62,
6004 length: 3,
6005 token: Token::Word(Word::Word("三".to_string())),
6006 },
6007 PositionalToken {
6008 source: uws,
6009 offset: 65,
6010 length: 3,
6011 token: Token::Word(Word::Word("山".to_string())),
6012 },
6013 PositionalToken {
6014 source: uws,
6015 offset: 68,
6016 length: 3,
6017 token: Token::Special(Special::Punctuation('(')),
6018 },
6019 PositionalToken {
6020 source: uws,
6021 offset: 71,
6022 length: 3,
6023 token: Token::Word(Word::Word("熊".to_string())),
6024 },
6025 PositionalToken {
6026 source: uws,
6027 offset: 74,
6028 length: 3,
6029 token: Token::Word(Word::Word("野".to_string())),
6030 },
6031 PositionalToken {
6032 source: uws,
6033 offset: 77,
6034 length: 3,
6035 token: Token::Word(Word::Word("本".to_string())),
6036 },
6037 PositionalToken {
6038 source: uws,
6039 offset: 80,
6040 length: 3,
6041 token: Token::Word(Word::Word("宮".to_string())),
6042 },
6043 PositionalToken {
6044 source: uws,
6045 offset: 83,
6046 length: 3,
6047 token: Token::Special(Special::Punctuation('、')),
6048 },
6049 PositionalToken {
6050 source: uws,
6051 offset: 86,
6052 length: 3,
6053 token: Token::Word(Word::Word("熊".to_string())),
6054 },
6055 PositionalToken {
6056 source: uws,
6057 offset: 89,
6058 length: 3,
6059 token: Token::Word(Word::Word("野".to_string())),
6060 },
6061 PositionalToken {
6062 source: uws,
6063 offset: 92,
6064 length: 3,
6065 token: Token::Word(Word::Word("新".to_string())),
6066 },
6067 PositionalToken {
6068 source: uws,
6069 offset: 95,
6070 length: 3,
6071 token: Token::Word(Word::Word("宮".to_string())),
6072 },
6073 PositionalToken {
6074 source: uws,
6075 offset: 98,
6076 length: 3,
6077 token: Token::Special(Special::Punctuation('、')),
6078 },
6079 PositionalToken {
6080 source: uws,
6081 offset: 101,
6082 length: 3,
6083 token: Token::Word(Word::Word("熊".to_string())),
6084 },
6085 PositionalToken {
6086 source: uws,
6087 offset: 104,
6088 length: 3,
6089 token: Token::Word(Word::Word("野".to_string())),
6090 },
6091 PositionalToken {
6092 source: uws,
6093 offset: 107,
6094 length: 3,
6095 token: Token::Word(Word::Word("那".to_string())),
6096 },
6097 PositionalToken {
6098 source: uws,
6099 offset: 110,
6100 length: 3,
6101 token: Token::Word(Word::Word("智".to_string())),
6102 },
6103 PositionalToken {
6104 source: uws,
6105 offset: 113,
6106 length: 3,
6107 token: Token::Special(Special::Punctuation(')')),
6108 },
6109 PositionalToken {
6110 source: uws,
6111 offset: 116,
6112 length: 3,
6113 token: Token::Word(Word::Word("の".to_string())),
6114 },
6115 PositionalToken {
6116 source: uws,
6117 offset: 119,
6118 length: 3,
6119 token: Token::Word(Word::Word("造".to_string())),
6120 },
6121 PositionalToken {
6122 source: uws,
6123 offset: 122,
6124 length: 3,
6125 token: Token::Word(Word::Word("営".to_string())),
6126 },
6127 PositionalToken {
6128 source: uws,
6129 offset: 125,
6130 length: 3,
6131 token: Token::Special(Special::Punctuation('・')),
6132 },
6133 PositionalToken {
6134 source: uws,
6135 offset: 128,
6136 length: 3,
6137 token: Token::Word(Word::Word("修".to_string())),
6138 },
6139 PositionalToken {
6140 source: uws,
6141 offset: 131,
6142 length: 3,
6143 token: Token::Word(Word::Word("造".to_string())),
6144 },
6145 PositionalToken {
6146 source: uws,
6147 offset: 134,
6148 length: 3,
6149 token: Token::Word(Word::Word("の".to_string())),
6150 },
6151 PositionalToken {
6152 source: uws,
6153 offset: 137,
6154 length: 3,
6155 token: Token::Word(Word::Word("た".to_string())),
6156 },
6157 PositionalToken {
6158 source: uws,
6159 offset: 140,
6160 length: 3,
6161 token: Token::Word(Word::Word("め".to_string())),
6162 },
6163 PositionalToken {
6164 source: uws,
6165 offset: 143,
6166 length: 3,
6167 token: Token::Word(Word::Word("の".to_string())),
6168 },
6169 PositionalToken {
6170 source: uws,
6171 offset: 146,
6172 length: 3,
6173 token: Token::Word(Word::Word("勧".to_string())),
6174 },
6175 PositionalToken {
6176 source: uws,
6177 offset: 149,
6178 length: 3,
6179 token: Token::Word(Word::Word("進".to_string())),
6180 },
6181 PositionalToken {
6182 source: uws,
6183 offset: 152,
6184 length: 3,
6185 token: Token::Word(Word::Word("を".to_string())),
6186 },
6187 PositionalToken {
6188 source: uws,
6189 offset: 155,
6190 length: 3,
6191 token: Token::Word(Word::Word("担".to_string())),
6192 },
6193 PositionalToken {
6194 source: uws,
6195 offset: 158,
6196 length: 3,
6197 token: Token::Word(Word::Word("っ".to_string())),
6198 },
6199 PositionalToken {
6200 source: uws,
6201 offset: 161,
6202 length: 3,
6203 token: Token::Word(Word::Word("た".to_string())),
6204 },
6205 PositionalToken {
6206 source: uws,
6207 offset: 164,
6208 length: 3,
6209 token: Token::Word(Word::Word("組".to_string())),
6210 },
6211 PositionalToken {
6212 source: uws,
6213 offset: 167,
6214 length: 3,
6215 token: Token::Word(Word::Word("織".to_string())),
6216 },
6217 PositionalToken {
6218 source: uws,
6219 offset: 170,
6220 length: 3,
6221 token: Token::Word(Word::Word("の".to_string())),
6222 },
6223 PositionalToken {
6224 source: uws,
6225 offset: 173,
6226 length: 3,
6227 token: Token::Word(Word::Word("総".to_string())),
6228 },
6229 PositionalToken {
6230 source: uws,
6231 offset: 176,
6232 length: 3,
6233 token: Token::Word(Word::Word("称".to_string())),
6234 },
6235 PositionalToken {
6236 source: uws,
6237 offset: 179,
6238 length: 3,
6239 token: Token::Special(Special::Punctuation('。')),
6240 },
6241 PositionalToken {
6242 source: uws,
6243 offset: 182,
6244 length: 1,
6245 token: Token::Special(Special::Separator(Separator::Space)),
6246 },
6247 PositionalToken {
6248 source: uws,
6249 offset: 183,
6250 length: 3,
6251 token: Token::Word(Word::Word("熊".to_string())),
6252 },
6253 PositionalToken {
6254 source: uws,
6255 offset: 186,
6256 length: 3,
6257 token: Token::Word(Word::Word("野".to_string())),
6258 },
6259 PositionalToken {
6260 source: uws,
6261 offset: 189,
6262 length: 3,
6263 token: Token::Word(Word::Word("三".to_string())),
6264 },
6265 PositionalToken {
6266 source: uws,
6267 offset: 192,
6268 length: 3,
6269 token: Token::Word(Word::Word("山".to_string())),
6270 },
6271 PositionalToken {
6272 source: uws,
6273 offset: 195,
6274 length: 3,
6275 token: Token::Word(Word::Word("を".to_string())),
6276 },
6277 PositionalToken {
6278 source: uws,
6279 offset: 198,
6280 length: 3,
6281 token: Token::Word(Word::Word("含".to_string())),
6282 },
6283 PositionalToken {
6284 source: uws,
6285 offset: 201,
6286 length: 3,
6287 token: Token::Word(Word::Word("め".to_string())),
6288 },
6289 PositionalToken {
6290 source: uws,
6291 offset: 204,
6292 length: 3,
6293 token: Token::Word(Word::Word("て".to_string())),
6294 },
6295 PositionalToken {
6296 source: uws,
6297 offset: 207,
6298 length: 3,
6299 token: Token::Special(Special::Punctuation('、')),
6300 },
6301 PositionalToken {
6302 source: uws,
6303 offset: 210,
6304 length: 3,
6305 token: Token::Word(Word::Word("日".to_string())),
6306 },
6307 PositionalToken {
6308 source: uws,
6309 offset: 213,
6310 length: 3,
6311 token: Token::Word(Word::Word("本".to_string())),
6312 },
6313 PositionalToken {
6314 source: uws,
6315 offset: 216,
6316 length: 3,
6317 token: Token::Word(Word::Word("に".to_string())),
6318 },
6319 PositionalToken {
6320 source: uws,
6321 offset: 219,
6322 length: 3,
6323 token: Token::Word(Word::Word("お".to_string())),
6324 },
6325 PositionalToken {
6326 source: uws,
6327 offset: 222,
6328 length: 3,
6329 token: Token::Word(Word::Word("け".to_string())),
6330 },
6331 PositionalToken {
6332 source: uws,
6333 offset: 225,
6334 length: 3,
6335 token: Token::Word(Word::Word("る".to_string())),
6336 },
6337 PositionalToken {
6338 source: uws,
6339 offset: 228,
6340 length: 3,
6341 token: Token::Word(Word::Word("古".to_string())),
6342 },
6343 PositionalToken {
6344 source: uws,
6345 offset: 231,
6346 length: 3,
6347 token: Token::Word(Word::Word("代".to_string())),
6348 },
6349 PositionalToken {
6350 source: uws,
6351 offset: 234,
6352 length: 3,
6353 token: Token::Word(Word::Word("か".to_string())),
6354 },
6355 PositionalToken {
6356 source: uws,
6357 offset: 237,
6358 length: 3,
6359 token: Token::Word(Word::Word("ら".to_string())),
6360 },
6361 PositionalToken {
6362 source: uws,
6363 offset: 240,
6364 length: 3,
6365 token: Token::Word(Word::Word("中".to_string())),
6366 },
6367 PositionalToken {
6368 source: uws,
6369 offset: 243,
6370 length: 3,
6371 token: Token::Word(Word::Word("世".to_string())),
6372 },
6373 PositionalToken {
6374 source: uws,
6375 offset: 246,
6376 length: 3,
6377 token: Token::Word(Word::Word("前".to_string())),
6378 },
6379 PositionalToken {
6380 source: uws,
6381 offset: 249,
6382 length: 3,
6383 token: Token::Word(Word::Word("半".to_string())),
6384 },
6385 PositionalToken {
6386 source: uws,
6387 offset: 252,
6388 length: 3,
6389 token: Token::Word(Word::Word("に".to_string())),
6390 },
6391 PositionalToken {
6392 source: uws,
6393 offset: 255,
6394 length: 3,
6395 token: Token::Word(Word::Word("か".to_string())),
6396 },
6397 PositionalToken {
6398 source: uws,
6399 offset: 258,
6400 length: 3,
6401 token: Token::Word(Word::Word("け".to_string())),
6402 },
6403 PositionalToken {
6404 source: uws,
6405 offset: 261,
6406 length: 3,
6407 token: Token::Word(Word::Word("て".to_string())),
6408 },
6409 PositionalToken {
6410 source: uws,
6411 offset: 264,
6412 length: 3,
6413 token: Token::Word(Word::Word("の".to_string())),
6414 },
6415 PositionalToken {
6416 source: uws,
6417 offset: 267,
6418 length: 3,
6419 token: Token::Word(Word::Word("寺".to_string())),
6420 },
6421 PositionalToken {
6422 source: uws,
6423 offset: 270,
6424 length: 3,
6425 token: Token::Word(Word::Word("社".to_string())),
6426 },
6427 PositionalToken {
6428 source: uws,
6429 offset: 273,
6430 length: 3,
6431 token: Token::Word(Word::Word("の".to_string())),
6432 },
6433 PositionalToken {
6434 source: uws,
6435 offset: 276,
6436 length: 3,
6437 token: Token::Word(Word::Word("造".to_string())),
6438 },
6439 PositionalToken {
6440 source: uws,
6441 offset: 279,
6442 length: 3,
6443 token: Token::Word(Word::Word("営".to_string())),
6444 },
6445 PositionalToken {
6446 source: uws,
6447 offset: 282,
6448 length: 3,
6449 token: Token::Word(Word::Word("は".to_string())),
6450 },
6451 PositionalToken {
6452 source: uws,
6453 offset: 285,
6454 length: 3,
6455 token: Token::Special(Special::Punctuation('、')),
6456 },
6457 PositionalToken {
6458 source: uws,
6459 offset: 288,
6460 length: 3,
6461 token: Token::Word(Word::Word("寺".to_string())),
6462 },
6463 PositionalToken {
6464 source: uws,
6465 offset: 291,
6466 length: 3,
6467 token: Token::Word(Word::Word("社".to_string())),
6468 },
6469 ],
6470 Lang::Kor => vec![
6471 PositionalToken {
6472 source: uws,
6473 offset: 0,
6474 length: 21,
6475 token: Token::Word(Word::Word("플레이스테이션".to_string())),
6476 },
6477 PositionalToken {
6478 source: uws,
6479 offset: 21,
6480 length: 1,
6481 token: Token::Special(Special::Separator(Separator::Space)),
6482 },
6483 PositionalToken {
6484 source: uws,
6485 offset: 22,
6486 length: 3,
6487 token: Token::Word(Word::Word("은".to_string())),
6488 },
6489 PositionalToken {
6490 source: uws,
6491 offset: 25,
6492 length: 1,
6493 token: Token::Special(Special::Separator(Separator::Space)),
6494 },
6495 PositionalToken {
6496 source: uws,
6497 offset: 26,
6498 length: 6,
6499 token: Token::Word(Word::Word("소니".to_string())),
6500 },
6501 PositionalToken {
6502 source: uws,
6503 offset: 32,
6504 length: 1,
6505 token: Token::Special(Special::Separator(Separator::Space)),
6506 },
6507 PositionalToken {
6508 source: uws,
6509 offset: 33,
6510 length: 9,
6511 token: Token::Word(Word::Word("컴퓨터".to_string())),
6512 },
6513 PositionalToken {
6514 source: uws,
6515 offset: 42,
6516 length: 1,
6517 token: Token::Special(Special::Separator(Separator::Space)),
6518 },
6519 PositionalToken {
6520 source: uws,
6521 offset: 43,
6522 length: 21,
6523 token: Token::Word(Word::Word("엔터테인먼트가".to_string())),
6524 },
6525 PositionalToken {
6526 source: uws,
6527 offset: 64,
6528 length: 1,
6529 token: Token::Special(Special::Separator(Separator::Space)),
6530 },
6531 PositionalToken {
6532 source: uws,
6533 offset: 65,
6534 length: 9,
6535 token: Token::Word(Word::Word("개발한".to_string())),
6536 },
6537 PositionalToken {
6538 source: uws,
6539 offset: 74,
6540 length: 1,
6541 token: Token::Special(Special::Separator(Separator::Space)),
6542 },
6543 PositionalToken {
6544 source: uws,
6545 offset: 75,
6546 length: 3,
6547 token: Token::Word(Word::Word("세".to_string())),
6548 },
6549 PositionalToken {
6550 source: uws,
6551 offset: 78,
6552 length: 1,
6553 token: Token::Special(Special::Separator(Separator::Space)),
6554 },
6555 PositionalToken {
6556 source: uws,
6557 offset: 79,
6558 length: 6,
6559 token: Token::Word(Word::Word("번째".to_string())),
6560 },
6561 PositionalToken {
6562 source: uws,
6563 offset: 85,
6564 length: 1,
6565 token: Token::Special(Special::Separator(Separator::Space)),
6566 },
6567 PositionalToken {
6568 source: uws,
6569 offset: 86,
6570 length: 9,
6571 token: Token::Word(Word::Word("가정용".to_string())),
6572 },
6573 PositionalToken {
6574 source: uws,
6575 offset: 95,
6576 length: 1,
6577 token: Token::Special(Special::Separator(Separator::Space)),
6578 },
6579 PositionalToken {
6580 source: uws,
6581 offset: 96,
6582 length: 15,
6583 token: Token::Word(Word::Word("게임기이다".to_string())),
6584 },
6585 PositionalToken {
6586 source: uws,
6587 offset: 111,
6588 length: 1,
6589 token: Token::Special(Special::Punctuation('.')),
6590 },
6591 PositionalToken {
6592 source: uws,
6593 offset: 112,
6594 length: 1,
6595 token: Token::Special(Special::Separator(Separator::Space)),
6596 },
6597 PositionalToken {
6598 source: uws,
6599 offset: 113,
6600 length: 24,
6601 token: Token::Word(Word::Word("마이크로소프트의".to_string())),
6602 },
6603 PositionalToken {
6604 source: uws,
6605 offset: 137,
6606 length: 1,
6607 token: Token::Special(Special::Separator(Separator::Space)),
6608 },
6609 PositionalToken {
6610 source: uws,
6611 offset: 138,
6612 length: 12,
6613 token: Token::Word(Word::Word("엑스박스".to_string())),
6614 },
6615 PositionalToken {
6616 source: uws,
6617 offset: 150,
6618 length: 1,
6619 token: Token::Special(Special::Separator(Separator::Space)),
6620 },
6621 PositionalToken {
6622 source: uws,
6623 offset: 151,
6624 length: 3,
6625 token: Token::Word(Word::Number(Number::Integer(360))),
6626 },
6627 PositionalToken {
6628 source: uws,
6629 offset: 154,
6630 length: 1,
6631 token: Token::Special(Special::Punctuation(',')),
6632 },
6633 PositionalToken {
6634 source: uws,
6635 offset: 155,
6636 length: 1,
6637 token: Token::Special(Special::Separator(Separator::Space)),
6638 },
6639 PositionalToken {
6640 source: uws,
6641 offset: 156,
6642 length: 12,
6643 token: Token::Word(Word::Word("닌텐도의".to_string())),
6644 },
6645 PositionalToken {
6646 source: uws,
6647 offset: 168,
6648 length: 1,
6649 token: Token::Special(Special::Separator(Separator::Space)),
6650 },
6651 PositionalToken {
6652 source: uws,
6653 offset: 169,
6654 length: 6,
6655 token: Token::Word(Word::Word("Wii와".to_string())),
6656 },
6657 PositionalToken {
6658 source: uws,
6659 offset: 175,
6660 length: 1,
6661 token: Token::Special(Special::Separator(Separator::Space)),
6662 },
6663 PositionalToken {
6664 source: uws,
6665 offset: 176,
6666 length: 12,
6667 token: Token::Word(Word::Word("경쟁하고".to_string())),
6668 },
6669 PositionalToken {
6670 source: uws,
6671 offset: 188,
6672 length: 1,
6673 token: Token::Special(Special::Separator(Separator::Space)),
6674 },
6675 PositionalToken {
6676 source: uws,
6677 offset: 189,
6678 length: 6,
6679 token: Token::Word(Word::Word("있다".to_string())),
6680 },
6681 PositionalToken {
6682 source: uws,
6683 offset: 195,
6684 length: 1,
6685 token: Token::Special(Special::Punctuation('.')),
6686 },
6687 PositionalToken {
6688 source: uws,
6689 offset: 196,
6690 length: 1,
6691 token: Token::Special(Special::Separator(Separator::Space)),
6692 },
6693 PositionalToken {
6694 source: uws,
6695 offset: 197,
6696 length: 6,
6697 token: Token::Word(Word::Word("이전".to_string())),
6698 },
6699 PositionalToken {
6700 source: uws,
6701 offset: 203,
6702 length: 1,
6703 token: Token::Special(Special::Separator(Separator::Space)),
6704 },
6705 PositionalToken {
6706 source: uws,
6707 offset: 204,
6708 length: 12,
6709 token: Token::Word(Word::Word("제품에서".to_string())),
6710 },
6711 PositionalToken {
6712 source: uws,
6713 offset: 216,
6714 length: 1,
6715 token: Token::Special(Special::Separator(Separator::Space)),
6716 },
6717 PositionalToken {
6718 source: uws,
6719 offset: 217,
6720 length: 9,
6721 token: Token::Word(Word::Word("온라인".to_string())),
6722 },
6723 PositionalToken {
6724 source: uws,
6725 offset: 226,
6726 length: 1,
6727 token: Token::Special(Special::Separator(Separator::Space)),
6728 },
6729 PositionalToken {
6730 source: uws,
6731 offset: 227,
6732 length: 9,
6733 token: Token::Word(Word::Word("플레이".to_string())),
6734 },
6735 PositionalToken {
6736 source: uws,
6737 offset: 236,
6738 length: 1,
6739 token: Token::Special(Special::Separator(Separator::Space)),
6740 },
6741 PositionalToken {
6742 source: uws,
6743 offset: 237,
6744 length: 3,
6745 token: Token::Word(Word::Word("기".to_string())),
6746 },
6747 ],
6748 Lang::Ara => vec![
6749 PositionalToken {
6750 source: uws,
6751 offset: 0,
6752 length: 14,
6753 token: Token::Word(Word::Word("لشکرکشی".to_string())),
6754 },
6755 PositionalToken {
6756 source: uws,
6757 offset: 14,
6758 length: 3,
6759 token: Token::Unicode(Unicode::Formatter(Formatter::Char('\u{200c}'))),
6760 },
6761 PositionalToken {
6762 source: uws,
6763 offset: 17,
6764 length: 6,
6765 token: Token::Word(Word::Word("های".to_string())),
6766 },
6767 PositionalToken {
6768 source: uws,
6769 offset: 23,
6770 length: 1,
6771 token: Token::Special(Special::Separator(Separator::Space)),
6772 },
6773 PositionalToken {
6774 source: uws,
6775 offset: 24,
6776 length: 6,
6777 token: Token::Word(Word::Word("روس".to_string())),
6778 },
6779 PositionalToken {
6780 source: uws,
6781 offset: 30,
6782 length: 3,
6783 token: Token::Unicode(Unicode::Formatter(Formatter::Char('\u{200c}'))),
6784 },
6785 PositionalToken {
6786 source: uws,
6787 offset: 33,
6788 length: 6,
6789 token: Token::Word(Word::Word("های".to_string())),
6790 },
6791 PositionalToken {
6792 source: uws,
6793 offset: 39,
6794 length: 1,
6795 token: Token::Special(Special::Separator(Separator::Space)),
6796 },
6797 PositionalToken {
6798 source: uws,
6799 offset: 40,
6800 length: 12,
6801 token: Token::Word(Word::Word("وارنگی".to_string())),
6802 },
6803 PositionalToken {
6804 source: uws,
6805 offset: 52,
6806 length: 1,
6807 token: Token::Special(Special::Separator(Separator::Space)),
6808 },
6809 PositionalToken {
6810 source: uws,
6811 offset: 53,
6812 length: 4,
6813 token: Token::Word(Word::Word("به".to_string())),
6814 },
6815 PositionalToken {
6816 source: uws,
6817 offset: 57,
6818 length: 1,
6819 token: Token::Special(Special::Separator(Separator::Space)),
6820 },
6821 PositionalToken {
6822 source: uws,
6823 offset: 58,
6824 length: 10,
6825 token: Token::Word(Word::Word("دریای".to_string())),
6826 },
6827 PositionalToken {
6828 source: uws,
6829 offset: 68,
6830 length: 1,
6831 token: Token::Special(Special::Separator(Separator::Space)),
6832 },
6833 PositionalToken {
6834 source: uws,
6835 offset: 69,
6836 length: 6,
6837 token: Token::Word(Word::Word("خزر".to_string())),
6838 },
6839 PositionalToken {
6840 source: uws,
6841 offset: 75,
6842 length: 1,
6843 token: Token::Special(Special::Separator(Separator::Space)),
6844 },
6845 PositionalToken {
6846 source: uws,
6847 offset: 76,
6848 length: 12,
6849 token: Token::Word(Word::Word("مجموعه".to_string())),
6850 },
6851 PositionalToken {
6852 source: uws,
6853 offset: 88,
6854 length: 3,
6855 token: Token::Unicode(Unicode::Formatter(Formatter::Char('\u{200c}'))),
6856 },
6857 PositionalToken {
6858 source: uws,
6859 offset: 91,
6860 length: 4,
6861 token: Token::Word(Word::Word("ای".to_string())),
6862 },
6863 PositionalToken {
6864 source: uws,
6865 offset: 95,
6866 length: 1,
6867 token: Token::Special(Special::Separator(Separator::Space)),
6868 },
6869 PositionalToken {
6870 source: uws,
6871 offset: 96,
6872 length: 4,
6873 token: Token::Word(Word::Word("از".to_string())),
6874 },
6875 PositionalToken {
6876 source: uws,
6877 offset: 100,
6878 length: 1,
6879 token: Token::Special(Special::Separator(Separator::Space)),
6880 },
6881 PositionalToken {
6882 source: uws,
6883 offset: 101,
6884 length: 10,
6885 token: Token::Word(Word::Word("حملات".to_string())),
6886 },
6887 PositionalToken {
6888 source: uws,
6889 offset: 111,
6890 length: 1,
6891 token: Token::Special(Special::Separator(Separator::Space)),
6892 },
6893 PositionalToken {
6894 source: uws,
6895 offset: 112,
6896 length: 10,
6897 token: Token::Word(Word::Word("نظامی".to_string())),
6898 },
6899 PositionalToken {
6900 source: uws,
6901 offset: 122,
6902 length: 1,
6903 token: Token::Special(Special::Separator(Separator::Space)),
6904 },
6905 PositionalToken {
6906 source: uws,
6907 offset: 123,
6908 length: 4,
6909 token: Token::Word(Word::Word("در".to_string())),
6910 },
6911 PositionalToken {
6912 source: uws,
6913 offset: 127,
6914 length: 1,
6915 token: Token::Special(Special::Separator(Separator::Space)),
6916 },
6917 PositionalToken {
6918 source: uws,
6919 offset: 128,
6920 length: 6,
6921 token: Token::Word(Word::Word("بین".to_string())),
6922 },
6923 PositionalToken {
6924 source: uws,
6925 offset: 134,
6926 length: 1,
6927 token: Token::Special(Special::Separator(Separator::Space)),
6928 },
6929 PositionalToken {
6930 source: uws,
6931 offset: 135,
6932 length: 6,
6933 token: Token::Word(Word::Word("سال".to_string())),
6934 },
6935 PositionalToken {
6936 source: uws,
6937 offset: 141,
6938 length: 3,
6939 token: Token::Unicode(Unicode::Formatter(Formatter::Char('\u{200c}'))),
6940 },
6941 PositionalToken {
6942 source: uws,
6943 offset: 144,
6944 length: 6,
6945 token: Token::Word(Word::Word("های".to_string())),
6946 },
6947 PositionalToken {
6948 source: uws,
6949 offset: 150,
6950 length: 1,
6951 token: Token::Special(Special::Separator(Separator::Space)),
6952 },
6953 PositionalToken {
6954 source: uws,
6955 offset: 151,
6956 length: 6,
6957 token: Token::Word(Word::StrangeWord("۸۶۴".to_string())),
6958 },
6959 PositionalToken {
6960 source: uws,
6961 offset: 157,
6962 length: 1,
6963 token: Token::Special(Special::Separator(Separator::Space)),
6964 },
6965 PositionalToken {
6966 source: uws,
6967 offset: 158,
6968 length: 4,
6969 token: Token::Word(Word::Word("تا".to_string())),
6970 },
6971 PositionalToken {
6972 source: uws,
6973 offset: 162,
6974 length: 1,
6975 token: Token::Special(Special::Separator(Separator::Space)),
6976 },
6977 PositionalToken {
6978 source: uws,
6979 offset: 163,
6980 length: 8,
6981 token: Token::Word(Word::StrangeWord("۱۰۴۱".to_string())),
6982 },
6983 PositionalToken {
6984 source: uws,
6985 offset: 171,
6986 length: 1,
6987 token: Token::Special(Special::Separator(Separator::Space)),
6988 },
6989 PositionalToken {
6990 source: uws,
6991 offset: 172,
6992 length: 12,
6993 token: Token::Word(Word::Word("میلادی".to_string())),
6994 },
6995 PositionalToken {
6996 source: uws,
6997 offset: 184,
6998 length: 1,
6999 token: Token::Special(Special::Separator(Separator::Space)),
7000 },
7001 PositionalToken {
7002 source: uws,
7003 offset: 185,
7004 length: 2,
7005 token: Token::Word(Word::Word("ب".to_string())),
7006 },
7007 ],
7008 Lang::Ell => vec![
7009 PositionalToken {
7010 source: uws,
7011 offset: 0,
7012 length: 4,
7013 token: Token::Word(Word::Word("Το".to_string())),
7014 },
7015 PositionalToken {
7016 source: uws,
7017 offset: 4,
7018 length: 1,
7019 token: Token::Special(Special::Separator(Separator::Space)),
7020 },
7021 PositionalToken {
7022 source: uws,
7023 offset: 5,
7024 length: 18,
7025 token: Token::Word(Word::Word("Πρόγραμμα".to_string())),
7026 },
7027 PositionalToken {
7028 source: uws,
7029 offset: 23,
7030 length: 1,
7031 token: Token::Special(Special::Separator(Separator::Space)),
7032 },
7033 PositionalToken {
7034 source: uws,
7035 offset: 24,
7036 length: 22,
7037 token: Token::Word(Word::Word("υλοποιείται".to_string())),
7038 },
7039 PositionalToken {
7040 source: uws,
7041 offset: 46,
7042 length: 1,
7043 token: Token::Special(Special::Separator(Separator::Space)),
7044 },
7045 PositionalToken {
7046 source: uws,
7047 offset: 47,
7048 length: 4,
7049 token: Token::Word(Word::Word("εξ".to_string())),
7050 },
7051 PositionalToken {
7052 source: uws,
7053 offset: 51,
7054 length: 1,
7055 token: Token::Special(Special::Separator(Separator::Space)),
7056 },
7057 PositionalToken {
7058 source: uws,
7059 offset: 52,
7060 length: 18,
7061 token: Token::Word(Word::Word("ολοκλήρου".to_string())),
7062 },
7063 PositionalToken {
7064 source: uws,
7065 offset: 70,
7066 length: 1,
7067 token: Token::Special(Special::Separator(Separator::Space)),
7068 },
7069 PositionalToken {
7070 source: uws,
7071 offset: 71,
7072 length: 6,
7073 token: Token::Word(Word::Word("από".to_string())),
7074 },
7075 PositionalToken {
7076 source: uws,
7077 offset: 77,
7078 length: 1,
7079 token: Token::Special(Special::Separator(Separator::Space)),
7080 },
7081 PositionalToken {
7082 source: uws,
7083 offset: 78,
7084 length: 16,
7085 token: Token::Word(Word::Word("απόσταση".to_string())),
7086 },
7087 PositionalToken {
7088 source: uws,
7089 offset: 94,
7090 length: 1,
7091 token: Token::Special(Special::Separator(Separator::Space)),
7092 },
7093 PositionalToken {
7094 source: uws,
7095 offset: 95,
7096 length: 6,
7097 token: Token::Word(Word::Word("και".to_string())),
7098 },
7099 PositionalToken {
7100 source: uws,
7101 offset: 101,
7102 length: 1,
7103 token: Token::Special(Special::Separator(Separator::Space)),
7104 },
7105 PositionalToken {
7106 source: uws,
7107 offset: 102,
7108 length: 12,
7109 token: Token::Word(Word::Word("μπορεί".to_string())),
7110 },
7111 PositionalToken {
7112 source: uws,
7113 offset: 114,
7114 length: 1,
7115 token: Token::Special(Special::Separator(Separator::Space)),
7116 },
7117 PositionalToken {
7118 source: uws,
7119 offset: 115,
7120 length: 4,
7121 token: Token::Word(Word::Word("να".to_string())),
7122 },
7123 PositionalToken {
7124 source: uws,
7125 offset: 119,
7126 length: 1,
7127 token: Token::Special(Special::Separator(Separator::Space)),
7128 },
7129 PositionalToken {
7130 source: uws,
7131 offset: 120,
7132 length: 20,
7133 token: Token::Word(Word::Word("συμμετέχει".to_string())),
7134 },
7135 PositionalToken {
7136 source: uws,
7137 offset: 140,
7138 length: 1,
7139 token: Token::Special(Special::Separator(Separator::Space)),
7140 },
7141 PositionalToken {
7142 source: uws,
7143 offset: 141,
7144 length: 8,
7145 token: Token::Word(Word::Word("κάθε".to_string())),
7146 },
7147 PositionalToken {
7148 source: uws,
7149 offset: 149,
7150 length: 1,
7151 token: Token::Special(Special::Separator(Separator::Space)),
7152 },
7153 PositionalToken {
7154 source: uws,
7155 offset: 150,
7156 length: 24,
7157 token: Token::Word(Word::Word("εμπλεκόμενος".to_string())),
7158 },
7159 PositionalToken {
7160 source: uws,
7161 offset: 174,
7162 length: 1,
7163 token: Token::Special(Special::Separator(Separator::Space)),
7164 },
7165 PositionalToken {
7166 source: uws,
7167 offset: 175,
7168 length: 6,
7169 token: Token::Word(Word::Word("στη".to_string())),
7170 },
7171 PositionalToken {
7172 source: uws,
7173 offset: 181,
7174 length: 1,
7175 token: Token::Special(Special::Separator(Separator::Space)),
7176 },
7177 PositionalToken {
7178 source: uws,
7179 offset: 182,
7180 length: 2,
7181 token: Token::Word(Word::Word("ή".to_string())),
7182 },
7183 PositionalToken {
7184 source: uws,
7185 offset: 184,
7186 length: 1,
7187 token: Token::Special(Special::Punctuation('/')),
7188 },
7189 ],
7190 };
7191 (
7192 uws.chars()
7193 .take(100)
7194 .fold(String::new(), |acc, c| acc + &format!("{}", c)),
7195 tokens,
7196 )
7197 }
7198}