1use std::sync::Arc;
2use text_parsing::{Breaker, IntoSource, Local, Localize, Snip, Source, SourceEvent};
3
4mod emoji;
5pub use emoji::EMOJIMAP;
6
7mod breakers;
8pub use breakers::{SentenceBreaker, UnicodeSentenceBreaker};
9
10mod wordbreaker;
11
12mod options;
13pub use options::{IntoTokenizer, TokenizerOptions, TokenizerParams};
14
15mod tokens;
16pub use tokens::Tokens;
17
18mod text_tokens;
19use text_tokens::InnerBound;
20pub use text_tokens::TextTokens;
21
22#[derive(Debug)]
23pub enum Error {
24 TextParser(text_parsing::Error),
25}
26
27const EPS: f64 = 1e-8;
28
29#[cfg(feature = "strings")]
30#[derive(Debug, Clone, PartialEq, PartialOrd)]
31pub enum Number {
32 Integer(i64),
33 Float(f64),
34 ZeroInteger { i: i64, s: String },
36}
37
38#[cfg(not(feature = "strings"))]
39#[derive(Debug, Clone, Copy, PartialEq, PartialOrd)]
40pub enum Number {
41 Integer(i64),
42 Float(f64),
43 ZeroInteger { i: i64 },
44}
45
46impl Number {
47 pub fn as_f64(&self) -> f64 {
48 match self {
49 Number::Integer(i) => *i as f64,
50 Number::Float(f) => *f,
51 Number::ZeroInteger { i, .. } => *i as f64,
52 }
53 }
54}
55impl Ord for Number {
56 fn cmp(&self, other: &Number) -> std::cmp::Ordering {
57 let s = self.as_f64();
58 let o = other.as_f64();
59 let d = s - o;
60 match d.abs() < EPS {
61 true => std::cmp::Ordering::Equal,
62 false => {
63 if d > 0.0 {
64 return std::cmp::Ordering::Greater;
65 }
66 if d < 0.0 {
67 return std::cmp::Ordering::Less;
68 }
69 std::cmp::Ordering::Equal
70 }
71 }
72 }
73}
74impl Eq for Number {}
75
76#[derive(Debug, Clone, Copy, Eq, PartialEq, Ord, PartialOrd)]
77pub enum Separator {
78 Space,
79 Tab,
80 Newline,
81 Char(char),
82}
83
84#[derive(Debug, Clone, Copy, Eq, PartialEq, Ord, PartialOrd)]
85pub enum Formatter {
86 Char(char),
87 Joiner, }
89
90#[derive(Debug, Clone, Copy, PartialEq, Eq, Ord, PartialOrd)]
91pub enum Special {
92 Currency(char),
93 Punctuation(char),
94 Symbol(char),
95 Separator(Separator),
96}
97
98#[cfg(feature = "strings")]
99#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord)]
100pub enum Word {
101 Word(String),
102 StrangeWord(String),
103 Numerical(Numerical),
104 Number(Number),
105 Emoji(&'static str),
106}
107
108#[cfg(feature = "strings")]
109#[derive(Debug, Clone, Eq, PartialEq, Ord, PartialOrd)]
110pub enum Numerical {
111 DotSeparated(String),
115 Measures(String),
116 Alphanumeric(String),
117}
118
119#[cfg(feature = "strings")]
120#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord)]
121pub enum Struct {
122 Hashtag(String),
123 Mention(String),
124 }
126
127#[cfg(feature = "strings")]
128#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord)]
129pub enum Unicode {
130 String(String),
131 Formatter(Formatter),
132}
133
134#[cfg(not(feature = "strings"))]
135#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord)]
136pub enum Word {
137 Word,
138 StrangeWord,
139 Numerical(Numerical),
140 Number(Number),
141 Emoji(&'static str),
142}
143
144#[cfg(not(feature = "strings"))]
145#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord)]
146pub enum Numerical {
147 DotSeparated,
151 Measures,
152 Alphanumeric,
153}
154
155#[cfg(not(feature = "strings"))]
156#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord)]
157pub enum Struct {
158 Hashtag,
159 Mention,
160 }
162
163#[cfg(not(feature = "strings"))]
164#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord)]
165pub enum Unicode {
166 String,
167 Formatter(Formatter),
168}
169
170#[cfg(feature = "strings")]
171#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord)]
172pub enum Token {
173 Word(Word),
174 Struct(Struct),
175 Special(Special),
176 Unicode(Unicode),
177}
178
179#[cfg(not(feature = "strings"))]
180#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord)]
181pub enum Token {
182 Word(Word),
183 Struct(Struct),
184 Special(Special),
185 Unicode(Unicode),
186}
187
188#[derive(Debug)]
202pub struct TextStr<'s> {
203 buffer: &'s str,
204 localities: Arc<Vec<TextLocality>>,
205 breakers: Arc<Vec<InnerBound>>,
206}
207impl<'s> TextStr<'s> {
208 pub fn new<'a>(s: &'a str) -> Result<TextStr<'a>, Error> {
209 let text = inner_new(s.into_source(), false)?;
210 Ok(TextStr {
211 buffer: s,
212 localities: text.localities,
213 breakers: text.breakers,
214 })
215 }
216}
217
218fn inner_new<S: Source>(mut source: S, with_buffer: bool) -> Result<Text, Error> {
219 let mut buffer = String::new();
220 let mut localities = Vec::new();
221 let mut breakers = Vec::new();
222 let mut buffer_len = 0;
223
224 while let Some(local_se) = source.next_char().map_err(Error::TextParser)? {
225 let (local, se) = local_se.into_inner();
226 let c = match se {
227 SourceEvent::Char(c) => match c {
228 '\u{0060}' => '\u{0027}',
229 _ => c,
230 },
231 SourceEvent::Breaker(b) => {
232 let (c, opt_b) = match b {
233 Breaker::None => continue,
234 Breaker::Space => (' ', None),
235 Breaker::Line => ('\n', None),
236 Breaker::Word => ('\u{200B}', Some(b)), Breaker::Sentence | Breaker::Paragraph | Breaker::Section => ('\n', Some(b)),
238 };
239 if let Some(b) = opt_b {
240 let br = InnerBound {
241 bytes: Snip {
242 offset: buffer_len,
243 length: c.len_utf8(),
244 },
245 chars: Snip {
246 offset: localities.len(),
247 length: 1,
248 },
249 breaker: b,
250 original: Some(local),
251 };
252 breakers.push(br);
254 }
255 c
256 }
257 };
258
259 let buf_local = ().localize(
260 Snip {
261 offset: localities.len(),
263 length: 1,
264 },
265 Snip {
266 offset: buffer_len,
268 length: c.len_utf8(),
269 },
270 );
271 if with_buffer {
272 buffer.push(c);
273 }
274 buffer_len += c.len_utf8();
275 localities.push(TextLocality {
276 buffer: buf_local,
277 original: local,
278 });
279 }
280 Ok(Text {
281 buffer: Arc::new(buffer),
282 localities: Arc::new(localities),
283 breakers: Arc::new(breakers),
284 })
285}
286
287#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord)]
288pub struct TextLocality {
289 pub buffer: Local<()>,
290 pub original: Local<()>,
291}
292
293#[derive(Debug)]
294pub struct Text {
295 buffer: Arc<String>,
296 localities: Arc<Vec<TextLocality>>,
297 breakers: Arc<Vec<InnerBound>>,
298}
299impl Text {
300 pub fn new<S: Source>(source: S) -> Result<Text, Error> {
301 inner_new(source, true)
302 }
303 pub fn token_text<'s>(&'s self, token: &TextToken) -> &'s str {
304 let Snip {
305 offset: begin,
306 length: len,
307 } = token.locality.bytes();
308 let end = begin + len;
309 &self.buffer[begin..end]
310 }
311 pub fn text(&self) -> &str {
312 self.buffer.as_ref()
313 }
314 pub fn original_locality(&self, idx: usize) -> Option<Local<()>> {
315 self.localities.get(idx).map(|tl| tl.original)
316 }
317 pub fn localities(&self) -> &Vec<TextLocality> {
318 self.localities.as_ref()
319 }
320 pub fn shared_text(&self) -> Text {
321 Text {
322 buffer: self.buffer.clone(),
323 localities: self.localities.clone(),
324 breakers: self.breakers.clone(),
325 }
326 }
327}
328
329impl TryFrom<String> for Text {
330 type Error = Error;
331
332 fn try_from(s: String) -> Result<Text, Error> {
333 let mut text = inner_new((&s).into_source(), false)?;
334 text.buffer = Arc::new(s);
335 Ok(text)
336 }
337}
338
339impl TryFrom<&str> for Text {
340 type Error = Error;
341
342 fn try_from(s: &str) -> Result<Text, Error> {
343 Text::new(s.into_source())
344 }
345}
346
347#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord)]
348pub enum Bound {
349 Sentence,
350 Paragraph,
351 Section,
352}
353
354#[cfg(feature = "strings")]
355#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord)]
356pub struct TextToken {
357 locality: Local<()>,
358 original: Option<Local<()>>,
359 pub token: Token2,
360}
361
362#[cfg(not(feature = "strings"))]
363#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord)]
364pub struct TextToken {
365 locality: Local<()>,
366 original: Option<Local<()>>,
367 pub token: Token2,
368}
369
370#[cfg(test)]
371impl TextToken {
372 fn into_original_token_1(self) -> Option<Local<Token>> {
373 match self.original {
374 Some(original) => self.token.into_token().map(|t| original.local(t)),
375 None => None,
376 }
377 }
378}
379
380impl TextToken {
381 pub fn local(&self) -> Local<()> {
382 self.locality
383 }
384 pub fn original(&self) -> Option<Local<()>> {
385 self.original
386 }
387 pub fn into_position(mut self) -> TextToken {
388 self.locality = self.locality.into_position();
389 self.original = self.original.map(|or| or.into_position());
390 self
391 }
392 pub fn try_as_token(&self) -> Result<Token, Bound> {
393 self.token.try_as_token()
394 }
395 pub fn as_original_token(&self) -> Option<Local<&Token2>> {
396 self.original.map(|original| original.local(&self.token))
397 }
398 pub fn into_original_token(self) -> Option<Local<Token2>> {
399 self.original.map(|original| original.local(self.token))
400 }
401 pub fn original_str<'s>(&self, original: &'s str) -> Result<&'s str, OriginalError> {
402 match self.original {
403 Some(local) => {
404 let Snip {
405 offset: begin,
406 length: len,
407 } = local.bytes();
408 let end = begin + len;
409 match original.get(begin..end) {
410 Some(s) => Ok(s),
411 None => Err(OriginalError::InvalidSnip),
412 }
413 }
414 None => Err(OriginalError::NoOriginal),
415 }
416 }
417
418 pub fn test_token(lt: Local<Token2>) -> TextToken {
419 let (local, token) = lt.into_inner();
420 TextToken {
421 locality: local,
422 original: Some(local.local(())),
423 token,
424 }
425 }
426 pub fn test_new(token: Token2, local: Local<()>, original: Option<Local<()>>) -> TextToken {
427 TextToken {
428 locality: local,
429 original,
430 token,
431 }
432 }
433}
434
435#[derive(Debug)]
462pub enum OriginalError {
463 NoOriginal,
464 InvalidSnip,
465}
466
467#[cfg(feature = "strings")]
475#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord)]
476pub enum Token2 {
477 Word(Word),
478 Struct(Struct),
479 Special(Special),
480 Unicode(Unicode),
481
482 Bound(Bound),
483}
484#[cfg(not(feature = "strings"))]
485#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord)]
486pub enum Token2 {
487 Word(Word),
488 Struct(Struct),
489 Special(Special),
490 Unicode(Unicode),
491
492 Bound(Bound),
493}
494impl From<Token> for Token2 {
495 fn from(t: Token) -> Token2 {
496 match t {
497 Token::Word(w) => Token2::Word(w),
498 Token::Struct(s) => Token2::Struct(s),
499 Token::Special(s) => Token2::Special(s),
500 Token::Unicode(u) => Token2::Unicode(u),
501 }
502 }
503}
504impl Token2 {
505 #[cfg(not(feature = "strings"))]
506 fn try_as_token(&self) -> Result<Token, Bound> {
507 (*self).try_into_token()
508 }
509
510 #[cfg(feature = "strings")]
511 fn try_as_token(&self) -> Result<Token, Bound> {
512 self.clone().try_into_token()
513 }
514
515 fn try_into_token(self) -> Result<Token, Bound> {
516 match self {
517 Token2::Word(w) => Ok(Token::Word(w)),
518 Token2::Struct(s) => Ok(Token::Struct(s)),
519 Token2::Special(s) => Ok(Token::Special(s)),
520 Token2::Unicode(u) => Ok(Token::Unicode(u)),
521 Token2::Bound(b) => Err(b),
522 }
523 }
524}
525#[cfg(test)]
526impl Token2 {
527 fn into_token(self) -> Option<Token> {
528 match self {
529 Token2::Word(w) => Some(Token::Word(w)),
530 Token2::Struct(s) => Some(Token::Struct(s)),
531 Token2::Special(s) => Some(Token::Special(s)),
532 Token2::Unicode(u) => Some(Token::Unicode(u)),
533 Token2::Bound(_) => None,
534 }
535 }
536}
537
538#[cfg(test)]
539#[cfg(not(feature = "strings"))]
540mod test {
541 use super::*;
542 use text_parsing::{
543 IntoPipeParser, IntoSource, Localize, ParserExt, SourceExt, entities, tagger,
544 };
545
546 fn check_results(result: &Vec<Local<Token>>, lib_res: &Vec<Local<Token>>, _uws: &str) {
547 assert_eq!(result.len(), lib_res.len());
548 for i in 0..result.len() {
549 let res: Local<Token> = result[i].clone().into();
550 assert_eq!(res, lib_res[i]);
551 }
552 }
553
554 fn symbols() {
556 let uws = "Сибирь Арене 17 30 от 2560₽ 😀";
557 let lib_res = uws
560 .into_tokenizer(TokenizerParams::v1())
561 .collect::<Vec<_>>();
562 for t in lib_res {
564 println!("{:?}", t);
565 }
566 panic!()
567 }
568}
569
570#[cfg(test)]
571mod test_v0_5 {
572 use super::*;
573 use text_parsing::{IntoPipeParser, IntoSource, ParserExt, SourceExt, entities, tagger};
574
575 fn basic() {
577 let uws = "<p>Oxana Putan shared the quick (\"brown\") fox can't jump 32.3 feet, right? 4pda etc.</p><p> qeq U.S.A asd\n\n\nBrr, it's 29.3°F!\n Русское предложение #36.6 для тестирования деления по юникод-словам...\n🇷🇺 🇸🇹\n👱🏿👶🏽👨🏽\n+Done! Готово</p>";
592 let text = Text::new({
593 uws.into_source()
594 .pipe(tagger::Builder::new().create().into_breaker())
595 .pipe(entities::Builder::new().create().into_piped())
596 .into_separator()
597 })
598 .unwrap();
599 let lib_res = text
600 .into_tokenizer({
601 TokenizerParams::default()
602 .add_option(TokenizerOptions::SplitDot)
603 .add_option(TokenizerOptions::SplitUnderscore)
604 .add_option(TokenizerOptions::SplitColon)
605 .with_default_sentences()
606 })
607 .collect::<Vec<_>>();
608
609 for tok in lib_res {
610 println!(
611 "C{:?}, B{:?}, {:?} -> {:?}",
612 tok.original.map(|loc| loc.chars()),
613 tok.original.map(|loc| loc.bytes()),
614 tok.token,
615 tok.original_str(uws)
616 );
617 }
618
619 panic!()
620 }
621}
622
623#[cfg(test)]
624#[cfg(feature = "strings")]
625mod test {
626 use super::*;
627 use text_parsing::{
628 IntoPipeParser, IntoSource, Localize, ParserExt, SourceExt, entities, tagger,
629 };
630
631 #[derive(Debug, Clone)]
692 struct CharToken {
693 byte_offset: usize,
694 byte_length: usize,
695 char_offset: usize,
696 char_length: usize,
697 token: Token,
698 }
699 impl Into<Local<Token>> for CharToken {
700 fn into(self) -> Local<Token> {
701 self.token.localize(
702 Snip {
703 offset: self.char_offset,
704 length: self.char_length,
705 },
706 Snip {
707 offset: self.byte_offset,
708 length: self.byte_length,
709 },
710 )
711 }
712 }
713
714 #[derive(Debug, Clone)]
715 struct PositionalToken {
716 source: &'static str,
717 offset: usize,
718 length: usize,
719 token: Token,
720 }
721 impl Into<Local<Token>> for PositionalToken {
722 fn into(self) -> Local<Token> {
723 self.token.localize(
724 Snip {
725 offset: self.source[..self.offset].chars().count(),
726 length: self.source[self.offset..self.offset + self.length]
727 .chars()
728 .count(),
729 },
730 Snip {
731 offset: self.offset,
732 length: self.length,
733 },
734 )
735 }
736 }
737
738 fn check_results(result: &Vec<PositionalToken>, lib_res: &Vec<Local<Token>>, _uws: &str) {
739 assert_eq!(result.len(), lib_res.len());
740 for i in 0..result.len() {
741 let res: Local<Token> = result[i].clone().into();
742 assert_eq!(res, lib_res[i]);
743 }
744 }
745
746 fn check_cresults(result: &Vec<CharToken>, lib_res: &Vec<Local<Token>>, _uws: &str) {
747 assert_eq!(result.len(), lib_res.len());
748 for i in 0..result.len() {
749 let res: Local<Token> = result[i].clone().into();
750 assert_eq!(res, lib_res[i]);
751 }
752 }
753
754 fn check<T: Clone + std::fmt::Debug + Into<Local<Token>>>(
755 res: &Vec<T>,
756 lib: &Vec<Local<Token>>,
757 _uws: &str,
758 ) {
759 let mut lib = lib.iter();
760 let mut res = res.iter().map(|r| {
761 let res: Local<Token> = r.clone().into();
762 res
763 });
764 let mut diff = Vec::new();
765 loop {
766 match (lib.next(), res.next()) {
767 (Some(lw), Some(rw)) => {
768 if *lw != rw {
769 diff.push(format!("LIB: {:?}", lw));
770 diff.push(format!("TEST: {:?}", rw));
771 diff.push("".to_string())
772 }
773 }
774 (Some(lw), None) => {
775 diff.push(format!("LIB: {:?}", lw));
776 diff.push("TEST: ----".to_string());
777 diff.push("".to_string())
778 }
779 (None, Some(rw)) => {
780 diff.push("LIB: ----".to_string());
781 diff.push(format!("TEST: {:?}", rw));
782 diff.push("".to_string())
783 }
784 (None, None) => break,
785 }
786 }
787 if diff.len() > 0 {
788 for ln in &diff {
789 println!("{}", ln);
790 }
791 panic!("Diff count: {}", diff.len() / 3);
792 }
793 }
794
795 #[test]
796 fn spaces() {
797 let uws = " spaces too many apces ";
798 let result = vec![
799 PositionalToken {
800 source: uws,
801 offset: 0,
802 length: 4,
803 token: Token::Special(Special::Separator(Separator::Space)),
804 },
805 PositionalToken {
806 source: uws,
807 offset: 4,
808 length: 6,
809 token: Token::Word(Word::Word("spaces".to_string())),
810 },
811 PositionalToken {
812 source: uws,
813 offset: 10,
814 length: 4,
815 token: Token::Special(Special::Separator(Separator::Space)),
816 },
817 PositionalToken {
818 source: uws,
819 offset: 14,
820 length: 3,
821 token: Token::Word(Word::Word("too".to_string())),
822 },
823 PositionalToken {
824 source: uws,
825 offset: 17,
826 length: 3,
827 token: Token::Special(Special::Separator(Separator::Space)),
828 },
829 PositionalToken {
830 source: uws,
831 offset: 20,
832 length: 4,
833 token: Token::Word(Word::Word("many".to_string())),
834 },
835 PositionalToken {
836 source: uws,
837 offset: 24,
838 length: 3,
839 token: Token::Special(Special::Separator(Separator::Space)),
840 },
841 PositionalToken {
842 source: uws,
843 offset: 27,
844 length: 5,
845 token: Token::Word(Word::Word("apces".to_string())),
846 },
847 PositionalToken {
848 source: uws,
849 offset: 32,
850 length: 3,
851 token: Token::Special(Special::Separator(Separator::Space)),
852 },
853 ];
854 let lib_res = uws
855 .into_tokenizer(TokenizerParams::v1())
856 .collect::<Vec<_>>();
857 check_results(&result, &lib_res, uws);
858 }
860
861 #[test]
862 fn numbers() {
863 let uws = "(() -2\n() -2";
864 let result = vec![
865 PositionalToken {
866 source: uws,
867 offset: 0,
868 length: 1,
869 token: Token::Special(Special::Punctuation('(')),
870 },
871 PositionalToken {
872 source: uws,
873 offset: 1,
874 length: 1,
875 token: Token::Special(Special::Punctuation('(')),
876 },
877 PositionalToken {
878 source: uws,
879 offset: 2,
880 length: 1,
881 token: Token::Special(Special::Punctuation(')')),
882 },
883 PositionalToken {
884 source: uws,
885 offset: 3,
886 length: 1,
887 token: Token::Special(Special::Separator(Separator::Space)),
888 },
889 PositionalToken {
890 source: uws,
891 offset: 4,
892 length: 2,
893 token: Token::Word(Word::Number(Number::Integer(-2))),
894 },
895 PositionalToken {
896 source: uws,
897 offset: 6,
898 length: 1,
899 token: Token::Special(Special::Separator(Separator::Newline)),
900 },
901 PositionalToken {
902 source: uws,
903 offset: 7,
904 length: 1,
905 token: Token::Special(Special::Punctuation('(')),
906 },
907 PositionalToken {
908 source: uws,
909 offset: 8,
910 length: 1,
911 token: Token::Special(Special::Punctuation(')')),
912 },
913 PositionalToken {
914 source: uws,
915 offset: 9,
916 length: 2,
917 token: Token::Special(Special::Separator(Separator::Space)),
918 },
919 PositionalToken {
920 source: uws,
921 offset: 11,
922 length: 2,
923 token: Token::Word(Word::Number(Number::Integer(-2))),
924 },
925 ];
926 let lib_res = uws
927 .into_tokenizer({
928 TokenizerParams::default()
929 .add_option(TokenizerOptions::SplitDot)
930 .add_option(TokenizerOptions::SplitUnderscore)
931 .add_option(TokenizerOptions::SplitColon)
932 .add_option(TokenizerOptions::MergeWhites)
933 })
934 .collect::<Vec<_>>();
935 check_results(&result, &lib_res, uws);
936 }
937
938 #[test]
939 fn word_with_inner_hyphens() {
940 let uws = "Опросы показывают";
941 let result = vec![
942 PositionalToken {
943 source: uws,
944 offset: 0,
945 length: 14,
946 token: Token::Word(Word::StrangeWord("Опросы".to_string())),
947 },
948 PositionalToken {
949 source: uws,
950 offset: 14,
951 length: 1,
952 token: Token::Special(Special::Separator(Separator::Space)),
953 },
954 PositionalToken {
955 source: uws,
956 offset: 15,
957 length: 28,
958 token: Token::Word(Word::StrangeWord("показывают".to_string())),
959 },
960 ];
961 let lib_res = uws
962 .into_tokenizer(TokenizerParams::v1())
963 .collect::<Vec<_>>();
964 check_results(&result, &lib_res, uws);
965 }
966
967 #[test]
968 fn mixed_but_word() {
969 let uws = "L’Oreal";
970 let result = vec![PositionalToken {
971 source: uws,
972 offset: 0,
973 length: 9,
974 token: Token::Word(Word::StrangeWord("L’Oreal".to_string())),
975 }];
976 let lib_res = uws
977 .into_tokenizer(TokenizerParams::v1())
978 .collect::<Vec<_>>();
979 check_results(&result, &lib_res, uws);
980 }
981
982 #[test]
983 fn hashtags() {
984 let uws = "#hashtag#hashtag2";
985 let result = vec![
986 PositionalToken {
987 source: uws,
988 offset: 0,
989 length: 1,
990 token: Token::Special(Special::Punctuation('#')),
991 },
992 PositionalToken {
993 source: uws,
994 offset: 1,
995 length: 7,
996 token: Token::Word(Word::Word("hashtag".to_string())),
997 },
998 PositionalToken {
999 source: uws,
1000 offset: 8,
1001 length: 1,
1002 token: Token::Special(Special::Punctuation('#')),
1003 },
1004 PositionalToken {
1005 source: uws,
1006 offset: 9,
1007 length: 8,
1008 token: Token::Word(Word::Numerical(Numerical::Alphanumeric(
1009 "hashtag2".to_string(),
1010 ))),
1011 },
1012 ];
1013 let lib_res = uws
1014 .into_tokenizer(TokenizerParams::v1())
1015 .collect::<Vec<_>>();
1016 check_results(&result, &lib_res, uws);
1017 }
1018
1019 #[test]
1020 fn hashtags2() {
1021 let uws = "#hashtag#hashtag2 #hash_tag";
1022 let result = vec![
1023 PositionalToken {
1024 source: uws,
1025 offset: 0,
1026 length: 8,
1027 token: Token::Struct(Struct::Hashtag("hashtag".to_string())),
1028 },
1029 PositionalToken {
1030 source: uws,
1031 offset: 8,
1032 length: 9,
1033 token: Token::Struct(Struct::Hashtag("hashtag2".to_string())),
1034 },
1035 PositionalToken {
1036 source: uws,
1037 offset: 17,
1038 length: 1,
1039 token: Token::Special(Special::Separator(Separator::Space)),
1040 },
1041 PositionalToken {
1042 source: uws,
1043 offset: 18,
1044 length: 9,
1045 token: Token::Struct(Struct::Hashtag("hash_tag".to_string())),
1046 },
1047 ];
1048 let lib_res = uws
1049 .into_tokenizer(TokenizerParams::v1().add_option(TokenizerOptions::StructTokens))
1050 .collect::<Vec<_>>();
1051 check_results(&result, &lib_res, uws);
1052 }
1053
1054 #[test]
1055 fn mention2() {
1056 let uws = "@hashtag@hashtag2 @hash_tag";
1057 let result = vec![
1058 PositionalToken {
1059 source: uws,
1060 offset: 0,
1061 length: 8,
1062 token: Token::Struct(Struct::Mention("hashtag".to_string())),
1063 },
1064 PositionalToken {
1065 source: uws,
1066 offset: 8,
1067 length: 9,
1068 token: Token::Struct(Struct::Mention("hashtag2".to_string())),
1069 },
1070 PositionalToken {
1071 source: uws,
1072 offset: 17,
1073 length: 1,
1074 token: Token::Special(Special::Separator(Separator::Space)),
1075 },
1076 PositionalToken {
1077 source: uws,
1078 offset: 18,
1079 length: 9,
1080 token: Token::Struct(Struct::Mention("hash_tag".to_string())),
1081 },
1082 ];
1083 let lib_res = uws
1084 .into_tokenizer(TokenizerParams::v1().add_option(TokenizerOptions::StructTokens))
1085 .collect::<Vec<_>>();
1086 check_results(&result, &lib_res, uws);
1087 }
1088
1089 #[test]
1090 fn apostrophe() {
1091 let uws = "l'oreal; l\u{0060}oreal";
1092 let result = vec![
1093 PositionalToken {
1094 source: uws,
1095 offset: 0,
1096 length: 7,
1097 token: Token::Word(Word::Word("l'oreal".to_string())),
1098 },
1099 PositionalToken {
1100 source: uws,
1101 offset: 7,
1102 length: 1,
1103 token: Token::Special(Special::Punctuation(';')),
1104 },
1105 PositionalToken {
1106 source: uws,
1107 offset: 8,
1108 length: 1,
1109 token: Token::Special(Special::Separator(Separator::Space)),
1110 },
1111 PositionalToken {
1112 source: uws,
1113 offset: 9,
1114 length: 7,
1115 token: Token::Word(Word::Word("l'oreal".to_string())),
1116 },
1117 ];
1118 let text = Text::new(uws.into_source()).unwrap();
1119 let lib_res = text
1120 .into_tokenizer(TokenizerParams::v1())
1121 .filter_map(|tt| tt.into_original_token_1())
1122 .collect::<Vec<_>>();
1123 check_results(&result, &lib_res, uws);
1124 }
1125
1126 #[test]
1127 fn char_tokens() {
1128 let uws = "[Oxana Putan|1712640565] shared the quick (\"brown\") fox can't jump 32.3 feet, right? 4pda etc. qeq U.S.A asd\n\n\nBrr, it's 29.3°F!\n Русское предложение #36.6 для тестирования деления по юникод-словам...\n🇷🇺 🇸🇹\n👱🏿👶🏽👨🏽\n+Done! Готово";
1129 let result = vec![
1130 CharToken {
1131 byte_offset: 0,
1132 byte_length: 1,
1133 char_offset: 0,
1134 char_length: 1,
1135 token: Token::Special(Special::Punctuation('[')),
1136 },
1137 CharToken {
1138 byte_offset: 1,
1139 byte_length: 5,
1140 char_offset: 1,
1141 char_length: 5,
1142 token: Token::Word(Word::Word("Oxana".to_string())),
1143 },
1144 CharToken {
1145 byte_offset: 6,
1146 byte_length: 1,
1147 char_offset: 6,
1148 char_length: 1,
1149 token: Token::Special(Special::Separator(Separator::Space)),
1150 },
1151 CharToken {
1152 byte_offset: 7,
1153 byte_length: 5,
1154 char_offset: 7,
1155 char_length: 5,
1156 token: Token::Word(Word::Word("Putan".to_string())),
1157 },
1158 CharToken {
1159 byte_offset: 12,
1160 byte_length: 1,
1161 char_offset: 12,
1162 char_length: 1,
1163 token: Token::Special(Special::Punctuation('|')),
1164 },
1165 CharToken {
1166 byte_offset: 13,
1167 byte_length: 10,
1168 char_offset: 13,
1169 char_length: 10,
1170 token: Token::Word(Word::Number(Number::Integer(1712640565))),
1171 },
1172 CharToken {
1173 byte_offset: 23,
1174 byte_length: 1,
1175 char_offset: 23,
1176 char_length: 1,
1177 token: Token::Special(Special::Punctuation(']')),
1178 },
1179 CharToken {
1187 byte_offset: 24,
1188 byte_length: 1,
1189 char_offset: 24,
1190 char_length: 1,
1191 token: Token::Special(Special::Separator(Separator::Space)),
1192 },
1193 CharToken {
1194 byte_offset: 25,
1195 byte_length: 6,
1196 char_offset: 25,
1197 char_length: 6,
1198 token: Token::Word(Word::Word("shared".to_string())),
1199 },
1200 CharToken {
1201 byte_offset: 31,
1202 byte_length: 1,
1203 char_offset: 31,
1204 char_length: 1,
1205 token: Token::Special(Special::Separator(Separator::Space)),
1206 },
1207 CharToken {
1208 byte_offset: 32,
1209 byte_length: 3,
1210 char_offset: 32,
1211 char_length: 3,
1212 token: Token::Word(Word::Word("the".to_string())),
1213 },
1214 CharToken {
1215 byte_offset: 35,
1216 byte_length: 1,
1217 char_offset: 35,
1218 char_length: 1,
1219 token: Token::Special(Special::Separator(Separator::Space)),
1220 },
1221 CharToken {
1222 byte_offset: 36,
1223 byte_length: 5,
1224 char_offset: 36,
1225 char_length: 5,
1226 token: Token::Word(Word::Word("quick".to_string())),
1227 },
1228 CharToken {
1229 byte_offset: 41,
1230 byte_length: 1,
1231 char_offset: 41,
1232 char_length: 1,
1233 token: Token::Special(Special::Separator(Separator::Space)),
1234 },
1235 CharToken {
1236 byte_offset: 42,
1237 byte_length: 1,
1238 char_offset: 42,
1239 char_length: 1,
1240 token: Token::Special(Special::Punctuation('(')),
1241 },
1242 CharToken {
1243 byte_offset: 43,
1244 byte_length: 1,
1245 char_offset: 43,
1246 char_length: 1,
1247 token: Token::Special(Special::Punctuation('"')),
1248 },
1249 CharToken {
1250 byte_offset: 44,
1251 byte_length: 5,
1252 char_offset: 44,
1253 char_length: 5,
1254 token: Token::Word(Word::Word("brown".to_string())),
1255 },
1256 CharToken {
1257 byte_offset: 49,
1258 byte_length: 1,
1259 char_offset: 49,
1260 char_length: 1,
1261 token: Token::Special(Special::Punctuation('"')),
1262 },
1263 CharToken {
1264 byte_offset: 50,
1265 byte_length: 1,
1266 char_offset: 50,
1267 char_length: 1,
1268 token: Token::Special(Special::Punctuation(')')),
1269 },
1270 CharToken {
1271 byte_offset: 51,
1272 byte_length: 1,
1273 char_offset: 51,
1274 char_length: 1,
1275 token: Token::Special(Special::Separator(Separator::Space)),
1276 },
1277 CharToken {
1278 byte_offset: 52,
1279 byte_length: 3,
1280 char_offset: 52,
1281 char_length: 3,
1282 token: Token::Word(Word::Word("fox".to_string())),
1283 },
1284 CharToken {
1285 byte_offset: 55,
1286 byte_length: 1,
1287 char_offset: 55,
1288 char_length: 1,
1289 token: Token::Special(Special::Separator(Separator::Space)),
1290 },
1291 CharToken {
1292 byte_offset: 56,
1293 byte_length: 5,
1294 char_offset: 56,
1295 char_length: 5,
1296 token: Token::Word(Word::Word("can\'t".to_string())),
1297 },
1298 CharToken {
1299 byte_offset: 61,
1300 byte_length: 1,
1301 char_offset: 61,
1302 char_length: 1,
1303 token: Token::Special(Special::Separator(Separator::Space)),
1304 },
1305 CharToken {
1306 byte_offset: 62,
1307 byte_length: 4,
1308 char_offset: 62,
1309 char_length: 4,
1310 token: Token::Word(Word::Word("jump".to_string())),
1311 },
1312 CharToken {
1313 byte_offset: 66,
1314 byte_length: 1,
1315 char_offset: 66,
1316 char_length: 1,
1317 token: Token::Special(Special::Separator(Separator::Space)),
1318 },
1319 CharToken {
1320 byte_offset: 67,
1321 byte_length: 4,
1322 char_offset: 67,
1323 char_length: 4,
1324 token: Token::Word(Word::Number(Number::Float(32.3))),
1325 },
1326 CharToken {
1327 byte_offset: 71,
1328 byte_length: 1,
1329 char_offset: 71,
1330 char_length: 1,
1331 token: Token::Special(Special::Separator(Separator::Space)),
1332 },
1333 CharToken {
1334 byte_offset: 72,
1335 byte_length: 4,
1336 char_offset: 72,
1337 char_length: 4,
1338 token: Token::Word(Word::Word("feet".to_string())),
1339 },
1340 CharToken {
1341 byte_offset: 76,
1342 byte_length: 1,
1343 char_offset: 76,
1344 char_length: 1,
1345 token: Token::Special(Special::Punctuation(',')),
1346 },
1347 CharToken {
1348 byte_offset: 77,
1349 byte_length: 1,
1350 char_offset: 77,
1351 char_length: 1,
1352 token: Token::Special(Special::Separator(Separator::Space)),
1353 },
1354 CharToken {
1355 byte_offset: 78,
1356 byte_length: 5,
1357 char_offset: 78,
1358 char_length: 5,
1359 token: Token::Word(Word::Word("right".to_string())),
1360 },
1361 CharToken {
1362 byte_offset: 83,
1363 byte_length: 1,
1364 char_offset: 83,
1365 char_length: 1,
1366 token: Token::Special(Special::Punctuation('?')),
1367 },
1368 CharToken {
1369 byte_offset: 84,
1370 byte_length: 1,
1371 char_offset: 84,
1372 char_length: 1,
1373 token: Token::Special(Special::Separator(Separator::Space)),
1374 },
1375 CharToken {
1376 byte_offset: 85,
1377 byte_length: 4,
1378 char_offset: 85,
1379 char_length: 4,
1380 token: Token::Word(Word::Numerical(Numerical::Measures("4pda".to_string()))),
1381 },
1382 CharToken {
1383 byte_offset: 89,
1384 byte_length: 1,
1385 char_offset: 89,
1386 char_length: 1,
1387 token: Token::Special(Special::Separator(Separator::Space)),
1388 },
1389 CharToken {
1390 byte_offset: 90,
1391 byte_length: 3,
1392 char_offset: 90,
1393 char_length: 3,
1394 token: Token::Word(Word::Word("etc".to_string())),
1395 },
1396 CharToken {
1397 byte_offset: 93,
1398 byte_length: 1,
1399 char_offset: 93,
1400 char_length: 1,
1401 token: Token::Special(Special::Punctuation('.')),
1402 },
1403 CharToken {
1404 byte_offset: 94,
1405 byte_length: 1,
1406 char_offset: 94,
1407 char_length: 1,
1408 token: Token::Special(Special::Separator(Separator::Space)),
1409 },
1410 CharToken {
1411 byte_offset: 95,
1412 byte_length: 3,
1413 char_offset: 95,
1414 char_length: 3,
1415 token: Token::Word(Word::Word("qeq".to_string())),
1416 },
1417 CharToken {
1418 byte_offset: 98,
1419 byte_length: 1,
1420 char_offset: 98,
1421 char_length: 1,
1422 token: Token::Special(Special::Separator(Separator::Space)),
1423 },
1424 CharToken {
1425 byte_offset: 99,
1426 byte_length: 5,
1427 char_offset: 99,
1428 char_length: 5,
1429 token: Token::Word(Word::Word("U.S.A".to_string())),
1430 },
1431 CharToken {
1432 byte_offset: 104,
1433 byte_length: 2,
1434 char_offset: 104,
1435 char_length: 2,
1436 token: Token::Special(Special::Separator(Separator::Space)),
1437 },
1438 CharToken {
1439 byte_offset: 106,
1440 byte_length: 3,
1441 char_offset: 106,
1442 char_length: 3,
1443 token: Token::Word(Word::Word("asd".to_string())),
1444 },
1445 CharToken {
1446 byte_offset: 109,
1447 byte_length: 3,
1448 char_offset: 109,
1449 char_length: 3,
1450 token: Token::Special(Special::Separator(Separator::Newline)),
1451 },
1452 CharToken {
1453 byte_offset: 112,
1454 byte_length: 3,
1455 char_offset: 112,
1456 char_length: 3,
1457 token: Token::Word(Word::Word("Brr".to_string())),
1458 },
1459 CharToken {
1460 byte_offset: 115,
1461 byte_length: 1,
1462 char_offset: 115,
1463 char_length: 1,
1464 token: Token::Special(Special::Punctuation(',')),
1465 },
1466 CharToken {
1467 byte_offset: 116,
1468 byte_length: 1,
1469 char_offset: 116,
1470 char_length: 1,
1471 token: Token::Special(Special::Separator(Separator::Space)),
1472 },
1473 CharToken {
1474 byte_offset: 117,
1475 byte_length: 4,
1476 char_offset: 117,
1477 char_length: 4,
1478 token: Token::Word(Word::Word("it\'s".to_string())),
1479 },
1480 CharToken {
1481 byte_offset: 121,
1482 byte_length: 1,
1483 char_offset: 121,
1484 char_length: 1,
1485 token: Token::Special(Special::Separator(Separator::Space)),
1486 },
1487 CharToken {
1488 byte_offset: 122,
1489 byte_length: 4,
1490 char_offset: 122,
1491 char_length: 4,
1492 token: Token::Word(Word::Number(Number::Float(29.3))),
1493 },
1494 CharToken {
1495 byte_offset: 126,
1496 byte_length: 2,
1497 char_offset: 126,
1498 char_length: 1,
1499 token: Token::Special(Special::Symbol('°')),
1500 },
1501 CharToken {
1502 byte_offset: 128,
1503 byte_length: 1,
1504 char_offset: 127,
1505 char_length: 1,
1506 token: Token::Word(Word::Word("F".to_string())),
1507 },
1508 CharToken {
1509 byte_offset: 129,
1510 byte_length: 1,
1511 char_offset: 128,
1512 char_length: 1,
1513 token: Token::Special(Special::Punctuation('!')),
1514 },
1515 CharToken {
1516 byte_offset: 130,
1517 byte_length: 1,
1518 char_offset: 129,
1519 char_length: 1,
1520 token: Token::Special(Special::Separator(Separator::Newline)),
1521 },
1522 CharToken {
1523 byte_offset: 131,
1524 byte_length: 1,
1525 char_offset: 130,
1526 char_length: 1,
1527 token: Token::Special(Special::Separator(Separator::Space)),
1528 },
1529 CharToken {
1530 byte_offset: 132,
1531 byte_length: 14,
1532 char_offset: 131,
1533 char_length: 7,
1534 token: Token::Word(Word::Word("Русское".to_string())),
1535 },
1536 CharToken {
1537 byte_offset: 146,
1538 byte_length: 1,
1539 char_offset: 138,
1540 char_length: 1,
1541 token: Token::Special(Special::Separator(Separator::Space)),
1542 },
1543 CharToken {
1544 byte_offset: 147,
1545 byte_length: 22,
1546 char_offset: 139,
1547 char_length: 11,
1548 token: Token::Word(Word::Word("предложение".to_string())),
1549 },
1550 CharToken {
1551 byte_offset: 169,
1552 byte_length: 1,
1553 char_offset: 150,
1554 char_length: 1,
1555 token: Token::Special(Special::Separator(Separator::Space)),
1556 },
1557 CharToken {
1558 byte_offset: 170,
1559 byte_length: 5,
1560 char_offset: 151,
1561 char_length: 5,
1562 token: Token::Struct(Struct::Hashtag("36.6".to_string())),
1563 },
1564 CharToken {
1565 byte_offset: 175,
1566 byte_length: 1,
1567 char_offset: 156,
1568 char_length: 1,
1569 token: Token::Special(Special::Separator(Separator::Space)),
1570 },
1571 CharToken {
1572 byte_offset: 176,
1573 byte_length: 6,
1574 char_offset: 157,
1575 char_length: 3,
1576 token: Token::Word(Word::Word("для".to_string())),
1577 },
1578 CharToken {
1579 byte_offset: 182,
1580 byte_length: 1,
1581 char_offset: 160,
1582 char_length: 1,
1583 token: Token::Special(Special::Separator(Separator::Space)),
1584 },
1585 CharToken {
1586 byte_offset: 183,
1587 byte_length: 24,
1588 char_offset: 161,
1589 char_length: 12,
1590 token: Token::Word(Word::Word("тестирования".to_string())),
1591 },
1592 CharToken {
1593 byte_offset: 207,
1594 byte_length: 1,
1595 char_offset: 173,
1596 char_length: 1,
1597 token: Token::Special(Special::Separator(Separator::Space)),
1598 },
1599 CharToken {
1600 byte_offset: 208,
1601 byte_length: 14,
1602 char_offset: 174,
1603 char_length: 7,
1604 token: Token::Word(Word::Word("деления".to_string())),
1605 },
1606 CharToken {
1607 byte_offset: 222,
1608 byte_length: 1,
1609 char_offset: 181,
1610 char_length: 1,
1611 token: Token::Special(Special::Separator(Separator::Space)),
1612 },
1613 CharToken {
1614 byte_offset: 223,
1615 byte_length: 4,
1616 char_offset: 182,
1617 char_length: 2,
1618 token: Token::Word(Word::Word("по".to_string())),
1619 },
1620 CharToken {
1621 byte_offset: 227,
1622 byte_length: 1,
1623 char_offset: 184,
1624 char_length: 1,
1625 token: Token::Special(Special::Separator(Separator::Space)),
1626 },
1627 CharToken {
1628 byte_offset: 228,
1629 byte_length: 12,
1630 char_offset: 185,
1631 char_length: 6,
1632 token: Token::Word(Word::Word("юникод".to_string())),
1633 },
1634 CharToken {
1635 byte_offset: 240,
1636 byte_length: 1,
1637 char_offset: 191,
1638 char_length: 1,
1639 token: Token::Special(Special::Punctuation('-')),
1640 },
1641 CharToken {
1642 byte_offset: 241,
1643 byte_length: 12,
1644 char_offset: 192,
1645 char_length: 6,
1646 token: Token::Word(Word::Word("словам".to_string())),
1647 },
1648 CharToken {
1649 byte_offset: 253,
1650 byte_length: 3,
1651 char_offset: 198,
1652 char_length: 3,
1653 token: Token::Special(Special::Punctuation('.')),
1654 },
1655 CharToken {
1656 byte_offset: 256,
1657 byte_length: 1,
1658 char_offset: 201,
1659 char_length: 1,
1660 token: Token::Special(Special::Separator(Separator::Newline)),
1661 },
1662 CharToken {
1663 byte_offset: 257,
1664 byte_length: 8,
1665 char_offset: 202,
1666 char_length: 2,
1667 token: Token::Word(Word::Emoji("russia")),
1668 },
1669 CharToken {
1670 byte_offset: 265,
1671 byte_length: 1,
1672 char_offset: 204,
1673 char_length: 1,
1674 token: Token::Special(Special::Separator(Separator::Space)),
1675 },
1676 CharToken {
1677 byte_offset: 266,
1678 byte_length: 8,
1679 char_offset: 205,
1680 char_length: 2,
1681 token: Token::Word(Word::Emoji("sao_tome_and_principe")),
1682 },
1683 CharToken {
1684 byte_offset: 274,
1685 byte_length: 1,
1686 char_offset: 207,
1687 char_length: 1,
1688 token: Token::Special(Special::Separator(Separator::Newline)),
1689 },
1690 CharToken {
1691 byte_offset: 275,
1692 byte_length: 8,
1693 char_offset: 208,
1694 char_length: 2,
1695 token: Token::Word(Word::Emoji("blond_haired_person_dark_skin_tone")),
1696 },
1697 CharToken {
1698 byte_offset: 283,
1699 byte_length: 8,
1700 char_offset: 210,
1701 char_length: 2,
1702 token: Token::Word(Word::Emoji("baby_medium_skin_tone")),
1703 },
1704 CharToken {
1705 byte_offset: 291,
1706 byte_length: 8,
1707 char_offset: 212,
1708 char_length: 2,
1709 token: Token::Word(Word::Emoji("man_medium_skin_tone")),
1710 },
1711 CharToken {
1712 byte_offset: 299,
1713 byte_length: 1,
1714 char_offset: 214,
1715 char_length: 1,
1716 token: Token::Special(Special::Separator(Separator::Newline)),
1717 },
1718 CharToken {
1719 byte_offset: 300,
1720 byte_length: 1,
1721 char_offset: 215,
1722 char_length: 1,
1723 token: Token::Special(Special::Punctuation('+')),
1724 },
1725 CharToken {
1726 byte_offset: 301,
1727 byte_length: 4,
1728 char_offset: 216,
1729 char_length: 4,
1730 token: Token::Word(Word::Word("Done".to_string())),
1731 },
1732 CharToken {
1733 byte_offset: 305,
1734 byte_length: 1,
1735 char_offset: 220,
1736 char_length: 1,
1737 token: Token::Special(Special::Punctuation('!')),
1738 },
1739 CharToken {
1740 byte_offset: 306,
1741 byte_length: 1,
1742 char_offset: 221,
1743 char_length: 1,
1744 token: Token::Special(Special::Separator(Separator::Space)),
1745 },
1746 CharToken {
1747 byte_offset: 307,
1748 byte_length: 12,
1749 char_offset: 222,
1750 char_length: 6,
1751 token: Token::Word(Word::Word("Готово".to_string())),
1752 },
1753 ];
1754
1755 let lib_res = uws
1756 .into_tokenizer(TokenizerParams::complex())
1757 .collect::<Vec<_>>();
1758
1759 check_cresults(&result, &lib_res, uws);
1761 }
1762
1763 #[test]
1764 fn general_default() {
1765 let uws = "The quick (\"brown\") fox can't jump 32.3 feet, right? 4pda etc. qeq U.S.A asd\n\n\nBrr, it's 29.3°F!\n Русское предложение #36.6 для тестирования деления по юникод-словам...\n";
1766 let result = vec![
1767 PositionalToken {
1768 source: uws,
1769 offset: 0,
1770 length: 3,
1771 token: Token::Word(Word::Word("The".to_string())),
1772 },
1773 PositionalToken {
1774 source: uws,
1775 offset: 3,
1776 length: 1,
1777 token: Token::Special(Special::Separator(Separator::Space)),
1778 },
1779 PositionalToken {
1780 source: uws,
1781 offset: 4,
1782 length: 5,
1783 token: Token::Word(Word::Word("quick".to_string())),
1784 },
1785 PositionalToken {
1786 source: uws,
1787 offset: 9,
1788 length: 1,
1789 token: Token::Special(Special::Separator(Separator::Space)),
1790 },
1791 PositionalToken {
1792 source: uws,
1793 offset: 10,
1794 length: 1,
1795 token: Token::Special(Special::Punctuation('(')),
1796 },
1797 PositionalToken {
1798 source: uws,
1799 offset: 11,
1800 length: 1,
1801 token: Token::Special(Special::Punctuation('"')),
1802 },
1803 PositionalToken {
1804 source: uws,
1805 offset: 12,
1806 length: 5,
1807 token: Token::Word(Word::Word("brown".to_string())),
1808 },
1809 PositionalToken {
1810 source: uws,
1811 offset: 17,
1812 length: 1,
1813 token: Token::Special(Special::Punctuation('"')),
1814 },
1815 PositionalToken {
1816 source: uws,
1817 offset: 18,
1818 length: 1,
1819 token: Token::Special(Special::Punctuation(')')),
1820 },
1821 PositionalToken {
1822 source: uws,
1823 offset: 19,
1824 length: 1,
1825 token: Token::Special(Special::Separator(Separator::Space)),
1826 },
1827 PositionalToken {
1828 source: uws,
1829 offset: 20,
1830 length: 3,
1831 token: Token::Word(Word::Word("fox".to_string())),
1832 },
1833 PositionalToken {
1834 source: uws,
1835 offset: 23,
1836 length: 1,
1837 token: Token::Special(Special::Separator(Separator::Space)),
1838 },
1839 PositionalToken {
1840 source: uws,
1841 offset: 24,
1842 length: 5,
1843 token: Token::Word(Word::Word("can\'t".to_string())),
1844 },
1845 PositionalToken {
1846 source: uws,
1847 offset: 29,
1848 length: 1,
1849 token: Token::Special(Special::Separator(Separator::Space)),
1850 },
1851 PositionalToken {
1852 source: uws,
1853 offset: 30,
1854 length: 4,
1855 token: Token::Word(Word::Word("jump".to_string())),
1856 },
1857 PositionalToken {
1858 source: uws,
1859 offset: 34,
1860 length: 1,
1861 token: Token::Special(Special::Separator(Separator::Space)),
1862 },
1863 PositionalToken {
1864 source: uws,
1865 offset: 35,
1866 length: 4,
1867 token: Token::Word(Word::Number(Number::Float(32.3))),
1868 },
1869 PositionalToken {
1870 source: uws,
1871 offset: 39,
1872 length: 1,
1873 token: Token::Special(Special::Separator(Separator::Space)),
1874 },
1875 PositionalToken {
1876 source: uws,
1877 offset: 40,
1878 length: 4,
1879 token: Token::Word(Word::Word("feet".to_string())),
1880 },
1881 PositionalToken {
1882 source: uws,
1883 offset: 44,
1884 length: 1,
1885 token: Token::Special(Special::Punctuation(',')),
1886 },
1887 PositionalToken {
1888 source: uws,
1889 offset: 45,
1890 length: 1,
1891 token: Token::Special(Special::Separator(Separator::Space)),
1892 },
1893 PositionalToken {
1894 source: uws,
1895 offset: 46,
1896 length: 5,
1897 token: Token::Word(Word::Word("right".to_string())),
1898 },
1899 PositionalToken {
1900 source: uws,
1901 offset: 51,
1902 length: 1,
1903 token: Token::Special(Special::Punctuation('?')),
1904 },
1905 PositionalToken {
1906 source: uws,
1907 offset: 52,
1908 length: 1,
1909 token: Token::Special(Special::Separator(Separator::Space)),
1910 },
1911 PositionalToken {
1912 source: uws,
1913 offset: 53,
1914 length: 4,
1915 token: Token::Word(Word::Numerical(Numerical::Measures("4pda".to_string()))),
1916 }, PositionalToken {
1918 source: uws,
1919 offset: 57,
1920 length: 1,
1921 token: Token::Special(Special::Separator(Separator::Space)),
1922 },
1923 PositionalToken {
1924 source: uws,
1925 offset: 58,
1926 length: 3,
1927 token: Token::Word(Word::Word("etc".to_string())),
1928 },
1929 PositionalToken {
1930 source: uws,
1931 offset: 61,
1932 length: 1,
1933 token: Token::Special(Special::Punctuation('.')),
1934 },
1935 PositionalToken {
1936 source: uws,
1937 offset: 62,
1938 length: 1,
1939 token: Token::Special(Special::Separator(Separator::Space)),
1940 },
1941 PositionalToken {
1942 source: uws,
1943 offset: 63,
1944 length: 3,
1945 token: Token::Word(Word::Word("qeq".to_string())),
1946 },
1947 PositionalToken {
1948 source: uws,
1949 offset: 66,
1950 length: 1,
1951 token: Token::Special(Special::Separator(Separator::Space)),
1952 },
1953 PositionalToken {
1954 source: uws,
1955 offset: 67,
1956 length: 1,
1957 token: Token::Word(Word::Word("U".to_string())),
1958 },
1959 PositionalToken {
1960 source: uws,
1961 offset: 68,
1962 length: 1,
1963 token: Token::Special(Special::Punctuation('.')),
1964 },
1965 PositionalToken {
1966 source: uws,
1967 offset: 69,
1968 length: 1,
1969 token: Token::Word(Word::Word("S".to_string())),
1970 },
1971 PositionalToken {
1972 source: uws,
1973 offset: 70,
1974 length: 1,
1975 token: Token::Special(Special::Punctuation('.')),
1976 },
1977 PositionalToken {
1978 source: uws,
1979 offset: 71,
1980 length: 1,
1981 token: Token::Word(Word::Word("A".to_string())),
1982 },
1983 PositionalToken {
1984 source: uws,
1985 offset: 72,
1986 length: 2,
1987 token: Token::Special(Special::Separator(Separator::Space)),
1988 },
1989 PositionalToken {
1990 source: uws,
1991 offset: 74,
1992 length: 3,
1993 token: Token::Word(Word::Word("asd".to_string())),
1994 },
1995 PositionalToken {
1996 source: uws,
1997 offset: 77,
1998 length: 3,
1999 token: Token::Special(Special::Separator(Separator::Newline)),
2000 },
2001 PositionalToken {
2002 source: uws,
2003 offset: 80,
2004 length: 3,
2005 token: Token::Word(Word::Word("Brr".to_string())),
2006 },
2007 PositionalToken {
2008 source: uws,
2009 offset: 83,
2010 length: 1,
2011 token: Token::Special(Special::Punctuation(',')),
2012 },
2013 PositionalToken {
2014 source: uws,
2015 offset: 84,
2016 length: 1,
2017 token: Token::Special(Special::Separator(Separator::Space)),
2018 },
2019 PositionalToken {
2020 source: uws,
2021 offset: 85,
2022 length: 4,
2023 token: Token::Word(Word::Word("it\'s".to_string())),
2024 },
2025 PositionalToken {
2026 source: uws,
2027 offset: 89,
2028 length: 1,
2029 token: Token::Special(Special::Separator(Separator::Space)),
2030 },
2031 PositionalToken {
2032 source: uws,
2033 offset: 90,
2034 length: 4,
2035 token: Token::Word(Word::Number(Number::Float(29.3))),
2036 },
2037 PositionalToken {
2038 source: uws,
2039 offset: 94,
2040 length: 2,
2041 token: Token::Special(Special::Symbol('°')),
2042 },
2043 PositionalToken {
2044 source: uws,
2045 offset: 96,
2046 length: 1,
2047 token: Token::Word(Word::Word("F".to_string())),
2048 },
2049 PositionalToken {
2050 source: uws,
2051 offset: 97,
2052 length: 1,
2053 token: Token::Special(Special::Punctuation('!')),
2054 },
2055 PositionalToken {
2056 source: uws,
2057 offset: 98,
2058 length: 1,
2059 token: Token::Special(Special::Separator(Separator::Newline)),
2060 },
2061 PositionalToken {
2062 source: uws,
2063 offset: 99,
2064 length: 1,
2065 token: Token::Special(Special::Separator(Separator::Space)),
2066 },
2067 PositionalToken {
2068 source: uws,
2069 offset: 100,
2070 length: 14,
2071 token: Token::Word(Word::Word("Русское".to_string())),
2072 },
2073 PositionalToken {
2074 source: uws,
2075 offset: 114,
2076 length: 1,
2077 token: Token::Special(Special::Separator(Separator::Space)),
2078 },
2079 PositionalToken {
2080 source: uws,
2081 offset: 115,
2082 length: 22,
2083 token: Token::Word(Word::Word("предложение".to_string())),
2084 },
2085 PositionalToken {
2086 source: uws,
2087 offset: 137,
2088 length: 1,
2089 token: Token::Special(Special::Separator(Separator::Space)),
2090 },
2091 PositionalToken {
2092 source: uws,
2093 offset: 138,
2094 length: 1,
2095 token: Token::Special(Special::Punctuation('#')),
2096 },
2097 PositionalToken {
2098 source: uws,
2099 offset: 139,
2100 length: 4,
2101 token: Token::Word(Word::Number(Number::Float(36.6))),
2102 },
2103 PositionalToken {
2104 source: uws,
2105 offset: 143,
2106 length: 1,
2107 token: Token::Special(Special::Separator(Separator::Space)),
2108 },
2109 PositionalToken {
2110 source: uws,
2111 offset: 144,
2112 length: 6,
2113 token: Token::Word(Word::Word("для".to_string())),
2114 },
2115 PositionalToken {
2116 source: uws,
2117 offset: 150,
2118 length: 1,
2119 token: Token::Special(Special::Separator(Separator::Space)),
2120 },
2121 PositionalToken {
2122 source: uws,
2123 offset: 151,
2124 length: 24,
2125 token: Token::Word(Word::Word("тестирования".to_string())),
2126 },
2127 PositionalToken {
2128 source: uws,
2129 offset: 175,
2130 length: 1,
2131 token: Token::Special(Special::Separator(Separator::Space)),
2132 },
2133 PositionalToken {
2134 source: uws,
2135 offset: 176,
2136 length: 14,
2137 token: Token::Word(Word::Word("деления".to_string())),
2138 },
2139 PositionalToken {
2140 source: uws,
2141 offset: 190,
2142 length: 1,
2143 token: Token::Special(Special::Separator(Separator::Space)),
2144 },
2145 PositionalToken {
2146 source: uws,
2147 offset: 191,
2148 length: 4,
2149 token: Token::Word(Word::Word("по".to_string())),
2150 },
2151 PositionalToken {
2152 source: uws,
2153 offset: 195,
2154 length: 1,
2155 token: Token::Special(Special::Separator(Separator::Space)),
2156 },
2157 PositionalToken {
2158 source: uws,
2159 offset: 196,
2160 length: 12,
2161 token: Token::Word(Word::Word("юникод".to_string())),
2162 },
2163 PositionalToken {
2164 source: uws,
2165 offset: 208,
2166 length: 1,
2167 token: Token::Special(Special::Punctuation('-')),
2168 },
2169 PositionalToken {
2170 source: uws,
2171 offset: 209,
2172 length: 12,
2173 token: Token::Word(Word::Word("словам".to_string())),
2174 },
2175 PositionalToken {
2176 source: uws,
2177 offset: 221,
2178 length: 3,
2179 token: Token::Special(Special::Punctuation('.')),
2180 },
2181 PositionalToken {
2182 source: uws,
2183 offset: 224,
2184 length: 1,
2185 token: Token::Special(Special::Separator(Separator::Newline)),
2186 },
2187 ];
2188 let lib_res = uws
2189 .into_tokenizer(TokenizerParams::v1())
2190 .collect::<Vec<_>>();
2191 check_results(&result, &lib_res, uws);
2192 }
2193
2194 #[test]
2195 fn general_no_split() {
2196 let uws = "The quick (\"brown\") fox can't jump 32.3 feet, right? 4pda etc. qeq U.S.A asd\n\n\nBrr, it's 29.3°F!\n Русское предложение #36.6 для тестирования деления по юникод-словам...\n";
2197 let result = vec![
2198 PositionalToken {
2199 source: uws,
2200 offset: 0,
2201 length: 3,
2202 token: Token::Word(Word::Word("The".to_string())),
2203 },
2204 PositionalToken {
2205 source: uws,
2206 offset: 3,
2207 length: 1,
2208 token: Token::Special(Special::Separator(Separator::Space)),
2209 },
2210 PositionalToken {
2211 source: uws,
2212 offset: 4,
2213 length: 5,
2214 token: Token::Word(Word::Word("quick".to_string())),
2215 },
2216 PositionalToken {
2217 source: uws,
2218 offset: 9,
2219 length: 1,
2220 token: Token::Special(Special::Separator(Separator::Space)),
2221 },
2222 PositionalToken {
2223 source: uws,
2224 offset: 10,
2225 length: 1,
2226 token: Token::Special(Special::Punctuation('(')),
2227 },
2228 PositionalToken {
2229 source: uws,
2230 offset: 11,
2231 length: 1,
2232 token: Token::Special(Special::Punctuation('"')),
2233 },
2234 PositionalToken {
2235 source: uws,
2236 offset: 12,
2237 length: 5,
2238 token: Token::Word(Word::Word("brown".to_string())),
2239 },
2240 PositionalToken {
2241 source: uws,
2242 offset: 17,
2243 length: 1,
2244 token: Token::Special(Special::Punctuation('"')),
2245 },
2246 PositionalToken {
2247 source: uws,
2248 offset: 18,
2249 length: 1,
2250 token: Token::Special(Special::Punctuation(')')),
2251 },
2252 PositionalToken {
2253 source: uws,
2254 offset: 19,
2255 length: 1,
2256 token: Token::Special(Special::Separator(Separator::Space)),
2257 },
2258 PositionalToken {
2259 source: uws,
2260 offset: 20,
2261 length: 3,
2262 token: Token::Word(Word::Word("fox".to_string())),
2263 },
2264 PositionalToken {
2265 source: uws,
2266 offset: 23,
2267 length: 1,
2268 token: Token::Special(Special::Separator(Separator::Space)),
2269 },
2270 PositionalToken {
2271 source: uws,
2272 offset: 24,
2273 length: 5,
2274 token: Token::Word(Word::Word("can\'t".to_string())),
2275 },
2276 PositionalToken {
2277 source: uws,
2278 offset: 29,
2279 length: 1,
2280 token: Token::Special(Special::Separator(Separator::Space)),
2281 },
2282 PositionalToken {
2283 source: uws,
2284 offset: 30,
2285 length: 4,
2286 token: Token::Word(Word::Word("jump".to_string())),
2287 },
2288 PositionalToken {
2289 source: uws,
2290 offset: 34,
2291 length: 1,
2292 token: Token::Special(Special::Separator(Separator::Space)),
2293 },
2294 PositionalToken {
2295 source: uws,
2296 offset: 35,
2297 length: 4,
2298 token: Token::Word(Word::Number(Number::Float(32.3))),
2299 },
2300 PositionalToken {
2301 source: uws,
2302 offset: 39,
2303 length: 1,
2304 token: Token::Special(Special::Separator(Separator::Space)),
2305 },
2306 PositionalToken {
2307 source: uws,
2308 offset: 40,
2309 length: 4,
2310 token: Token::Word(Word::Word("feet".to_string())),
2311 },
2312 PositionalToken {
2313 source: uws,
2314 offset: 44,
2315 length: 1,
2316 token: Token::Special(Special::Punctuation(',')),
2317 },
2318 PositionalToken {
2319 source: uws,
2320 offset: 45,
2321 length: 1,
2322 token: Token::Special(Special::Separator(Separator::Space)),
2323 },
2324 PositionalToken {
2325 source: uws,
2326 offset: 46,
2327 length: 5,
2328 token: Token::Word(Word::Word("right".to_string())),
2329 },
2330 PositionalToken {
2331 source: uws,
2332 offset: 51,
2333 length: 1,
2334 token: Token::Special(Special::Punctuation('?')),
2335 },
2336 PositionalToken {
2337 source: uws,
2338 offset: 52,
2339 length: 1,
2340 token: Token::Special(Special::Separator(Separator::Space)),
2341 },
2342 PositionalToken {
2343 source: uws,
2344 offset: 53,
2345 length: 4,
2346 token: Token::Word(Word::Numerical(Numerical::Measures("4pda".to_string()))),
2347 }, PositionalToken {
2349 source: uws,
2350 offset: 57,
2351 length: 1,
2352 token: Token::Special(Special::Separator(Separator::Space)),
2353 },
2354 PositionalToken {
2355 source: uws,
2356 offset: 58,
2357 length: 3,
2358 token: Token::Word(Word::Word("etc".to_string())),
2359 },
2360 PositionalToken {
2361 source: uws,
2362 offset: 61,
2363 length: 1,
2364 token: Token::Special(Special::Punctuation('.')),
2365 },
2366 PositionalToken {
2367 source: uws,
2368 offset: 62,
2369 length: 1,
2370 token: Token::Special(Special::Separator(Separator::Space)),
2371 },
2372 PositionalToken {
2373 source: uws,
2374 offset: 63,
2375 length: 3,
2376 token: Token::Word(Word::Word("qeq".to_string())),
2377 },
2378 PositionalToken {
2379 source: uws,
2380 offset: 66,
2381 length: 1,
2382 token: Token::Special(Special::Separator(Separator::Space)),
2383 },
2384 PositionalToken {
2385 source: uws,
2386 offset: 67,
2387 length: 5,
2388 token: Token::Word(Word::Word("U.S.A".to_string())),
2389 },
2390 PositionalToken {
2391 source: uws,
2392 offset: 72,
2393 length: 1,
2394 token: Token::Special(Special::Separator(Separator::Space)),
2395 },
2396 PositionalToken {
2397 source: uws,
2398 offset: 73,
2399 length: 1,
2400 token: Token::Special(Special::Separator(Separator::Space)),
2401 },
2402 PositionalToken {
2403 source: uws,
2404 offset: 74,
2405 length: 3,
2406 token: Token::Word(Word::Word("asd".to_string())),
2407 },
2408 PositionalToken {
2409 source: uws,
2410 offset: 77,
2411 length: 1,
2412 token: Token::Special(Special::Separator(Separator::Newline)),
2413 },
2414 PositionalToken {
2415 source: uws,
2416 offset: 78,
2417 length: 1,
2418 token: Token::Special(Special::Separator(Separator::Newline)),
2419 },
2420 PositionalToken {
2421 source: uws,
2422 offset: 79,
2423 length: 1,
2424 token: Token::Special(Special::Separator(Separator::Newline)),
2425 },
2426 PositionalToken {
2427 source: uws,
2428 offset: 80,
2429 length: 3,
2430 token: Token::Word(Word::Word("Brr".to_string())),
2431 },
2432 PositionalToken {
2433 source: uws,
2434 offset: 83,
2435 length: 1,
2436 token: Token::Special(Special::Punctuation(',')),
2437 },
2438 PositionalToken {
2439 source: uws,
2440 offset: 84,
2441 length: 1,
2442 token: Token::Special(Special::Separator(Separator::Space)),
2443 },
2444 PositionalToken {
2445 source: uws,
2446 offset: 85,
2447 length: 4,
2448 token: Token::Word(Word::Word("it\'s".to_string())),
2449 },
2450 PositionalToken {
2451 source: uws,
2452 offset: 89,
2453 length: 1,
2454 token: Token::Special(Special::Separator(Separator::Space)),
2455 },
2456 PositionalToken {
2457 source: uws,
2458 offset: 90,
2459 length: 4,
2460 token: Token::Word(Word::Number(Number::Float(29.3))),
2461 },
2462 PositionalToken {
2463 source: uws,
2464 offset: 94,
2465 length: 2,
2466 token: Token::Special(Special::Symbol('°')),
2467 },
2468 PositionalToken {
2469 source: uws,
2470 offset: 96,
2471 length: 1,
2472 token: Token::Word(Word::Word("F".to_string())),
2473 },
2474 PositionalToken {
2475 source: uws,
2476 offset: 97,
2477 length: 1,
2478 token: Token::Special(Special::Punctuation('!')),
2479 },
2480 PositionalToken {
2481 source: uws,
2482 offset: 98,
2483 length: 1,
2484 token: Token::Special(Special::Separator(Separator::Newline)),
2485 },
2486 PositionalToken {
2487 source: uws,
2488 offset: 99,
2489 length: 1,
2490 token: Token::Special(Special::Separator(Separator::Space)),
2491 },
2492 PositionalToken {
2493 source: uws,
2494 offset: 100,
2495 length: 14,
2496 token: Token::Word(Word::Word("Русское".to_string())),
2497 },
2498 PositionalToken {
2499 source: uws,
2500 offset: 114,
2501 length: 1,
2502 token: Token::Special(Special::Separator(Separator::Space)),
2503 },
2504 PositionalToken {
2505 source: uws,
2506 offset: 115,
2507 length: 22,
2508 token: Token::Word(Word::Word("предложение".to_string())),
2509 },
2510 PositionalToken {
2511 source: uws,
2512 offset: 137,
2513 length: 1,
2514 token: Token::Special(Special::Separator(Separator::Space)),
2515 },
2516 PositionalToken {
2517 source: uws,
2518 offset: 138,
2519 length: 1,
2520 token: Token::Special(Special::Punctuation('#')),
2521 },
2522 PositionalToken {
2523 source: uws,
2524 offset: 139,
2525 length: 4,
2526 token: Token::Word(Word::Number(Number::Float(36.6))),
2527 },
2528 PositionalToken {
2529 source: uws,
2530 offset: 143,
2531 length: 1,
2532 token: Token::Special(Special::Separator(Separator::Space)),
2533 },
2534 PositionalToken {
2535 source: uws,
2536 offset: 144,
2537 length: 6,
2538 token: Token::Word(Word::Word("для".to_string())),
2539 },
2540 PositionalToken {
2541 source: uws,
2542 offset: 150,
2543 length: 1,
2544 token: Token::Special(Special::Separator(Separator::Space)),
2545 },
2546 PositionalToken {
2547 source: uws,
2548 offset: 151,
2549 length: 24,
2550 token: Token::Word(Word::Word("тестирования".to_string())),
2551 },
2552 PositionalToken {
2553 source: uws,
2554 offset: 175,
2555 length: 1,
2556 token: Token::Special(Special::Separator(Separator::Space)),
2557 },
2558 PositionalToken {
2559 source: uws,
2560 offset: 176,
2561 length: 14,
2562 token: Token::Word(Word::Word("деления".to_string())),
2563 },
2564 PositionalToken {
2565 source: uws,
2566 offset: 190,
2567 length: 1,
2568 token: Token::Special(Special::Separator(Separator::Space)),
2569 },
2570 PositionalToken {
2571 source: uws,
2572 offset: 191,
2573 length: 4,
2574 token: Token::Word(Word::Word("по".to_string())),
2575 },
2576 PositionalToken {
2577 source: uws,
2578 offset: 195,
2579 length: 1,
2580 token: Token::Special(Special::Separator(Separator::Space)),
2581 },
2582 PositionalToken {
2583 source: uws,
2584 offset: 196,
2585 length: 12,
2586 token: Token::Word(Word::Word("юникод".to_string())),
2587 },
2588 PositionalToken {
2589 source: uws,
2590 offset: 208,
2591 length: 1,
2592 token: Token::Special(Special::Punctuation('-')),
2593 },
2594 PositionalToken {
2595 source: uws,
2596 offset: 209,
2597 length: 12,
2598 token: Token::Word(Word::Word("словам".to_string())),
2599 },
2600 PositionalToken {
2601 source: uws,
2602 offset: 221,
2603 length: 1,
2604 token: Token::Special(Special::Punctuation('.')),
2605 },
2606 PositionalToken {
2607 source: uws,
2608 offset: 222,
2609 length: 1,
2610 token: Token::Special(Special::Punctuation('.')),
2611 },
2612 PositionalToken {
2613 source: uws,
2614 offset: 223,
2615 length: 1,
2616 token: Token::Special(Special::Punctuation('.')),
2617 },
2618 PositionalToken {
2619 source: uws,
2620 offset: 224,
2621 length: 1,
2622 token: Token::Special(Special::Separator(Separator::Newline)),
2623 },
2624 ];
2625 let lib_res = uws.into_tokenizer(Default::default()).collect::<Vec<_>>();
2626 check_results(&result, &lib_res, uws);
2627 }
2628
2629 #[test]
2630 fn general_complex() {
2631 let uws = "The quick (\"brown\") fox can't jump 32.3 feet, right? 4pda etc. qeq U.S.A asd\n\n\nBrr, it's 29.3°F!\n Русское предложение #36.6 для тестирования деления по юникод-словам...\n";
2632 let result = vec![
2633 PositionalToken {
2634 source: uws,
2635 offset: 0,
2636 length: 3,
2637 token: Token::Word(Word::Word("The".to_string())),
2638 },
2639 PositionalToken {
2640 source: uws,
2641 offset: 3,
2642 length: 1,
2643 token: Token::Special(Special::Separator(Separator::Space)),
2644 },
2645 PositionalToken {
2646 source: uws,
2647 offset: 4,
2648 length: 5,
2649 token: Token::Word(Word::Word("quick".to_string())),
2650 },
2651 PositionalToken {
2652 source: uws,
2653 offset: 9,
2654 length: 1,
2655 token: Token::Special(Special::Separator(Separator::Space)),
2656 },
2657 PositionalToken {
2658 source: uws,
2659 offset: 10,
2660 length: 1,
2661 token: Token::Special(Special::Punctuation('(')),
2662 },
2663 PositionalToken {
2664 source: uws,
2665 offset: 11,
2666 length: 1,
2667 token: Token::Special(Special::Punctuation('"')),
2668 },
2669 PositionalToken {
2670 source: uws,
2671 offset: 12,
2672 length: 5,
2673 token: Token::Word(Word::Word("brown".to_string())),
2674 },
2675 PositionalToken {
2676 source: uws,
2677 offset: 17,
2678 length: 1,
2679 token: Token::Special(Special::Punctuation('"')),
2680 },
2681 PositionalToken {
2682 source: uws,
2683 offset: 18,
2684 length: 1,
2685 token: Token::Special(Special::Punctuation(')')),
2686 },
2687 PositionalToken {
2688 source: uws,
2689 offset: 19,
2690 length: 1,
2691 token: Token::Special(Special::Separator(Separator::Space)),
2692 },
2693 PositionalToken {
2694 source: uws,
2695 offset: 20,
2696 length: 3,
2697 token: Token::Word(Word::Word("fox".to_string())),
2698 },
2699 PositionalToken {
2700 source: uws,
2701 offset: 23,
2702 length: 1,
2703 token: Token::Special(Special::Separator(Separator::Space)),
2704 },
2705 PositionalToken {
2706 source: uws,
2707 offset: 24,
2708 length: 5,
2709 token: Token::Word(Word::Word("can\'t".to_string())),
2710 },
2711 PositionalToken {
2712 source: uws,
2713 offset: 29,
2714 length: 1,
2715 token: Token::Special(Special::Separator(Separator::Space)),
2716 },
2717 PositionalToken {
2718 source: uws,
2719 offset: 30,
2720 length: 4,
2721 token: Token::Word(Word::Word("jump".to_string())),
2722 },
2723 PositionalToken {
2724 source: uws,
2725 offset: 34,
2726 length: 1,
2727 token: Token::Special(Special::Separator(Separator::Space)),
2728 },
2729 PositionalToken {
2730 source: uws,
2731 offset: 35,
2732 length: 4,
2733 token: Token::Word(Word::Number(Number::Float(32.3))),
2734 },
2735 PositionalToken {
2736 source: uws,
2737 offset: 39,
2738 length: 1,
2739 token: Token::Special(Special::Separator(Separator::Space)),
2740 },
2741 PositionalToken {
2742 source: uws,
2743 offset: 40,
2744 length: 4,
2745 token: Token::Word(Word::Word("feet".to_string())),
2746 },
2747 PositionalToken {
2748 source: uws,
2749 offset: 44,
2750 length: 1,
2751 token: Token::Special(Special::Punctuation(',')),
2752 },
2753 PositionalToken {
2754 source: uws,
2755 offset: 45,
2756 length: 1,
2757 token: Token::Special(Special::Separator(Separator::Space)),
2758 },
2759 PositionalToken {
2760 source: uws,
2761 offset: 46,
2762 length: 5,
2763 token: Token::Word(Word::Word("right".to_string())),
2764 },
2765 PositionalToken {
2766 source: uws,
2767 offset: 51,
2768 length: 1,
2769 token: Token::Special(Special::Punctuation('?')),
2770 },
2771 PositionalToken {
2772 source: uws,
2773 offset: 52,
2774 length: 1,
2775 token: Token::Special(Special::Separator(Separator::Space)),
2776 },
2777 PositionalToken {
2778 source: uws,
2779 offset: 53,
2780 length: 4,
2781 token: Token::Word(Word::Numerical(Numerical::Measures("4pda".to_string()))),
2782 }, PositionalToken {
2784 source: uws,
2785 offset: 57,
2786 length: 1,
2787 token: Token::Special(Special::Separator(Separator::Space)),
2788 },
2789 PositionalToken {
2790 source: uws,
2791 offset: 58,
2792 length: 3,
2793 token: Token::Word(Word::Word("etc".to_string())),
2794 },
2795 PositionalToken {
2796 source: uws,
2797 offset: 61,
2798 length: 1,
2799 token: Token::Special(Special::Punctuation('.')),
2800 },
2801 PositionalToken {
2802 source: uws,
2803 offset: 62,
2804 length: 1,
2805 token: Token::Special(Special::Separator(Separator::Space)),
2806 },
2807 PositionalToken {
2808 source: uws,
2809 offset: 63,
2810 length: 3,
2811 token: Token::Word(Word::Word("qeq".to_string())),
2812 },
2813 PositionalToken {
2814 source: uws,
2815 offset: 66,
2816 length: 1,
2817 token: Token::Special(Special::Separator(Separator::Space)),
2818 },
2819 PositionalToken {
2820 source: uws,
2821 offset: 67,
2822 length: 5,
2823 token: Token::Word(Word::Word("U.S.A".to_string())),
2824 },
2825 PositionalToken {
2826 source: uws,
2827 offset: 72,
2828 length: 2,
2829 token: Token::Special(Special::Separator(Separator::Space)),
2830 },
2831 PositionalToken {
2832 source: uws,
2833 offset: 74,
2834 length: 3,
2835 token: Token::Word(Word::Word("asd".to_string())),
2836 },
2837 PositionalToken {
2838 source: uws,
2839 offset: 77,
2840 length: 3,
2841 token: Token::Special(Special::Separator(Separator::Newline)),
2842 },
2843 PositionalToken {
2844 source: uws,
2845 offset: 80,
2846 length: 3,
2847 token: Token::Word(Word::Word("Brr".to_string())),
2848 },
2849 PositionalToken {
2850 source: uws,
2851 offset: 83,
2852 length: 1,
2853 token: Token::Special(Special::Punctuation(',')),
2854 },
2855 PositionalToken {
2856 source: uws,
2857 offset: 84,
2858 length: 1,
2859 token: Token::Special(Special::Separator(Separator::Space)),
2860 },
2861 PositionalToken {
2862 source: uws,
2863 offset: 85,
2864 length: 4,
2865 token: Token::Word(Word::Word("it\'s".to_string())),
2866 },
2867 PositionalToken {
2868 source: uws,
2869 offset: 89,
2870 length: 1,
2871 token: Token::Special(Special::Separator(Separator::Space)),
2872 },
2873 PositionalToken {
2874 source: uws,
2875 offset: 90,
2876 length: 4,
2877 token: Token::Word(Word::Number(Number::Float(29.3))),
2878 },
2879 PositionalToken {
2880 source: uws,
2881 offset: 94,
2882 length: 2,
2883 token: Token::Special(Special::Symbol('°')),
2884 },
2885 PositionalToken {
2886 source: uws,
2887 offset: 96,
2888 length: 1,
2889 token: Token::Word(Word::Word("F".to_string())),
2890 },
2891 PositionalToken {
2892 source: uws,
2893 offset: 97,
2894 length: 1,
2895 token: Token::Special(Special::Punctuation('!')),
2896 },
2897 PositionalToken {
2898 source: uws,
2899 offset: 98,
2900 length: 1,
2901 token: Token::Special(Special::Separator(Separator::Newline)),
2902 },
2903 PositionalToken {
2904 source: uws,
2905 offset: 99,
2906 length: 1,
2907 token: Token::Special(Special::Separator(Separator::Space)),
2908 },
2909 PositionalToken {
2910 source: uws,
2911 offset: 100,
2912 length: 14,
2913 token: Token::Word(Word::Word("Русское".to_string())),
2914 },
2915 PositionalToken {
2916 source: uws,
2917 offset: 114,
2918 length: 1,
2919 token: Token::Special(Special::Separator(Separator::Space)),
2920 },
2921 PositionalToken {
2922 source: uws,
2923 offset: 115,
2924 length: 22,
2925 token: Token::Word(Word::Word("предложение".to_string())),
2926 },
2927 PositionalToken {
2928 source: uws,
2929 offset: 137,
2930 length: 1,
2931 token: Token::Special(Special::Separator(Separator::Space)),
2932 },
2933 PositionalToken {
2934 source: uws,
2935 offset: 138,
2936 length: 5,
2937 token: Token::Struct(Struct::Hashtag("36.6".to_string())),
2938 },
2939 PositionalToken {
2940 source: uws,
2941 offset: 143,
2942 length: 1,
2943 token: Token::Special(Special::Separator(Separator::Space)),
2944 },
2945 PositionalToken {
2946 source: uws,
2947 offset: 144,
2948 length: 6,
2949 token: Token::Word(Word::Word("для".to_string())),
2950 },
2951 PositionalToken {
2952 source: uws,
2953 offset: 150,
2954 length: 1,
2955 token: Token::Special(Special::Separator(Separator::Space)),
2956 },
2957 PositionalToken {
2958 source: uws,
2959 offset: 151,
2960 length: 24,
2961 token: Token::Word(Word::Word("тестирования".to_string())),
2962 },
2963 PositionalToken {
2964 source: uws,
2965 offset: 175,
2966 length: 1,
2967 token: Token::Special(Special::Separator(Separator::Space)),
2968 },
2969 PositionalToken {
2970 source: uws,
2971 offset: 176,
2972 length: 14,
2973 token: Token::Word(Word::Word("деления".to_string())),
2974 },
2975 PositionalToken {
2976 source: uws,
2977 offset: 190,
2978 length: 1,
2979 token: Token::Special(Special::Separator(Separator::Space)),
2980 },
2981 PositionalToken {
2982 source: uws,
2983 offset: 191,
2984 length: 4,
2985 token: Token::Word(Word::Word("по".to_string())),
2986 },
2987 PositionalToken {
2988 source: uws,
2989 offset: 195,
2990 length: 1,
2991 token: Token::Special(Special::Separator(Separator::Space)),
2992 },
2993 PositionalToken {
2994 source: uws,
2995 offset: 196,
2996 length: 12,
2997 token: Token::Word(Word::Word("юникод".to_string())),
2998 },
2999 PositionalToken {
3000 source: uws,
3001 offset: 208,
3002 length: 1,
3003 token: Token::Special(Special::Punctuation('-')),
3004 },
3005 PositionalToken {
3006 source: uws,
3007 offset: 209,
3008 length: 12,
3009 token: Token::Word(Word::Word("словам".to_string())),
3010 },
3011 PositionalToken {
3012 source: uws,
3013 offset: 221,
3014 length: 3,
3015 token: Token::Special(Special::Punctuation('.')),
3016 },
3017 PositionalToken {
3018 source: uws,
3019 offset: 224,
3020 length: 1,
3021 token: Token::Special(Special::Separator(Separator::Newline)),
3022 },
3023 ];
3024 let lib_res = uws
3025 .into_tokenizer(TokenizerParams::complex())
3026 .collect::<Vec<_>>();
3027 check_results(&result, &lib_res, uws);
3028 }
3029
3030 #[test]
3031 fn plus_minus() {
3032 let uws = "+23 -4.5 -34 +25.7 - 2 + 5.6";
3033 let result = vec![
3034 PositionalToken {
3035 source: uws,
3036 offset: 0,
3037 length: 3,
3038 token: Token::Word(Word::Number(Number::Integer(23))),
3039 },
3040 PositionalToken {
3041 source: uws,
3042 offset: 3,
3043 length: 1,
3044 token: Token::Special(Special::Separator(Separator::Space)),
3045 },
3046 PositionalToken {
3047 source: uws,
3048 offset: 4,
3049 length: 4,
3050 token: Token::Word(Word::Number(Number::Float(-4.5))),
3051 },
3052 PositionalToken {
3053 source: uws,
3054 offset: 8,
3055 length: 1,
3056 token: Token::Special(Special::Separator(Separator::Space)),
3057 },
3058 PositionalToken {
3059 source: uws,
3060 offset: 9,
3061 length: 3,
3062 token: Token::Word(Word::Number(Number::Integer(-34))),
3063 },
3064 PositionalToken {
3065 source: uws,
3066 offset: 12,
3067 length: 1,
3068 token: Token::Special(Special::Separator(Separator::Space)),
3069 },
3070 PositionalToken {
3071 source: uws,
3072 offset: 13,
3073 length: 5,
3074 token: Token::Word(Word::Number(Number::Float(25.7))),
3075 },
3076 PositionalToken {
3077 source: uws,
3078 offset: 18,
3079 length: 1,
3080 token: Token::Special(Special::Separator(Separator::Space)),
3081 },
3082 PositionalToken {
3083 source: uws,
3084 offset: 19,
3085 length: 1,
3086 token: Token::Special(Special::Punctuation('-')),
3087 },
3088 PositionalToken {
3089 source: uws,
3090 offset: 20,
3091 length: 1,
3092 token: Token::Special(Special::Separator(Separator::Space)),
3093 },
3094 PositionalToken {
3095 source: uws,
3096 offset: 21,
3097 length: 1,
3098 token: Token::Word(Word::Number(Number::Integer(2))),
3099 },
3100 PositionalToken {
3101 source: uws,
3102 offset: 22,
3103 length: 1,
3104 token: Token::Special(Special::Separator(Separator::Space)),
3105 },
3106 PositionalToken {
3107 source: uws,
3108 offset: 23,
3109 length: 1,
3110 token: Token::Special(Special::Punctuation('+')),
3111 },
3112 PositionalToken {
3113 source: uws,
3114 offset: 24,
3115 length: 1,
3116 token: Token::Special(Special::Separator(Separator::Space)),
3117 },
3118 PositionalToken {
3119 source: uws,
3120 offset: 25,
3121 length: 3,
3122 token: Token::Word(Word::Number(Number::Float(5.6))),
3123 },
3124 ];
3125 let lib_res = uws
3126 .into_tokenizer(TokenizerParams::v1())
3127 .collect::<Vec<_>>();
3128 check(&result, &lib_res, uws);
3129 }
3131
3132 #[test]
3133 #[ignore]
3134 fn woman_bouncing_ball() {
3135 let uws = "\u{26f9}\u{200d}\u{2640}";
3136 let result = vec![PositionalToken {
3137 source: uws,
3138 offset: 0,
3139 length: 9,
3140 token: Token::Word(Word::Emoji("woman_bouncing_ball")),
3141 }];
3142 let lib_res = uws
3143 .into_tokenizer(TokenizerParams::v1())
3144 .collect::<Vec<_>>();
3145 check_results(&result, &lib_res, uws);
3146 }
3148
3149 #[test]
3150 fn emoji_and_rusabbr_default() {
3151 let uws = "🇷🇺 🇸🇹\n👱🏿👶🏽👨🏽\n👱\nС.С.С.Р.\n👨👩👦👦\n🧠\n";
3152 let result = vec![
3153 PositionalToken {
3154 source: uws,
3155 offset: 0,
3156 length: 8,
3157 token: Token::Word(Word::Emoji("russia")),
3158 },
3159 PositionalToken {
3160 source: uws,
3161 offset: 8,
3162 length: 1,
3163 token: Token::Special(Special::Separator(Separator::Space)),
3164 },
3165 PositionalToken {
3166 source: uws,
3167 offset: 9,
3168 length: 8,
3169 token: Token::Word(Word::Emoji("sao_tome_and_principe")),
3170 },
3171 PositionalToken {
3172 source: uws,
3173 offset: 17,
3174 length: 1,
3175 token: Token::Special(Special::Separator(Separator::Newline)),
3176 },
3177 PositionalToken {
3178 source: uws,
3179 offset: 18,
3180 length: 8,
3181 token: Token::Word(Word::Emoji("blond_haired_person_dark_skin_tone")),
3182 },
3183 PositionalToken {
3184 source: uws,
3185 offset: 26,
3186 length: 8,
3187 token: Token::Word(Word::Emoji("baby_medium_skin_tone")),
3188 },
3189 PositionalToken {
3190 source: uws,
3191 offset: 34,
3192 length: 8,
3193 token: Token::Word(Word::Emoji("man_medium_skin_tone")),
3194 },
3195 PositionalToken {
3196 source: uws,
3197 offset: 42,
3198 length: 1,
3199 token: Token::Special(Special::Separator(Separator::Newline)),
3200 },
3201 PositionalToken {
3202 source: uws,
3203 offset: 43,
3204 length: 4,
3205 token: Token::Word(Word::Emoji("blond_haired_person")),
3206 },
3207 PositionalToken {
3208 source: uws,
3209 offset: 47,
3210 length: 1,
3211 token: Token::Special(Special::Separator(Separator::Newline)),
3212 },
3213 PositionalToken {
3214 source: uws,
3215 offset: 48,
3216 length: 2,
3217 token: Token::Word(Word::Word("С".to_string())),
3218 },
3219 PositionalToken {
3220 source: uws,
3221 offset: 50,
3222 length: 1,
3223 token: Token::Special(Special::Punctuation('.')),
3224 },
3225 PositionalToken {
3226 source: uws,
3227 offset: 51,
3228 length: 2,
3229 token: Token::Word(Word::Word("С".to_string())),
3230 },
3231 PositionalToken {
3232 source: uws,
3233 offset: 53,
3234 length: 1,
3235 token: Token::Special(Special::Punctuation('.')),
3236 },
3237 PositionalToken {
3238 source: uws,
3239 offset: 54,
3240 length: 2,
3241 token: Token::Word(Word::Word("С".to_string())),
3242 },
3243 PositionalToken {
3244 source: uws,
3245 offset: 56,
3246 length: 1,
3247 token: Token::Special(Special::Punctuation('.')),
3248 },
3249 PositionalToken {
3250 source: uws,
3251 offset: 57,
3252 length: 2,
3253 token: Token::Word(Word::Word("Р".to_string())),
3254 },
3255 PositionalToken {
3256 source: uws,
3257 offset: 59,
3258 length: 1,
3259 token: Token::Special(Special::Punctuation('.')),
3260 },
3261 PositionalToken {
3262 source: uws,
3263 offset: 60,
3264 length: 1,
3265 token: Token::Special(Special::Separator(Separator::Newline)),
3266 },
3267 PositionalToken {
3268 source: uws,
3269 offset: 61,
3270 length: 25,
3271 token: Token::Word(Word::Emoji("family_man_woman_boy_boy")),
3272 },
3273 PositionalToken {
3274 source: uws,
3275 offset: 86,
3276 length: 1,
3277 token: Token::Special(Special::Separator(Separator::Newline)),
3278 },
3279 PositionalToken {
3280 source: uws,
3281 offset: 87,
3282 length: 4,
3283 token: Token::Word(Word::Emoji("brain")),
3284 },
3285 PositionalToken {
3286 source: uws,
3287 offset: 91,
3288 length: 1,
3289 token: Token::Special(Special::Separator(Separator::Newline)),
3290 },
3291 ];
3292
3293 let lib_res = uws
3294 .into_tokenizer(TokenizerParams::v1())
3295 .collect::<Vec<_>>();
3296 check_results(&result, &lib_res, uws);
3297 }
3299
3300 #[test]
3301 fn emoji_and_rusabbr_no_split() {
3302 let uws = "🇷🇺 🇸🇹\n👱🏿👶🏽👨🏽\n👱\nС.С.С.Р.\n👨👩👦👦\n🧠\n";
3303 let result = vec![
3304 PositionalToken {
3305 source: uws,
3306 offset: 0,
3307 length: 8,
3308 token: Token::Word(Word::Emoji("russia")),
3309 },
3310 PositionalToken {
3311 source: uws,
3312 offset: 8,
3313 length: 1,
3314 token: Token::Special(Special::Separator(Separator::Space)),
3315 },
3316 PositionalToken {
3317 source: uws,
3318 offset: 9,
3319 length: 8,
3320 token: Token::Word(Word::Emoji("sao_tome_and_principe")),
3321 },
3322 PositionalToken {
3323 source: uws,
3324 offset: 17,
3325 length: 1,
3326 token: Token::Special(Special::Separator(Separator::Newline)),
3327 },
3328 PositionalToken {
3329 source: uws,
3330 offset: 18,
3331 length: 8,
3332 token: Token::Word(Word::Emoji("blond_haired_person_dark_skin_tone")),
3333 },
3334 PositionalToken {
3335 source: uws,
3336 offset: 26,
3337 length: 8,
3338 token: Token::Word(Word::Emoji("baby_medium_skin_tone")),
3339 },
3340 PositionalToken {
3341 source: uws,
3342 offset: 34,
3343 length: 8,
3344 token: Token::Word(Word::Emoji("man_medium_skin_tone")),
3345 },
3346 PositionalToken {
3347 source: uws,
3348 offset: 42,
3349 length: 1,
3350 token: Token::Special(Special::Separator(Separator::Newline)),
3351 },
3352 PositionalToken {
3353 source: uws,
3354 offset: 43,
3355 length: 4,
3356 token: Token::Word(Word::Emoji("blond_haired_person")),
3357 },
3358 PositionalToken {
3359 source: uws,
3360 offset: 47,
3361 length: 1,
3362 token: Token::Special(Special::Separator(Separator::Newline)),
3363 },
3364 PositionalToken {
3365 source: uws,
3366 offset: 48,
3367 length: 11,
3368 token: Token::Word(Word::Word("С.С.С.Р".to_string())),
3369 },
3370 PositionalToken {
3371 source: uws,
3372 offset: 59,
3373 length: 1,
3374 token: Token::Special(Special::Punctuation('.')),
3375 },
3376 PositionalToken {
3377 source: uws,
3378 offset: 60,
3379 length: 1,
3380 token: Token::Special(Special::Separator(Separator::Newline)),
3381 },
3382 PositionalToken {
3383 source: uws,
3384 offset: 61,
3385 length: 25,
3386 token: Token::Word(Word::Emoji("family_man_woman_boy_boy")),
3387 },
3388 PositionalToken {
3389 source: uws,
3390 offset: 86,
3391 length: 1,
3392 token: Token::Special(Special::Separator(Separator::Newline)),
3393 },
3394 PositionalToken {
3395 source: uws,
3396 offset: 87,
3397 length: 4,
3398 token: Token::Word(Word::Emoji("brain")),
3399 },
3400 PositionalToken {
3401 source: uws,
3402 offset: 91,
3403 length: 1,
3404 token: Token::Special(Special::Separator(Separator::Newline)),
3405 },
3406 ];
3407
3408 let lib_res = uws.into_tokenizer(Default::default()).collect::<Vec<_>>();
3409 check_results(&result, &lib_res, uws);
3410 }
3412
3413 #[test]
3637 fn html() {
3638 let uws = "<div class=\"article article_view \" id=\"article_view_-113039156_9551\" data-article-url=\"/@chaibuket-o-chem-ne-zabyt-25-noyabrya\" data-audio-context=\"article:-113039156_9551\"><h1 class=\"article_decoration_first article_decoration_last\" >День Мамы </h1><p class=\"article_decoration_first article_decoration_last\" >День, когда поздравляют мам, бабушек, сестер и жён — это всемирный праздник, называемый «День Мамы». В настоящее время его отмечают почти в каждой стране, просто везде разные даты и способы празднования. </p><h3 class=\"article_decoration_first article_decoration_last\" ><span class='article_anchor_title'>\n <span class='article_anchor_button' id='pochemu-my-ego-prazdnuem'></span>\n <span class='article_anchor_fsymbol'>П</span>\n</span>ПОЧЕМУ МЫ ЕГО ПРАЗДНУЕМ</h3><p class=\"article_decoration_first article_decoration_last article_decoration_before\" >В 1987 году комитет госдумы по делам женщин, семьи и молодежи выступил с предложением учредить «День мамы», а сам приказ был подписан уже 30 января 1988 года Борисом Ельциным. Было решено, что ежегодно в России празднество дня мамы будет выпадать на последнее воскресенье ноября. </p><figure data-type=\"101\" data-mode=\"\" class=\"article_decoration_first article_decoration_last\" >\n <div class=\"article_figure_content\" style=\"width: 1125px\">\n <div class=\"article_figure_sizer_content\"><div class=\"article_object_sizer_wrap\" data-sizes=\"[{"s":["https://pp.userapi.com/c849128/v849128704/c0ffd/pcNJaBH3NDo.jpg",75,50],"m":["https://pp.userapi.com/c849128/v849128704/c0ffe/ozCLs2kHtRY.jpg",130,87],"x":["https://pp.userapi.com/c849128/v849128704/c0fff/E4KtTNDydzE.jpg",604,403],"y":["https://pp.userapi.com/c849128/v849128704/c1000/1nLxpYKavzU.jpg",807,538],"z":["https://pp.userapi.com/c849128/v849128704/c1001/IgEODe90yEk.jpg",1125,750],"o":["https://pp.userapi.com/c849128/v849128704/c1002/01faNwVZ2_E.jpg",130,87],"p":["https://pp.userapi.com/c849128/v849128704/c1003/baDFzbdRP2s.jpg",200,133],"q":["https://pp.userapi.com/c849128/v849128704/c1004/CY4khI6KJKA.jpg",320,213],"r":["https://pp.userapi.com/c849128/v849128704/c1005/NOvAJ6-VltY.jpg",510,340]}]\">\n <img class=\"article_object_sizer_inner article_object_photo__image_blur\" src=\"https://pp.userapi.com/c849128/v849128704/c0ffd/pcNJaBH3NDo.jpg\" data-baseurl=\"\"/>\n \n</div></div>\n <div class=\"article_figure_sizer\" style=\"padding-bottom: 66.666666666667%\"></div>";
3639 let result = vec![
3640 PositionalToken {
3641 source: uws,
3642 offset: 236,
3643 length: 8,
3644 token: Token::Word(Word::Word("День".to_string())),
3645 },
3646 PositionalToken {
3647 source: uws,
3648 offset: 244,
3649 length: 1,
3650 token: Token::Special(Special::Separator(Separator::Space)),
3651 },
3652 PositionalToken {
3653 source: uws,
3654 offset: 245,
3655 length: 8,
3656 token: Token::Word(Word::Word("Мамы".to_string())),
3657 },
3658 PositionalToken {
3659 source: uws,
3660 offset: 253,
3661 length: 1,
3662 token: Token::Special(Special::Separator(Separator::Space)),
3663 },
3664 PositionalToken {
3665 source: uws,
3666 offset: 321,
3667 length: 8,
3668 token: Token::Word(Word::Word("День".to_string())),
3669 },
3670 PositionalToken {
3671 source: uws,
3672 offset: 329,
3673 length: 1,
3674 token: Token::Special(Special::Punctuation(',')),
3675 },
3676 PositionalToken {
3677 source: uws,
3678 offset: 330,
3679 length: 1,
3680 token: Token::Special(Special::Separator(Separator::Space)),
3681 },
3682 PositionalToken {
3683 source: uws,
3684 offset: 331,
3685 length: 10,
3686 token: Token::Word(Word::Word("когда".to_string())),
3687 },
3688 PositionalToken {
3689 source: uws,
3690 offset: 341,
3691 length: 1,
3692 token: Token::Special(Special::Separator(Separator::Space)),
3693 },
3694 PositionalToken {
3695 source: uws,
3696 offset: 342,
3697 length: 22,
3698 token: Token::Word(Word::Word("поздравляют".to_string())),
3699 },
3700 PositionalToken {
3701 source: uws,
3702 offset: 364,
3703 length: 1,
3704 token: Token::Special(Special::Separator(Separator::Space)),
3705 },
3706 PositionalToken {
3707 source: uws,
3708 offset: 365,
3709 length: 6,
3710 token: Token::Word(Word::Word("мам".to_string())),
3711 },
3712 PositionalToken {
3713 source: uws,
3714 offset: 371,
3715 length: 1,
3716 token: Token::Special(Special::Punctuation(',')),
3717 },
3718 PositionalToken {
3719 source: uws,
3720 offset: 372,
3721 length: 1,
3722 token: Token::Special(Special::Separator(Separator::Space)),
3723 },
3724 PositionalToken {
3725 source: uws,
3726 offset: 373,
3727 length: 14,
3728 token: Token::Word(Word::Word("бабушек".to_string())),
3729 },
3730 PositionalToken {
3731 source: uws,
3732 offset: 387,
3733 length: 1,
3734 token: Token::Special(Special::Punctuation(',')),
3735 },
3736 PositionalToken {
3737 source: uws,
3738 offset: 388,
3739 length: 1,
3740 token: Token::Special(Special::Separator(Separator::Space)),
3741 },
3742 PositionalToken {
3743 source: uws,
3744 offset: 389,
3745 length: 12,
3746 token: Token::Word(Word::Word("сестер".to_string())),
3747 },
3748 PositionalToken {
3749 source: uws,
3750 offset: 401,
3751 length: 1,
3752 token: Token::Special(Special::Separator(Separator::Space)),
3753 },
3754 PositionalToken {
3755 source: uws,
3756 offset: 402,
3757 length: 2,
3758 token: Token::Word(Word::Word("и".to_string())),
3759 },
3760 PositionalToken {
3761 source: uws,
3762 offset: 404,
3763 length: 1,
3764 token: Token::Special(Special::Separator(Separator::Space)),
3765 },
3766 PositionalToken {
3767 source: uws,
3768 offset: 405,
3769 length: 6,
3770 token: Token::Word(Word::Word("жён".to_string())),
3771 },
3772 PositionalToken {
3773 source: uws,
3774 offset: 411,
3775 length: 1,
3776 token: Token::Special(Special::Separator(Separator::Space)),
3777 },
3778 PositionalToken {
3779 source: uws,
3780 offset: 412,
3781 length: 3,
3782 token: Token::Special(Special::Punctuation('—')),
3783 },
3784 PositionalToken {
3785 source: uws,
3786 offset: 415,
3787 length: 1,
3788 token: Token::Special(Special::Separator(Separator::Space)),
3789 },
3790 PositionalToken {
3791 source: uws,
3792 offset: 416,
3793 length: 6,
3794 token: Token::Word(Word::Word("это".to_string())),
3795 },
3796 PositionalToken {
3797 source: uws,
3798 offset: 422,
3799 length: 1,
3800 token: Token::Special(Special::Separator(Separator::Space)),
3801 },
3802 PositionalToken {
3803 source: uws,
3804 offset: 423,
3805 length: 18,
3806 token: Token::Word(Word::Word("всемирный".to_string())),
3807 },
3808 PositionalToken {
3809 source: uws,
3810 offset: 441,
3811 length: 1,
3812 token: Token::Special(Special::Separator(Separator::Space)),
3813 },
3814 PositionalToken {
3815 source: uws,
3816 offset: 442,
3817 length: 16,
3818 token: Token::Word(Word::Word("праздник".to_string())),
3819 },
3820 PositionalToken {
3821 source: uws,
3822 offset: 458,
3823 length: 1,
3824 token: Token::Special(Special::Punctuation(',')),
3825 },
3826 PositionalToken {
3827 source: uws,
3828 offset: 459,
3829 length: 1,
3830 token: Token::Special(Special::Separator(Separator::Space)),
3831 },
3832 PositionalToken {
3833 source: uws,
3834 offset: 460,
3835 length: 20,
3836 token: Token::Word(Word::Word("называемый".to_string())),
3837 },
3838 PositionalToken {
3839 source: uws,
3840 offset: 480,
3841 length: 1,
3842 token: Token::Special(Special::Separator(Separator::Space)),
3843 },
3844 PositionalToken {
3845 source: uws,
3846 offset: 481,
3847 length: 2,
3848 token: Token::Special(Special::Punctuation('«')),
3849 },
3850 PositionalToken {
3851 source: uws,
3852 offset: 483,
3853 length: 8,
3854 token: Token::Word(Word::Word("День".to_string())),
3855 },
3856 PositionalToken {
3857 source: uws,
3858 offset: 491,
3859 length: 1,
3860 token: Token::Special(Special::Separator(Separator::Space)),
3861 },
3862 PositionalToken {
3863 source: uws,
3864 offset: 492,
3865 length: 8,
3866 token: Token::Word(Word::Word("Мамы".to_string())),
3867 },
3868 PositionalToken {
3869 source: uws,
3870 offset: 500,
3871 length: 2,
3872 token: Token::Special(Special::Punctuation('»')),
3873 },
3874 PositionalToken {
3875 source: uws,
3876 offset: 502,
3877 length: 1,
3878 token: Token::Special(Special::Punctuation('.')),
3879 },
3880 PositionalToken {
3881 source: uws,
3882 offset: 503,
3883 length: 1,
3884 token: Token::Special(Special::Separator(Separator::Space)),
3885 },
3886 PositionalToken {
3887 source: uws,
3888 offset: 504,
3889 length: 2,
3890 token: Token::Word(Word::Word("В".to_string())),
3891 },
3892 PositionalToken {
3893 source: uws,
3894 offset: 506,
3895 length: 1,
3896 token: Token::Special(Special::Separator(Separator::Space)),
3897 },
3898 PositionalToken {
3899 source: uws,
3900 offset: 507,
3901 length: 18,
3902 token: Token::Word(Word::Word("настоящее".to_string())),
3903 },
3904 PositionalToken {
3905 source: uws,
3906 offset: 525,
3907 length: 1,
3908 token: Token::Special(Special::Separator(Separator::Space)),
3909 },
3910 PositionalToken {
3911 source: uws,
3912 offset: 526,
3913 length: 10,
3914 token: Token::Word(Word::Word("время".to_string())),
3915 },
3916 PositionalToken {
3917 source: uws,
3918 offset: 536,
3919 length: 1,
3920 token: Token::Special(Special::Separator(Separator::Space)),
3921 },
3922 PositionalToken {
3923 source: uws,
3924 offset: 537,
3925 length: 6,
3926 token: Token::Word(Word::Word("его".to_string())),
3927 },
3928 PositionalToken {
3929 source: uws,
3930 offset: 543,
3931 length: 1,
3932 token: Token::Special(Special::Separator(Separator::Space)),
3933 },
3934 PositionalToken {
3935 source: uws,
3936 offset: 544,
3937 length: 16,
3938 token: Token::Word(Word::Word("отмечают".to_string())),
3939 },
3940 PositionalToken {
3941 source: uws,
3942 offset: 560,
3943 length: 1,
3944 token: Token::Special(Special::Separator(Separator::Space)),
3945 },
3946 PositionalToken {
3947 source: uws,
3948 offset: 561,
3949 length: 10,
3950 token: Token::Word(Word::Word("почти".to_string())),
3951 },
3952 PositionalToken {
3953 source: uws,
3954 offset: 571,
3955 length: 1,
3956 token: Token::Special(Special::Separator(Separator::Space)),
3957 },
3958 PositionalToken {
3959 source: uws,
3960 offset: 572,
3961 length: 2,
3962 token: Token::Word(Word::Word("в".to_string())),
3963 },
3964 PositionalToken {
3965 source: uws,
3966 offset: 574,
3967 length: 1,
3968 token: Token::Special(Special::Separator(Separator::Space)),
3969 },
3970 PositionalToken {
3971 source: uws,
3972 offset: 575,
3973 length: 12,
3974 token: Token::Word(Word::Word("каждой".to_string())),
3975 },
3976 PositionalToken {
3977 source: uws,
3978 offset: 587,
3979 length: 1,
3980 token: Token::Special(Special::Separator(Separator::Space)),
3981 },
3982 PositionalToken {
3983 source: uws,
3984 offset: 588,
3985 length: 12,
3986 token: Token::Word(Word::Word("стране".to_string())),
3987 },
3988 PositionalToken {
3989 source: uws,
3990 offset: 600,
3991 length: 1,
3992 token: Token::Special(Special::Punctuation(',')),
3993 },
3994 PositionalToken {
3995 source: uws,
3996 offset: 601,
3997 length: 1,
3998 token: Token::Special(Special::Separator(Separator::Space)),
3999 },
4000 PositionalToken {
4001 source: uws,
4002 offset: 602,
4003 length: 12,
4004 token: Token::Word(Word::Word("просто".to_string())),
4005 },
4006 PositionalToken {
4007 source: uws,
4008 offset: 614,
4009 length: 1,
4010 token: Token::Special(Special::Separator(Separator::Space)),
4011 },
4012 PositionalToken {
4013 source: uws,
4014 offset: 615,
4015 length: 10,
4016 token: Token::Word(Word::Word("везде".to_string())),
4017 },
4018 PositionalToken {
4019 source: uws,
4020 offset: 625,
4021 length: 1,
4022 token: Token::Special(Special::Separator(Separator::Space)),
4023 },
4024 PositionalToken {
4025 source: uws,
4026 offset: 626,
4027 length: 12,
4028 token: Token::Word(Word::Word("разные".to_string())),
4029 },
4030 PositionalToken {
4031 source: uws,
4032 offset: 638,
4033 length: 1,
4034 token: Token::Special(Special::Separator(Separator::Space)),
4035 },
4036 PositionalToken {
4037 source: uws,
4038 offset: 639,
4039 length: 8,
4040 token: Token::Word(Word::Word("даты".to_string())),
4041 },
4042 PositionalToken {
4043 source: uws,
4044 offset: 647,
4045 length: 1,
4046 token: Token::Special(Special::Separator(Separator::Space)),
4047 },
4048 PositionalToken {
4049 source: uws,
4050 offset: 648,
4051 length: 2,
4052 token: Token::Word(Word::Word("и".to_string())),
4053 },
4054 PositionalToken {
4055 source: uws,
4056 offset: 650,
4057 length: 1,
4058 token: Token::Special(Special::Separator(Separator::Space)),
4059 },
4060 PositionalToken {
4061 source: uws,
4062 offset: 651,
4063 length: 14,
4064 token: Token::Word(Word::Word("способы".to_string())),
4065 },
4066 PositionalToken {
4067 source: uws,
4068 offset: 665,
4069 length: 1,
4070 token: Token::Special(Special::Separator(Separator::Space)),
4071 },
4072 PositionalToken {
4073 source: uws,
4074 offset: 666,
4075 length: 24,
4076 token: Token::Word(Word::Word("празднования".to_string())),
4077 },
4078 PositionalToken {
4079 source: uws,
4080 offset: 690,
4081 length: 1,
4082 token: Token::Special(Special::Punctuation('.')),
4083 },
4084 PositionalToken {
4085 source: uws,
4086 offset: 691,
4087 length: 1,
4088 token: Token::Special(Special::Separator(Separator::Space)),
4089 },
4090 PositionalToken {
4091 source: uws,
4092 offset: 794,
4093 length: 1,
4094 token: Token::Special(Special::Separator(Separator::Newline)),
4095 },
4096 PositionalToken {
4097 source: uws,
4098 offset: 795,
4099 length: 2,
4100 token: Token::Special(Special::Separator(Separator::Space)),
4101 },
4102 PositionalToken {
4103 source: uws,
4104 offset: 870,
4105 length: 1,
4106 token: Token::Special(Special::Separator(Separator::Newline)),
4107 },
4108 PositionalToken {
4109 source: uws,
4110 offset: 871,
4111 length: 2,
4112 token: Token::Special(Special::Separator(Separator::Space)),
4113 },
4114 PositionalToken {
4115 source: uws,
4116 offset: 910,
4117 length: 2,
4118 token: Token::Word(Word::Word("П".to_string())),
4119 },
4120 PositionalToken {
4121 source: uws,
4122 offset: 919,
4123 length: 1,
4124 token: Token::Special(Special::Separator(Separator::Newline)),
4125 },
4126 PositionalToken {
4127 source: uws,
4128 offset: 927,
4129 length: 12,
4130 token: Token::Word(Word::Word("ПОЧЕМУ".to_string())),
4131 },
4132 PositionalToken {
4133 source: uws,
4134 offset: 939,
4135 length: 1,
4136 token: Token::Special(Special::Separator(Separator::Space)),
4137 },
4138 PositionalToken {
4139 source: uws,
4140 offset: 940,
4141 length: 4,
4142 token: Token::Word(Word::Word("МЫ".to_string())),
4143 },
4144 PositionalToken {
4145 source: uws,
4146 offset: 944,
4147 length: 1,
4148 token: Token::Special(Special::Separator(Separator::Space)),
4149 },
4150 PositionalToken {
4151 source: uws,
4152 offset: 945,
4153 length: 6,
4154 token: Token::Word(Word::Word("ЕГО".to_string())),
4155 },
4156 PositionalToken {
4157 source: uws,
4158 offset: 951,
4159 length: 1,
4160 token: Token::Special(Special::Separator(Separator::Space)),
4161 },
4162 PositionalToken {
4163 source: uws,
4164 offset: 952,
4165 length: 18,
4166 token: Token::Word(Word::Word("ПРАЗДНУЕМ".to_string())),
4167 },
4168 PositionalToken {
4169 source: uws,
4170 offset: 1063,
4171 length: 2,
4172 token: Token::Word(Word::Word("В".to_string())),
4173 },
4174 PositionalToken {
4175 source: uws,
4176 offset: 1065,
4177 length: 1,
4178 token: Token::Special(Special::Separator(Separator::Space)),
4179 },
4180 PositionalToken {
4181 source: uws,
4182 offset: 1066,
4183 length: 4,
4184 token: Token::Word(Word::Number(Number::Integer(1987))),
4185 },
4186 PositionalToken {
4187 source: uws,
4188 offset: 1070,
4189 length: 1,
4190 token: Token::Special(Special::Separator(Separator::Space)),
4191 },
4192 PositionalToken {
4193 source: uws,
4194 offset: 1071,
4195 length: 8,
4196 token: Token::Word(Word::Word("году".to_string())),
4197 },
4198 PositionalToken {
4199 source: uws,
4200 offset: 1079,
4201 length: 1,
4202 token: Token::Special(Special::Separator(Separator::Space)),
4203 },
4204 PositionalToken {
4205 source: uws,
4206 offset: 1080,
4207 length: 14,
4208 token: Token::Word(Word::Word("комитет".to_string())),
4209 },
4210 PositionalToken {
4211 source: uws,
4212 offset: 1094,
4213 length: 1,
4214 token: Token::Special(Special::Separator(Separator::Space)),
4215 },
4216 PositionalToken {
4217 source: uws,
4218 offset: 1095,
4219 length: 14,
4220 token: Token::Word(Word::Word("госдумы".to_string())),
4221 },
4222 PositionalToken {
4223 source: uws,
4224 offset: 1109,
4225 length: 1,
4226 token: Token::Special(Special::Separator(Separator::Space)),
4227 },
4228 PositionalToken {
4229 source: uws,
4230 offset: 1110,
4231 length: 4,
4232 token: Token::Word(Word::Word("по".to_string())),
4233 },
4234 PositionalToken {
4235 source: uws,
4236 offset: 1114,
4237 length: 1,
4238 token: Token::Special(Special::Separator(Separator::Space)),
4239 },
4240 PositionalToken {
4241 source: uws,
4242 offset: 1115,
4243 length: 10,
4244 token: Token::Word(Word::Word("делам".to_string())),
4245 },
4246 PositionalToken {
4247 source: uws,
4248 offset: 1125,
4249 length: 1,
4250 token: Token::Special(Special::Separator(Separator::Space)),
4251 },
4252 PositionalToken {
4253 source: uws,
4254 offset: 1126,
4255 length: 12,
4256 token: Token::Word(Word::Word("женщин".to_string())),
4257 },
4258 PositionalToken {
4259 source: uws,
4260 offset: 1138,
4261 length: 1,
4262 token: Token::Special(Special::Punctuation(',')),
4263 },
4264 PositionalToken {
4265 source: uws,
4266 offset: 1139,
4267 length: 1,
4268 token: Token::Special(Special::Separator(Separator::Space)),
4269 },
4270 PositionalToken {
4271 source: uws,
4272 offset: 1140,
4273 length: 10,
4274 token: Token::Word(Word::Word("семьи".to_string())),
4275 },
4276 PositionalToken {
4277 source: uws,
4278 offset: 1150,
4279 length: 1,
4280 token: Token::Special(Special::Separator(Separator::Space)),
4281 },
4282 PositionalToken {
4283 source: uws,
4284 offset: 1151,
4285 length: 2,
4286 token: Token::Word(Word::Word("и".to_string())),
4287 },
4288 PositionalToken {
4289 source: uws,
4290 offset: 1153,
4291 length: 1,
4292 token: Token::Special(Special::Separator(Separator::Space)),
4293 },
4294 PositionalToken {
4295 source: uws,
4296 offset: 1154,
4297 length: 16,
4298 token: Token::Word(Word::Word("молодежи".to_string())),
4299 },
4300 PositionalToken {
4301 source: uws,
4302 offset: 1170,
4303 length: 1,
4304 token: Token::Special(Special::Separator(Separator::Space)),
4305 },
4306 PositionalToken {
4307 source: uws,
4308 offset: 1171,
4309 length: 16,
4310 token: Token::Word(Word::Word("выступил".to_string())),
4311 },
4312 PositionalToken {
4313 source: uws,
4314 offset: 1187,
4315 length: 1,
4316 token: Token::Special(Special::Separator(Separator::Space)),
4317 },
4318 PositionalToken {
4319 source: uws,
4320 offset: 1188,
4321 length: 2,
4322 token: Token::Word(Word::Word("с".to_string())),
4323 },
4324 PositionalToken {
4325 source: uws,
4326 offset: 1190,
4327 length: 1,
4328 token: Token::Special(Special::Separator(Separator::Space)),
4329 },
4330 PositionalToken {
4331 source: uws,
4332 offset: 1191,
4333 length: 24,
4334 token: Token::Word(Word::Word("предложением".to_string())),
4335 },
4336 PositionalToken {
4337 source: uws,
4338 offset: 1215,
4339 length: 1,
4340 token: Token::Special(Special::Separator(Separator::Space)),
4341 },
4342 PositionalToken {
4343 source: uws,
4344 offset: 1216,
4345 length: 16,
4346 token: Token::Word(Word::Word("учредить".to_string())),
4347 },
4348 PositionalToken {
4349 source: uws,
4350 offset: 1232,
4351 length: 1,
4352 token: Token::Special(Special::Separator(Separator::Space)),
4353 },
4354 PositionalToken {
4355 source: uws,
4356 offset: 1233,
4357 length: 2,
4358 token: Token::Special(Special::Punctuation('«')),
4359 },
4360 PositionalToken {
4361 source: uws,
4362 offset: 1235,
4363 length: 8,
4364 token: Token::Word(Word::Word("День".to_string())),
4365 },
4366 PositionalToken {
4367 source: uws,
4368 offset: 1243,
4369 length: 1,
4370 token: Token::Special(Special::Separator(Separator::Space)),
4371 },
4372 PositionalToken {
4373 source: uws,
4374 offset: 1244,
4375 length: 8,
4376 token: Token::Word(Word::Word("мамы".to_string())),
4377 },
4378 PositionalToken {
4379 source: uws,
4380 offset: 1252,
4381 length: 2,
4382 token: Token::Special(Special::Punctuation('»')),
4383 },
4384 PositionalToken {
4385 source: uws,
4386 offset: 1254,
4387 length: 1,
4388 token: Token::Special(Special::Punctuation(',')),
4389 },
4390 PositionalToken {
4391 source: uws,
4392 offset: 1255,
4393 length: 1,
4394 token: Token::Special(Special::Separator(Separator::Space)),
4395 },
4396 PositionalToken {
4397 source: uws,
4398 offset: 1256,
4399 length: 2,
4400 token: Token::Word(Word::Word("а".to_string())),
4401 },
4402 PositionalToken {
4403 source: uws,
4404 offset: 1258,
4405 length: 1,
4406 token: Token::Special(Special::Separator(Separator::Space)),
4407 },
4408 PositionalToken {
4409 source: uws,
4410 offset: 1259,
4411 length: 6,
4412 token: Token::Word(Word::Word("сам".to_string())),
4413 },
4414 PositionalToken {
4415 source: uws,
4416 offset: 1265,
4417 length: 1,
4418 token: Token::Special(Special::Separator(Separator::Space)),
4419 },
4420 PositionalToken {
4421 source: uws,
4422 offset: 1266,
4423 length: 12,
4424 token: Token::Word(Word::Word("приказ".to_string())),
4425 },
4426 PositionalToken {
4427 source: uws,
4428 offset: 1278,
4429 length: 1,
4430 token: Token::Special(Special::Separator(Separator::Space)),
4431 },
4432 PositionalToken {
4433 source: uws,
4434 offset: 1279,
4435 length: 6,
4436 token: Token::Word(Word::Word("был".to_string())),
4437 },
4438 PositionalToken {
4439 source: uws,
4440 offset: 1285,
4441 length: 1,
4442 token: Token::Special(Special::Separator(Separator::Space)),
4443 },
4444 PositionalToken {
4445 source: uws,
4446 offset: 1286,
4447 length: 16,
4448 token: Token::Word(Word::Word("подписан".to_string())),
4449 },
4450 PositionalToken {
4451 source: uws,
4452 offset: 1302,
4453 length: 1,
4454 token: Token::Special(Special::Separator(Separator::Space)),
4455 },
4456 PositionalToken {
4457 source: uws,
4458 offset: 1303,
4459 length: 6,
4460 token: Token::Word(Word::Word("уже".to_string())),
4461 },
4462 PositionalToken {
4463 source: uws,
4464 offset: 1309,
4465 length: 1,
4466 token: Token::Special(Special::Separator(Separator::Space)),
4467 },
4468 PositionalToken {
4469 source: uws,
4470 offset: 1310,
4471 length: 2,
4472 token: Token::Word(Word::Number(Number::Integer(30))),
4473 },
4474 PositionalToken {
4475 source: uws,
4476 offset: 1312,
4477 length: 1,
4478 token: Token::Special(Special::Separator(Separator::Space)),
4479 },
4480 PositionalToken {
4481 source: uws,
4482 offset: 1313,
4483 length: 12,
4484 token: Token::Word(Word::Word("января".to_string())),
4485 },
4486 PositionalToken {
4487 source: uws,
4488 offset: 1325,
4489 length: 1,
4490 token: Token::Special(Special::Separator(Separator::Space)),
4491 },
4492 PositionalToken {
4493 source: uws,
4494 offset: 1326,
4495 length: 4,
4496 token: Token::Word(Word::Number(Number::Integer(1988))),
4497 },
4498 PositionalToken {
4499 source: uws,
4500 offset: 1330,
4501 length: 1,
4502 token: Token::Special(Special::Separator(Separator::Space)),
4503 },
4504 PositionalToken {
4505 source: uws,
4506 offset: 1331,
4507 length: 8,
4508 token: Token::Word(Word::Word("года".to_string())),
4509 },
4510 PositionalToken {
4511 source: uws,
4512 offset: 1339,
4513 length: 1,
4514 token: Token::Special(Special::Separator(Separator::Space)),
4515 },
4516 PositionalToken {
4517 source: uws,
4518 offset: 1340,
4519 length: 14,
4520 token: Token::Word(Word::Word("Борисом".to_string())),
4521 },
4522 PositionalToken {
4523 source: uws,
4524 offset: 1354,
4525 length: 1,
4526 token: Token::Special(Special::Separator(Separator::Space)),
4527 },
4528 PositionalToken {
4529 source: uws,
4530 offset: 1355,
4531 length: 16,
4532 token: Token::Word(Word::Word("Ельциным".to_string())),
4533 },
4534 PositionalToken {
4535 source: uws,
4536 offset: 1371,
4537 length: 1,
4538 token: Token::Special(Special::Punctuation('.')),
4539 },
4540 PositionalToken {
4541 source: uws,
4542 offset: 1372,
4543 length: 1,
4544 token: Token::Special(Special::Separator(Separator::Space)),
4545 },
4546 PositionalToken {
4547 source: uws,
4548 offset: 1373,
4549 length: 8,
4550 token: Token::Word(Word::Word("Было".to_string())),
4551 },
4552 PositionalToken {
4553 source: uws,
4554 offset: 1381,
4555 length: 1,
4556 token: Token::Special(Special::Separator(Separator::Space)),
4557 },
4558 PositionalToken {
4559 source: uws,
4560 offset: 1382,
4561 length: 12,
4562 token: Token::Word(Word::Word("решено".to_string())),
4563 },
4564 PositionalToken {
4565 source: uws,
4566 offset: 1394,
4567 length: 1,
4568 token: Token::Special(Special::Punctuation(',')),
4569 },
4570 PositionalToken {
4571 source: uws,
4572 offset: 1395,
4573 length: 1,
4574 token: Token::Special(Special::Separator(Separator::Space)),
4575 },
4576 PositionalToken {
4577 source: uws,
4578 offset: 1396,
4579 length: 6,
4580 token: Token::Word(Word::Word("что".to_string())),
4581 },
4582 PositionalToken {
4583 source: uws,
4584 offset: 1402,
4585 length: 1,
4586 token: Token::Special(Special::Separator(Separator::Space)),
4587 },
4588 PositionalToken {
4589 source: uws,
4590 offset: 1403,
4591 length: 16,
4592 token: Token::Word(Word::Word("ежегодно".to_string())),
4593 },
4594 PositionalToken {
4595 source: uws,
4596 offset: 1419,
4597 length: 1,
4598 token: Token::Special(Special::Separator(Separator::Space)),
4599 },
4600 PositionalToken {
4601 source: uws,
4602 offset: 1420,
4603 length: 2,
4604 token: Token::Word(Word::Word("в".to_string())),
4605 },
4606 PositionalToken {
4607 source: uws,
4608 offset: 1422,
4609 length: 1,
4610 token: Token::Special(Special::Separator(Separator::Space)),
4611 },
4612 PositionalToken {
4613 source: uws,
4614 offset: 1423,
4615 length: 12,
4616 token: Token::Word(Word::Word("России".to_string())),
4617 },
4618 PositionalToken {
4619 source: uws,
4620 offset: 1435,
4621 length: 1,
4622 token: Token::Special(Special::Separator(Separator::Space)),
4623 },
4624 PositionalToken {
4625 source: uws,
4626 offset: 1436,
4627 length: 22,
4628 token: Token::Word(Word::Word("празднество".to_string())),
4629 },
4630 PositionalToken {
4631 source: uws,
4632 offset: 1458,
4633 length: 1,
4634 token: Token::Special(Special::Separator(Separator::Space)),
4635 },
4636 PositionalToken {
4637 source: uws,
4638 offset: 1459,
4639 length: 6,
4640 token: Token::Word(Word::Word("дня".to_string())),
4641 },
4642 PositionalToken {
4643 source: uws,
4644 offset: 1465,
4645 length: 1,
4646 token: Token::Special(Special::Separator(Separator::Space)),
4647 },
4648 PositionalToken {
4649 source: uws,
4650 offset: 1466,
4651 length: 8,
4652 token: Token::Word(Word::Word("мамы".to_string())),
4653 },
4654 PositionalToken {
4655 source: uws,
4656 offset: 1474,
4657 length: 1,
4658 token: Token::Special(Special::Separator(Separator::Space)),
4659 },
4660 PositionalToken {
4661 source: uws,
4662 offset: 1475,
4663 length: 10,
4664 token: Token::Word(Word::Word("будет".to_string())),
4665 },
4666 PositionalToken {
4667 source: uws,
4668 offset: 1485,
4669 length: 1,
4670 token: Token::Special(Special::Separator(Separator::Space)),
4671 },
4672 PositionalToken {
4673 source: uws,
4674 offset: 1486,
4675 length: 16,
4676 token: Token::Word(Word::Word("выпадать".to_string())),
4677 },
4678 PositionalToken {
4679 source: uws,
4680 offset: 1502,
4681 length: 1,
4682 token: Token::Special(Special::Separator(Separator::Space)),
4683 },
4684 PositionalToken {
4685 source: uws,
4686 offset: 1503,
4687 length: 4,
4688 token: Token::Word(Word::Word("на".to_string())),
4689 },
4690 PositionalToken {
4691 source: uws,
4692 offset: 1507,
4693 length: 1,
4694 token: Token::Special(Special::Separator(Separator::Space)),
4695 },
4696 PositionalToken {
4697 source: uws,
4698 offset: 1508,
4699 length: 18,
4700 token: Token::Word(Word::Word("последнее".to_string())),
4701 },
4702 PositionalToken {
4703 source: uws,
4704 offset: 1526,
4705 length: 1,
4706 token: Token::Special(Special::Separator(Separator::Space)),
4707 },
4708 PositionalToken {
4709 source: uws,
4710 offset: 1527,
4711 length: 22,
4712 token: Token::Word(Word::Word("воскресенье".to_string())),
4713 },
4714 PositionalToken {
4715 source: uws,
4716 offset: 1549,
4717 length: 1,
4718 token: Token::Special(Special::Separator(Separator::Space)),
4719 },
4720 PositionalToken {
4721 source: uws,
4722 offset: 1550,
4723 length: 12,
4724 token: Token::Word(Word::Word("ноября".to_string())),
4725 },
4726 PositionalToken {
4727 source: uws,
4728 offset: 1562,
4729 length: 1,
4730 token: Token::Special(Special::Punctuation('.')),
4731 },
4732 PositionalToken {
4733 source: uws,
4734 offset: 1563,
4735 length: 1,
4736 token: Token::Special(Special::Separator(Separator::Space)),
4737 },
4738 PositionalToken {
4739 source: uws,
4740 offset: 1664,
4741 length: 1,
4742 token: Token::Special(Special::Separator(Separator::Newline)),
4743 },
4744 PositionalToken {
4745 source: uws,
4746 offset: 1665,
4747 length: 2,
4748 token: Token::Special(Special::Separator(Separator::Space)),
4749 },
4750 PositionalToken {
4751 source: uws,
4752 offset: 1725,
4753 length: 1,
4754 token: Token::Special(Special::Separator(Separator::Newline)),
4755 },
4756 PositionalToken {
4757 source: uws,
4758 offset: 1726,
4759 length: 4,
4760 token: Token::Special(Special::Separator(Separator::Space)),
4761 },
4762 PositionalToken {
4763 source: uws,
4764 offset: 2725,
4765 length: 1,
4766 token: Token::Special(Special::Separator(Separator::Newline)),
4767 },
4768 PositionalToken {
4769 source: uws,
4770 offset: 2726,
4771 length: 2,
4772 token: Token::Special(Special::Separator(Separator::Space)),
4773 },
4774 PositionalToken {
4775 source: uws,
4776 offset: 2888,
4777 length: 1,
4778 token: Token::Special(Special::Separator(Separator::Newline)),
4779 },
4780 PositionalToken {
4781 source: uws,
4782 offset: 2889,
4783 length: 2,
4784 token: Token::Special(Special::Separator(Separator::Space)),
4785 },
4786 PositionalToken {
4787 source: uws,
4788 offset: 2891,
4789 length: 1,
4790 token: Token::Special(Special::Separator(Separator::Newline)),
4791 },
4792 PositionalToken {
4793 source: uws,
4794 offset: 2904,
4795 length: 1,
4796 token: Token::Special(Special::Separator(Separator::Newline)),
4797 },
4798 PositionalToken {
4799 source: uws,
4800 offset: 2905,
4801 length: 4,
4802 token: Token::Special(Special::Separator(Separator::Space)),
4803 },
4804 ];
4805
4806 let text = Text::new({
4807 uws.into_source()
4808 .pipe(tagger::Builder::new().create().into_breaker())
4809 .pipe(entities::Builder::new().create().into_piped())
4810 .into_separator()
4811 })
4812 .unwrap();
4813
4814 let lib_res = text
4815 .into_tokenizer(TokenizerParams::v1())
4816 .filter_map(|tt| tt.into_original_token_1())
4817 .collect::<Vec<_>>();
4818
4819 check_results(&result, &lib_res, uws);
4820 }
4821
4822 #[test]
4873 fn numerical_no_split() {
4874 let uws = "12.02.18 31.28.34 23.11.2018 123.568.365.234.578 127.0.0.1 1st 1кг 123123афываыв 12321фвафыов234выалфо 12_123_343.4234_4234";
4875 let lib_res = uws.into_tokenizer(Default::default()).collect::<Vec<_>>();
4876 let result = vec![
4878 PositionalToken {
4879 source: uws,
4880 offset: 0,
4881 length: 8,
4882 token: Token::Word(Word::Numerical(Numerical::DotSeparated(
4883 "12.02.18".to_string(),
4884 ))),
4885 },
4886 PositionalToken {
4887 source: uws,
4888 offset: 8,
4889 length: 1,
4890 token: Token::Special(Special::Separator(Separator::Space)),
4891 },
4892 PositionalToken {
4893 source: uws,
4894 offset: 9,
4895 length: 8,
4896 token: Token::Word(Word::Numerical(Numerical::DotSeparated(
4897 "31.28.34".to_string(),
4898 ))),
4899 },
4900 PositionalToken {
4901 source: uws,
4902 offset: 17,
4903 length: 1,
4904 token: Token::Special(Special::Separator(Separator::Space)),
4905 },
4906 PositionalToken {
4907 source: uws,
4908 offset: 18,
4909 length: 10,
4910 token: Token::Word(Word::Numerical(Numerical::DotSeparated(
4911 "23.11.2018".to_string(),
4912 ))),
4913 },
4914 PositionalToken {
4915 source: uws,
4916 offset: 28,
4917 length: 1,
4918 token: Token::Special(Special::Separator(Separator::Space)),
4919 },
4920 PositionalToken {
4921 source: uws,
4922 offset: 29,
4923 length: 19,
4924 token: Token::Word(Word::Numerical(Numerical::DotSeparated(
4925 "123.568.365.234.578".to_string(),
4926 ))),
4927 },
4928 PositionalToken {
4929 source: uws,
4930 offset: 48,
4931 length: 1,
4932 token: Token::Special(Special::Separator(Separator::Space)),
4933 },
4934 PositionalToken {
4935 source: uws,
4936 offset: 49,
4937 length: 9,
4938 token: Token::Word(Word::Numerical(Numerical::DotSeparated(
4939 "127.0.0.1".to_string(),
4940 ))),
4941 },
4942 PositionalToken {
4943 source: uws,
4944 offset: 58,
4945 length: 1,
4946 token: Token::Special(Special::Separator(Separator::Space)),
4947 },
4948 PositionalToken {
4949 source: uws,
4950 offset: 59,
4951 length: 3,
4952 token: Token::Word(Word::Numerical(Numerical::Measures("1st".to_string()))),
4953 },
4954 PositionalToken {
4955 source: uws,
4956 offset: 62,
4957 length: 1,
4958 token: Token::Special(Special::Separator(Separator::Space)),
4959 },
4960 PositionalToken {
4961 source: uws,
4962 offset: 63,
4963 length: 5,
4964 token: Token::Word(Word::Numerical(Numerical::Measures("1кг".to_string()))),
4965 },
4966 PositionalToken {
4967 source: uws,
4968 offset: 68,
4969 length: 1,
4970 token: Token::Special(Special::Separator(Separator::Space)),
4971 },
4972 PositionalToken {
4973 source: uws,
4974 offset: 69,
4975 length: 20,
4976 token: Token::Word(Word::Numerical(Numerical::Measures(
4977 "123123афываыв".to_string(),
4978 ))),
4979 },
4980 PositionalToken {
4981 source: uws,
4982 offset: 89,
4983 length: 1,
4984 token: Token::Special(Special::Separator(Separator::Space)),
4985 },
4986 PositionalToken {
4987 source: uws,
4988 offset: 90,
4989 length: 34,
4990 token: Token::Word(Word::Numerical(Numerical::Alphanumeric(
4991 "12321фвафыов234выалфо".to_string(),
4992 ))),
4993 },
4994 PositionalToken {
4995 source: uws,
4996 offset: 124,
4997 length: 1,
4998 token: Token::Special(Special::Separator(Separator::Space)),
4999 },
5000 PositionalToken {
5001 source: uws,
5002 offset: 125,
5003 length: 20,
5004 token: Token::Word(Word::Numerical(Numerical::Alphanumeric(
5005 "12_123_343.4234_4234".to_string(),
5006 ))),
5007 },
5008 ];
5009 check_results(&result, &lib_res, uws);
5010 }
5011
5012 #[test]
5013 fn numerical_default() {
5014 let uws = "12.02.18 31.28.34 23.11.2018 123.568.365.234.578 127.0.0.1 1st 1кг 123123афываыв 12321фвафыов234выалфо 12_123_343.4234_4234";
5015 let lib_res = uws
5016 .into_tokenizer(TokenizerParams::v1())
5017 .collect::<Vec<_>>();
5018 let result = vec![
5020 PositionalToken {
5021 source: uws,
5022 offset: 0,
5023 length: 2,
5024 token: Token::Word(Word::Number(Number::Integer(12))),
5025 },
5026 PositionalToken {
5027 source: uws,
5028 offset: 2,
5029 length: 1,
5030 token: Token::Special(Special::Punctuation('.')),
5031 },
5032 PositionalToken {
5033 source: uws,
5034 offset: 3,
5035 length: 2,
5036 token: Token::Word(Word::Number(Number::ZeroInteger {
5037 i: 2,
5038 s: "02".to_string(),
5039 })),
5040 },
5041 PositionalToken {
5042 source: uws,
5043 offset: 5,
5044 length: 1,
5045 token: Token::Special(Special::Punctuation('.')),
5046 },
5047 PositionalToken {
5048 source: uws,
5049 offset: 6,
5050 length: 2,
5051 token: Token::Word(Word::Number(Number::Integer(18))),
5052 },
5053 PositionalToken {
5054 source: uws,
5055 offset: 8,
5056 length: 1,
5057 token: Token::Special(Special::Separator(Separator::Space)),
5058 },
5059 PositionalToken {
5060 source: uws,
5061 offset: 9,
5062 length: 2,
5063 token: Token::Word(Word::Number(Number::Integer(31))),
5064 },
5065 PositionalToken {
5066 source: uws,
5067 offset: 11,
5068 length: 1,
5069 token: Token::Special(Special::Punctuation('.')),
5070 },
5071 PositionalToken {
5072 source: uws,
5073 offset: 12,
5074 length: 2,
5075 token: Token::Word(Word::Number(Number::Integer(28))),
5076 },
5077 PositionalToken {
5078 source: uws,
5079 offset: 14,
5080 length: 1,
5081 token: Token::Special(Special::Punctuation('.')),
5082 },
5083 PositionalToken {
5084 source: uws,
5085 offset: 15,
5086 length: 2,
5087 token: Token::Word(Word::Number(Number::Integer(34))),
5088 },
5089 PositionalToken {
5090 source: uws,
5091 offset: 17,
5092 length: 1,
5093 token: Token::Special(Special::Separator(Separator::Space)),
5094 },
5095 PositionalToken {
5096 source: uws,
5097 offset: 18,
5098 length: 2,
5099 token: Token::Word(Word::Number(Number::Integer(23))),
5100 },
5101 PositionalToken {
5102 source: uws,
5103 offset: 20,
5104 length: 1,
5105 token: Token::Special(Special::Punctuation('.')),
5106 },
5107 PositionalToken {
5108 source: uws,
5109 offset: 21,
5110 length: 2,
5111 token: Token::Word(Word::Number(Number::Integer(11))),
5112 },
5113 PositionalToken {
5114 source: uws,
5115 offset: 23,
5116 length: 1,
5117 token: Token::Special(Special::Punctuation('.')),
5118 },
5119 PositionalToken {
5120 source: uws,
5121 offset: 24,
5122 length: 4,
5123 token: Token::Word(Word::Number(Number::Integer(2018))),
5124 },
5125 PositionalToken {
5126 source: uws,
5127 offset: 28,
5128 length: 1,
5129 token: Token::Special(Special::Separator(Separator::Space)),
5130 },
5131 PositionalToken {
5132 source: uws,
5133 offset: 29,
5134 length: 3,
5135 token: Token::Word(Word::Number(Number::Integer(123))),
5136 },
5137 PositionalToken {
5138 source: uws,
5139 offset: 32,
5140 length: 1,
5141 token: Token::Special(Special::Punctuation('.')),
5142 },
5143 PositionalToken {
5144 source: uws,
5145 offset: 33,
5146 length: 3,
5147 token: Token::Word(Word::Number(Number::Integer(568))),
5148 },
5149 PositionalToken {
5150 source: uws,
5151 offset: 36,
5152 length: 1,
5153 token: Token::Special(Special::Punctuation('.')),
5154 },
5155 PositionalToken {
5156 source: uws,
5157 offset: 37,
5158 length: 3,
5159 token: Token::Word(Word::Number(Number::Integer(365))),
5160 },
5161 PositionalToken {
5162 source: uws,
5163 offset: 40,
5164 length: 1,
5165 token: Token::Special(Special::Punctuation('.')),
5166 },
5167 PositionalToken {
5168 source: uws,
5169 offset: 41,
5170 length: 3,
5171 token: Token::Word(Word::Number(Number::Integer(234))),
5172 },
5173 PositionalToken {
5174 source: uws,
5175 offset: 44,
5176 length: 1,
5177 token: Token::Special(Special::Punctuation('.')),
5178 },
5179 PositionalToken {
5180 source: uws,
5181 offset: 45,
5182 length: 3,
5183 token: Token::Word(Word::Number(Number::Integer(578))),
5184 },
5185 PositionalToken {
5186 source: uws,
5187 offset: 48,
5188 length: 1,
5189 token: Token::Special(Special::Separator(Separator::Space)),
5190 },
5191 PositionalToken {
5192 source: uws,
5193 offset: 49,
5194 length: 3,
5195 token: Token::Word(Word::Number(Number::Integer(127))),
5196 },
5197 PositionalToken {
5198 source: uws,
5199 offset: 52,
5200 length: 1,
5201 token: Token::Special(Special::Punctuation('.')),
5202 },
5203 PositionalToken {
5204 source: uws,
5205 offset: 53,
5206 length: 1,
5207 token: Token::Word(Word::Number(Number::ZeroInteger {
5208 i: 0,
5209 s: "0".to_string(),
5210 })),
5211 },
5212 PositionalToken {
5213 source: uws,
5214 offset: 54,
5215 length: 1,
5216 token: Token::Special(Special::Punctuation('.')),
5217 },
5218 PositionalToken {
5219 source: uws,
5220 offset: 55,
5221 length: 1,
5222 token: Token::Word(Word::Number(Number::ZeroInteger {
5223 i: 0,
5224 s: "0".to_string(),
5225 })),
5226 },
5227 PositionalToken {
5228 source: uws,
5229 offset: 56,
5230 length: 1,
5231 token: Token::Special(Special::Punctuation('.')),
5232 },
5233 PositionalToken {
5234 source: uws,
5235 offset: 57,
5236 length: 1,
5237 token: Token::Word(Word::Number(Number::Integer(1))),
5238 },
5239 PositionalToken {
5240 source: uws,
5241 offset: 58,
5242 length: 1,
5243 token: Token::Special(Special::Separator(Separator::Space)),
5244 },
5245 PositionalToken {
5246 source: uws,
5247 offset: 59,
5248 length: 3,
5249 token: Token::Word(Word::Numerical(Numerical::Measures("1st".to_string()))),
5250 },
5251 PositionalToken {
5252 source: uws,
5253 offset: 62,
5254 length: 1,
5255 token: Token::Special(Special::Separator(Separator::Space)),
5256 },
5257 PositionalToken {
5258 source: uws,
5259 offset: 63,
5260 length: 5,
5261 token: Token::Word(Word::Numerical(Numerical::Measures("1кг".to_string()))),
5262 },
5263 PositionalToken {
5264 source: uws,
5265 offset: 68,
5266 length: 1,
5267 token: Token::Special(Special::Separator(Separator::Space)),
5268 },
5269 PositionalToken {
5270 source: uws,
5271 offset: 69,
5272 length: 20,
5273 token: Token::Word(Word::Numerical(Numerical::Measures(
5274 "123123афываыв".to_string(),
5275 ))),
5276 },
5277 PositionalToken {
5278 source: uws,
5279 offset: 89,
5280 length: 1,
5281 token: Token::Special(Special::Separator(Separator::Space)),
5282 },
5283 PositionalToken {
5284 source: uws,
5285 offset: 90,
5286 length: 34,
5287 token: Token::Word(Word::Numerical(Numerical::Alphanumeric(
5288 "12321фвафыов234выалфо".to_string(),
5289 ))),
5290 },
5291 PositionalToken {
5292 source: uws,
5293 offset: 124,
5294 length: 1,
5295 token: Token::Special(Special::Separator(Separator::Space)),
5296 },
5297 PositionalToken {
5298 source: uws,
5299 offset: 125,
5300 length: 2,
5301 token: Token::Word(Word::Number(Number::Integer(12))),
5302 },
5303 PositionalToken {
5304 source: uws,
5305 offset: 127,
5306 length: 1,
5307 token: Token::Special(Special::Punctuation('_')),
5308 },
5309 PositionalToken {
5310 source: uws,
5311 offset: 128,
5312 length: 3,
5313 token: Token::Word(Word::Number(Number::Integer(123))),
5314 },
5315 PositionalToken {
5316 source: uws,
5317 offset: 131,
5318 length: 1,
5319 token: Token::Special(Special::Punctuation('_')),
5320 },
5321 PositionalToken {
5322 source: uws,
5323 offset: 132,
5324 length: 3,
5325 token: Token::Word(Word::Number(Number::Integer(343))),
5326 },
5327 PositionalToken {
5328 source: uws,
5329 offset: 135,
5330 length: 1,
5331 token: Token::Special(Special::Punctuation('.')),
5332 },
5333 PositionalToken {
5334 source: uws,
5335 offset: 136,
5336 length: 4,
5337 token: Token::Word(Word::Number(Number::Integer(4234))),
5338 },
5339 PositionalToken {
5340 source: uws,
5341 offset: 140,
5342 length: 1,
5343 token: Token::Special(Special::Punctuation('_')),
5344 },
5345 PositionalToken {
5346 source: uws,
5347 offset: 141,
5348 length: 4,
5349 token: Token::Word(Word::Number(Number::Integer(4234))),
5350 },
5351 ];
5352 check_results(&result, &lib_res, uws);
5353 }
5354
5355 enum Lang {
5368 Zho,
5369 Jpn,
5370 Kor,
5371 Ara,
5372 Ell,
5373 }
5374
5375 #[test]
5376 fn test_lang_zho() {
5377 let (uws, result) = get_lang_test(Lang::Zho);
5378 let lib_res = uws
5379 .into_tokenizer(TokenizerParams::v1())
5380 .collect::<Vec<_>>();
5381 check_results(&result, &lib_res, &uws);
5382 }
5383
5384 #[test]
5385 fn test_lang_jpn() {
5386 let (uws, result) = get_lang_test(Lang::Jpn);
5387 let lib_res = uws
5388 .into_tokenizer(TokenizerParams::v1())
5389 .collect::<Vec<_>>();
5390 check_results(&result, &lib_res, &uws);
5391 }
5392
5393 #[test]
5394 fn test_lang_kor() {
5395 let (uws, result) = get_lang_test(Lang::Kor);
5396 let lib_res = uws
5397 .into_tokenizer(TokenizerParams::v1())
5398 .collect::<Vec<_>>();
5399 check_results(&result, &lib_res, &uws);
5400 }
5401
5402 #[test]
5403 fn test_lang_ara() {
5404 let (uws, result) = get_lang_test(Lang::Ara);
5405 let lib_res = uws
5406 .into_tokenizer(TokenizerParams::v1())
5407 .collect::<Vec<_>>();
5408 check_results(&result, &lib_res, &uws);
5409 }
5410
5411 #[test]
5412 fn test_lang_ell() {
5413 let (uws, result) = get_lang_test(Lang::Ell);
5414 let lib_res = uws
5415 .into_tokenizer(TokenizerParams::v1())
5416 .collect::<Vec<_>>();
5417 check_results(&result, &lib_res, &uws);
5418 }
5419
5420 fn get_lang_test(lng: Lang) -> (String, Vec<PositionalToken>) {
5421 let uws = match lng {
5422 Lang::Zho => {
5423 "美国电视连续剧《超人前传》的第一集《试播集》于2001年10月16日在電視網首播,剧集主创人阿尔弗雷德·高夫和迈尔斯·米勒編劇,大卫·努特尔执导。这一试播首次向观众引荐了克拉克·肯特一角,他是位拥有超能力的外星孤儿,与家人和朋友一起在堪薩斯州虚构小镇斯莫维尔生活。在这一集里,肯特首度得知自己的来历,同时还需要阻止一位学生试图杀死镇上高中多名学生的报复之举。本集节目里引入了多个之后将贯穿全季甚至整部剧集的主题元素,例如几位主要角色之间的三角恋情。电视剧在加拿大溫哥華取景,旨在选用其“美国中产阶级”景观,主创人花了5个月的时间专门用于为主角物色合适的演员。试播集在所有演员选好4天后正式开拍。由于时间上的限制,剧组无法搭建好实体外景,因此只能使用计算机绘图技术将数字化的外景插入到镜头中。节目一经上映就打破了电视网的多项收视纪录,并且获得了评论员的普遍好评和多个奖项提名,并在其中两项上胜出"
5424 }
5425 Lang::Kor => {
5426 "플레이스테이션 은 소니 컴퓨터 엔터테인먼트가 개발한 세 번째 가정용 게임기이다. 마이크로소프트의 엑스박스 360, 닌텐도의 Wii와 경쟁하고 있다. 이전 제품에서 온라인 플레이 기능을 비디오 게임 개발사에 전적으로 의존하던 것과 달리 통합 온라인 게임 서비스인 플레이스테이션 네트워크 서비스를 발매와 함께 시작해 제공하고 있으며, 탄탄한 멀티미디어 재생 기능, 플레이스테이션 포터블과의 연결, 고화질 광학 디스크 포맷인 블루레이 디스크 재생 기능 등의 기능을 갖추고 있다. 2006년 11월 11일에 일본에서 처음으로 출시했으며, 11월 17일에는 북미 지역, 2007년 3월 23일에는 유럽과 오세아니아 지역에서, 대한민국의 경우 6월 5일부터 일주일간 예약판매를 실시해, 매일 준비한 수량이 동이 나는 등 많은 관심을 받았으며 6월 16일에 정식 출시 행사를 열었다"
5427 }
5428 Lang::Jpn => {
5429 "熊野三山本願所は、15世紀末以降における熊野三山(熊野本宮、熊野新宮、熊野那智)の造営・修造のための勧進を担った組織の総称。 熊野三山を含めて、日本における古代から中世前半にかけての寺社の造営は、寺社領経営のような恒常的財源、幕府や朝廷などからの一時的な造営料所の寄進、あるいは公権力からの臨時の保護によって行われていた。しかしながら、熊野三山では、これらの財源はすべて15世紀半ばまでに実効性を失った"
5430 }
5431 Lang::Ara => {
5432 "لشکرکشیهای روسهای وارنگی به دریای خزر مجموعهای از حملات نظامی در بین سالهای ۸۶۴ تا ۱۰۴۱ میلادی به سواحل دریای خزر بودهاست. روسهای وارنگی ابتدا در قرن نهم میلادی به عنوان بازرگانان پوست، عسل و برده در سرزمینهای اسلامی(سرکلند) ظاهر شدند. این بازرگانان در مسیر تجاری ولگا به خرید و فروش میپرداختند. نخستین حملهٔ آنان در فاصله سالهای ۸۶۴ تا ۸۸۴ میلادی در مقیاسی کوچک علیه علویان طبرستان رخ داد. نخستین یورش بزرگ روسها در سال ۹۱۳ رخ داد و آنان با ۵۰۰ فروند درازکشتی شهر گرگان و اطراف آن را غارت کردند. آنها در این حمله مقداری کالا و برده را به تاراج بردند و در راه بازگشتن به سمت شمال، در دلتای ولگا، مورد حملهٔ خزرهای مسلمان قرار گرفتند و بعضی از آنان موفق به فرار شدند، ولی در میانهٔ ولگا به قتل رسیدند. دومین هجوم بزرگ روسها به دریای خزر در سال ۹۴۳ به وقوع پیوست. در این دوره ایگور یکم، حاکم روس کیف، رهبری روسها را در دست داشت. روسها پس از توافق با دولت خزرها برای عبور امن از منطقه، تا رود کورا و اعماق قفقاز پیش رفتند و در سال ۹۴۳ موفق شدند بندر بردعه، پایتخت اران (جمهوری آذربایجان کنونی)، را تصرف کنند. روسها در آنجا به مدت چند ماه ماندند و بسیاری از ساکنان شهر را کشتند و از راه غارتگری اموالی را به تاراج بردند. تنها دلیل بازگشت آنان "
5433 }
5434 Lang::Ell => {
5435 "Το Πρόγραμμα υλοποιείται εξ ολοκλήρου από απόσταση και μπορεί να συμμετέχει κάθε εμπλεκόμενος στη ή/και ενδιαφερόμενος για τη διδασκαλία της Ελληνικής ως δεύτερης/ξένης γλώσσας στην Ελλάδα και στο εξωτερικό, αρκεί να είναι απόφοιτος ελληνικής φιλολογίας, ξένων φιλολογιών, παιδαγωγικών τμημάτων, θεολογικών σχολών ή άλλων πανεπιστημιακών τμημάτων ελληνικών ή ισότιμων ξένων πανεπιστημίων. Υπό όρους γίνονται δεκτοί υποψήφιοι που δεν έχουν ολοκληρώσει σπουδές τριτοβάθμιας εκπαίδευσης."
5436 }
5437 };
5438 let tokens = match lng {
5439 Lang::Zho => vec![
5440 PositionalToken {
5441 source: uws,
5442 offset: 0,
5443 length: 3,
5444 token: Token::Word(Word::Word("美".to_string())),
5445 },
5446 PositionalToken {
5447 source: uws,
5448 offset: 3,
5449 length: 3,
5450 token: Token::Word(Word::Word("国".to_string())),
5451 },
5452 PositionalToken {
5453 source: uws,
5454 offset: 6,
5455 length: 3,
5456 token: Token::Word(Word::Word("电".to_string())),
5457 },
5458 PositionalToken {
5459 source: uws,
5460 offset: 9,
5461 length: 3,
5462 token: Token::Word(Word::Word("视".to_string())),
5463 },
5464 PositionalToken {
5465 source: uws,
5466 offset: 12,
5467 length: 3,
5468 token: Token::Word(Word::Word("连".to_string())),
5469 },
5470 PositionalToken {
5471 source: uws,
5472 offset: 15,
5473 length: 3,
5474 token: Token::Word(Word::Word("续".to_string())),
5475 },
5476 PositionalToken {
5477 source: uws,
5478 offset: 18,
5479 length: 3,
5480 token: Token::Word(Word::Word("剧".to_string())),
5481 },
5482 PositionalToken {
5483 source: uws,
5484 offset: 21,
5485 length: 3,
5486 token: Token::Special(Special::Punctuation('《')),
5487 },
5488 PositionalToken {
5489 source: uws,
5490 offset: 24,
5491 length: 3,
5492 token: Token::Word(Word::Word("超".to_string())),
5493 },
5494 PositionalToken {
5495 source: uws,
5496 offset: 27,
5497 length: 3,
5498 token: Token::Word(Word::Word("人".to_string())),
5499 },
5500 PositionalToken {
5501 source: uws,
5502 offset: 30,
5503 length: 3,
5504 token: Token::Word(Word::Word("前".to_string())),
5505 },
5506 PositionalToken {
5507 source: uws,
5508 offset: 33,
5509 length: 3,
5510 token: Token::Word(Word::Word("传".to_string())),
5511 },
5512 PositionalToken {
5513 source: uws,
5514 offset: 36,
5515 length: 3,
5516 token: Token::Special(Special::Punctuation('》')),
5517 },
5518 PositionalToken {
5519 source: uws,
5520 offset: 39,
5521 length: 3,
5522 token: Token::Word(Word::Word("的".to_string())),
5523 },
5524 PositionalToken {
5525 source: uws,
5526 offset: 42,
5527 length: 3,
5528 token: Token::Word(Word::Word("第".to_string())),
5529 },
5530 PositionalToken {
5531 source: uws,
5532 offset: 45,
5533 length: 3,
5534 token: Token::Word(Word::Word("一".to_string())),
5535 },
5536 PositionalToken {
5537 source: uws,
5538 offset: 48,
5539 length: 3,
5540 token: Token::Word(Word::Word("集".to_string())),
5541 },
5542 PositionalToken {
5543 source: uws,
5544 offset: 51,
5545 length: 3,
5546 token: Token::Special(Special::Punctuation('《')),
5547 },
5548 PositionalToken {
5549 source: uws,
5550 offset: 54,
5551 length: 3,
5552 token: Token::Word(Word::Word("试".to_string())),
5553 },
5554 PositionalToken {
5555 source: uws,
5556 offset: 57,
5557 length: 3,
5558 token: Token::Word(Word::Word("播".to_string())),
5559 },
5560 PositionalToken {
5561 source: uws,
5562 offset: 60,
5563 length: 3,
5564 token: Token::Word(Word::Word("集".to_string())),
5565 },
5566 PositionalToken {
5567 source: uws,
5568 offset: 63,
5569 length: 3,
5570 token: Token::Special(Special::Punctuation('》')),
5571 },
5572 PositionalToken {
5573 source: uws,
5574 offset: 66,
5575 length: 3,
5576 token: Token::Word(Word::Word("于".to_string())),
5577 },
5578 PositionalToken {
5579 source: uws,
5580 offset: 69,
5581 length: 4,
5582 token: Token::Word(Word::Number(Number::Integer(2001))),
5583 },
5584 PositionalToken {
5585 source: uws,
5586 offset: 73,
5587 length: 3,
5588 token: Token::Word(Word::Word("年".to_string())),
5589 },
5590 PositionalToken {
5591 source: uws,
5592 offset: 76,
5593 length: 2,
5594 token: Token::Word(Word::Number(Number::Integer(10))),
5595 },
5596 PositionalToken {
5597 source: uws,
5598 offset: 78,
5599 length: 3,
5600 token: Token::Word(Word::Word("月".to_string())),
5601 },
5602 PositionalToken {
5603 source: uws,
5604 offset: 81,
5605 length: 2,
5606 token: Token::Word(Word::Number(Number::Integer(16))),
5607 },
5608 PositionalToken {
5609 source: uws,
5610 offset: 83,
5611 length: 3,
5612 token: Token::Word(Word::Word("日".to_string())),
5613 },
5614 PositionalToken {
5615 source: uws,
5616 offset: 86,
5617 length: 3,
5618 token: Token::Word(Word::Word("在".to_string())),
5619 },
5620 PositionalToken {
5621 source: uws,
5622 offset: 89,
5623 length: 3,
5624 token: Token::Word(Word::Word("電".to_string())),
5625 },
5626 PositionalToken {
5627 source: uws,
5628 offset: 92,
5629 length: 3,
5630 token: Token::Word(Word::Word("視".to_string())),
5631 },
5632 PositionalToken {
5633 source: uws,
5634 offset: 95,
5635 length: 3,
5636 token: Token::Word(Word::Word("網".to_string())),
5637 },
5638 PositionalToken {
5639 source: uws,
5640 offset: 98,
5641 length: 3,
5642 token: Token::Word(Word::Word("首".to_string())),
5643 },
5644 PositionalToken {
5645 source: uws,
5646 offset: 101,
5647 length: 3,
5648 token: Token::Word(Word::Word("播".to_string())),
5649 },
5650 PositionalToken {
5651 source: uws,
5652 offset: 104,
5653 length: 3,
5654 token: Token::Special(Special::Punctuation(',')),
5655 },
5656 PositionalToken {
5657 source: uws,
5658 offset: 107,
5659 length: 3,
5660 token: Token::Word(Word::Word("剧".to_string())),
5661 },
5662 PositionalToken {
5663 source: uws,
5664 offset: 110,
5665 length: 3,
5666 token: Token::Word(Word::Word("集".to_string())),
5667 },
5668 PositionalToken {
5669 source: uws,
5670 offset: 113,
5671 length: 3,
5672 token: Token::Word(Word::Word("主".to_string())),
5673 },
5674 PositionalToken {
5675 source: uws,
5676 offset: 116,
5677 length: 3,
5678 token: Token::Word(Word::Word("创".to_string())),
5679 },
5680 PositionalToken {
5681 source: uws,
5682 offset: 119,
5683 length: 3,
5684 token: Token::Word(Word::Word("人".to_string())),
5685 },
5686 PositionalToken {
5687 source: uws,
5688 offset: 122,
5689 length: 3,
5690 token: Token::Word(Word::Word("阿".to_string())),
5691 },
5692 PositionalToken {
5693 source: uws,
5694 offset: 125,
5695 length: 3,
5696 token: Token::Word(Word::Word("尔".to_string())),
5697 },
5698 PositionalToken {
5699 source: uws,
5700 offset: 128,
5701 length: 3,
5702 token: Token::Word(Word::Word("弗".to_string())),
5703 },
5704 PositionalToken {
5705 source: uws,
5706 offset: 131,
5707 length: 3,
5708 token: Token::Word(Word::Word("雷".to_string())),
5709 },
5710 PositionalToken {
5711 source: uws,
5712 offset: 134,
5713 length: 3,
5714 token: Token::Word(Word::Word("德".to_string())),
5715 },
5716 PositionalToken {
5717 source: uws,
5718 offset: 137,
5719 length: 2,
5720 token: Token::Special(Special::Punctuation('·')),
5721 },
5722 PositionalToken {
5723 source: uws,
5724 offset: 139,
5725 length: 3,
5726 token: Token::Word(Word::Word("高".to_string())),
5727 },
5728 PositionalToken {
5729 source: uws,
5730 offset: 142,
5731 length: 3,
5732 token: Token::Word(Word::Word("夫".to_string())),
5733 },
5734 PositionalToken {
5735 source: uws,
5736 offset: 145,
5737 length: 3,
5738 token: Token::Word(Word::Word("和".to_string())),
5739 },
5740 PositionalToken {
5741 source: uws,
5742 offset: 148,
5743 length: 3,
5744 token: Token::Word(Word::Word("迈".to_string())),
5745 },
5746 PositionalToken {
5747 source: uws,
5748 offset: 151,
5749 length: 3,
5750 token: Token::Word(Word::Word("尔".to_string())),
5751 },
5752 PositionalToken {
5753 source: uws,
5754 offset: 154,
5755 length: 3,
5756 token: Token::Word(Word::Word("斯".to_string())),
5757 },
5758 PositionalToken {
5759 source: uws,
5760 offset: 157,
5761 length: 2,
5762 token: Token::Special(Special::Punctuation('·')),
5763 },
5764 PositionalToken {
5765 source: uws,
5766 offset: 159,
5767 length: 3,
5768 token: Token::Word(Word::Word("米".to_string())),
5769 },
5770 PositionalToken {
5771 source: uws,
5772 offset: 162,
5773 length: 3,
5774 token: Token::Word(Word::Word("勒".to_string())),
5775 },
5776 PositionalToken {
5777 source: uws,
5778 offset: 165,
5779 length: 3,
5780 token: Token::Word(Word::Word("編".to_string())),
5781 },
5782 PositionalToken {
5783 source: uws,
5784 offset: 168,
5785 length: 3,
5786 token: Token::Word(Word::Word("劇".to_string())),
5787 },
5788 PositionalToken {
5789 source: uws,
5790 offset: 171,
5791 length: 3,
5792 token: Token::Special(Special::Punctuation(',')),
5793 },
5794 PositionalToken {
5795 source: uws,
5796 offset: 174,
5797 length: 3,
5798 token: Token::Word(Word::Word("大".to_string())),
5799 },
5800 PositionalToken {
5801 source: uws,
5802 offset: 177,
5803 length: 3,
5804 token: Token::Word(Word::Word("卫".to_string())),
5805 },
5806 PositionalToken {
5807 source: uws,
5808 offset: 180,
5809 length: 2,
5810 token: Token::Special(Special::Punctuation('·')),
5811 },
5812 PositionalToken {
5813 source: uws,
5814 offset: 182,
5815 length: 3,
5816 token: Token::Word(Word::Word("努".to_string())),
5817 },
5818 PositionalToken {
5819 source: uws,
5820 offset: 185,
5821 length: 3,
5822 token: Token::Word(Word::Word("特".to_string())),
5823 },
5824 PositionalToken {
5825 source: uws,
5826 offset: 188,
5827 length: 3,
5828 token: Token::Word(Word::Word("尔".to_string())),
5829 },
5830 PositionalToken {
5831 source: uws,
5832 offset: 191,
5833 length: 3,
5834 token: Token::Word(Word::Word("执".to_string())),
5835 },
5836 PositionalToken {
5837 source: uws,
5838 offset: 194,
5839 length: 3,
5840 token: Token::Word(Word::Word("导".to_string())),
5841 },
5842 PositionalToken {
5843 source: uws,
5844 offset: 197,
5845 length: 3,
5846 token: Token::Special(Special::Punctuation('。')),
5847 },
5848 PositionalToken {
5849 source: uws,
5850 offset: 200,
5851 length: 3,
5852 token: Token::Word(Word::Word("这".to_string())),
5853 },
5854 PositionalToken {
5855 source: uws,
5856 offset: 203,
5857 length: 3,
5858 token: Token::Word(Word::Word("一".to_string())),
5859 },
5860 PositionalToken {
5861 source: uws,
5862 offset: 206,
5863 length: 3,
5864 token: Token::Word(Word::Word("试".to_string())),
5865 },
5866 PositionalToken {
5867 source: uws,
5868 offset: 209,
5869 length: 3,
5870 token: Token::Word(Word::Word("播".to_string())),
5871 },
5872 PositionalToken {
5873 source: uws,
5874 offset: 212,
5875 length: 3,
5876 token: Token::Word(Word::Word("首".to_string())),
5877 },
5878 PositionalToken {
5879 source: uws,
5880 offset: 215,
5881 length: 3,
5882 token: Token::Word(Word::Word("次".to_string())),
5883 },
5884 PositionalToken {
5885 source: uws,
5886 offset: 218,
5887 length: 3,
5888 token: Token::Word(Word::Word("向".to_string())),
5889 },
5890 PositionalToken {
5891 source: uws,
5892 offset: 221,
5893 length: 3,
5894 token: Token::Word(Word::Word("观".to_string())),
5895 },
5896 PositionalToken {
5897 source: uws,
5898 offset: 224,
5899 length: 3,
5900 token: Token::Word(Word::Word("众".to_string())),
5901 },
5902 PositionalToken {
5903 source: uws,
5904 offset: 227,
5905 length: 3,
5906 token: Token::Word(Word::Word("引".to_string())),
5907 },
5908 PositionalToken {
5909 source: uws,
5910 offset: 230,
5911 length: 3,
5912 token: Token::Word(Word::Word("荐".to_string())),
5913 },
5914 PositionalToken {
5915 source: uws,
5916 offset: 233,
5917 length: 3,
5918 token: Token::Word(Word::Word("了".to_string())),
5919 },
5920 PositionalToken {
5921 source: uws,
5922 offset: 236,
5923 length: 3,
5924 token: Token::Word(Word::Word("克".to_string())),
5925 },
5926 PositionalToken {
5927 source: uws,
5928 offset: 239,
5929 length: 3,
5930 token: Token::Word(Word::Word("拉".to_string())),
5931 },
5932 PositionalToken {
5933 source: uws,
5934 offset: 242,
5935 length: 3,
5936 token: Token::Word(Word::Word("克".to_string())),
5937 },
5938 PositionalToken {
5939 source: uws,
5940 offset: 245,
5941 length: 2,
5942 token: Token::Special(Special::Punctuation('·')),
5943 },
5944 PositionalToken {
5945 source: uws,
5946 offset: 247,
5947 length: 3,
5948 token: Token::Word(Word::Word("肯".to_string())),
5949 },
5950 PositionalToken {
5951 source: uws,
5952 offset: 250,
5953 length: 3,
5954 token: Token::Word(Word::Word("特".to_string())),
5955 },
5956 PositionalToken {
5957 source: uws,
5958 offset: 253,
5959 length: 3,
5960 token: Token::Word(Word::Word("一".to_string())),
5961 },
5962 PositionalToken {
5963 source: uws,
5964 offset: 256,
5965 length: 3,
5966 token: Token::Word(Word::Word("角".to_string())),
5967 },
5968 PositionalToken {
5969 source: uws,
5970 offset: 259,
5971 length: 3,
5972 token: Token::Special(Special::Punctuation(',')),
5973 },
5974 PositionalToken {
5975 source: uws,
5976 offset: 262,
5977 length: 3,
5978 token: Token::Word(Word::Word("他".to_string())),
5979 },
5980 PositionalToken {
5981 source: uws,
5982 offset: 265,
5983 length: 3,
5984 token: Token::Word(Word::Word("是".to_string())),
5985 },
5986 PositionalToken {
5987 source: uws,
5988 offset: 268,
5989 length: 3,
5990 token: Token::Word(Word::Word("位".to_string())),
5991 },
5992 PositionalToken {
5993 source: uws,
5994 offset: 271,
5995 length: 3,
5996 token: Token::Word(Word::Word("拥".to_string())),
5997 },
5998 PositionalToken {
5999 source: uws,
6000 offset: 274,
6001 length: 3,
6002 token: Token::Word(Word::Word("有".to_string())),
6003 },
6004 PositionalToken {
6005 source: uws,
6006 offset: 277,
6007 length: 3,
6008 token: Token::Word(Word::Word("超".to_string())),
6009 },
6010 ],
6011 Lang::Jpn => vec![
6012 PositionalToken {
6013 source: uws,
6014 offset: 0,
6015 length: 3,
6016 token: Token::Word(Word::Word("熊".to_string())),
6017 },
6018 PositionalToken {
6019 source: uws,
6020 offset: 3,
6021 length: 3,
6022 token: Token::Word(Word::Word("野".to_string())),
6023 },
6024 PositionalToken {
6025 source: uws,
6026 offset: 6,
6027 length: 3,
6028 token: Token::Word(Word::Word("三".to_string())),
6029 },
6030 PositionalToken {
6031 source: uws,
6032 offset: 9,
6033 length: 3,
6034 token: Token::Word(Word::Word("山".to_string())),
6035 },
6036 PositionalToken {
6037 source: uws,
6038 offset: 12,
6039 length: 3,
6040 token: Token::Word(Word::Word("本".to_string())),
6041 },
6042 PositionalToken {
6043 source: uws,
6044 offset: 15,
6045 length: 3,
6046 token: Token::Word(Word::Word("願".to_string())),
6047 },
6048 PositionalToken {
6049 source: uws,
6050 offset: 18,
6051 length: 3,
6052 token: Token::Word(Word::Word("所".to_string())),
6053 },
6054 PositionalToken {
6055 source: uws,
6056 offset: 21,
6057 length: 3,
6058 token: Token::Word(Word::Word("は".to_string())),
6059 },
6060 PositionalToken {
6061 source: uws,
6062 offset: 24,
6063 length: 3,
6064 token: Token::Special(Special::Punctuation('、')),
6065 },
6066 PositionalToken {
6067 source: uws,
6068 offset: 27,
6069 length: 2,
6070 token: Token::Word(Word::Number(Number::Integer(15))),
6071 },
6072 PositionalToken {
6073 source: uws,
6074 offset: 29,
6075 length: 3,
6076 token: Token::Word(Word::Word("世".to_string())),
6077 },
6078 PositionalToken {
6079 source: uws,
6080 offset: 32,
6081 length: 3,
6082 token: Token::Word(Word::Word("紀".to_string())),
6083 },
6084 PositionalToken {
6085 source: uws,
6086 offset: 35,
6087 length: 3,
6088 token: Token::Word(Word::Word("末".to_string())),
6089 },
6090 PositionalToken {
6091 source: uws,
6092 offset: 38,
6093 length: 3,
6094 token: Token::Word(Word::Word("以".to_string())),
6095 },
6096 PositionalToken {
6097 source: uws,
6098 offset: 41,
6099 length: 3,
6100 token: Token::Word(Word::Word("降".to_string())),
6101 },
6102 PositionalToken {
6103 source: uws,
6104 offset: 44,
6105 length: 3,
6106 token: Token::Word(Word::Word("に".to_string())),
6107 },
6108 PositionalToken {
6109 source: uws,
6110 offset: 47,
6111 length: 3,
6112 token: Token::Word(Word::Word("お".to_string())),
6113 },
6114 PositionalToken {
6115 source: uws,
6116 offset: 50,
6117 length: 3,
6118 token: Token::Word(Word::Word("け".to_string())),
6119 },
6120 PositionalToken {
6121 source: uws,
6122 offset: 53,
6123 length: 3,
6124 token: Token::Word(Word::Word("る".to_string())),
6125 },
6126 PositionalToken {
6127 source: uws,
6128 offset: 56,
6129 length: 3,
6130 token: Token::Word(Word::Word("熊".to_string())),
6131 },
6132 PositionalToken {
6133 source: uws,
6134 offset: 59,
6135 length: 3,
6136 token: Token::Word(Word::Word("野".to_string())),
6137 },
6138 PositionalToken {
6139 source: uws,
6140 offset: 62,
6141 length: 3,
6142 token: Token::Word(Word::Word("三".to_string())),
6143 },
6144 PositionalToken {
6145 source: uws,
6146 offset: 65,
6147 length: 3,
6148 token: Token::Word(Word::Word("山".to_string())),
6149 },
6150 PositionalToken {
6151 source: uws,
6152 offset: 68,
6153 length: 3,
6154 token: Token::Special(Special::Punctuation('(')),
6155 },
6156 PositionalToken {
6157 source: uws,
6158 offset: 71,
6159 length: 3,
6160 token: Token::Word(Word::Word("熊".to_string())),
6161 },
6162 PositionalToken {
6163 source: uws,
6164 offset: 74,
6165 length: 3,
6166 token: Token::Word(Word::Word("野".to_string())),
6167 },
6168 PositionalToken {
6169 source: uws,
6170 offset: 77,
6171 length: 3,
6172 token: Token::Word(Word::Word("本".to_string())),
6173 },
6174 PositionalToken {
6175 source: uws,
6176 offset: 80,
6177 length: 3,
6178 token: Token::Word(Word::Word("宮".to_string())),
6179 },
6180 PositionalToken {
6181 source: uws,
6182 offset: 83,
6183 length: 3,
6184 token: Token::Special(Special::Punctuation('、')),
6185 },
6186 PositionalToken {
6187 source: uws,
6188 offset: 86,
6189 length: 3,
6190 token: Token::Word(Word::Word("熊".to_string())),
6191 },
6192 PositionalToken {
6193 source: uws,
6194 offset: 89,
6195 length: 3,
6196 token: Token::Word(Word::Word("野".to_string())),
6197 },
6198 PositionalToken {
6199 source: uws,
6200 offset: 92,
6201 length: 3,
6202 token: Token::Word(Word::Word("新".to_string())),
6203 },
6204 PositionalToken {
6205 source: uws,
6206 offset: 95,
6207 length: 3,
6208 token: Token::Word(Word::Word("宮".to_string())),
6209 },
6210 PositionalToken {
6211 source: uws,
6212 offset: 98,
6213 length: 3,
6214 token: Token::Special(Special::Punctuation('、')),
6215 },
6216 PositionalToken {
6217 source: uws,
6218 offset: 101,
6219 length: 3,
6220 token: Token::Word(Word::Word("熊".to_string())),
6221 },
6222 PositionalToken {
6223 source: uws,
6224 offset: 104,
6225 length: 3,
6226 token: Token::Word(Word::Word("野".to_string())),
6227 },
6228 PositionalToken {
6229 source: uws,
6230 offset: 107,
6231 length: 3,
6232 token: Token::Word(Word::Word("那".to_string())),
6233 },
6234 PositionalToken {
6235 source: uws,
6236 offset: 110,
6237 length: 3,
6238 token: Token::Word(Word::Word("智".to_string())),
6239 },
6240 PositionalToken {
6241 source: uws,
6242 offset: 113,
6243 length: 3,
6244 token: Token::Special(Special::Punctuation(')')),
6245 },
6246 PositionalToken {
6247 source: uws,
6248 offset: 116,
6249 length: 3,
6250 token: Token::Word(Word::Word("の".to_string())),
6251 },
6252 PositionalToken {
6253 source: uws,
6254 offset: 119,
6255 length: 3,
6256 token: Token::Word(Word::Word("造".to_string())),
6257 },
6258 PositionalToken {
6259 source: uws,
6260 offset: 122,
6261 length: 3,
6262 token: Token::Word(Word::Word("営".to_string())),
6263 },
6264 PositionalToken {
6265 source: uws,
6266 offset: 125,
6267 length: 3,
6268 token: Token::Special(Special::Punctuation('・')),
6269 },
6270 PositionalToken {
6271 source: uws,
6272 offset: 128,
6273 length: 3,
6274 token: Token::Word(Word::Word("修".to_string())),
6275 },
6276 PositionalToken {
6277 source: uws,
6278 offset: 131,
6279 length: 3,
6280 token: Token::Word(Word::Word("造".to_string())),
6281 },
6282 PositionalToken {
6283 source: uws,
6284 offset: 134,
6285 length: 3,
6286 token: Token::Word(Word::Word("の".to_string())),
6287 },
6288 PositionalToken {
6289 source: uws,
6290 offset: 137,
6291 length: 3,
6292 token: Token::Word(Word::Word("た".to_string())),
6293 },
6294 PositionalToken {
6295 source: uws,
6296 offset: 140,
6297 length: 3,
6298 token: Token::Word(Word::Word("め".to_string())),
6299 },
6300 PositionalToken {
6301 source: uws,
6302 offset: 143,
6303 length: 3,
6304 token: Token::Word(Word::Word("の".to_string())),
6305 },
6306 PositionalToken {
6307 source: uws,
6308 offset: 146,
6309 length: 3,
6310 token: Token::Word(Word::Word("勧".to_string())),
6311 },
6312 PositionalToken {
6313 source: uws,
6314 offset: 149,
6315 length: 3,
6316 token: Token::Word(Word::Word("進".to_string())),
6317 },
6318 PositionalToken {
6319 source: uws,
6320 offset: 152,
6321 length: 3,
6322 token: Token::Word(Word::Word("を".to_string())),
6323 },
6324 PositionalToken {
6325 source: uws,
6326 offset: 155,
6327 length: 3,
6328 token: Token::Word(Word::Word("担".to_string())),
6329 },
6330 PositionalToken {
6331 source: uws,
6332 offset: 158,
6333 length: 3,
6334 token: Token::Word(Word::Word("っ".to_string())),
6335 },
6336 PositionalToken {
6337 source: uws,
6338 offset: 161,
6339 length: 3,
6340 token: Token::Word(Word::Word("た".to_string())),
6341 },
6342 PositionalToken {
6343 source: uws,
6344 offset: 164,
6345 length: 3,
6346 token: Token::Word(Word::Word("組".to_string())),
6347 },
6348 PositionalToken {
6349 source: uws,
6350 offset: 167,
6351 length: 3,
6352 token: Token::Word(Word::Word("織".to_string())),
6353 },
6354 PositionalToken {
6355 source: uws,
6356 offset: 170,
6357 length: 3,
6358 token: Token::Word(Word::Word("の".to_string())),
6359 },
6360 PositionalToken {
6361 source: uws,
6362 offset: 173,
6363 length: 3,
6364 token: Token::Word(Word::Word("総".to_string())),
6365 },
6366 PositionalToken {
6367 source: uws,
6368 offset: 176,
6369 length: 3,
6370 token: Token::Word(Word::Word("称".to_string())),
6371 },
6372 PositionalToken {
6373 source: uws,
6374 offset: 179,
6375 length: 3,
6376 token: Token::Special(Special::Punctuation('。')),
6377 },
6378 PositionalToken {
6379 source: uws,
6380 offset: 182,
6381 length: 1,
6382 token: Token::Special(Special::Separator(Separator::Space)),
6383 },
6384 PositionalToken {
6385 source: uws,
6386 offset: 183,
6387 length: 3,
6388 token: Token::Word(Word::Word("熊".to_string())),
6389 },
6390 PositionalToken {
6391 source: uws,
6392 offset: 186,
6393 length: 3,
6394 token: Token::Word(Word::Word("野".to_string())),
6395 },
6396 PositionalToken {
6397 source: uws,
6398 offset: 189,
6399 length: 3,
6400 token: Token::Word(Word::Word("三".to_string())),
6401 },
6402 PositionalToken {
6403 source: uws,
6404 offset: 192,
6405 length: 3,
6406 token: Token::Word(Word::Word("山".to_string())),
6407 },
6408 PositionalToken {
6409 source: uws,
6410 offset: 195,
6411 length: 3,
6412 token: Token::Word(Word::Word("を".to_string())),
6413 },
6414 PositionalToken {
6415 source: uws,
6416 offset: 198,
6417 length: 3,
6418 token: Token::Word(Word::Word("含".to_string())),
6419 },
6420 PositionalToken {
6421 source: uws,
6422 offset: 201,
6423 length: 3,
6424 token: Token::Word(Word::Word("め".to_string())),
6425 },
6426 PositionalToken {
6427 source: uws,
6428 offset: 204,
6429 length: 3,
6430 token: Token::Word(Word::Word("て".to_string())),
6431 },
6432 PositionalToken {
6433 source: uws,
6434 offset: 207,
6435 length: 3,
6436 token: Token::Special(Special::Punctuation('、')),
6437 },
6438 PositionalToken {
6439 source: uws,
6440 offset: 210,
6441 length: 3,
6442 token: Token::Word(Word::Word("日".to_string())),
6443 },
6444 PositionalToken {
6445 source: uws,
6446 offset: 213,
6447 length: 3,
6448 token: Token::Word(Word::Word("本".to_string())),
6449 },
6450 PositionalToken {
6451 source: uws,
6452 offset: 216,
6453 length: 3,
6454 token: Token::Word(Word::Word("に".to_string())),
6455 },
6456 PositionalToken {
6457 source: uws,
6458 offset: 219,
6459 length: 3,
6460 token: Token::Word(Word::Word("お".to_string())),
6461 },
6462 PositionalToken {
6463 source: uws,
6464 offset: 222,
6465 length: 3,
6466 token: Token::Word(Word::Word("け".to_string())),
6467 },
6468 PositionalToken {
6469 source: uws,
6470 offset: 225,
6471 length: 3,
6472 token: Token::Word(Word::Word("る".to_string())),
6473 },
6474 PositionalToken {
6475 source: uws,
6476 offset: 228,
6477 length: 3,
6478 token: Token::Word(Word::Word("古".to_string())),
6479 },
6480 PositionalToken {
6481 source: uws,
6482 offset: 231,
6483 length: 3,
6484 token: Token::Word(Word::Word("代".to_string())),
6485 },
6486 PositionalToken {
6487 source: uws,
6488 offset: 234,
6489 length: 3,
6490 token: Token::Word(Word::Word("か".to_string())),
6491 },
6492 PositionalToken {
6493 source: uws,
6494 offset: 237,
6495 length: 3,
6496 token: Token::Word(Word::Word("ら".to_string())),
6497 },
6498 PositionalToken {
6499 source: uws,
6500 offset: 240,
6501 length: 3,
6502 token: Token::Word(Word::Word("中".to_string())),
6503 },
6504 PositionalToken {
6505 source: uws,
6506 offset: 243,
6507 length: 3,
6508 token: Token::Word(Word::Word("世".to_string())),
6509 },
6510 PositionalToken {
6511 source: uws,
6512 offset: 246,
6513 length: 3,
6514 token: Token::Word(Word::Word("前".to_string())),
6515 },
6516 PositionalToken {
6517 source: uws,
6518 offset: 249,
6519 length: 3,
6520 token: Token::Word(Word::Word("半".to_string())),
6521 },
6522 PositionalToken {
6523 source: uws,
6524 offset: 252,
6525 length: 3,
6526 token: Token::Word(Word::Word("に".to_string())),
6527 },
6528 PositionalToken {
6529 source: uws,
6530 offset: 255,
6531 length: 3,
6532 token: Token::Word(Word::Word("か".to_string())),
6533 },
6534 PositionalToken {
6535 source: uws,
6536 offset: 258,
6537 length: 3,
6538 token: Token::Word(Word::Word("け".to_string())),
6539 },
6540 PositionalToken {
6541 source: uws,
6542 offset: 261,
6543 length: 3,
6544 token: Token::Word(Word::Word("て".to_string())),
6545 },
6546 PositionalToken {
6547 source: uws,
6548 offset: 264,
6549 length: 3,
6550 token: Token::Word(Word::Word("の".to_string())),
6551 },
6552 PositionalToken {
6553 source: uws,
6554 offset: 267,
6555 length: 3,
6556 token: Token::Word(Word::Word("寺".to_string())),
6557 },
6558 PositionalToken {
6559 source: uws,
6560 offset: 270,
6561 length: 3,
6562 token: Token::Word(Word::Word("社".to_string())),
6563 },
6564 PositionalToken {
6565 source: uws,
6566 offset: 273,
6567 length: 3,
6568 token: Token::Word(Word::Word("の".to_string())),
6569 },
6570 PositionalToken {
6571 source: uws,
6572 offset: 276,
6573 length: 3,
6574 token: Token::Word(Word::Word("造".to_string())),
6575 },
6576 PositionalToken {
6577 source: uws,
6578 offset: 279,
6579 length: 3,
6580 token: Token::Word(Word::Word("営".to_string())),
6581 },
6582 PositionalToken {
6583 source: uws,
6584 offset: 282,
6585 length: 3,
6586 token: Token::Word(Word::Word("は".to_string())),
6587 },
6588 PositionalToken {
6589 source: uws,
6590 offset: 285,
6591 length: 3,
6592 token: Token::Special(Special::Punctuation('、')),
6593 },
6594 PositionalToken {
6595 source: uws,
6596 offset: 288,
6597 length: 3,
6598 token: Token::Word(Word::Word("寺".to_string())),
6599 },
6600 PositionalToken {
6601 source: uws,
6602 offset: 291,
6603 length: 3,
6604 token: Token::Word(Word::Word("社".to_string())),
6605 },
6606 ],
6607 Lang::Kor => vec![
6608 PositionalToken {
6609 source: uws,
6610 offset: 0,
6611 length: 21,
6612 token: Token::Word(Word::Word("플레이스테이션".to_string())),
6613 },
6614 PositionalToken {
6615 source: uws,
6616 offset: 21,
6617 length: 1,
6618 token: Token::Special(Special::Separator(Separator::Space)),
6619 },
6620 PositionalToken {
6621 source: uws,
6622 offset: 22,
6623 length: 3,
6624 token: Token::Word(Word::Word("은".to_string())),
6625 },
6626 PositionalToken {
6627 source: uws,
6628 offset: 25,
6629 length: 1,
6630 token: Token::Special(Special::Separator(Separator::Space)),
6631 },
6632 PositionalToken {
6633 source: uws,
6634 offset: 26,
6635 length: 6,
6636 token: Token::Word(Word::Word("소니".to_string())),
6637 },
6638 PositionalToken {
6639 source: uws,
6640 offset: 32,
6641 length: 1,
6642 token: Token::Special(Special::Separator(Separator::Space)),
6643 },
6644 PositionalToken {
6645 source: uws,
6646 offset: 33,
6647 length: 9,
6648 token: Token::Word(Word::Word("컴퓨터".to_string())),
6649 },
6650 PositionalToken {
6651 source: uws,
6652 offset: 42,
6653 length: 1,
6654 token: Token::Special(Special::Separator(Separator::Space)),
6655 },
6656 PositionalToken {
6657 source: uws,
6658 offset: 43,
6659 length: 21,
6660 token: Token::Word(Word::Word("엔터테인먼트가".to_string())),
6661 },
6662 PositionalToken {
6663 source: uws,
6664 offset: 64,
6665 length: 1,
6666 token: Token::Special(Special::Separator(Separator::Space)),
6667 },
6668 PositionalToken {
6669 source: uws,
6670 offset: 65,
6671 length: 9,
6672 token: Token::Word(Word::Word("개발한".to_string())),
6673 },
6674 PositionalToken {
6675 source: uws,
6676 offset: 74,
6677 length: 1,
6678 token: Token::Special(Special::Separator(Separator::Space)),
6679 },
6680 PositionalToken {
6681 source: uws,
6682 offset: 75,
6683 length: 3,
6684 token: Token::Word(Word::Word("세".to_string())),
6685 },
6686 PositionalToken {
6687 source: uws,
6688 offset: 78,
6689 length: 1,
6690 token: Token::Special(Special::Separator(Separator::Space)),
6691 },
6692 PositionalToken {
6693 source: uws,
6694 offset: 79,
6695 length: 6,
6696 token: Token::Word(Word::Word("번째".to_string())),
6697 },
6698 PositionalToken {
6699 source: uws,
6700 offset: 85,
6701 length: 1,
6702 token: Token::Special(Special::Separator(Separator::Space)),
6703 },
6704 PositionalToken {
6705 source: uws,
6706 offset: 86,
6707 length: 9,
6708 token: Token::Word(Word::Word("가정용".to_string())),
6709 },
6710 PositionalToken {
6711 source: uws,
6712 offset: 95,
6713 length: 1,
6714 token: Token::Special(Special::Separator(Separator::Space)),
6715 },
6716 PositionalToken {
6717 source: uws,
6718 offset: 96,
6719 length: 15,
6720 token: Token::Word(Word::Word("게임기이다".to_string())),
6721 },
6722 PositionalToken {
6723 source: uws,
6724 offset: 111,
6725 length: 1,
6726 token: Token::Special(Special::Punctuation('.')),
6727 },
6728 PositionalToken {
6729 source: uws,
6730 offset: 112,
6731 length: 1,
6732 token: Token::Special(Special::Separator(Separator::Space)),
6733 },
6734 PositionalToken {
6735 source: uws,
6736 offset: 113,
6737 length: 24,
6738 token: Token::Word(Word::Word("마이크로소프트의".to_string())),
6739 },
6740 PositionalToken {
6741 source: uws,
6742 offset: 137,
6743 length: 1,
6744 token: Token::Special(Special::Separator(Separator::Space)),
6745 },
6746 PositionalToken {
6747 source: uws,
6748 offset: 138,
6749 length: 12,
6750 token: Token::Word(Word::Word("엑스박스".to_string())),
6751 },
6752 PositionalToken {
6753 source: uws,
6754 offset: 150,
6755 length: 1,
6756 token: Token::Special(Special::Separator(Separator::Space)),
6757 },
6758 PositionalToken {
6759 source: uws,
6760 offset: 151,
6761 length: 3,
6762 token: Token::Word(Word::Number(Number::Integer(360))),
6763 },
6764 PositionalToken {
6765 source: uws,
6766 offset: 154,
6767 length: 1,
6768 token: Token::Special(Special::Punctuation(',')),
6769 },
6770 PositionalToken {
6771 source: uws,
6772 offset: 155,
6773 length: 1,
6774 token: Token::Special(Special::Separator(Separator::Space)),
6775 },
6776 PositionalToken {
6777 source: uws,
6778 offset: 156,
6779 length: 12,
6780 token: Token::Word(Word::Word("닌텐도의".to_string())),
6781 },
6782 PositionalToken {
6783 source: uws,
6784 offset: 168,
6785 length: 1,
6786 token: Token::Special(Special::Separator(Separator::Space)),
6787 },
6788 PositionalToken {
6789 source: uws,
6790 offset: 169,
6791 length: 6,
6792 token: Token::Word(Word::Word("Wii와".to_string())),
6793 },
6794 PositionalToken {
6795 source: uws,
6796 offset: 175,
6797 length: 1,
6798 token: Token::Special(Special::Separator(Separator::Space)),
6799 },
6800 PositionalToken {
6801 source: uws,
6802 offset: 176,
6803 length: 12,
6804 token: Token::Word(Word::Word("경쟁하고".to_string())),
6805 },
6806 PositionalToken {
6807 source: uws,
6808 offset: 188,
6809 length: 1,
6810 token: Token::Special(Special::Separator(Separator::Space)),
6811 },
6812 PositionalToken {
6813 source: uws,
6814 offset: 189,
6815 length: 6,
6816 token: Token::Word(Word::Word("있다".to_string())),
6817 },
6818 PositionalToken {
6819 source: uws,
6820 offset: 195,
6821 length: 1,
6822 token: Token::Special(Special::Punctuation('.')),
6823 },
6824 PositionalToken {
6825 source: uws,
6826 offset: 196,
6827 length: 1,
6828 token: Token::Special(Special::Separator(Separator::Space)),
6829 },
6830 PositionalToken {
6831 source: uws,
6832 offset: 197,
6833 length: 6,
6834 token: Token::Word(Word::Word("이전".to_string())),
6835 },
6836 PositionalToken {
6837 source: uws,
6838 offset: 203,
6839 length: 1,
6840 token: Token::Special(Special::Separator(Separator::Space)),
6841 },
6842 PositionalToken {
6843 source: uws,
6844 offset: 204,
6845 length: 12,
6846 token: Token::Word(Word::Word("제품에서".to_string())),
6847 },
6848 PositionalToken {
6849 source: uws,
6850 offset: 216,
6851 length: 1,
6852 token: Token::Special(Special::Separator(Separator::Space)),
6853 },
6854 PositionalToken {
6855 source: uws,
6856 offset: 217,
6857 length: 9,
6858 token: Token::Word(Word::Word("온라인".to_string())),
6859 },
6860 PositionalToken {
6861 source: uws,
6862 offset: 226,
6863 length: 1,
6864 token: Token::Special(Special::Separator(Separator::Space)),
6865 },
6866 PositionalToken {
6867 source: uws,
6868 offset: 227,
6869 length: 9,
6870 token: Token::Word(Word::Word("플레이".to_string())),
6871 },
6872 PositionalToken {
6873 source: uws,
6874 offset: 236,
6875 length: 1,
6876 token: Token::Special(Special::Separator(Separator::Space)),
6877 },
6878 PositionalToken {
6879 source: uws,
6880 offset: 237,
6881 length: 3,
6882 token: Token::Word(Word::Word("기".to_string())),
6883 },
6884 ],
6885 Lang::Ara => vec![
6886 PositionalToken {
6887 source: uws,
6888 offset: 0,
6889 length: 14,
6890 token: Token::Word(Word::Word("لشکرکشی".to_string())),
6891 },
6892 PositionalToken {
6893 source: uws,
6894 offset: 14,
6895 length: 3,
6896 token: Token::Unicode(Unicode::Formatter(Formatter::Char('\u{200c}'))),
6897 },
6898 PositionalToken {
6899 source: uws,
6900 offset: 17,
6901 length: 6,
6902 token: Token::Word(Word::Word("های".to_string())),
6903 },
6904 PositionalToken {
6905 source: uws,
6906 offset: 23,
6907 length: 1,
6908 token: Token::Special(Special::Separator(Separator::Space)),
6909 },
6910 PositionalToken {
6911 source: uws,
6912 offset: 24,
6913 length: 6,
6914 token: Token::Word(Word::Word("روس".to_string())),
6915 },
6916 PositionalToken {
6917 source: uws,
6918 offset: 30,
6919 length: 3,
6920 token: Token::Unicode(Unicode::Formatter(Formatter::Char('\u{200c}'))),
6921 },
6922 PositionalToken {
6923 source: uws,
6924 offset: 33,
6925 length: 6,
6926 token: Token::Word(Word::Word("های".to_string())),
6927 },
6928 PositionalToken {
6929 source: uws,
6930 offset: 39,
6931 length: 1,
6932 token: Token::Special(Special::Separator(Separator::Space)),
6933 },
6934 PositionalToken {
6935 source: uws,
6936 offset: 40,
6937 length: 12,
6938 token: Token::Word(Word::Word("وارنگی".to_string())),
6939 },
6940 PositionalToken {
6941 source: uws,
6942 offset: 52,
6943 length: 1,
6944 token: Token::Special(Special::Separator(Separator::Space)),
6945 },
6946 PositionalToken {
6947 source: uws,
6948 offset: 53,
6949 length: 4,
6950 token: Token::Word(Word::Word("به".to_string())),
6951 },
6952 PositionalToken {
6953 source: uws,
6954 offset: 57,
6955 length: 1,
6956 token: Token::Special(Special::Separator(Separator::Space)),
6957 },
6958 PositionalToken {
6959 source: uws,
6960 offset: 58,
6961 length: 10,
6962 token: Token::Word(Word::Word("دریای".to_string())),
6963 },
6964 PositionalToken {
6965 source: uws,
6966 offset: 68,
6967 length: 1,
6968 token: Token::Special(Special::Separator(Separator::Space)),
6969 },
6970 PositionalToken {
6971 source: uws,
6972 offset: 69,
6973 length: 6,
6974 token: Token::Word(Word::Word("خزر".to_string())),
6975 },
6976 PositionalToken {
6977 source: uws,
6978 offset: 75,
6979 length: 1,
6980 token: Token::Special(Special::Separator(Separator::Space)),
6981 },
6982 PositionalToken {
6983 source: uws,
6984 offset: 76,
6985 length: 12,
6986 token: Token::Word(Word::Word("مجموعه".to_string())),
6987 },
6988 PositionalToken {
6989 source: uws,
6990 offset: 88,
6991 length: 3,
6992 token: Token::Unicode(Unicode::Formatter(Formatter::Char('\u{200c}'))),
6993 },
6994 PositionalToken {
6995 source: uws,
6996 offset: 91,
6997 length: 4,
6998 token: Token::Word(Word::Word("ای".to_string())),
6999 },
7000 PositionalToken {
7001 source: uws,
7002 offset: 95,
7003 length: 1,
7004 token: Token::Special(Special::Separator(Separator::Space)),
7005 },
7006 PositionalToken {
7007 source: uws,
7008 offset: 96,
7009 length: 4,
7010 token: Token::Word(Word::Word("از".to_string())),
7011 },
7012 PositionalToken {
7013 source: uws,
7014 offset: 100,
7015 length: 1,
7016 token: Token::Special(Special::Separator(Separator::Space)),
7017 },
7018 PositionalToken {
7019 source: uws,
7020 offset: 101,
7021 length: 10,
7022 token: Token::Word(Word::Word("حملات".to_string())),
7023 },
7024 PositionalToken {
7025 source: uws,
7026 offset: 111,
7027 length: 1,
7028 token: Token::Special(Special::Separator(Separator::Space)),
7029 },
7030 PositionalToken {
7031 source: uws,
7032 offset: 112,
7033 length: 10,
7034 token: Token::Word(Word::Word("نظامی".to_string())),
7035 },
7036 PositionalToken {
7037 source: uws,
7038 offset: 122,
7039 length: 1,
7040 token: Token::Special(Special::Separator(Separator::Space)),
7041 },
7042 PositionalToken {
7043 source: uws,
7044 offset: 123,
7045 length: 4,
7046 token: Token::Word(Word::Word("در".to_string())),
7047 },
7048 PositionalToken {
7049 source: uws,
7050 offset: 127,
7051 length: 1,
7052 token: Token::Special(Special::Separator(Separator::Space)),
7053 },
7054 PositionalToken {
7055 source: uws,
7056 offset: 128,
7057 length: 6,
7058 token: Token::Word(Word::Word("بین".to_string())),
7059 },
7060 PositionalToken {
7061 source: uws,
7062 offset: 134,
7063 length: 1,
7064 token: Token::Special(Special::Separator(Separator::Space)),
7065 },
7066 PositionalToken {
7067 source: uws,
7068 offset: 135,
7069 length: 6,
7070 token: Token::Word(Word::Word("سال".to_string())),
7071 },
7072 PositionalToken {
7073 source: uws,
7074 offset: 141,
7075 length: 3,
7076 token: Token::Unicode(Unicode::Formatter(Formatter::Char('\u{200c}'))),
7077 },
7078 PositionalToken {
7079 source: uws,
7080 offset: 144,
7081 length: 6,
7082 token: Token::Word(Word::Word("های".to_string())),
7083 },
7084 PositionalToken {
7085 source: uws,
7086 offset: 150,
7087 length: 1,
7088 token: Token::Special(Special::Separator(Separator::Space)),
7089 },
7090 PositionalToken {
7091 source: uws,
7092 offset: 151,
7093 length: 6,
7094 token: Token::Word(Word::StrangeWord("۸۶۴".to_string())),
7095 },
7096 PositionalToken {
7097 source: uws,
7098 offset: 157,
7099 length: 1,
7100 token: Token::Special(Special::Separator(Separator::Space)),
7101 },
7102 PositionalToken {
7103 source: uws,
7104 offset: 158,
7105 length: 4,
7106 token: Token::Word(Word::Word("تا".to_string())),
7107 },
7108 PositionalToken {
7109 source: uws,
7110 offset: 162,
7111 length: 1,
7112 token: Token::Special(Special::Separator(Separator::Space)),
7113 },
7114 PositionalToken {
7115 source: uws,
7116 offset: 163,
7117 length: 8,
7118 token: Token::Word(Word::StrangeWord("۱۰۴۱".to_string())),
7119 },
7120 PositionalToken {
7121 source: uws,
7122 offset: 171,
7123 length: 1,
7124 token: Token::Special(Special::Separator(Separator::Space)),
7125 },
7126 PositionalToken {
7127 source: uws,
7128 offset: 172,
7129 length: 12,
7130 token: Token::Word(Word::Word("میلادی".to_string())),
7131 },
7132 PositionalToken {
7133 source: uws,
7134 offset: 184,
7135 length: 1,
7136 token: Token::Special(Special::Separator(Separator::Space)),
7137 },
7138 PositionalToken {
7139 source: uws,
7140 offset: 185,
7141 length: 2,
7142 token: Token::Word(Word::Word("ب".to_string())),
7143 },
7144 ],
7145 Lang::Ell => vec![
7146 PositionalToken {
7147 source: uws,
7148 offset: 0,
7149 length: 4,
7150 token: Token::Word(Word::Word("Το".to_string())),
7151 },
7152 PositionalToken {
7153 source: uws,
7154 offset: 4,
7155 length: 1,
7156 token: Token::Special(Special::Separator(Separator::Space)),
7157 },
7158 PositionalToken {
7159 source: uws,
7160 offset: 5,
7161 length: 18,
7162 token: Token::Word(Word::Word("Πρόγραμμα".to_string())),
7163 },
7164 PositionalToken {
7165 source: uws,
7166 offset: 23,
7167 length: 1,
7168 token: Token::Special(Special::Separator(Separator::Space)),
7169 },
7170 PositionalToken {
7171 source: uws,
7172 offset: 24,
7173 length: 22,
7174 token: Token::Word(Word::Word("υλοποιείται".to_string())),
7175 },
7176 PositionalToken {
7177 source: uws,
7178 offset: 46,
7179 length: 1,
7180 token: Token::Special(Special::Separator(Separator::Space)),
7181 },
7182 PositionalToken {
7183 source: uws,
7184 offset: 47,
7185 length: 4,
7186 token: Token::Word(Word::Word("εξ".to_string())),
7187 },
7188 PositionalToken {
7189 source: uws,
7190 offset: 51,
7191 length: 1,
7192 token: Token::Special(Special::Separator(Separator::Space)),
7193 },
7194 PositionalToken {
7195 source: uws,
7196 offset: 52,
7197 length: 18,
7198 token: Token::Word(Word::Word("ολοκλήρου".to_string())),
7199 },
7200 PositionalToken {
7201 source: uws,
7202 offset: 70,
7203 length: 1,
7204 token: Token::Special(Special::Separator(Separator::Space)),
7205 },
7206 PositionalToken {
7207 source: uws,
7208 offset: 71,
7209 length: 6,
7210 token: Token::Word(Word::Word("από".to_string())),
7211 },
7212 PositionalToken {
7213 source: uws,
7214 offset: 77,
7215 length: 1,
7216 token: Token::Special(Special::Separator(Separator::Space)),
7217 },
7218 PositionalToken {
7219 source: uws,
7220 offset: 78,
7221 length: 16,
7222 token: Token::Word(Word::Word("απόσταση".to_string())),
7223 },
7224 PositionalToken {
7225 source: uws,
7226 offset: 94,
7227 length: 1,
7228 token: Token::Special(Special::Separator(Separator::Space)),
7229 },
7230 PositionalToken {
7231 source: uws,
7232 offset: 95,
7233 length: 6,
7234 token: Token::Word(Word::Word("και".to_string())),
7235 },
7236 PositionalToken {
7237 source: uws,
7238 offset: 101,
7239 length: 1,
7240 token: Token::Special(Special::Separator(Separator::Space)),
7241 },
7242 PositionalToken {
7243 source: uws,
7244 offset: 102,
7245 length: 12,
7246 token: Token::Word(Word::Word("μπορεί".to_string())),
7247 },
7248 PositionalToken {
7249 source: uws,
7250 offset: 114,
7251 length: 1,
7252 token: Token::Special(Special::Separator(Separator::Space)),
7253 },
7254 PositionalToken {
7255 source: uws,
7256 offset: 115,
7257 length: 4,
7258 token: Token::Word(Word::Word("να".to_string())),
7259 },
7260 PositionalToken {
7261 source: uws,
7262 offset: 119,
7263 length: 1,
7264 token: Token::Special(Special::Separator(Separator::Space)),
7265 },
7266 PositionalToken {
7267 source: uws,
7268 offset: 120,
7269 length: 20,
7270 token: Token::Word(Word::Word("συμμετέχει".to_string())),
7271 },
7272 PositionalToken {
7273 source: uws,
7274 offset: 140,
7275 length: 1,
7276 token: Token::Special(Special::Separator(Separator::Space)),
7277 },
7278 PositionalToken {
7279 source: uws,
7280 offset: 141,
7281 length: 8,
7282 token: Token::Word(Word::Word("κάθε".to_string())),
7283 },
7284 PositionalToken {
7285 source: uws,
7286 offset: 149,
7287 length: 1,
7288 token: Token::Special(Special::Separator(Separator::Space)),
7289 },
7290 PositionalToken {
7291 source: uws,
7292 offset: 150,
7293 length: 24,
7294 token: Token::Word(Word::Word("εμπλεκόμενος".to_string())),
7295 },
7296 PositionalToken {
7297 source: uws,
7298 offset: 174,
7299 length: 1,
7300 token: Token::Special(Special::Separator(Separator::Space)),
7301 },
7302 PositionalToken {
7303 source: uws,
7304 offset: 175,
7305 length: 6,
7306 token: Token::Word(Word::Word("στη".to_string())),
7307 },
7308 PositionalToken {
7309 source: uws,
7310 offset: 181,
7311 length: 1,
7312 token: Token::Special(Special::Separator(Separator::Space)),
7313 },
7314 PositionalToken {
7315 source: uws,
7316 offset: 182,
7317 length: 2,
7318 token: Token::Word(Word::Word("ή".to_string())),
7319 },
7320 PositionalToken {
7321 source: uws,
7322 offset: 184,
7323 length: 1,
7324 token: Token::Special(Special::Punctuation('/')),
7325 },
7326 ],
7327 };
7328 (
7329 uws.chars()
7330 .take(100)
7331 .fold(String::new(), |acc, c| acc + &format!("{}", c)),
7332 tokens,
7333 )
7334 }
7335}