1use std::sync::Arc;
2use text_parsing::{Breaker, IntoSource, Local, Localize, Snip, Source, SourceEvent};
3
4mod emoji;
5pub use emoji::EMOJIMAP;
6
7mod breakers;
8pub use breakers::{SentenceBreaker, UnicodeSentenceBreaker};
9
10mod wordbreaker;
11
12mod options;
13pub use options::{IntoTokenizer, TokenizerOptions, TokenizerParams};
14
15mod tokens;
16pub use tokens::Tokens;
17
18mod text_tokens;
19use text_tokens::InnerBound;
20pub use text_tokens::TextTokens;
21
22#[derive(Debug)]
23pub enum Error {
24 TextParser(text_parsing::Error),
25}
26
27const EPS: f64 = 1e-8;
28
29#[cfg(feature = "strings")]
30#[derive(Debug, Clone, PartialEq, PartialOrd)]
31pub enum Number {
32 Integer(i64),
33 Float(f64),
34 ZeroInteger { i: i64, s: String },
36}
37
38#[cfg(not(feature = "strings"))]
39#[derive(Debug, Clone, Copy, PartialEq, PartialOrd)]
40pub enum Number {
41 Integer(i64),
42 Float(f64),
43 ZeroInteger { i: i64 },
44}
45
46impl Number {
47 pub fn as_f64(&self) -> f64 {
48 match self {
49 Number::Integer(i) => *i as f64,
50 Number::Float(f) => *f,
51 Number::ZeroInteger { i, .. } => *i as f64,
52 }
53 }
54}
55impl Ord for Number {
56 fn cmp(&self, other: &Number) -> std::cmp::Ordering {
57 let s = self.as_f64();
58 let o = other.as_f64();
59 let d = s - o;
60 match d.abs() < EPS {
61 true => std::cmp::Ordering::Equal,
62 false => {
63 if d > 0.0 {
64 return std::cmp::Ordering::Greater;
65 }
66 if d < 0.0 {
67 return std::cmp::Ordering::Less;
68 }
69 std::cmp::Ordering::Equal
70 }
71 }
72 }
73}
74impl Eq for Number {}
75
76#[derive(Debug, Clone, Copy, Eq, PartialEq, Ord, PartialOrd)]
77pub enum Separator {
78 Space,
79 Tab,
80 Newline,
81 Char(char),
82}
83
84#[derive(Debug, Clone, Copy, Eq, PartialEq, Ord, PartialOrd)]
85pub enum Formatter {
86 Char(char),
87 Joiner, }
89
90#[derive(Debug, Clone, Copy, PartialEq, Eq, Ord, PartialOrd)]
91pub enum Special {
92 Currency(char),
93 Punctuation(char),
94 Symbol(char),
95 Separator(Separator),
96}
97
98#[cfg(feature = "strings")]
99#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord)]
100pub enum Word {
101 Word(String),
102 StrangeWord(String),
103 Numerical(Numerical),
104 Number(Number),
105 Emoji(&'static str),
106}
107
108#[cfg(feature = "strings")]
109#[derive(Debug, Clone, Eq, PartialEq, Ord, PartialOrd)]
110pub enum Numerical {
111 DotSeparated(String),
115 Measures(String),
116 Alphanumeric(String),
117}
118
119#[cfg(feature = "strings")]
120#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord)]
121pub enum Struct {
122 Hashtag(String),
123 Mention(String),
124 }
126
127#[cfg(feature = "strings")]
128#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord)]
129pub enum Unicode {
130 String(String),
131 Formatter(Formatter),
132}
133
134#[cfg(not(feature = "strings"))]
135#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord)]
136pub enum Word {
137 Word,
138 StrangeWord,
139 Numerical(Numerical),
140 Number(Number),
141 Emoji(&'static str),
142}
143
144#[cfg(not(feature = "strings"))]
145#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord)]
146pub enum Numerical {
147 DotSeparated,
151 Measures,
152 Alphanumeric,
153}
154
155#[cfg(not(feature = "strings"))]
156#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord)]
157pub enum Struct {
158 Hashtag,
159 Mention,
160 }
162
163#[cfg(not(feature = "strings"))]
164#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord)]
165pub enum Unicode {
166 String,
167 Formatter(Formatter),
168}
169
170#[cfg(feature = "strings")]
171#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord)]
172pub enum Token {
173 Word(Word),
174 Struct(Struct),
175 Special(Special),
176 Unicode(Unicode),
177}
178
179#[cfg(not(feature = "strings"))]
180#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord)]
181pub enum Token {
182 Word(Word),
183 Struct(Struct),
184 Special(Special),
185 Unicode(Unicode),
186}
187
188#[derive(Debug)]
202pub struct TextStr<'s> {
203 buffer: &'s str,
204 localities: Arc<Vec<TextLocality>>,
205 breakers: Arc<Vec<InnerBound>>,
206}
207impl<'s> TextStr<'s> {
208 pub fn new<'a>(s: &'a str) -> Result<TextStr<'a>, Error> {
209 let text = inner_new(s.into_source(), false)?;
210 Ok(TextStr {
211 buffer: s,
212 localities: text.localities,
213 breakers: text.breakers,
214 })
215 }
216}
217
218fn inner_new<S: Source>(mut source: S, with_buffer: bool) -> Result<Text, Error> {
219 let mut buffer = String::new();
220 let mut localities = Vec::new();
221 let mut breakers = Vec::new();
222 let mut buffer_len = 0;
223
224 while let Some(local_se) = source.next_char().map_err(Error::TextParser)? {
225 let (local, se) = local_se.into_inner();
226 let c = match se {
227 SourceEvent::Char(c) => match c {
228 '\u{0060}' => '\u{0027}',
229 _ => c,
230 },
231 SourceEvent::Breaker(b) => {
232 let (c, opt_b) = match b {
233 Breaker::None => continue,
234 Breaker::Space => (' ', None),
235 Breaker::Line => ('\n', None),
236 Breaker::Word => ('\u{200B}', Some(b)), Breaker::Sentence | Breaker::Paragraph | Breaker::Section => ('\n', Some(b)),
238 };
239 if let Some(b) = opt_b {
240 let br = InnerBound {
241 bytes: Snip {
242 offset: buffer_len,
243 length: c.len_utf8(),
244 },
245 chars: Snip {
246 offset: localities.len(),
247 length: 1,
248 },
249 breaker: b,
250 original: Some(local),
251 };
252 breakers.push(br);
254 }
255 c
256 }
257 };
258
259 let buf_local = ().localize(
260 Snip {
261 offset: localities.len(),
263 length: 1,
264 },
265 Snip {
266 offset: buffer_len,
268 length: c.len_utf8(),
269 },
270 );
271 if with_buffer {
272 buffer.push(c);
273 }
274 buffer_len += c.len_utf8();
275 localities.push(TextLocality {
276 buffer: buf_local,
277 original: local,
278 });
279 }
280 Ok(Text {
281 buffer: Arc::new(buffer),
282 localities: Arc::new(localities),
283 breakers: Arc::new(breakers),
284 })
285}
286
287#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord)]
288pub struct TextLocality {
289 pub buffer: Local<()>,
290 pub original: Local<()>,
291}
292
293#[derive(Debug)]
294pub struct Text {
295 buffer: Arc<String>,
296 localities: Arc<Vec<TextLocality>>,
297 breakers: Arc<Vec<InnerBound>>,
298}
299impl Text {
300 pub fn new<S: Source>(source: S) -> Result<Text, Error> {
301 inner_new(source, true)
302 }
303 pub fn token_text<'s>(&'s self, token: &TextToken) -> &'s str {
304 let Snip {
305 offset: begin,
306 length: len,
307 } = token.locality.bytes();
308 let end = begin + len;
309 &self.buffer[begin..end]
310 }
311 pub fn text(&self) -> &str {
312 self.buffer.as_ref()
313 }
314 pub fn original_locality(&self, idx: usize) -> Option<Local<()>> {
315 self.localities.get(idx).map(|tl| tl.original)
316 }
317 pub fn localities(&self) -> &Vec<TextLocality> {
318 self.localities.as_ref()
319 }
320 pub fn shared_text(&self) -> Text {
321 Text {
322 buffer: self.buffer.clone(),
323 localities: self.localities.clone(),
324 breakers: self.breakers.clone(),
325 }
326 }
327}
328
329impl TryFrom<String> for Text {
330 type Error = Error;
331
332 fn try_from(s: String) -> Result<Text, Error> {
333 let mut text = inner_new((&s).into_source(), false)?;
334 text.buffer = Arc::new(s);
335 Ok(text)
336 }
337}
338
339impl TryFrom<&str> for Text {
340 type Error = Error;
341
342 fn try_from(s: &str) -> Result<Text, Error> {
343 Text::new(s.into_source())
344 }
345}
346
347#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord)]
348pub enum Bound {
349 Sentence,
350 Paragraph,
351 Section,
352}
353
354#[cfg(feature = "strings")]
355#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord)]
356pub struct TextToken {
357 locality: Local<()>,
358 original: Option<Local<()>>,
359 pub token: Token2,
360}
361
362#[cfg(not(feature = "strings"))]
363#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord)]
364pub struct TextToken {
365 locality: Local<()>,
366 original: Option<Local<()>>,
367 pub token: Token2,
368}
369
370#[cfg(test)]
371impl TextToken {
372 fn into_original_token_1(self) -> Option<Local<Token>> {
373 match self.original {
374 Some(original) => self.token.into_token().map(|t| original.local(t)),
375 None => None,
376 }
377 }
378}
379
380impl TextToken {
381 pub fn local(&self) -> Local<()> {
382 self.locality
383 }
384 pub fn original(&self) -> Option<Local<()>> {
385 self.original
386 }
387 pub fn into_position(mut self) -> TextToken {
388 self.locality = self.locality.into_position();
389 self.original = self.original.map(|or| or.into_position());
390 self
391 }
392 pub fn try_as_token(&self) -> Result<Token, Bound> {
393 self.token.try_as_token()
394 }
395 pub fn as_original_token(&self) -> Option<Local<&Token2>> {
396 self.original.map(|original| original.local(&self.token))
397 }
398 pub fn into_original_token(self) -> Option<Local<Token2>> {
399 self.original.map(|original| original.local(self.token))
400 }
401 pub fn original_str<'s>(&self, original: &'s str) -> Result<&'s str, OriginalError> {
402 match self.original {
403 Some(local) => {
404 let Snip {
405 offset: begin,
406 length: len,
407 } = local.bytes();
408 let end = begin + len;
409 match original.get(begin..end) {
410 Some(s) => Ok(s),
411 None => Err(OriginalError::InvalidSnip),
412 }
413 }
414 None => Err(OriginalError::NoOriginal),
415 }
416 }
417
418 pub fn test_token(lt: Local<Token2>) -> TextToken {
419 let (local, token) = lt.into_inner();
420 TextToken {
421 locality: local,
422 original: Some(local.local(())),
423 token,
424 }
425 }
426 pub fn test_new(token: Token2, local: Local<()>, original: Option<Local<()>>) -> TextToken {
427 TextToken {
428 locality: local,
429 original,
430 token,
431 }
432 }
433}
434
435#[derive(Debug)]
462pub enum OriginalError {
463 NoOriginal,
464 InvalidSnip,
465}
466
467#[cfg(feature = "strings")]
475#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord)]
476pub enum Token2 {
477 Word(Word),
478 Struct(Struct),
479 Special(Special),
480 Unicode(Unicode),
481
482 Bound(Bound),
483}
484#[cfg(not(feature = "strings"))]
485#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord)]
486pub enum Token2 {
487 Word(Word),
488 Struct(Struct),
489 Special(Special),
490 Unicode(Unicode),
491
492 Bound(Bound),
493}
494impl From<Token> for Token2 {
495 fn from(t: Token) -> Token2 {
496 match t {
497 Token::Word(w) => Token2::Word(w),
498 Token::Struct(s) => Token2::Struct(s),
499 Token::Special(s) => Token2::Special(s),
500 Token::Unicode(u) => Token2::Unicode(u),
501 }
502 }
503}
504impl Token2 {
505 #[cfg(not(feature = "strings"))]
506 fn try_as_token(&self) -> Result<Token, Bound> {
507 (*self).try_into_token()
508 }
509
510 #[cfg(feature = "strings")]
511 fn try_as_token(&self) -> Result<Token, Bound> {
512 self.clone().try_into_token()
513 }
514
515 fn try_into_token(self) -> Result<Token, Bound> {
516 match self {
517 Token2::Word(w) => Ok(Token::Word(w)),
518 Token2::Struct(s) => Ok(Token::Struct(s)),
519 Token2::Special(s) => Ok(Token::Special(s)),
520 Token2::Unicode(u) => Ok(Token::Unicode(u)),
521 Token2::Bound(b) => Err(b),
522 }
523 }
524}
525#[cfg(test)]
526impl Token2 {
527 fn into_token(self) -> Option<Token> {
528 match self {
529 Token2::Word(w) => Some(Token::Word(w)),
530 Token2::Struct(s) => Some(Token::Struct(s)),
531 Token2::Special(s) => Some(Token::Special(s)),
532 Token2::Unicode(u) => Some(Token::Unicode(u)),
533 Token2::Bound(_) => None,
534 }
535 }
536}
537
538#[cfg(test)]
539#[cfg(not(feature = "strings"))]
540mod test {
541 use super::*;
542 use text_parsing::{
543 IntoPipeParser, IntoSource, Localize, ParserExt, SourceExt, entities, tagger,
544 };
545
546 fn check_results(result: &Vec<Local<Token>>, lib_res: &Vec<Local<Token>>, _uws: &str) {
547 assert_eq!(result.len(), lib_res.len());
548 for i in 0..result.len() {
549 let res: Local<Token> = result[i].clone().into();
550 assert_eq!(res, lib_res[i]);
551 }
552 }
553
554 fn symbols() {
556 let uws = "Сибирь Арене 17 30 от 2560₽ 😀";
557 let lib_res = uws
560 .into_tokenizer(TokenizerParams::v1())
561 .collect::<Vec<_>>();
562 for t in lib_res {
564 println!("{:?}", t);
565 }
566 panic!()
567 }
568}
569
570#[cfg(test)]
571mod test_v0_5 {
572 use super::*;
573 use text_parsing::{IntoPipeParser, IntoSource, ParserExt, SourceExt, entities, tagger};
574
575 fn basic() {
577 let uws = "<p>Oxana Putan shared the quick (\"brown\") fox can't jump 32.3 feet, right? 4pda etc.</p><p> qeq U.S.A asd\n\n\nBrr, it's 29.3°F!\n Русское предложение #36.6 для тестирования деления по юникод-словам...\n🇷🇺 🇸🇹\n👱🏿👶🏽👨🏽\n+Done! Готово</p>";
592 let text = Text::new({
593 uws.into_source()
594 .pipe(tagger::Builder::new().create().into_breaker())
595 .pipe(entities::Builder::new().create().into_piped())
596 .into_separator()
597 })
598 .unwrap();
599 let lib_res = text
600 .into_tokenizer({
601 TokenizerParams::default()
602 .add_option(TokenizerOptions::SplitDot)
603 .add_option(TokenizerOptions::SplitUnderscore)
604 .add_option(TokenizerOptions::SplitColon)
605 .with_default_sentences()
606 })
607 .collect::<Vec<_>>();
608
609 for tok in lib_res {
610 println!(
611 "C{:?}, B{:?}, {:?} -> {:?}",
612 tok.original.map(|loc| loc.chars()),
613 tok.original.map(|loc| loc.bytes()),
614 tok.token,
615 tok.original_str(uws)
616 );
617 }
618
619 panic!()
620 }
621}
622
623#[cfg(test)]
624#[cfg(feature = "strings")]
625mod test {
626 use super::*;
627 use text_parsing::{
628 IntoPipeParser, IntoSource, Localize, ParserExt, SourceExt, entities, tagger,
629 };
630
631 #[allow(dead_code)]
650 fn print_result(lib_res: &Vec<Local<Token>>) {
651 for lt in lib_res {
656 println!("{:?}", lt);
657 }
658 }
659 #[derive(Debug, Clone)]
688 struct CharToken {
689 byte_offset: usize,
690 byte_length: usize,
691 char_offset: usize,
692 char_length: usize,
693 token: Token,
694 }
695 impl Into<Local<Token>> for CharToken {
696 fn into(self) -> Local<Token> {
697 self.token.localize(
698 Snip {
699 offset: self.char_offset,
700 length: self.char_length,
701 },
702 Snip {
703 offset: self.byte_offset,
704 length: self.byte_length,
705 },
706 )
707 }
708 }
709
710 #[derive(Debug, Clone)]
711 struct PositionalToken {
712 source: &'static str,
713 offset: usize,
714 length: usize,
715 token: Token,
716 }
717 impl Into<Local<Token>> for PositionalToken {
718 fn into(self) -> Local<Token> {
719 self.token.localize(
720 Snip {
721 offset: self.source[..self.offset].chars().count(),
722 length: self.source[self.offset..self.offset + self.length]
723 .chars()
724 .count(),
725 },
726 Snip {
727 offset: self.offset,
728 length: self.length,
729 },
730 )
731 }
732 }
733
734 fn check_results(result: &Vec<PositionalToken>, lib_res: &Vec<Local<Token>>, _uws: &str) {
735 assert_eq!(result.len(), lib_res.len());
736 for i in 0..result.len() {
737 let res: Local<Token> = result[i].clone().into();
738 assert_eq!(res, lib_res[i]);
739 }
740 }
741
742 fn check_cresults(result: &Vec<CharToken>, lib_res: &Vec<Local<Token>>, _uws: &str) {
743 assert_eq!(result.len(), lib_res.len());
744 for i in 0..result.len() {
745 let res: Local<Token> = result[i].clone().into();
746 assert_eq!(res, lib_res[i]);
747 }
748 }
749
750 fn check<T: Clone + std::fmt::Debug + Into<Local<Token>>>(
751 res: &Vec<T>,
752 lib: &Vec<Local<Token>>,
753 _uws: &str,
754 ) {
755 let mut lib = lib.iter();
756 let mut res = res.iter().map(|r| {
757 let res: Local<Token> = r.clone().into();
758 res
759 });
760 let mut diff = Vec::new();
761 loop {
762 match (lib.next(), res.next()) {
763 (Some(lw), Some(rw)) => {
764 if *lw != rw {
765 diff.push(format!("LIB: {:?}", lw));
766 diff.push(format!("TEST: {:?}", rw));
767 diff.push("".to_string())
768 }
769 }
770 (Some(lw), None) => {
771 diff.push(format!("LIB: {:?}", lw));
772 diff.push("TEST: ----".to_string());
773 diff.push("".to_string())
774 }
775 (None, Some(rw)) => {
776 diff.push("LIB: ----".to_string());
777 diff.push(format!("TEST: {:?}", rw));
778 diff.push("".to_string())
779 }
780 (None, None) => break,
781 }
782 }
783 if diff.len() > 0 {
784 for ln in &diff {
785 println!("{}", ln);
786 }
787 panic!("Diff count: {}", diff.len() / 3);
788 }
789 }
790
791 #[test]
792 #[rustfmt::skip]
793 fn currency() {
794 let uws = "$ ₽ € ¥";
795 let result = vec![
796 PositionalToken { source: uws, offset: 0, length: 1, token: Token::Special(Special::Currency('$')) },
797 PositionalToken { source: uws, offset: 1, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
798 PositionalToken { source: uws, offset: 2, length: 3, token: Token::Special(Special::Currency('₽')) },
799 PositionalToken { source: uws, offset: 5, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
800 PositionalToken { source: uws, offset: 6, length: 3, token: Token::Special(Special::Currency('€')) },
801 PositionalToken { source: uws, offset: 9, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
802 PositionalToken { source: uws, offset: 10, length: 2, token: Token::Special(Special::Currency('¥')) },
803 ];
804 let lib_res = uws
805 .into_tokenizer(TokenizerParams::v1())
806 .collect::<Vec<_>>();
807 check_results(&result, &lib_res, uws);
809 }
811
812 #[test]
813 fn spaces() {
814 let uws = " spaces too many apces ";
815 let result = vec![
816 PositionalToken {
817 source: uws,
818 offset: 0,
819 length: 4,
820 token: Token::Special(Special::Separator(Separator::Space)),
821 },
822 PositionalToken {
823 source: uws,
824 offset: 4,
825 length: 6,
826 token: Token::Word(Word::Word("spaces".to_string())),
827 },
828 PositionalToken {
829 source: uws,
830 offset: 10,
831 length: 4,
832 token: Token::Special(Special::Separator(Separator::Space)),
833 },
834 PositionalToken {
835 source: uws,
836 offset: 14,
837 length: 3,
838 token: Token::Word(Word::Word("too".to_string())),
839 },
840 PositionalToken {
841 source: uws,
842 offset: 17,
843 length: 3,
844 token: Token::Special(Special::Separator(Separator::Space)),
845 },
846 PositionalToken {
847 source: uws,
848 offset: 20,
849 length: 4,
850 token: Token::Word(Word::Word("many".to_string())),
851 },
852 PositionalToken {
853 source: uws,
854 offset: 24,
855 length: 3,
856 token: Token::Special(Special::Separator(Separator::Space)),
857 },
858 PositionalToken {
859 source: uws,
860 offset: 27,
861 length: 5,
862 token: Token::Word(Word::Word("apces".to_string())),
863 },
864 PositionalToken {
865 source: uws,
866 offset: 32,
867 length: 3,
868 token: Token::Special(Special::Separator(Separator::Space)),
869 },
870 ];
871 let lib_res = uws
872 .into_tokenizer(TokenizerParams::v1())
873 .collect::<Vec<_>>();
874 check_results(&result, &lib_res, uws);
875 }
877
878 #[test]
879 fn numbers() {
880 let uws = "(() -2\n() -2";
881 let result = vec![
882 PositionalToken {
883 source: uws,
884 offset: 0,
885 length: 1,
886 token: Token::Special(Special::Punctuation('(')),
887 },
888 PositionalToken {
889 source: uws,
890 offset: 1,
891 length: 1,
892 token: Token::Special(Special::Punctuation('(')),
893 },
894 PositionalToken {
895 source: uws,
896 offset: 2,
897 length: 1,
898 token: Token::Special(Special::Punctuation(')')),
899 },
900 PositionalToken {
901 source: uws,
902 offset: 3,
903 length: 1,
904 token: Token::Special(Special::Separator(Separator::Space)),
905 },
906 PositionalToken {
907 source: uws,
908 offset: 4,
909 length: 2,
910 token: Token::Word(Word::Number(Number::Integer(-2))),
911 },
912 PositionalToken {
913 source: uws,
914 offset: 6,
915 length: 1,
916 token: Token::Special(Special::Separator(Separator::Newline)),
917 },
918 PositionalToken {
919 source: uws,
920 offset: 7,
921 length: 1,
922 token: Token::Special(Special::Punctuation('(')),
923 },
924 PositionalToken {
925 source: uws,
926 offset: 8,
927 length: 1,
928 token: Token::Special(Special::Punctuation(')')),
929 },
930 PositionalToken {
931 source: uws,
932 offset: 9,
933 length: 2,
934 token: Token::Special(Special::Separator(Separator::Space)),
935 },
936 PositionalToken {
937 source: uws,
938 offset: 11,
939 length: 2,
940 token: Token::Word(Word::Number(Number::Integer(-2))),
941 },
942 ];
943 let lib_res = uws
944 .into_tokenizer({
945 TokenizerParams::default()
946 .add_option(TokenizerOptions::SplitDot)
947 .add_option(TokenizerOptions::SplitUnderscore)
948 .add_option(TokenizerOptions::SplitColon)
949 .add_option(TokenizerOptions::MergeWhites)
950 })
951 .collect::<Vec<_>>();
952 check_results(&result, &lib_res, uws);
953 }
954
955 #[test]
956 fn word_with_inner_hyphens() {
957 let uws = "Опросы показывают";
958 let result = vec![
959 PositionalToken {
960 source: uws,
961 offset: 0,
962 length: 14,
963 token: Token::Word(Word::StrangeWord("Опросы".to_string())),
964 },
965 PositionalToken {
966 source: uws,
967 offset: 14,
968 length: 1,
969 token: Token::Special(Special::Separator(Separator::Space)),
970 },
971 PositionalToken {
972 source: uws,
973 offset: 15,
974 length: 28,
975 token: Token::Word(Word::StrangeWord("показывают".to_string())),
976 },
977 ];
978 let lib_res = uws
979 .into_tokenizer(TokenizerParams::v1())
980 .collect::<Vec<_>>();
981 check_results(&result, &lib_res, uws);
982 }
983
984 #[test]
985 fn mixed_but_word() {
986 let uws = "L’Oreal";
987 let result = vec![PositionalToken {
988 source: uws,
989 offset: 0,
990 length: 9,
991 token: Token::Word(Word::StrangeWord("L’Oreal".to_string())),
992 }];
993 let lib_res = uws
994 .into_tokenizer(TokenizerParams::v1())
995 .collect::<Vec<_>>();
996 check_results(&result, &lib_res, uws);
997 }
998
999 #[test]
1000 fn hashtags() {
1001 let uws = "#hashtag#hashtag2";
1002 let result = vec![
1003 PositionalToken {
1004 source: uws,
1005 offset: 0,
1006 length: 8,
1007 token: Token::Struct(Struct::Hashtag("hashtag".to_string())),
1008 },
1009 PositionalToken {
1010 source: uws,
1011 offset: 8,
1012 length: 9,
1013 token: Token::Struct(Struct::Hashtag("hashtag2".to_string())),
1014 },
1015 ];
1042 let lib_res = uws
1043 .into_tokenizer(TokenizerParams::v1())
1044 .collect::<Vec<_>>();
1045 check_results(&result, &lib_res, uws);
1046 }
1047
1048 #[test]
1049 fn hashtags2() {
1050 let uws = "#hashtag#hashtag2 #hash_tag";
1051 let result = vec![
1052 PositionalToken {
1053 source: uws,
1054 offset: 0,
1055 length: 8,
1056 token: Token::Struct(Struct::Hashtag("hashtag".to_string())),
1057 },
1058 PositionalToken {
1059 source: uws,
1060 offset: 8,
1061 length: 9,
1062 token: Token::Struct(Struct::Hashtag("hashtag2".to_string())),
1063 },
1064 PositionalToken {
1065 source: uws,
1066 offset: 17,
1067 length: 1,
1068 token: Token::Special(Special::Separator(Separator::Space)),
1069 },
1070 PositionalToken {
1071 source: uws,
1072 offset: 18,
1073 length: 9,
1074 token: Token::Struct(Struct::Hashtag("hash_tag".to_string())),
1075 },
1076 ];
1077 let lib_res = uws
1078 .into_tokenizer(TokenizerParams::v1().add_option(TokenizerOptions::StructTokens))
1079 .collect::<Vec<_>>();
1080 check_results(&result, &lib_res, uws);
1081 }
1082
1083 #[test]
1084 fn mention2() {
1085 let uws = "@hashtag@hashtag2 @hash_tag";
1086 let result = vec![
1087 PositionalToken {
1088 source: uws,
1089 offset: 0,
1090 length: 8,
1091 token: Token::Struct(Struct::Mention("hashtag".to_string())),
1092 },
1093 PositionalToken {
1094 source: uws,
1095 offset: 8,
1096 length: 9,
1097 token: Token::Struct(Struct::Mention("hashtag2".to_string())),
1098 },
1099 PositionalToken {
1100 source: uws,
1101 offset: 17,
1102 length: 1,
1103 token: Token::Special(Special::Separator(Separator::Space)),
1104 },
1105 PositionalToken {
1106 source: uws,
1107 offset: 18,
1108 length: 9,
1109 token: Token::Struct(Struct::Mention("hash_tag".to_string())),
1110 },
1111 ];
1112 let lib_res = uws
1113 .into_tokenizer(TokenizerParams::v1().add_option(TokenizerOptions::StructTokens))
1114 .collect::<Vec<_>>();
1115 check_results(&result, &lib_res, uws);
1116 }
1117
1118 #[test]
1119 fn apostrophe() {
1120 let uws = "l'oreal; l\u{0060}oreal";
1121 let result = vec![
1122 PositionalToken {
1123 source: uws,
1124 offset: 0,
1125 length: 7,
1126 token: Token::Word(Word::Word("l'oreal".to_string())),
1127 },
1128 PositionalToken {
1129 source: uws,
1130 offset: 7,
1131 length: 1,
1132 token: Token::Special(Special::Punctuation(';')),
1133 },
1134 PositionalToken {
1135 source: uws,
1136 offset: 8,
1137 length: 1,
1138 token: Token::Special(Special::Separator(Separator::Space)),
1139 },
1140 PositionalToken {
1141 source: uws,
1142 offset: 9,
1143 length: 7,
1144 token: Token::Word(Word::Word("l'oreal".to_string())),
1145 },
1146 ];
1147 let text = Text::new(uws.into_source()).unwrap();
1148 let lib_res = text
1149 .into_tokenizer(TokenizerParams::v1())
1150 .filter_map(|tt| tt.into_original_token_1())
1151 .collect::<Vec<_>>();
1152 check_results(&result, &lib_res, uws);
1153 }
1154
1155 #[test]
1156 fn char_tokens() {
1157 let uws = "[Oxana Putan|1712640565] shared the quick (\"brown\") fox can't jump 32.3 feet, right? 4pda etc. qeq U.S.A asd\n\n\nBrr, it's 29.3°F!\n Русское предложение #36.6 для тестирования деления по юникод-словам...\n🇷🇺 🇸🇹\n👱🏿👶🏽👨🏽\n+Done! Готово";
1158 let result = vec![
1159 CharToken {
1160 byte_offset: 0,
1161 byte_length: 1,
1162 char_offset: 0,
1163 char_length: 1,
1164 token: Token::Special(Special::Punctuation('[')),
1165 },
1166 CharToken {
1167 byte_offset: 1,
1168 byte_length: 5,
1169 char_offset: 1,
1170 char_length: 5,
1171 token: Token::Word(Word::Word("Oxana".to_string())),
1172 },
1173 CharToken {
1174 byte_offset: 6,
1175 byte_length: 1,
1176 char_offset: 6,
1177 char_length: 1,
1178 token: Token::Special(Special::Separator(Separator::Space)),
1179 },
1180 CharToken {
1181 byte_offset: 7,
1182 byte_length: 5,
1183 char_offset: 7,
1184 char_length: 5,
1185 token: Token::Word(Word::Word("Putan".to_string())),
1186 },
1187 CharToken {
1188 byte_offset: 12,
1189 byte_length: 1,
1190 char_offset: 12,
1191 char_length: 1,
1192 token: Token::Special(Special::Punctuation('|')),
1193 },
1194 CharToken {
1195 byte_offset: 13,
1196 byte_length: 10,
1197 char_offset: 13,
1198 char_length: 10,
1199 token: Token::Word(Word::Number(Number::Integer(1712640565))),
1200 },
1201 CharToken {
1202 byte_offset: 23,
1203 byte_length: 1,
1204 char_offset: 23,
1205 char_length: 1,
1206 token: Token::Special(Special::Punctuation(']')),
1207 },
1208 CharToken {
1216 byte_offset: 24,
1217 byte_length: 1,
1218 char_offset: 24,
1219 char_length: 1,
1220 token: Token::Special(Special::Separator(Separator::Space)),
1221 },
1222 CharToken {
1223 byte_offset: 25,
1224 byte_length: 6,
1225 char_offset: 25,
1226 char_length: 6,
1227 token: Token::Word(Word::Word("shared".to_string())),
1228 },
1229 CharToken {
1230 byte_offset: 31,
1231 byte_length: 1,
1232 char_offset: 31,
1233 char_length: 1,
1234 token: Token::Special(Special::Separator(Separator::Space)),
1235 },
1236 CharToken {
1237 byte_offset: 32,
1238 byte_length: 3,
1239 char_offset: 32,
1240 char_length: 3,
1241 token: Token::Word(Word::Word("the".to_string())),
1242 },
1243 CharToken {
1244 byte_offset: 35,
1245 byte_length: 1,
1246 char_offset: 35,
1247 char_length: 1,
1248 token: Token::Special(Special::Separator(Separator::Space)),
1249 },
1250 CharToken {
1251 byte_offset: 36,
1252 byte_length: 5,
1253 char_offset: 36,
1254 char_length: 5,
1255 token: Token::Word(Word::Word("quick".to_string())),
1256 },
1257 CharToken {
1258 byte_offset: 41,
1259 byte_length: 1,
1260 char_offset: 41,
1261 char_length: 1,
1262 token: Token::Special(Special::Separator(Separator::Space)),
1263 },
1264 CharToken {
1265 byte_offset: 42,
1266 byte_length: 1,
1267 char_offset: 42,
1268 char_length: 1,
1269 token: Token::Special(Special::Punctuation('(')),
1270 },
1271 CharToken {
1272 byte_offset: 43,
1273 byte_length: 1,
1274 char_offset: 43,
1275 char_length: 1,
1276 token: Token::Special(Special::Punctuation('"')),
1277 },
1278 CharToken {
1279 byte_offset: 44,
1280 byte_length: 5,
1281 char_offset: 44,
1282 char_length: 5,
1283 token: Token::Word(Word::Word("brown".to_string())),
1284 },
1285 CharToken {
1286 byte_offset: 49,
1287 byte_length: 1,
1288 char_offset: 49,
1289 char_length: 1,
1290 token: Token::Special(Special::Punctuation('"')),
1291 },
1292 CharToken {
1293 byte_offset: 50,
1294 byte_length: 1,
1295 char_offset: 50,
1296 char_length: 1,
1297 token: Token::Special(Special::Punctuation(')')),
1298 },
1299 CharToken {
1300 byte_offset: 51,
1301 byte_length: 1,
1302 char_offset: 51,
1303 char_length: 1,
1304 token: Token::Special(Special::Separator(Separator::Space)),
1305 },
1306 CharToken {
1307 byte_offset: 52,
1308 byte_length: 3,
1309 char_offset: 52,
1310 char_length: 3,
1311 token: Token::Word(Word::Word("fox".to_string())),
1312 },
1313 CharToken {
1314 byte_offset: 55,
1315 byte_length: 1,
1316 char_offset: 55,
1317 char_length: 1,
1318 token: Token::Special(Special::Separator(Separator::Space)),
1319 },
1320 CharToken {
1321 byte_offset: 56,
1322 byte_length: 5,
1323 char_offset: 56,
1324 char_length: 5,
1325 token: Token::Word(Word::Word("can\'t".to_string())),
1326 },
1327 CharToken {
1328 byte_offset: 61,
1329 byte_length: 1,
1330 char_offset: 61,
1331 char_length: 1,
1332 token: Token::Special(Special::Separator(Separator::Space)),
1333 },
1334 CharToken {
1335 byte_offset: 62,
1336 byte_length: 4,
1337 char_offset: 62,
1338 char_length: 4,
1339 token: Token::Word(Word::Word("jump".to_string())),
1340 },
1341 CharToken {
1342 byte_offset: 66,
1343 byte_length: 1,
1344 char_offset: 66,
1345 char_length: 1,
1346 token: Token::Special(Special::Separator(Separator::Space)),
1347 },
1348 CharToken {
1349 byte_offset: 67,
1350 byte_length: 4,
1351 char_offset: 67,
1352 char_length: 4,
1353 token: Token::Word(Word::Number(Number::Float(32.3))),
1354 },
1355 CharToken {
1356 byte_offset: 71,
1357 byte_length: 1,
1358 char_offset: 71,
1359 char_length: 1,
1360 token: Token::Special(Special::Separator(Separator::Space)),
1361 },
1362 CharToken {
1363 byte_offset: 72,
1364 byte_length: 4,
1365 char_offset: 72,
1366 char_length: 4,
1367 token: Token::Word(Word::Word("feet".to_string())),
1368 },
1369 CharToken {
1370 byte_offset: 76,
1371 byte_length: 1,
1372 char_offset: 76,
1373 char_length: 1,
1374 token: Token::Special(Special::Punctuation(',')),
1375 },
1376 CharToken {
1377 byte_offset: 77,
1378 byte_length: 1,
1379 char_offset: 77,
1380 char_length: 1,
1381 token: Token::Special(Special::Separator(Separator::Space)),
1382 },
1383 CharToken {
1384 byte_offset: 78,
1385 byte_length: 5,
1386 char_offset: 78,
1387 char_length: 5,
1388 token: Token::Word(Word::Word("right".to_string())),
1389 },
1390 CharToken {
1391 byte_offset: 83,
1392 byte_length: 1,
1393 char_offset: 83,
1394 char_length: 1,
1395 token: Token::Special(Special::Punctuation('?')),
1396 },
1397 CharToken {
1398 byte_offset: 84,
1399 byte_length: 1,
1400 char_offset: 84,
1401 char_length: 1,
1402 token: Token::Special(Special::Separator(Separator::Space)),
1403 },
1404 CharToken {
1405 byte_offset: 85,
1406 byte_length: 4,
1407 char_offset: 85,
1408 char_length: 4,
1409 token: Token::Word(Word::Numerical(Numerical::Measures("4pda".to_string()))),
1410 },
1411 CharToken {
1412 byte_offset: 89,
1413 byte_length: 1,
1414 char_offset: 89,
1415 char_length: 1,
1416 token: Token::Special(Special::Separator(Separator::Space)),
1417 },
1418 CharToken {
1419 byte_offset: 90,
1420 byte_length: 3,
1421 char_offset: 90,
1422 char_length: 3,
1423 token: Token::Word(Word::Word("etc".to_string())),
1424 },
1425 CharToken {
1426 byte_offset: 93,
1427 byte_length: 1,
1428 char_offset: 93,
1429 char_length: 1,
1430 token: Token::Special(Special::Punctuation('.')),
1431 },
1432 CharToken {
1433 byte_offset: 94,
1434 byte_length: 1,
1435 char_offset: 94,
1436 char_length: 1,
1437 token: Token::Special(Special::Separator(Separator::Space)),
1438 },
1439 CharToken {
1440 byte_offset: 95,
1441 byte_length: 3,
1442 char_offset: 95,
1443 char_length: 3,
1444 token: Token::Word(Word::Word("qeq".to_string())),
1445 },
1446 CharToken {
1447 byte_offset: 98,
1448 byte_length: 1,
1449 char_offset: 98,
1450 char_length: 1,
1451 token: Token::Special(Special::Separator(Separator::Space)),
1452 },
1453 CharToken {
1454 byte_offset: 99,
1455 byte_length: 5,
1456 char_offset: 99,
1457 char_length: 5,
1458 token: Token::Word(Word::Word("U.S.A".to_string())),
1459 },
1460 CharToken {
1461 byte_offset: 104,
1462 byte_length: 2,
1463 char_offset: 104,
1464 char_length: 2,
1465 token: Token::Special(Special::Separator(Separator::Space)),
1466 },
1467 CharToken {
1468 byte_offset: 106,
1469 byte_length: 3,
1470 char_offset: 106,
1471 char_length: 3,
1472 token: Token::Word(Word::Word("asd".to_string())),
1473 },
1474 CharToken {
1475 byte_offset: 109,
1476 byte_length: 3,
1477 char_offset: 109,
1478 char_length: 3,
1479 token: Token::Special(Special::Separator(Separator::Newline)),
1480 },
1481 CharToken {
1482 byte_offset: 112,
1483 byte_length: 3,
1484 char_offset: 112,
1485 char_length: 3,
1486 token: Token::Word(Word::Word("Brr".to_string())),
1487 },
1488 CharToken {
1489 byte_offset: 115,
1490 byte_length: 1,
1491 char_offset: 115,
1492 char_length: 1,
1493 token: Token::Special(Special::Punctuation(',')),
1494 },
1495 CharToken {
1496 byte_offset: 116,
1497 byte_length: 1,
1498 char_offset: 116,
1499 char_length: 1,
1500 token: Token::Special(Special::Separator(Separator::Space)),
1501 },
1502 CharToken {
1503 byte_offset: 117,
1504 byte_length: 4,
1505 char_offset: 117,
1506 char_length: 4,
1507 token: Token::Word(Word::Word("it\'s".to_string())),
1508 },
1509 CharToken {
1510 byte_offset: 121,
1511 byte_length: 1,
1512 char_offset: 121,
1513 char_length: 1,
1514 token: Token::Special(Special::Separator(Separator::Space)),
1515 },
1516 CharToken {
1517 byte_offset: 122,
1518 byte_length: 4,
1519 char_offset: 122,
1520 char_length: 4,
1521 token: Token::Word(Word::Number(Number::Float(29.3))),
1522 },
1523 CharToken {
1524 byte_offset: 126,
1525 byte_length: 2,
1526 char_offset: 126,
1527 char_length: 1,
1528 token: Token::Special(Special::Symbol('°')),
1529 },
1530 CharToken {
1531 byte_offset: 128,
1532 byte_length: 1,
1533 char_offset: 127,
1534 char_length: 1,
1535 token: Token::Word(Word::Word("F".to_string())),
1536 },
1537 CharToken {
1538 byte_offset: 129,
1539 byte_length: 1,
1540 char_offset: 128,
1541 char_length: 1,
1542 token: Token::Special(Special::Punctuation('!')),
1543 },
1544 CharToken {
1545 byte_offset: 130,
1546 byte_length: 1,
1547 char_offset: 129,
1548 char_length: 1,
1549 token: Token::Special(Special::Separator(Separator::Newline)),
1550 },
1551 CharToken {
1552 byte_offset: 131,
1553 byte_length: 1,
1554 char_offset: 130,
1555 char_length: 1,
1556 token: Token::Special(Special::Separator(Separator::Space)),
1557 },
1558 CharToken {
1559 byte_offset: 132,
1560 byte_length: 14,
1561 char_offset: 131,
1562 char_length: 7,
1563 token: Token::Word(Word::Word("Русское".to_string())),
1564 },
1565 CharToken {
1566 byte_offset: 146,
1567 byte_length: 1,
1568 char_offset: 138,
1569 char_length: 1,
1570 token: Token::Special(Special::Separator(Separator::Space)),
1571 },
1572 CharToken {
1573 byte_offset: 147,
1574 byte_length: 22,
1575 char_offset: 139,
1576 char_length: 11,
1577 token: Token::Word(Word::Word("предложение".to_string())),
1578 },
1579 CharToken {
1580 byte_offset: 169,
1581 byte_length: 1,
1582 char_offset: 150,
1583 char_length: 1,
1584 token: Token::Special(Special::Separator(Separator::Space)),
1585 },
1586 CharToken {
1587 byte_offset: 170,
1588 byte_length: 5,
1589 char_offset: 151,
1590 char_length: 5,
1591 token: Token::Struct(Struct::Hashtag("36.6".to_string())),
1592 },
1593 CharToken {
1594 byte_offset: 175,
1595 byte_length: 1,
1596 char_offset: 156,
1597 char_length: 1,
1598 token: Token::Special(Special::Separator(Separator::Space)),
1599 },
1600 CharToken {
1601 byte_offset: 176,
1602 byte_length: 6,
1603 char_offset: 157,
1604 char_length: 3,
1605 token: Token::Word(Word::Word("для".to_string())),
1606 },
1607 CharToken {
1608 byte_offset: 182,
1609 byte_length: 1,
1610 char_offset: 160,
1611 char_length: 1,
1612 token: Token::Special(Special::Separator(Separator::Space)),
1613 },
1614 CharToken {
1615 byte_offset: 183,
1616 byte_length: 24,
1617 char_offset: 161,
1618 char_length: 12,
1619 token: Token::Word(Word::Word("тестирования".to_string())),
1620 },
1621 CharToken {
1622 byte_offset: 207,
1623 byte_length: 1,
1624 char_offset: 173,
1625 char_length: 1,
1626 token: Token::Special(Special::Separator(Separator::Space)),
1627 },
1628 CharToken {
1629 byte_offset: 208,
1630 byte_length: 14,
1631 char_offset: 174,
1632 char_length: 7,
1633 token: Token::Word(Word::Word("деления".to_string())),
1634 },
1635 CharToken {
1636 byte_offset: 222,
1637 byte_length: 1,
1638 char_offset: 181,
1639 char_length: 1,
1640 token: Token::Special(Special::Separator(Separator::Space)),
1641 },
1642 CharToken {
1643 byte_offset: 223,
1644 byte_length: 4,
1645 char_offset: 182,
1646 char_length: 2,
1647 token: Token::Word(Word::Word("по".to_string())),
1648 },
1649 CharToken {
1650 byte_offset: 227,
1651 byte_length: 1,
1652 char_offset: 184,
1653 char_length: 1,
1654 token: Token::Special(Special::Separator(Separator::Space)),
1655 },
1656 CharToken {
1657 byte_offset: 228,
1658 byte_length: 12,
1659 char_offset: 185,
1660 char_length: 6,
1661 token: Token::Word(Word::Word("юникод".to_string())),
1662 },
1663 CharToken {
1664 byte_offset: 240,
1665 byte_length: 1,
1666 char_offset: 191,
1667 char_length: 1,
1668 token: Token::Special(Special::Punctuation('-')),
1669 },
1670 CharToken {
1671 byte_offset: 241,
1672 byte_length: 12,
1673 char_offset: 192,
1674 char_length: 6,
1675 token: Token::Word(Word::Word("словам".to_string())),
1676 },
1677 CharToken {
1678 byte_offset: 253,
1679 byte_length: 3,
1680 char_offset: 198,
1681 char_length: 3,
1682 token: Token::Special(Special::Punctuation('.')),
1683 },
1684 CharToken {
1685 byte_offset: 256,
1686 byte_length: 1,
1687 char_offset: 201,
1688 char_length: 1,
1689 token: Token::Special(Special::Separator(Separator::Newline)),
1690 },
1691 CharToken {
1692 byte_offset: 257,
1693 byte_length: 8,
1694 char_offset: 202,
1695 char_length: 2,
1696 token: Token::Word(Word::Emoji("russia")),
1697 },
1698 CharToken {
1699 byte_offset: 265,
1700 byte_length: 1,
1701 char_offset: 204,
1702 char_length: 1,
1703 token: Token::Special(Special::Separator(Separator::Space)),
1704 },
1705 CharToken {
1706 byte_offset: 266,
1707 byte_length: 8,
1708 char_offset: 205,
1709 char_length: 2,
1710 token: Token::Word(Word::Emoji("sao_tome_and_principe")),
1711 },
1712 CharToken {
1713 byte_offset: 274,
1714 byte_length: 1,
1715 char_offset: 207,
1716 char_length: 1,
1717 token: Token::Special(Special::Separator(Separator::Newline)),
1718 },
1719 CharToken {
1720 byte_offset: 275,
1721 byte_length: 8,
1722 char_offset: 208,
1723 char_length: 2,
1724 token: Token::Word(Word::Emoji("blond_haired_person_dark_skin_tone")),
1725 },
1726 CharToken {
1727 byte_offset: 283,
1728 byte_length: 8,
1729 char_offset: 210,
1730 char_length: 2,
1731 token: Token::Word(Word::Emoji("baby_medium_skin_tone")),
1732 },
1733 CharToken {
1734 byte_offset: 291,
1735 byte_length: 8,
1736 char_offset: 212,
1737 char_length: 2,
1738 token: Token::Word(Word::Emoji("man_medium_skin_tone")),
1739 },
1740 CharToken {
1741 byte_offset: 299,
1742 byte_length: 1,
1743 char_offset: 214,
1744 char_length: 1,
1745 token: Token::Special(Special::Separator(Separator::Newline)),
1746 },
1747 CharToken {
1748 byte_offset: 300,
1749 byte_length: 1,
1750 char_offset: 215,
1751 char_length: 1,
1752 token: Token::Special(Special::Punctuation('+')),
1753 },
1754 CharToken {
1755 byte_offset: 301,
1756 byte_length: 4,
1757 char_offset: 216,
1758 char_length: 4,
1759 token: Token::Word(Word::Word("Done".to_string())),
1760 },
1761 CharToken {
1762 byte_offset: 305,
1763 byte_length: 1,
1764 char_offset: 220,
1765 char_length: 1,
1766 token: Token::Special(Special::Punctuation('!')),
1767 },
1768 CharToken {
1769 byte_offset: 306,
1770 byte_length: 1,
1771 char_offset: 221,
1772 char_length: 1,
1773 token: Token::Special(Special::Separator(Separator::Space)),
1774 },
1775 CharToken {
1776 byte_offset: 307,
1777 byte_length: 12,
1778 char_offset: 222,
1779 char_length: 6,
1780 token: Token::Word(Word::Word("Готово".to_string())),
1781 },
1782 ];
1783
1784 let lib_res = uws
1785 .into_tokenizer(TokenizerParams::complex())
1786 .collect::<Vec<_>>();
1787
1788 check_cresults(&result, &lib_res, uws);
1790 }
1791
1792 #[test]
1793 fn general_default() {
1794 let uws = "The quick (\"brown\") fox can't jump 32.3 feet, right? 4pda etc. qeq U.S.A asd\n\n\nBrr, it's 29.3°F!\n Русское предложение #36.6 для тестирования деления по юникод-словам...\n";
1795 let result = vec![
1796 PositionalToken {
1797 source: uws,
1798 offset: 0,
1799 length: 3,
1800 token: Token::Word(Word::Word("The".to_string())),
1801 },
1802 PositionalToken {
1803 source: uws,
1804 offset: 3,
1805 length: 1,
1806 token: Token::Special(Special::Separator(Separator::Space)),
1807 },
1808 PositionalToken {
1809 source: uws,
1810 offset: 4,
1811 length: 5,
1812 token: Token::Word(Word::Word("quick".to_string())),
1813 },
1814 PositionalToken {
1815 source: uws,
1816 offset: 9,
1817 length: 1,
1818 token: Token::Special(Special::Separator(Separator::Space)),
1819 },
1820 PositionalToken {
1821 source: uws,
1822 offset: 10,
1823 length: 1,
1824 token: Token::Special(Special::Punctuation('(')),
1825 },
1826 PositionalToken {
1827 source: uws,
1828 offset: 11,
1829 length: 1,
1830 token: Token::Special(Special::Punctuation('"')),
1831 },
1832 PositionalToken {
1833 source: uws,
1834 offset: 12,
1835 length: 5,
1836 token: Token::Word(Word::Word("brown".to_string())),
1837 },
1838 PositionalToken {
1839 source: uws,
1840 offset: 17,
1841 length: 1,
1842 token: Token::Special(Special::Punctuation('"')),
1843 },
1844 PositionalToken {
1845 source: uws,
1846 offset: 18,
1847 length: 1,
1848 token: Token::Special(Special::Punctuation(')')),
1849 },
1850 PositionalToken {
1851 source: uws,
1852 offset: 19,
1853 length: 1,
1854 token: Token::Special(Special::Separator(Separator::Space)),
1855 },
1856 PositionalToken {
1857 source: uws,
1858 offset: 20,
1859 length: 3,
1860 token: Token::Word(Word::Word("fox".to_string())),
1861 },
1862 PositionalToken {
1863 source: uws,
1864 offset: 23,
1865 length: 1,
1866 token: Token::Special(Special::Separator(Separator::Space)),
1867 },
1868 PositionalToken {
1869 source: uws,
1870 offset: 24,
1871 length: 5,
1872 token: Token::Word(Word::Word("can\'t".to_string())),
1873 },
1874 PositionalToken {
1875 source: uws,
1876 offset: 29,
1877 length: 1,
1878 token: Token::Special(Special::Separator(Separator::Space)),
1879 },
1880 PositionalToken {
1881 source: uws,
1882 offset: 30,
1883 length: 4,
1884 token: Token::Word(Word::Word("jump".to_string())),
1885 },
1886 PositionalToken {
1887 source: uws,
1888 offset: 34,
1889 length: 1,
1890 token: Token::Special(Special::Separator(Separator::Space)),
1891 },
1892 PositionalToken {
1893 source: uws,
1894 offset: 35,
1895 length: 4,
1896 token: Token::Word(Word::Number(Number::Float(32.3))),
1897 },
1898 PositionalToken {
1899 source: uws,
1900 offset: 39,
1901 length: 1,
1902 token: Token::Special(Special::Separator(Separator::Space)),
1903 },
1904 PositionalToken {
1905 source: uws,
1906 offset: 40,
1907 length: 4,
1908 token: Token::Word(Word::Word("feet".to_string())),
1909 },
1910 PositionalToken {
1911 source: uws,
1912 offset: 44,
1913 length: 1,
1914 token: Token::Special(Special::Punctuation(',')),
1915 },
1916 PositionalToken {
1917 source: uws,
1918 offset: 45,
1919 length: 1,
1920 token: Token::Special(Special::Separator(Separator::Space)),
1921 },
1922 PositionalToken {
1923 source: uws,
1924 offset: 46,
1925 length: 5,
1926 token: Token::Word(Word::Word("right".to_string())),
1927 },
1928 PositionalToken {
1929 source: uws,
1930 offset: 51,
1931 length: 1,
1932 token: Token::Special(Special::Punctuation('?')),
1933 },
1934 PositionalToken {
1935 source: uws,
1936 offset: 52,
1937 length: 1,
1938 token: Token::Special(Special::Separator(Separator::Space)),
1939 },
1940 PositionalToken {
1941 source: uws,
1942 offset: 53,
1943 length: 4,
1944 token: Token::Word(Word::Numerical(Numerical::Measures("4pda".to_string()))),
1945 }, PositionalToken {
1947 source: uws,
1948 offset: 57,
1949 length: 1,
1950 token: Token::Special(Special::Separator(Separator::Space)),
1951 },
1952 PositionalToken {
1953 source: uws,
1954 offset: 58,
1955 length: 3,
1956 token: Token::Word(Word::Word("etc".to_string())),
1957 },
1958 PositionalToken {
1959 source: uws,
1960 offset: 61,
1961 length: 1,
1962 token: Token::Special(Special::Punctuation('.')),
1963 },
1964 PositionalToken {
1965 source: uws,
1966 offset: 62,
1967 length: 1,
1968 token: Token::Special(Special::Separator(Separator::Space)),
1969 },
1970 PositionalToken {
1971 source: uws,
1972 offset: 63,
1973 length: 3,
1974 token: Token::Word(Word::Word("qeq".to_string())),
1975 },
1976 PositionalToken {
1977 source: uws,
1978 offset: 66,
1979 length: 1,
1980 token: Token::Special(Special::Separator(Separator::Space)),
1981 },
1982 PositionalToken {
1983 source: uws,
1984 offset: 67,
1985 length: 1,
1986 token: Token::Word(Word::Word("U".to_string())),
1987 },
1988 PositionalToken {
1989 source: uws,
1990 offset: 68,
1991 length: 1,
1992 token: Token::Special(Special::Punctuation('.')),
1993 },
1994 PositionalToken {
1995 source: uws,
1996 offset: 69,
1997 length: 1,
1998 token: Token::Word(Word::Word("S".to_string())),
1999 },
2000 PositionalToken {
2001 source: uws,
2002 offset: 70,
2003 length: 1,
2004 token: Token::Special(Special::Punctuation('.')),
2005 },
2006 PositionalToken {
2007 source: uws,
2008 offset: 71,
2009 length: 1,
2010 token: Token::Word(Word::Word("A".to_string())),
2011 },
2012 PositionalToken {
2013 source: uws,
2014 offset: 72,
2015 length: 2,
2016 token: Token::Special(Special::Separator(Separator::Space)),
2017 },
2018 PositionalToken {
2019 source: uws,
2020 offset: 74,
2021 length: 3,
2022 token: Token::Word(Word::Word("asd".to_string())),
2023 },
2024 PositionalToken {
2025 source: uws,
2026 offset: 77,
2027 length: 3,
2028 token: Token::Special(Special::Separator(Separator::Newline)),
2029 },
2030 PositionalToken {
2031 source: uws,
2032 offset: 80,
2033 length: 3,
2034 token: Token::Word(Word::Word("Brr".to_string())),
2035 },
2036 PositionalToken {
2037 source: uws,
2038 offset: 83,
2039 length: 1,
2040 token: Token::Special(Special::Punctuation(',')),
2041 },
2042 PositionalToken {
2043 source: uws,
2044 offset: 84,
2045 length: 1,
2046 token: Token::Special(Special::Separator(Separator::Space)),
2047 },
2048 PositionalToken {
2049 source: uws,
2050 offset: 85,
2051 length: 4,
2052 token: Token::Word(Word::Word("it\'s".to_string())),
2053 },
2054 PositionalToken {
2055 source: uws,
2056 offset: 89,
2057 length: 1,
2058 token: Token::Special(Special::Separator(Separator::Space)),
2059 },
2060 PositionalToken {
2061 source: uws,
2062 offset: 90,
2063 length: 4,
2064 token: Token::Word(Word::Number(Number::Float(29.3))),
2065 },
2066 PositionalToken {
2067 source: uws,
2068 offset: 94,
2069 length: 2,
2070 token: Token::Special(Special::Symbol('°')),
2071 },
2072 PositionalToken {
2073 source: uws,
2074 offset: 96,
2075 length: 1,
2076 token: Token::Word(Word::Word("F".to_string())),
2077 },
2078 PositionalToken {
2079 source: uws,
2080 offset: 97,
2081 length: 1,
2082 token: Token::Special(Special::Punctuation('!')),
2083 },
2084 PositionalToken {
2085 source: uws,
2086 offset: 98,
2087 length: 1,
2088 token: Token::Special(Special::Separator(Separator::Newline)),
2089 },
2090 PositionalToken {
2091 source: uws,
2092 offset: 99,
2093 length: 1,
2094 token: Token::Special(Special::Separator(Separator::Space)),
2095 },
2096 PositionalToken {
2097 source: uws,
2098 offset: 100,
2099 length: 14,
2100 token: Token::Word(Word::Word("Русское".to_string())),
2101 },
2102 PositionalToken {
2103 source: uws,
2104 offset: 114,
2105 length: 1,
2106 token: Token::Special(Special::Separator(Separator::Space)),
2107 },
2108 PositionalToken {
2109 source: uws,
2110 offset: 115,
2111 length: 22,
2112 token: Token::Word(Word::Word("предложение".to_string())),
2113 },
2114 PositionalToken {
2115 source: uws,
2116 offset: 137,
2117 length: 1,
2118 token: Token::Special(Special::Separator(Separator::Space)),
2119 },
2120 PositionalToken {
2121 source: uws,
2122 offset: 138,
2123 length: 5,
2124 token: Token::Struct(Struct::Hashtag("36.6".to_string())),
2125 },
2126 PositionalToken {
2139 source: uws,
2140 offset: 143,
2141 length: 1,
2142 token: Token::Special(Special::Separator(Separator::Space)),
2143 },
2144 PositionalToken {
2145 source: uws,
2146 offset: 144,
2147 length: 6,
2148 token: Token::Word(Word::Word("для".to_string())),
2149 },
2150 PositionalToken {
2151 source: uws,
2152 offset: 150,
2153 length: 1,
2154 token: Token::Special(Special::Separator(Separator::Space)),
2155 },
2156 PositionalToken {
2157 source: uws,
2158 offset: 151,
2159 length: 24,
2160 token: Token::Word(Word::Word("тестирования".to_string())),
2161 },
2162 PositionalToken {
2163 source: uws,
2164 offset: 175,
2165 length: 1,
2166 token: Token::Special(Special::Separator(Separator::Space)),
2167 },
2168 PositionalToken {
2169 source: uws,
2170 offset: 176,
2171 length: 14,
2172 token: Token::Word(Word::Word("деления".to_string())),
2173 },
2174 PositionalToken {
2175 source: uws,
2176 offset: 190,
2177 length: 1,
2178 token: Token::Special(Special::Separator(Separator::Space)),
2179 },
2180 PositionalToken {
2181 source: uws,
2182 offset: 191,
2183 length: 4,
2184 token: Token::Word(Word::Word("по".to_string())),
2185 },
2186 PositionalToken {
2187 source: uws,
2188 offset: 195,
2189 length: 1,
2190 token: Token::Special(Special::Separator(Separator::Space)),
2191 },
2192 PositionalToken {
2193 source: uws,
2194 offset: 196,
2195 length: 12,
2196 token: Token::Word(Word::Word("юникод".to_string())),
2197 },
2198 PositionalToken {
2199 source: uws,
2200 offset: 208,
2201 length: 1,
2202 token: Token::Special(Special::Punctuation('-')),
2203 },
2204 PositionalToken {
2205 source: uws,
2206 offset: 209,
2207 length: 12,
2208 token: Token::Word(Word::Word("словам".to_string())),
2209 },
2210 PositionalToken {
2211 source: uws,
2212 offset: 221,
2213 length: 3,
2214 token: Token::Special(Special::Punctuation('.')),
2215 },
2216 PositionalToken {
2217 source: uws,
2218 offset: 224,
2219 length: 1,
2220 token: Token::Special(Special::Separator(Separator::Newline)),
2221 },
2222 ];
2223 let lib_res = uws
2224 .into_tokenizer(TokenizerParams::v1())
2225 .collect::<Vec<_>>();
2226 check_results(&result, &lib_res, uws);
2227 }
2228
2229 #[test]
2230 fn general_no_split() {
2231 let uws = "The quick (\"brown\") fox can't jump 32.3 feet, right? 4pda etc. qeq U.S.A asd\n\n\nBrr, it's 29.3°F!\n Русское предложение #36.6 для тестирования деления по юникод-словам...\n";
2232 let result = vec![
2233 PositionalToken {
2234 source: uws,
2235 offset: 0,
2236 length: 3,
2237 token: Token::Word(Word::Word("The".to_string())),
2238 },
2239 PositionalToken {
2240 source: uws,
2241 offset: 3,
2242 length: 1,
2243 token: Token::Special(Special::Separator(Separator::Space)),
2244 },
2245 PositionalToken {
2246 source: uws,
2247 offset: 4,
2248 length: 5,
2249 token: Token::Word(Word::Word("quick".to_string())),
2250 },
2251 PositionalToken {
2252 source: uws,
2253 offset: 9,
2254 length: 1,
2255 token: Token::Special(Special::Separator(Separator::Space)),
2256 },
2257 PositionalToken {
2258 source: uws,
2259 offset: 10,
2260 length: 1,
2261 token: Token::Special(Special::Punctuation('(')),
2262 },
2263 PositionalToken {
2264 source: uws,
2265 offset: 11,
2266 length: 1,
2267 token: Token::Special(Special::Punctuation('"')),
2268 },
2269 PositionalToken {
2270 source: uws,
2271 offset: 12,
2272 length: 5,
2273 token: Token::Word(Word::Word("brown".to_string())),
2274 },
2275 PositionalToken {
2276 source: uws,
2277 offset: 17,
2278 length: 1,
2279 token: Token::Special(Special::Punctuation('"')),
2280 },
2281 PositionalToken {
2282 source: uws,
2283 offset: 18,
2284 length: 1,
2285 token: Token::Special(Special::Punctuation(')')),
2286 },
2287 PositionalToken {
2288 source: uws,
2289 offset: 19,
2290 length: 1,
2291 token: Token::Special(Special::Separator(Separator::Space)),
2292 },
2293 PositionalToken {
2294 source: uws,
2295 offset: 20,
2296 length: 3,
2297 token: Token::Word(Word::Word("fox".to_string())),
2298 },
2299 PositionalToken {
2300 source: uws,
2301 offset: 23,
2302 length: 1,
2303 token: Token::Special(Special::Separator(Separator::Space)),
2304 },
2305 PositionalToken {
2306 source: uws,
2307 offset: 24,
2308 length: 5,
2309 token: Token::Word(Word::Word("can\'t".to_string())),
2310 },
2311 PositionalToken {
2312 source: uws,
2313 offset: 29,
2314 length: 1,
2315 token: Token::Special(Special::Separator(Separator::Space)),
2316 },
2317 PositionalToken {
2318 source: uws,
2319 offset: 30,
2320 length: 4,
2321 token: Token::Word(Word::Word("jump".to_string())),
2322 },
2323 PositionalToken {
2324 source: uws,
2325 offset: 34,
2326 length: 1,
2327 token: Token::Special(Special::Separator(Separator::Space)),
2328 },
2329 PositionalToken {
2330 source: uws,
2331 offset: 35,
2332 length: 4,
2333 token: Token::Word(Word::Number(Number::Float(32.3))),
2334 },
2335 PositionalToken {
2336 source: uws,
2337 offset: 39,
2338 length: 1,
2339 token: Token::Special(Special::Separator(Separator::Space)),
2340 },
2341 PositionalToken {
2342 source: uws,
2343 offset: 40,
2344 length: 4,
2345 token: Token::Word(Word::Word("feet".to_string())),
2346 },
2347 PositionalToken {
2348 source: uws,
2349 offset: 44,
2350 length: 1,
2351 token: Token::Special(Special::Punctuation(',')),
2352 },
2353 PositionalToken {
2354 source: uws,
2355 offset: 45,
2356 length: 1,
2357 token: Token::Special(Special::Separator(Separator::Space)),
2358 },
2359 PositionalToken {
2360 source: uws,
2361 offset: 46,
2362 length: 5,
2363 token: Token::Word(Word::Word("right".to_string())),
2364 },
2365 PositionalToken {
2366 source: uws,
2367 offset: 51,
2368 length: 1,
2369 token: Token::Special(Special::Punctuation('?')),
2370 },
2371 PositionalToken {
2372 source: uws,
2373 offset: 52,
2374 length: 1,
2375 token: Token::Special(Special::Separator(Separator::Space)),
2376 },
2377 PositionalToken {
2378 source: uws,
2379 offset: 53,
2380 length: 4,
2381 token: Token::Word(Word::Numerical(Numerical::Measures("4pda".to_string()))),
2382 }, PositionalToken {
2384 source: uws,
2385 offset: 57,
2386 length: 1,
2387 token: Token::Special(Special::Separator(Separator::Space)),
2388 },
2389 PositionalToken {
2390 source: uws,
2391 offset: 58,
2392 length: 3,
2393 token: Token::Word(Word::Word("etc".to_string())),
2394 },
2395 PositionalToken {
2396 source: uws,
2397 offset: 61,
2398 length: 1,
2399 token: Token::Special(Special::Punctuation('.')),
2400 },
2401 PositionalToken {
2402 source: uws,
2403 offset: 62,
2404 length: 1,
2405 token: Token::Special(Special::Separator(Separator::Space)),
2406 },
2407 PositionalToken {
2408 source: uws,
2409 offset: 63,
2410 length: 3,
2411 token: Token::Word(Word::Word("qeq".to_string())),
2412 },
2413 PositionalToken {
2414 source: uws,
2415 offset: 66,
2416 length: 1,
2417 token: Token::Special(Special::Separator(Separator::Space)),
2418 },
2419 PositionalToken {
2420 source: uws,
2421 offset: 67,
2422 length: 5,
2423 token: Token::Word(Word::Word("U.S.A".to_string())),
2424 },
2425 PositionalToken {
2426 source: uws,
2427 offset: 72,
2428 length: 1,
2429 token: Token::Special(Special::Separator(Separator::Space)),
2430 },
2431 PositionalToken {
2432 source: uws,
2433 offset: 73,
2434 length: 1,
2435 token: Token::Special(Special::Separator(Separator::Space)),
2436 },
2437 PositionalToken {
2438 source: uws,
2439 offset: 74,
2440 length: 3,
2441 token: Token::Word(Word::Word("asd".to_string())),
2442 },
2443 PositionalToken {
2444 source: uws,
2445 offset: 77,
2446 length: 1,
2447 token: Token::Special(Special::Separator(Separator::Newline)),
2448 },
2449 PositionalToken {
2450 source: uws,
2451 offset: 78,
2452 length: 1,
2453 token: Token::Special(Special::Separator(Separator::Newline)),
2454 },
2455 PositionalToken {
2456 source: uws,
2457 offset: 79,
2458 length: 1,
2459 token: Token::Special(Special::Separator(Separator::Newline)),
2460 },
2461 PositionalToken {
2462 source: uws,
2463 offset: 80,
2464 length: 3,
2465 token: Token::Word(Word::Word("Brr".to_string())),
2466 },
2467 PositionalToken {
2468 source: uws,
2469 offset: 83,
2470 length: 1,
2471 token: Token::Special(Special::Punctuation(',')),
2472 },
2473 PositionalToken {
2474 source: uws,
2475 offset: 84,
2476 length: 1,
2477 token: Token::Special(Special::Separator(Separator::Space)),
2478 },
2479 PositionalToken {
2480 source: uws,
2481 offset: 85,
2482 length: 4,
2483 token: Token::Word(Word::Word("it\'s".to_string())),
2484 },
2485 PositionalToken {
2486 source: uws,
2487 offset: 89,
2488 length: 1,
2489 token: Token::Special(Special::Separator(Separator::Space)),
2490 },
2491 PositionalToken {
2492 source: uws,
2493 offset: 90,
2494 length: 4,
2495 token: Token::Word(Word::Number(Number::Float(29.3))),
2496 },
2497 PositionalToken {
2498 source: uws,
2499 offset: 94,
2500 length: 2,
2501 token: Token::Special(Special::Symbol('°')),
2502 },
2503 PositionalToken {
2504 source: uws,
2505 offset: 96,
2506 length: 1,
2507 token: Token::Word(Word::Word("F".to_string())),
2508 },
2509 PositionalToken {
2510 source: uws,
2511 offset: 97,
2512 length: 1,
2513 token: Token::Special(Special::Punctuation('!')),
2514 },
2515 PositionalToken {
2516 source: uws,
2517 offset: 98,
2518 length: 1,
2519 token: Token::Special(Special::Separator(Separator::Newline)),
2520 },
2521 PositionalToken {
2522 source: uws,
2523 offset: 99,
2524 length: 1,
2525 token: Token::Special(Special::Separator(Separator::Space)),
2526 },
2527 PositionalToken {
2528 source: uws,
2529 offset: 100,
2530 length: 14,
2531 token: Token::Word(Word::Word("Русское".to_string())),
2532 },
2533 PositionalToken {
2534 source: uws,
2535 offset: 114,
2536 length: 1,
2537 token: Token::Special(Special::Separator(Separator::Space)),
2538 },
2539 PositionalToken {
2540 source: uws,
2541 offset: 115,
2542 length: 22,
2543 token: Token::Word(Word::Word("предложение".to_string())),
2544 },
2545 PositionalToken {
2546 source: uws,
2547 offset: 137,
2548 length: 1,
2549 token: Token::Special(Special::Separator(Separator::Space)),
2550 },
2551 PositionalToken {
2552 source: uws,
2553 offset: 138,
2554 length: 1,
2555 token: Token::Special(Special::Punctuation('#')),
2556 },
2557 PositionalToken {
2558 source: uws,
2559 offset: 139,
2560 length: 4,
2561 token: Token::Word(Word::Number(Number::Float(36.6))),
2562 },
2563 PositionalToken {
2564 source: uws,
2565 offset: 143,
2566 length: 1,
2567 token: Token::Special(Special::Separator(Separator::Space)),
2568 },
2569 PositionalToken {
2570 source: uws,
2571 offset: 144,
2572 length: 6,
2573 token: Token::Word(Word::Word("для".to_string())),
2574 },
2575 PositionalToken {
2576 source: uws,
2577 offset: 150,
2578 length: 1,
2579 token: Token::Special(Special::Separator(Separator::Space)),
2580 },
2581 PositionalToken {
2582 source: uws,
2583 offset: 151,
2584 length: 24,
2585 token: Token::Word(Word::Word("тестирования".to_string())),
2586 },
2587 PositionalToken {
2588 source: uws,
2589 offset: 175,
2590 length: 1,
2591 token: Token::Special(Special::Separator(Separator::Space)),
2592 },
2593 PositionalToken {
2594 source: uws,
2595 offset: 176,
2596 length: 14,
2597 token: Token::Word(Word::Word("деления".to_string())),
2598 },
2599 PositionalToken {
2600 source: uws,
2601 offset: 190,
2602 length: 1,
2603 token: Token::Special(Special::Separator(Separator::Space)),
2604 },
2605 PositionalToken {
2606 source: uws,
2607 offset: 191,
2608 length: 4,
2609 token: Token::Word(Word::Word("по".to_string())),
2610 },
2611 PositionalToken {
2612 source: uws,
2613 offset: 195,
2614 length: 1,
2615 token: Token::Special(Special::Separator(Separator::Space)),
2616 },
2617 PositionalToken {
2618 source: uws,
2619 offset: 196,
2620 length: 12,
2621 token: Token::Word(Word::Word("юникод".to_string())),
2622 },
2623 PositionalToken {
2624 source: uws,
2625 offset: 208,
2626 length: 1,
2627 token: Token::Special(Special::Punctuation('-')),
2628 },
2629 PositionalToken {
2630 source: uws,
2631 offset: 209,
2632 length: 12,
2633 token: Token::Word(Word::Word("словам".to_string())),
2634 },
2635 PositionalToken {
2636 source: uws,
2637 offset: 221,
2638 length: 1,
2639 token: Token::Special(Special::Punctuation('.')),
2640 },
2641 PositionalToken {
2642 source: uws,
2643 offset: 222,
2644 length: 1,
2645 token: Token::Special(Special::Punctuation('.')),
2646 },
2647 PositionalToken {
2648 source: uws,
2649 offset: 223,
2650 length: 1,
2651 token: Token::Special(Special::Punctuation('.')),
2652 },
2653 PositionalToken {
2654 source: uws,
2655 offset: 224,
2656 length: 1,
2657 token: Token::Special(Special::Separator(Separator::Newline)),
2658 },
2659 ];
2660 let lib_res = uws.into_tokenizer(Default::default()).collect::<Vec<_>>();
2661 check_results(&result, &lib_res, uws);
2662 }
2663
2664 #[test]
2665 fn general_complex() {
2666 let uws = "The quick (\"brown\") fox can't jump 32.3 feet, right? 4pda etc. qeq U.S.A asd\n\n\nBrr, it's 29.3°F!\n Русское предложение #36.6 для тестирования деления по юникод-словам...\n";
2667 let result = vec![
2668 PositionalToken {
2669 source: uws,
2670 offset: 0,
2671 length: 3,
2672 token: Token::Word(Word::Word("The".to_string())),
2673 },
2674 PositionalToken {
2675 source: uws,
2676 offset: 3,
2677 length: 1,
2678 token: Token::Special(Special::Separator(Separator::Space)),
2679 },
2680 PositionalToken {
2681 source: uws,
2682 offset: 4,
2683 length: 5,
2684 token: Token::Word(Word::Word("quick".to_string())),
2685 },
2686 PositionalToken {
2687 source: uws,
2688 offset: 9,
2689 length: 1,
2690 token: Token::Special(Special::Separator(Separator::Space)),
2691 },
2692 PositionalToken {
2693 source: uws,
2694 offset: 10,
2695 length: 1,
2696 token: Token::Special(Special::Punctuation('(')),
2697 },
2698 PositionalToken {
2699 source: uws,
2700 offset: 11,
2701 length: 1,
2702 token: Token::Special(Special::Punctuation('"')),
2703 },
2704 PositionalToken {
2705 source: uws,
2706 offset: 12,
2707 length: 5,
2708 token: Token::Word(Word::Word("brown".to_string())),
2709 },
2710 PositionalToken {
2711 source: uws,
2712 offset: 17,
2713 length: 1,
2714 token: Token::Special(Special::Punctuation('"')),
2715 },
2716 PositionalToken {
2717 source: uws,
2718 offset: 18,
2719 length: 1,
2720 token: Token::Special(Special::Punctuation(')')),
2721 },
2722 PositionalToken {
2723 source: uws,
2724 offset: 19,
2725 length: 1,
2726 token: Token::Special(Special::Separator(Separator::Space)),
2727 },
2728 PositionalToken {
2729 source: uws,
2730 offset: 20,
2731 length: 3,
2732 token: Token::Word(Word::Word("fox".to_string())),
2733 },
2734 PositionalToken {
2735 source: uws,
2736 offset: 23,
2737 length: 1,
2738 token: Token::Special(Special::Separator(Separator::Space)),
2739 },
2740 PositionalToken {
2741 source: uws,
2742 offset: 24,
2743 length: 5,
2744 token: Token::Word(Word::Word("can\'t".to_string())),
2745 },
2746 PositionalToken {
2747 source: uws,
2748 offset: 29,
2749 length: 1,
2750 token: Token::Special(Special::Separator(Separator::Space)),
2751 },
2752 PositionalToken {
2753 source: uws,
2754 offset: 30,
2755 length: 4,
2756 token: Token::Word(Word::Word("jump".to_string())),
2757 },
2758 PositionalToken {
2759 source: uws,
2760 offset: 34,
2761 length: 1,
2762 token: Token::Special(Special::Separator(Separator::Space)),
2763 },
2764 PositionalToken {
2765 source: uws,
2766 offset: 35,
2767 length: 4,
2768 token: Token::Word(Word::Number(Number::Float(32.3))),
2769 },
2770 PositionalToken {
2771 source: uws,
2772 offset: 39,
2773 length: 1,
2774 token: Token::Special(Special::Separator(Separator::Space)),
2775 },
2776 PositionalToken {
2777 source: uws,
2778 offset: 40,
2779 length: 4,
2780 token: Token::Word(Word::Word("feet".to_string())),
2781 },
2782 PositionalToken {
2783 source: uws,
2784 offset: 44,
2785 length: 1,
2786 token: Token::Special(Special::Punctuation(',')),
2787 },
2788 PositionalToken {
2789 source: uws,
2790 offset: 45,
2791 length: 1,
2792 token: Token::Special(Special::Separator(Separator::Space)),
2793 },
2794 PositionalToken {
2795 source: uws,
2796 offset: 46,
2797 length: 5,
2798 token: Token::Word(Word::Word("right".to_string())),
2799 },
2800 PositionalToken {
2801 source: uws,
2802 offset: 51,
2803 length: 1,
2804 token: Token::Special(Special::Punctuation('?')),
2805 },
2806 PositionalToken {
2807 source: uws,
2808 offset: 52,
2809 length: 1,
2810 token: Token::Special(Special::Separator(Separator::Space)),
2811 },
2812 PositionalToken {
2813 source: uws,
2814 offset: 53,
2815 length: 4,
2816 token: Token::Word(Word::Numerical(Numerical::Measures("4pda".to_string()))),
2817 }, PositionalToken {
2819 source: uws,
2820 offset: 57,
2821 length: 1,
2822 token: Token::Special(Special::Separator(Separator::Space)),
2823 },
2824 PositionalToken {
2825 source: uws,
2826 offset: 58,
2827 length: 3,
2828 token: Token::Word(Word::Word("etc".to_string())),
2829 },
2830 PositionalToken {
2831 source: uws,
2832 offset: 61,
2833 length: 1,
2834 token: Token::Special(Special::Punctuation('.')),
2835 },
2836 PositionalToken {
2837 source: uws,
2838 offset: 62,
2839 length: 1,
2840 token: Token::Special(Special::Separator(Separator::Space)),
2841 },
2842 PositionalToken {
2843 source: uws,
2844 offset: 63,
2845 length: 3,
2846 token: Token::Word(Word::Word("qeq".to_string())),
2847 },
2848 PositionalToken {
2849 source: uws,
2850 offset: 66,
2851 length: 1,
2852 token: Token::Special(Special::Separator(Separator::Space)),
2853 },
2854 PositionalToken {
2855 source: uws,
2856 offset: 67,
2857 length: 5,
2858 token: Token::Word(Word::Word("U.S.A".to_string())),
2859 },
2860 PositionalToken {
2861 source: uws,
2862 offset: 72,
2863 length: 2,
2864 token: Token::Special(Special::Separator(Separator::Space)),
2865 },
2866 PositionalToken {
2867 source: uws,
2868 offset: 74,
2869 length: 3,
2870 token: Token::Word(Word::Word("asd".to_string())),
2871 },
2872 PositionalToken {
2873 source: uws,
2874 offset: 77,
2875 length: 3,
2876 token: Token::Special(Special::Separator(Separator::Newline)),
2877 },
2878 PositionalToken {
2879 source: uws,
2880 offset: 80,
2881 length: 3,
2882 token: Token::Word(Word::Word("Brr".to_string())),
2883 },
2884 PositionalToken {
2885 source: uws,
2886 offset: 83,
2887 length: 1,
2888 token: Token::Special(Special::Punctuation(',')),
2889 },
2890 PositionalToken {
2891 source: uws,
2892 offset: 84,
2893 length: 1,
2894 token: Token::Special(Special::Separator(Separator::Space)),
2895 },
2896 PositionalToken {
2897 source: uws,
2898 offset: 85,
2899 length: 4,
2900 token: Token::Word(Word::Word("it\'s".to_string())),
2901 },
2902 PositionalToken {
2903 source: uws,
2904 offset: 89,
2905 length: 1,
2906 token: Token::Special(Special::Separator(Separator::Space)),
2907 },
2908 PositionalToken {
2909 source: uws,
2910 offset: 90,
2911 length: 4,
2912 token: Token::Word(Word::Number(Number::Float(29.3))),
2913 },
2914 PositionalToken {
2915 source: uws,
2916 offset: 94,
2917 length: 2,
2918 token: Token::Special(Special::Symbol('°')),
2919 },
2920 PositionalToken {
2921 source: uws,
2922 offset: 96,
2923 length: 1,
2924 token: Token::Word(Word::Word("F".to_string())),
2925 },
2926 PositionalToken {
2927 source: uws,
2928 offset: 97,
2929 length: 1,
2930 token: Token::Special(Special::Punctuation('!')),
2931 },
2932 PositionalToken {
2933 source: uws,
2934 offset: 98,
2935 length: 1,
2936 token: Token::Special(Special::Separator(Separator::Newline)),
2937 },
2938 PositionalToken {
2939 source: uws,
2940 offset: 99,
2941 length: 1,
2942 token: Token::Special(Special::Separator(Separator::Space)),
2943 },
2944 PositionalToken {
2945 source: uws,
2946 offset: 100,
2947 length: 14,
2948 token: Token::Word(Word::Word("Русское".to_string())),
2949 },
2950 PositionalToken {
2951 source: uws,
2952 offset: 114,
2953 length: 1,
2954 token: Token::Special(Special::Separator(Separator::Space)),
2955 },
2956 PositionalToken {
2957 source: uws,
2958 offset: 115,
2959 length: 22,
2960 token: Token::Word(Word::Word("предложение".to_string())),
2961 },
2962 PositionalToken {
2963 source: uws,
2964 offset: 137,
2965 length: 1,
2966 token: Token::Special(Special::Separator(Separator::Space)),
2967 },
2968 PositionalToken {
2969 source: uws,
2970 offset: 138,
2971 length: 5,
2972 token: Token::Struct(Struct::Hashtag("36.6".to_string())),
2973 },
2974 PositionalToken {
2975 source: uws,
2976 offset: 143,
2977 length: 1,
2978 token: Token::Special(Special::Separator(Separator::Space)),
2979 },
2980 PositionalToken {
2981 source: uws,
2982 offset: 144,
2983 length: 6,
2984 token: Token::Word(Word::Word("для".to_string())),
2985 },
2986 PositionalToken {
2987 source: uws,
2988 offset: 150,
2989 length: 1,
2990 token: Token::Special(Special::Separator(Separator::Space)),
2991 },
2992 PositionalToken {
2993 source: uws,
2994 offset: 151,
2995 length: 24,
2996 token: Token::Word(Word::Word("тестирования".to_string())),
2997 },
2998 PositionalToken {
2999 source: uws,
3000 offset: 175,
3001 length: 1,
3002 token: Token::Special(Special::Separator(Separator::Space)),
3003 },
3004 PositionalToken {
3005 source: uws,
3006 offset: 176,
3007 length: 14,
3008 token: Token::Word(Word::Word("деления".to_string())),
3009 },
3010 PositionalToken {
3011 source: uws,
3012 offset: 190,
3013 length: 1,
3014 token: Token::Special(Special::Separator(Separator::Space)),
3015 },
3016 PositionalToken {
3017 source: uws,
3018 offset: 191,
3019 length: 4,
3020 token: Token::Word(Word::Word("по".to_string())),
3021 },
3022 PositionalToken {
3023 source: uws,
3024 offset: 195,
3025 length: 1,
3026 token: Token::Special(Special::Separator(Separator::Space)),
3027 },
3028 PositionalToken {
3029 source: uws,
3030 offset: 196,
3031 length: 12,
3032 token: Token::Word(Word::Word("юникод".to_string())),
3033 },
3034 PositionalToken {
3035 source: uws,
3036 offset: 208,
3037 length: 1,
3038 token: Token::Special(Special::Punctuation('-')),
3039 },
3040 PositionalToken {
3041 source: uws,
3042 offset: 209,
3043 length: 12,
3044 token: Token::Word(Word::Word("словам".to_string())),
3045 },
3046 PositionalToken {
3047 source: uws,
3048 offset: 221,
3049 length: 3,
3050 token: Token::Special(Special::Punctuation('.')),
3051 },
3052 PositionalToken {
3053 source: uws,
3054 offset: 224,
3055 length: 1,
3056 token: Token::Special(Special::Separator(Separator::Newline)),
3057 },
3058 ];
3059 let lib_res = uws
3060 .into_tokenizer(TokenizerParams::complex())
3061 .collect::<Vec<_>>();
3062 check_results(&result, &lib_res, uws);
3063 }
3064
3065 #[test]
3066 fn plus_minus() {
3067 let uws = "+23 -4.5 -34 +25.7 - 2 + 5.6";
3068 let result = vec![
3069 PositionalToken {
3070 source: uws,
3071 offset: 0,
3072 length: 3,
3073 token: Token::Word(Word::Number(Number::Integer(23))),
3074 },
3075 PositionalToken {
3076 source: uws,
3077 offset: 3,
3078 length: 1,
3079 token: Token::Special(Special::Separator(Separator::Space)),
3080 },
3081 PositionalToken {
3082 source: uws,
3083 offset: 4,
3084 length: 4,
3085 token: Token::Word(Word::Number(Number::Float(-4.5))),
3086 },
3087 PositionalToken {
3088 source: uws,
3089 offset: 8,
3090 length: 1,
3091 token: Token::Special(Special::Separator(Separator::Space)),
3092 },
3093 PositionalToken {
3094 source: uws,
3095 offset: 9,
3096 length: 3,
3097 token: Token::Word(Word::Number(Number::Integer(-34))),
3098 },
3099 PositionalToken {
3100 source: uws,
3101 offset: 12,
3102 length: 1,
3103 token: Token::Special(Special::Separator(Separator::Space)),
3104 },
3105 PositionalToken {
3106 source: uws,
3107 offset: 13,
3108 length: 5,
3109 token: Token::Word(Word::Number(Number::Float(25.7))),
3110 },
3111 PositionalToken {
3112 source: uws,
3113 offset: 18,
3114 length: 1,
3115 token: Token::Special(Special::Separator(Separator::Space)),
3116 },
3117 PositionalToken {
3118 source: uws,
3119 offset: 19,
3120 length: 1,
3121 token: Token::Special(Special::Punctuation('-')),
3122 },
3123 PositionalToken {
3124 source: uws,
3125 offset: 20,
3126 length: 1,
3127 token: Token::Special(Special::Separator(Separator::Space)),
3128 },
3129 PositionalToken {
3130 source: uws,
3131 offset: 21,
3132 length: 1,
3133 token: Token::Word(Word::Number(Number::Integer(2))),
3134 },
3135 PositionalToken {
3136 source: uws,
3137 offset: 22,
3138 length: 1,
3139 token: Token::Special(Special::Separator(Separator::Space)),
3140 },
3141 PositionalToken {
3142 source: uws,
3143 offset: 23,
3144 length: 1,
3145 token: Token::Special(Special::Punctuation('+')),
3146 },
3147 PositionalToken {
3148 source: uws,
3149 offset: 24,
3150 length: 1,
3151 token: Token::Special(Special::Separator(Separator::Space)),
3152 },
3153 PositionalToken {
3154 source: uws,
3155 offset: 25,
3156 length: 3,
3157 token: Token::Word(Word::Number(Number::Float(5.6))),
3158 },
3159 ];
3160 let lib_res = uws
3161 .into_tokenizer(TokenizerParams::v1())
3162 .collect::<Vec<_>>();
3163 check(&result, &lib_res, uws);
3164 }
3166
3167 #[test]
3168 #[ignore]
3169 fn woman_bouncing_ball() {
3170 let uws = "\u{26f9}\u{200d}\u{2640}";
3171 let result = vec![PositionalToken {
3172 source: uws,
3173 offset: 0,
3174 length: 9,
3175 token: Token::Word(Word::Emoji("woman_bouncing_ball")),
3176 }];
3177 let lib_res = uws
3178 .into_tokenizer(TokenizerParams::v1())
3179 .collect::<Vec<_>>();
3180 check_results(&result, &lib_res, uws);
3181 }
3183
3184 #[test]
3185 fn emoji_and_rusabbr_default() {
3186 let uws = "🇷🇺 🇸🇹\n👱🏿👶🏽👨🏽\n👱\nС.С.С.Р.\n👨👩👦👦\n🧠\n";
3187 let result = vec![
3188 PositionalToken {
3189 source: uws,
3190 offset: 0,
3191 length: 8,
3192 token: Token::Word(Word::Emoji("russia")),
3193 },
3194 PositionalToken {
3195 source: uws,
3196 offset: 8,
3197 length: 1,
3198 token: Token::Special(Special::Separator(Separator::Space)),
3199 },
3200 PositionalToken {
3201 source: uws,
3202 offset: 9,
3203 length: 8,
3204 token: Token::Word(Word::Emoji("sao_tome_and_principe")),
3205 },
3206 PositionalToken {
3207 source: uws,
3208 offset: 17,
3209 length: 1,
3210 token: Token::Special(Special::Separator(Separator::Newline)),
3211 },
3212 PositionalToken {
3213 source: uws,
3214 offset: 18,
3215 length: 8,
3216 token: Token::Word(Word::Emoji("blond_haired_person_dark_skin_tone")),
3217 },
3218 PositionalToken {
3219 source: uws,
3220 offset: 26,
3221 length: 8,
3222 token: Token::Word(Word::Emoji("baby_medium_skin_tone")),
3223 },
3224 PositionalToken {
3225 source: uws,
3226 offset: 34,
3227 length: 8,
3228 token: Token::Word(Word::Emoji("man_medium_skin_tone")),
3229 },
3230 PositionalToken {
3231 source: uws,
3232 offset: 42,
3233 length: 1,
3234 token: Token::Special(Special::Separator(Separator::Newline)),
3235 },
3236 PositionalToken {
3237 source: uws,
3238 offset: 43,
3239 length: 4,
3240 token: Token::Word(Word::Emoji("blond_haired_person")),
3241 },
3242 PositionalToken {
3243 source: uws,
3244 offset: 47,
3245 length: 1,
3246 token: Token::Special(Special::Separator(Separator::Newline)),
3247 },
3248 PositionalToken {
3249 source: uws,
3250 offset: 48,
3251 length: 2,
3252 token: Token::Word(Word::Word("С".to_string())),
3253 },
3254 PositionalToken {
3255 source: uws,
3256 offset: 50,
3257 length: 1,
3258 token: Token::Special(Special::Punctuation('.')),
3259 },
3260 PositionalToken {
3261 source: uws,
3262 offset: 51,
3263 length: 2,
3264 token: Token::Word(Word::Word("С".to_string())),
3265 },
3266 PositionalToken {
3267 source: uws,
3268 offset: 53,
3269 length: 1,
3270 token: Token::Special(Special::Punctuation('.')),
3271 },
3272 PositionalToken {
3273 source: uws,
3274 offset: 54,
3275 length: 2,
3276 token: Token::Word(Word::Word("С".to_string())),
3277 },
3278 PositionalToken {
3279 source: uws,
3280 offset: 56,
3281 length: 1,
3282 token: Token::Special(Special::Punctuation('.')),
3283 },
3284 PositionalToken {
3285 source: uws,
3286 offset: 57,
3287 length: 2,
3288 token: Token::Word(Word::Word("Р".to_string())),
3289 },
3290 PositionalToken {
3291 source: uws,
3292 offset: 59,
3293 length: 1,
3294 token: Token::Special(Special::Punctuation('.')),
3295 },
3296 PositionalToken {
3297 source: uws,
3298 offset: 60,
3299 length: 1,
3300 token: Token::Special(Special::Separator(Separator::Newline)),
3301 },
3302 PositionalToken {
3303 source: uws,
3304 offset: 61,
3305 length: 25,
3306 token: Token::Word(Word::Emoji("family_man_woman_boy_boy")),
3307 },
3308 PositionalToken {
3309 source: uws,
3310 offset: 86,
3311 length: 1,
3312 token: Token::Special(Special::Separator(Separator::Newline)),
3313 },
3314 PositionalToken {
3315 source: uws,
3316 offset: 87,
3317 length: 4,
3318 token: Token::Word(Word::Emoji("brain")),
3319 },
3320 PositionalToken {
3321 source: uws,
3322 offset: 91,
3323 length: 1,
3324 token: Token::Special(Special::Separator(Separator::Newline)),
3325 },
3326 ];
3327
3328 let lib_res = uws
3329 .into_tokenizer(TokenizerParams::v1())
3330 .collect::<Vec<_>>();
3331 check_results(&result, &lib_res, uws);
3332 }
3334
3335 #[test]
3336 fn emoji_and_rusabbr_no_split() {
3337 let uws = "🇷🇺 🇸🇹\n👱🏿👶🏽👨🏽\n👱\nС.С.С.Р.\n👨👩👦👦\n🧠\n";
3338 let result = vec![
3339 PositionalToken {
3340 source: uws,
3341 offset: 0,
3342 length: 8,
3343 token: Token::Word(Word::Emoji("russia")),
3344 },
3345 PositionalToken {
3346 source: uws,
3347 offset: 8,
3348 length: 1,
3349 token: Token::Special(Special::Separator(Separator::Space)),
3350 },
3351 PositionalToken {
3352 source: uws,
3353 offset: 9,
3354 length: 8,
3355 token: Token::Word(Word::Emoji("sao_tome_and_principe")),
3356 },
3357 PositionalToken {
3358 source: uws,
3359 offset: 17,
3360 length: 1,
3361 token: Token::Special(Special::Separator(Separator::Newline)),
3362 },
3363 PositionalToken {
3364 source: uws,
3365 offset: 18,
3366 length: 8,
3367 token: Token::Word(Word::Emoji("blond_haired_person_dark_skin_tone")),
3368 },
3369 PositionalToken {
3370 source: uws,
3371 offset: 26,
3372 length: 8,
3373 token: Token::Word(Word::Emoji("baby_medium_skin_tone")),
3374 },
3375 PositionalToken {
3376 source: uws,
3377 offset: 34,
3378 length: 8,
3379 token: Token::Word(Word::Emoji("man_medium_skin_tone")),
3380 },
3381 PositionalToken {
3382 source: uws,
3383 offset: 42,
3384 length: 1,
3385 token: Token::Special(Special::Separator(Separator::Newline)),
3386 },
3387 PositionalToken {
3388 source: uws,
3389 offset: 43,
3390 length: 4,
3391 token: Token::Word(Word::Emoji("blond_haired_person")),
3392 },
3393 PositionalToken {
3394 source: uws,
3395 offset: 47,
3396 length: 1,
3397 token: Token::Special(Special::Separator(Separator::Newline)),
3398 },
3399 PositionalToken {
3400 source: uws,
3401 offset: 48,
3402 length: 11,
3403 token: Token::Word(Word::Word("С.С.С.Р".to_string())),
3404 },
3405 PositionalToken {
3406 source: uws,
3407 offset: 59,
3408 length: 1,
3409 token: Token::Special(Special::Punctuation('.')),
3410 },
3411 PositionalToken {
3412 source: uws,
3413 offset: 60,
3414 length: 1,
3415 token: Token::Special(Special::Separator(Separator::Newline)),
3416 },
3417 PositionalToken {
3418 source: uws,
3419 offset: 61,
3420 length: 25,
3421 token: Token::Word(Word::Emoji("family_man_woman_boy_boy")),
3422 },
3423 PositionalToken {
3424 source: uws,
3425 offset: 86,
3426 length: 1,
3427 token: Token::Special(Special::Separator(Separator::Newline)),
3428 },
3429 PositionalToken {
3430 source: uws,
3431 offset: 87,
3432 length: 4,
3433 token: Token::Word(Word::Emoji("brain")),
3434 },
3435 PositionalToken {
3436 source: uws,
3437 offset: 91,
3438 length: 1,
3439 token: Token::Special(Special::Separator(Separator::Newline)),
3440 },
3441 ];
3442
3443 let lib_res = uws.into_tokenizer(Default::default()).collect::<Vec<_>>();
3444 check_results(&result, &lib_res, uws);
3445 }
3447
3448 #[test]
3672 fn html() {
3673 let uws = "<div class=\"article article_view \" id=\"article_view_-113039156_9551\" data-article-url=\"/@chaibuket-o-chem-ne-zabyt-25-noyabrya\" data-audio-context=\"article:-113039156_9551\"><h1 class=\"article_decoration_first article_decoration_last\" >День Мамы </h1><p class=\"article_decoration_first article_decoration_last\" >День, когда поздравляют мам, бабушек, сестер и жён — это всемирный праздник, называемый «День Мамы». В настоящее время его отмечают почти в каждой стране, просто везде разные даты и способы празднования. </p><h3 class=\"article_decoration_first article_decoration_last\" ><span class='article_anchor_title'>\n <span class='article_anchor_button' id='pochemu-my-ego-prazdnuem'></span>\n <span class='article_anchor_fsymbol'>П</span>\n</span>ПОЧЕМУ МЫ ЕГО ПРАЗДНУЕМ</h3><p class=\"article_decoration_first article_decoration_last article_decoration_before\" >В 1987 году комитет госдумы по делам женщин, семьи и молодежи выступил с предложением учредить «День мамы», а сам приказ был подписан уже 30 января 1988 года Борисом Ельциным. Было решено, что ежегодно в России празднество дня мамы будет выпадать на последнее воскресенье ноября. </p><figure data-type=\"101\" data-mode=\"\" class=\"article_decoration_first article_decoration_last\" >\n <div class=\"article_figure_content\" style=\"width: 1125px\">\n <div class=\"article_figure_sizer_content\"><div class=\"article_object_sizer_wrap\" data-sizes=\"[{"s":["https://pp.userapi.com/c849128/v849128704/c0ffd/pcNJaBH3NDo.jpg",75,50],"m":["https://pp.userapi.com/c849128/v849128704/c0ffe/ozCLs2kHtRY.jpg",130,87],"x":["https://pp.userapi.com/c849128/v849128704/c0fff/E4KtTNDydzE.jpg",604,403],"y":["https://pp.userapi.com/c849128/v849128704/c1000/1nLxpYKavzU.jpg",807,538],"z":["https://pp.userapi.com/c849128/v849128704/c1001/IgEODe90yEk.jpg",1125,750],"o":["https://pp.userapi.com/c849128/v849128704/c1002/01faNwVZ2_E.jpg",130,87],"p":["https://pp.userapi.com/c849128/v849128704/c1003/baDFzbdRP2s.jpg",200,133],"q":["https://pp.userapi.com/c849128/v849128704/c1004/CY4khI6KJKA.jpg",320,213],"r":["https://pp.userapi.com/c849128/v849128704/c1005/NOvAJ6-VltY.jpg",510,340]}]\">\n <img class=\"article_object_sizer_inner article_object_photo__image_blur\" src=\"https://pp.userapi.com/c849128/v849128704/c0ffd/pcNJaBH3NDo.jpg\" data-baseurl=\"\"/>\n \n</div></div>\n <div class=\"article_figure_sizer\" style=\"padding-bottom: 66.666666666667%\"></div>";
3674 let result = vec![
3675 PositionalToken {
3676 source: uws,
3677 offset: 236,
3678 length: 8,
3679 token: Token::Word(Word::Word("День".to_string())),
3680 },
3681 PositionalToken {
3682 source: uws,
3683 offset: 244,
3684 length: 1,
3685 token: Token::Special(Special::Separator(Separator::Space)),
3686 },
3687 PositionalToken {
3688 source: uws,
3689 offset: 245,
3690 length: 8,
3691 token: Token::Word(Word::Word("Мамы".to_string())),
3692 },
3693 PositionalToken {
3694 source: uws,
3695 offset: 253,
3696 length: 1,
3697 token: Token::Special(Special::Separator(Separator::Space)),
3698 },
3699 PositionalToken {
3700 source: uws,
3701 offset: 321,
3702 length: 8,
3703 token: Token::Word(Word::Word("День".to_string())),
3704 },
3705 PositionalToken {
3706 source: uws,
3707 offset: 329,
3708 length: 1,
3709 token: Token::Special(Special::Punctuation(',')),
3710 },
3711 PositionalToken {
3712 source: uws,
3713 offset: 330,
3714 length: 1,
3715 token: Token::Special(Special::Separator(Separator::Space)),
3716 },
3717 PositionalToken {
3718 source: uws,
3719 offset: 331,
3720 length: 10,
3721 token: Token::Word(Word::Word("когда".to_string())),
3722 },
3723 PositionalToken {
3724 source: uws,
3725 offset: 341,
3726 length: 1,
3727 token: Token::Special(Special::Separator(Separator::Space)),
3728 },
3729 PositionalToken {
3730 source: uws,
3731 offset: 342,
3732 length: 22,
3733 token: Token::Word(Word::Word("поздравляют".to_string())),
3734 },
3735 PositionalToken {
3736 source: uws,
3737 offset: 364,
3738 length: 1,
3739 token: Token::Special(Special::Separator(Separator::Space)),
3740 },
3741 PositionalToken {
3742 source: uws,
3743 offset: 365,
3744 length: 6,
3745 token: Token::Word(Word::Word("мам".to_string())),
3746 },
3747 PositionalToken {
3748 source: uws,
3749 offset: 371,
3750 length: 1,
3751 token: Token::Special(Special::Punctuation(',')),
3752 },
3753 PositionalToken {
3754 source: uws,
3755 offset: 372,
3756 length: 1,
3757 token: Token::Special(Special::Separator(Separator::Space)),
3758 },
3759 PositionalToken {
3760 source: uws,
3761 offset: 373,
3762 length: 14,
3763 token: Token::Word(Word::Word("бабушек".to_string())),
3764 },
3765 PositionalToken {
3766 source: uws,
3767 offset: 387,
3768 length: 1,
3769 token: Token::Special(Special::Punctuation(',')),
3770 },
3771 PositionalToken {
3772 source: uws,
3773 offset: 388,
3774 length: 1,
3775 token: Token::Special(Special::Separator(Separator::Space)),
3776 },
3777 PositionalToken {
3778 source: uws,
3779 offset: 389,
3780 length: 12,
3781 token: Token::Word(Word::Word("сестер".to_string())),
3782 },
3783 PositionalToken {
3784 source: uws,
3785 offset: 401,
3786 length: 1,
3787 token: Token::Special(Special::Separator(Separator::Space)),
3788 },
3789 PositionalToken {
3790 source: uws,
3791 offset: 402,
3792 length: 2,
3793 token: Token::Word(Word::Word("и".to_string())),
3794 },
3795 PositionalToken {
3796 source: uws,
3797 offset: 404,
3798 length: 1,
3799 token: Token::Special(Special::Separator(Separator::Space)),
3800 },
3801 PositionalToken {
3802 source: uws,
3803 offset: 405,
3804 length: 6,
3805 token: Token::Word(Word::Word("жён".to_string())),
3806 },
3807 PositionalToken {
3808 source: uws,
3809 offset: 411,
3810 length: 1,
3811 token: Token::Special(Special::Separator(Separator::Space)),
3812 },
3813 PositionalToken {
3814 source: uws,
3815 offset: 412,
3816 length: 3,
3817 token: Token::Special(Special::Punctuation('—')),
3818 },
3819 PositionalToken {
3820 source: uws,
3821 offset: 415,
3822 length: 1,
3823 token: Token::Special(Special::Separator(Separator::Space)),
3824 },
3825 PositionalToken {
3826 source: uws,
3827 offset: 416,
3828 length: 6,
3829 token: Token::Word(Word::Word("это".to_string())),
3830 },
3831 PositionalToken {
3832 source: uws,
3833 offset: 422,
3834 length: 1,
3835 token: Token::Special(Special::Separator(Separator::Space)),
3836 },
3837 PositionalToken {
3838 source: uws,
3839 offset: 423,
3840 length: 18,
3841 token: Token::Word(Word::Word("всемирный".to_string())),
3842 },
3843 PositionalToken {
3844 source: uws,
3845 offset: 441,
3846 length: 1,
3847 token: Token::Special(Special::Separator(Separator::Space)),
3848 },
3849 PositionalToken {
3850 source: uws,
3851 offset: 442,
3852 length: 16,
3853 token: Token::Word(Word::Word("праздник".to_string())),
3854 },
3855 PositionalToken {
3856 source: uws,
3857 offset: 458,
3858 length: 1,
3859 token: Token::Special(Special::Punctuation(',')),
3860 },
3861 PositionalToken {
3862 source: uws,
3863 offset: 459,
3864 length: 1,
3865 token: Token::Special(Special::Separator(Separator::Space)),
3866 },
3867 PositionalToken {
3868 source: uws,
3869 offset: 460,
3870 length: 20,
3871 token: Token::Word(Word::Word("называемый".to_string())),
3872 },
3873 PositionalToken {
3874 source: uws,
3875 offset: 480,
3876 length: 1,
3877 token: Token::Special(Special::Separator(Separator::Space)),
3878 },
3879 PositionalToken {
3880 source: uws,
3881 offset: 481,
3882 length: 2,
3883 token: Token::Special(Special::Punctuation('«')),
3884 },
3885 PositionalToken {
3886 source: uws,
3887 offset: 483,
3888 length: 8,
3889 token: Token::Word(Word::Word("День".to_string())),
3890 },
3891 PositionalToken {
3892 source: uws,
3893 offset: 491,
3894 length: 1,
3895 token: Token::Special(Special::Separator(Separator::Space)),
3896 },
3897 PositionalToken {
3898 source: uws,
3899 offset: 492,
3900 length: 8,
3901 token: Token::Word(Word::Word("Мамы".to_string())),
3902 },
3903 PositionalToken {
3904 source: uws,
3905 offset: 500,
3906 length: 2,
3907 token: Token::Special(Special::Punctuation('»')),
3908 },
3909 PositionalToken {
3910 source: uws,
3911 offset: 502,
3912 length: 1,
3913 token: Token::Special(Special::Punctuation('.')),
3914 },
3915 PositionalToken {
3916 source: uws,
3917 offset: 503,
3918 length: 1,
3919 token: Token::Special(Special::Separator(Separator::Space)),
3920 },
3921 PositionalToken {
3922 source: uws,
3923 offset: 504,
3924 length: 2,
3925 token: Token::Word(Word::Word("В".to_string())),
3926 },
3927 PositionalToken {
3928 source: uws,
3929 offset: 506,
3930 length: 1,
3931 token: Token::Special(Special::Separator(Separator::Space)),
3932 },
3933 PositionalToken {
3934 source: uws,
3935 offset: 507,
3936 length: 18,
3937 token: Token::Word(Word::Word("настоящее".to_string())),
3938 },
3939 PositionalToken {
3940 source: uws,
3941 offset: 525,
3942 length: 1,
3943 token: Token::Special(Special::Separator(Separator::Space)),
3944 },
3945 PositionalToken {
3946 source: uws,
3947 offset: 526,
3948 length: 10,
3949 token: Token::Word(Word::Word("время".to_string())),
3950 },
3951 PositionalToken {
3952 source: uws,
3953 offset: 536,
3954 length: 1,
3955 token: Token::Special(Special::Separator(Separator::Space)),
3956 },
3957 PositionalToken {
3958 source: uws,
3959 offset: 537,
3960 length: 6,
3961 token: Token::Word(Word::Word("его".to_string())),
3962 },
3963 PositionalToken {
3964 source: uws,
3965 offset: 543,
3966 length: 1,
3967 token: Token::Special(Special::Separator(Separator::Space)),
3968 },
3969 PositionalToken {
3970 source: uws,
3971 offset: 544,
3972 length: 16,
3973 token: Token::Word(Word::Word("отмечают".to_string())),
3974 },
3975 PositionalToken {
3976 source: uws,
3977 offset: 560,
3978 length: 1,
3979 token: Token::Special(Special::Separator(Separator::Space)),
3980 },
3981 PositionalToken {
3982 source: uws,
3983 offset: 561,
3984 length: 10,
3985 token: Token::Word(Word::Word("почти".to_string())),
3986 },
3987 PositionalToken {
3988 source: uws,
3989 offset: 571,
3990 length: 1,
3991 token: Token::Special(Special::Separator(Separator::Space)),
3992 },
3993 PositionalToken {
3994 source: uws,
3995 offset: 572,
3996 length: 2,
3997 token: Token::Word(Word::Word("в".to_string())),
3998 },
3999 PositionalToken {
4000 source: uws,
4001 offset: 574,
4002 length: 1,
4003 token: Token::Special(Special::Separator(Separator::Space)),
4004 },
4005 PositionalToken {
4006 source: uws,
4007 offset: 575,
4008 length: 12,
4009 token: Token::Word(Word::Word("каждой".to_string())),
4010 },
4011 PositionalToken {
4012 source: uws,
4013 offset: 587,
4014 length: 1,
4015 token: Token::Special(Special::Separator(Separator::Space)),
4016 },
4017 PositionalToken {
4018 source: uws,
4019 offset: 588,
4020 length: 12,
4021 token: Token::Word(Word::Word("стране".to_string())),
4022 },
4023 PositionalToken {
4024 source: uws,
4025 offset: 600,
4026 length: 1,
4027 token: Token::Special(Special::Punctuation(',')),
4028 },
4029 PositionalToken {
4030 source: uws,
4031 offset: 601,
4032 length: 1,
4033 token: Token::Special(Special::Separator(Separator::Space)),
4034 },
4035 PositionalToken {
4036 source: uws,
4037 offset: 602,
4038 length: 12,
4039 token: Token::Word(Word::Word("просто".to_string())),
4040 },
4041 PositionalToken {
4042 source: uws,
4043 offset: 614,
4044 length: 1,
4045 token: Token::Special(Special::Separator(Separator::Space)),
4046 },
4047 PositionalToken {
4048 source: uws,
4049 offset: 615,
4050 length: 10,
4051 token: Token::Word(Word::Word("везде".to_string())),
4052 },
4053 PositionalToken {
4054 source: uws,
4055 offset: 625,
4056 length: 1,
4057 token: Token::Special(Special::Separator(Separator::Space)),
4058 },
4059 PositionalToken {
4060 source: uws,
4061 offset: 626,
4062 length: 12,
4063 token: Token::Word(Word::Word("разные".to_string())),
4064 },
4065 PositionalToken {
4066 source: uws,
4067 offset: 638,
4068 length: 1,
4069 token: Token::Special(Special::Separator(Separator::Space)),
4070 },
4071 PositionalToken {
4072 source: uws,
4073 offset: 639,
4074 length: 8,
4075 token: Token::Word(Word::Word("даты".to_string())),
4076 },
4077 PositionalToken {
4078 source: uws,
4079 offset: 647,
4080 length: 1,
4081 token: Token::Special(Special::Separator(Separator::Space)),
4082 },
4083 PositionalToken {
4084 source: uws,
4085 offset: 648,
4086 length: 2,
4087 token: Token::Word(Word::Word("и".to_string())),
4088 },
4089 PositionalToken {
4090 source: uws,
4091 offset: 650,
4092 length: 1,
4093 token: Token::Special(Special::Separator(Separator::Space)),
4094 },
4095 PositionalToken {
4096 source: uws,
4097 offset: 651,
4098 length: 14,
4099 token: Token::Word(Word::Word("способы".to_string())),
4100 },
4101 PositionalToken {
4102 source: uws,
4103 offset: 665,
4104 length: 1,
4105 token: Token::Special(Special::Separator(Separator::Space)),
4106 },
4107 PositionalToken {
4108 source: uws,
4109 offset: 666,
4110 length: 24,
4111 token: Token::Word(Word::Word("празднования".to_string())),
4112 },
4113 PositionalToken {
4114 source: uws,
4115 offset: 690,
4116 length: 1,
4117 token: Token::Special(Special::Punctuation('.')),
4118 },
4119 PositionalToken {
4120 source: uws,
4121 offset: 691,
4122 length: 1,
4123 token: Token::Special(Special::Separator(Separator::Space)),
4124 },
4125 PositionalToken {
4126 source: uws,
4127 offset: 794,
4128 length: 1,
4129 token: Token::Special(Special::Separator(Separator::Newline)),
4130 },
4131 PositionalToken {
4132 source: uws,
4133 offset: 795,
4134 length: 2,
4135 token: Token::Special(Special::Separator(Separator::Space)),
4136 },
4137 PositionalToken {
4138 source: uws,
4139 offset: 870,
4140 length: 1,
4141 token: Token::Special(Special::Separator(Separator::Newline)),
4142 },
4143 PositionalToken {
4144 source: uws,
4145 offset: 871,
4146 length: 2,
4147 token: Token::Special(Special::Separator(Separator::Space)),
4148 },
4149 PositionalToken {
4150 source: uws,
4151 offset: 910,
4152 length: 2,
4153 token: Token::Word(Word::Word("П".to_string())),
4154 },
4155 PositionalToken {
4156 source: uws,
4157 offset: 919,
4158 length: 1,
4159 token: Token::Special(Special::Separator(Separator::Newline)),
4160 },
4161 PositionalToken {
4162 source: uws,
4163 offset: 927,
4164 length: 12,
4165 token: Token::Word(Word::Word("ПОЧЕМУ".to_string())),
4166 },
4167 PositionalToken {
4168 source: uws,
4169 offset: 939,
4170 length: 1,
4171 token: Token::Special(Special::Separator(Separator::Space)),
4172 },
4173 PositionalToken {
4174 source: uws,
4175 offset: 940,
4176 length: 4,
4177 token: Token::Word(Word::Word("МЫ".to_string())),
4178 },
4179 PositionalToken {
4180 source: uws,
4181 offset: 944,
4182 length: 1,
4183 token: Token::Special(Special::Separator(Separator::Space)),
4184 },
4185 PositionalToken {
4186 source: uws,
4187 offset: 945,
4188 length: 6,
4189 token: Token::Word(Word::Word("ЕГО".to_string())),
4190 },
4191 PositionalToken {
4192 source: uws,
4193 offset: 951,
4194 length: 1,
4195 token: Token::Special(Special::Separator(Separator::Space)),
4196 },
4197 PositionalToken {
4198 source: uws,
4199 offset: 952,
4200 length: 18,
4201 token: Token::Word(Word::Word("ПРАЗДНУЕМ".to_string())),
4202 },
4203 PositionalToken {
4204 source: uws,
4205 offset: 1063,
4206 length: 2,
4207 token: Token::Word(Word::Word("В".to_string())),
4208 },
4209 PositionalToken {
4210 source: uws,
4211 offset: 1065,
4212 length: 1,
4213 token: Token::Special(Special::Separator(Separator::Space)),
4214 },
4215 PositionalToken {
4216 source: uws,
4217 offset: 1066,
4218 length: 4,
4219 token: Token::Word(Word::Number(Number::Integer(1987))),
4220 },
4221 PositionalToken {
4222 source: uws,
4223 offset: 1070,
4224 length: 1,
4225 token: Token::Special(Special::Separator(Separator::Space)),
4226 },
4227 PositionalToken {
4228 source: uws,
4229 offset: 1071,
4230 length: 8,
4231 token: Token::Word(Word::Word("году".to_string())),
4232 },
4233 PositionalToken {
4234 source: uws,
4235 offset: 1079,
4236 length: 1,
4237 token: Token::Special(Special::Separator(Separator::Space)),
4238 },
4239 PositionalToken {
4240 source: uws,
4241 offset: 1080,
4242 length: 14,
4243 token: Token::Word(Word::Word("комитет".to_string())),
4244 },
4245 PositionalToken {
4246 source: uws,
4247 offset: 1094,
4248 length: 1,
4249 token: Token::Special(Special::Separator(Separator::Space)),
4250 },
4251 PositionalToken {
4252 source: uws,
4253 offset: 1095,
4254 length: 14,
4255 token: Token::Word(Word::Word("госдумы".to_string())),
4256 },
4257 PositionalToken {
4258 source: uws,
4259 offset: 1109,
4260 length: 1,
4261 token: Token::Special(Special::Separator(Separator::Space)),
4262 },
4263 PositionalToken {
4264 source: uws,
4265 offset: 1110,
4266 length: 4,
4267 token: Token::Word(Word::Word("по".to_string())),
4268 },
4269 PositionalToken {
4270 source: uws,
4271 offset: 1114,
4272 length: 1,
4273 token: Token::Special(Special::Separator(Separator::Space)),
4274 },
4275 PositionalToken {
4276 source: uws,
4277 offset: 1115,
4278 length: 10,
4279 token: Token::Word(Word::Word("делам".to_string())),
4280 },
4281 PositionalToken {
4282 source: uws,
4283 offset: 1125,
4284 length: 1,
4285 token: Token::Special(Special::Separator(Separator::Space)),
4286 },
4287 PositionalToken {
4288 source: uws,
4289 offset: 1126,
4290 length: 12,
4291 token: Token::Word(Word::Word("женщин".to_string())),
4292 },
4293 PositionalToken {
4294 source: uws,
4295 offset: 1138,
4296 length: 1,
4297 token: Token::Special(Special::Punctuation(',')),
4298 },
4299 PositionalToken {
4300 source: uws,
4301 offset: 1139,
4302 length: 1,
4303 token: Token::Special(Special::Separator(Separator::Space)),
4304 },
4305 PositionalToken {
4306 source: uws,
4307 offset: 1140,
4308 length: 10,
4309 token: Token::Word(Word::Word("семьи".to_string())),
4310 },
4311 PositionalToken {
4312 source: uws,
4313 offset: 1150,
4314 length: 1,
4315 token: Token::Special(Special::Separator(Separator::Space)),
4316 },
4317 PositionalToken {
4318 source: uws,
4319 offset: 1151,
4320 length: 2,
4321 token: Token::Word(Word::Word("и".to_string())),
4322 },
4323 PositionalToken {
4324 source: uws,
4325 offset: 1153,
4326 length: 1,
4327 token: Token::Special(Special::Separator(Separator::Space)),
4328 },
4329 PositionalToken {
4330 source: uws,
4331 offset: 1154,
4332 length: 16,
4333 token: Token::Word(Word::Word("молодежи".to_string())),
4334 },
4335 PositionalToken {
4336 source: uws,
4337 offset: 1170,
4338 length: 1,
4339 token: Token::Special(Special::Separator(Separator::Space)),
4340 },
4341 PositionalToken {
4342 source: uws,
4343 offset: 1171,
4344 length: 16,
4345 token: Token::Word(Word::Word("выступил".to_string())),
4346 },
4347 PositionalToken {
4348 source: uws,
4349 offset: 1187,
4350 length: 1,
4351 token: Token::Special(Special::Separator(Separator::Space)),
4352 },
4353 PositionalToken {
4354 source: uws,
4355 offset: 1188,
4356 length: 2,
4357 token: Token::Word(Word::Word("с".to_string())),
4358 },
4359 PositionalToken {
4360 source: uws,
4361 offset: 1190,
4362 length: 1,
4363 token: Token::Special(Special::Separator(Separator::Space)),
4364 },
4365 PositionalToken {
4366 source: uws,
4367 offset: 1191,
4368 length: 24,
4369 token: Token::Word(Word::Word("предложением".to_string())),
4370 },
4371 PositionalToken {
4372 source: uws,
4373 offset: 1215,
4374 length: 1,
4375 token: Token::Special(Special::Separator(Separator::Space)),
4376 },
4377 PositionalToken {
4378 source: uws,
4379 offset: 1216,
4380 length: 16,
4381 token: Token::Word(Word::Word("учредить".to_string())),
4382 },
4383 PositionalToken {
4384 source: uws,
4385 offset: 1232,
4386 length: 1,
4387 token: Token::Special(Special::Separator(Separator::Space)),
4388 },
4389 PositionalToken {
4390 source: uws,
4391 offset: 1233,
4392 length: 2,
4393 token: Token::Special(Special::Punctuation('«')),
4394 },
4395 PositionalToken {
4396 source: uws,
4397 offset: 1235,
4398 length: 8,
4399 token: Token::Word(Word::Word("День".to_string())),
4400 },
4401 PositionalToken {
4402 source: uws,
4403 offset: 1243,
4404 length: 1,
4405 token: Token::Special(Special::Separator(Separator::Space)),
4406 },
4407 PositionalToken {
4408 source: uws,
4409 offset: 1244,
4410 length: 8,
4411 token: Token::Word(Word::Word("мамы".to_string())),
4412 },
4413 PositionalToken {
4414 source: uws,
4415 offset: 1252,
4416 length: 2,
4417 token: Token::Special(Special::Punctuation('»')),
4418 },
4419 PositionalToken {
4420 source: uws,
4421 offset: 1254,
4422 length: 1,
4423 token: Token::Special(Special::Punctuation(',')),
4424 },
4425 PositionalToken {
4426 source: uws,
4427 offset: 1255,
4428 length: 1,
4429 token: Token::Special(Special::Separator(Separator::Space)),
4430 },
4431 PositionalToken {
4432 source: uws,
4433 offset: 1256,
4434 length: 2,
4435 token: Token::Word(Word::Word("а".to_string())),
4436 },
4437 PositionalToken {
4438 source: uws,
4439 offset: 1258,
4440 length: 1,
4441 token: Token::Special(Special::Separator(Separator::Space)),
4442 },
4443 PositionalToken {
4444 source: uws,
4445 offset: 1259,
4446 length: 6,
4447 token: Token::Word(Word::Word("сам".to_string())),
4448 },
4449 PositionalToken {
4450 source: uws,
4451 offset: 1265,
4452 length: 1,
4453 token: Token::Special(Special::Separator(Separator::Space)),
4454 },
4455 PositionalToken {
4456 source: uws,
4457 offset: 1266,
4458 length: 12,
4459 token: Token::Word(Word::Word("приказ".to_string())),
4460 },
4461 PositionalToken {
4462 source: uws,
4463 offset: 1278,
4464 length: 1,
4465 token: Token::Special(Special::Separator(Separator::Space)),
4466 },
4467 PositionalToken {
4468 source: uws,
4469 offset: 1279,
4470 length: 6,
4471 token: Token::Word(Word::Word("был".to_string())),
4472 },
4473 PositionalToken {
4474 source: uws,
4475 offset: 1285,
4476 length: 1,
4477 token: Token::Special(Special::Separator(Separator::Space)),
4478 },
4479 PositionalToken {
4480 source: uws,
4481 offset: 1286,
4482 length: 16,
4483 token: Token::Word(Word::Word("подписан".to_string())),
4484 },
4485 PositionalToken {
4486 source: uws,
4487 offset: 1302,
4488 length: 1,
4489 token: Token::Special(Special::Separator(Separator::Space)),
4490 },
4491 PositionalToken {
4492 source: uws,
4493 offset: 1303,
4494 length: 6,
4495 token: Token::Word(Word::Word("уже".to_string())),
4496 },
4497 PositionalToken {
4498 source: uws,
4499 offset: 1309,
4500 length: 1,
4501 token: Token::Special(Special::Separator(Separator::Space)),
4502 },
4503 PositionalToken {
4504 source: uws,
4505 offset: 1310,
4506 length: 2,
4507 token: Token::Word(Word::Number(Number::Integer(30))),
4508 },
4509 PositionalToken {
4510 source: uws,
4511 offset: 1312,
4512 length: 1,
4513 token: Token::Special(Special::Separator(Separator::Space)),
4514 },
4515 PositionalToken {
4516 source: uws,
4517 offset: 1313,
4518 length: 12,
4519 token: Token::Word(Word::Word("января".to_string())),
4520 },
4521 PositionalToken {
4522 source: uws,
4523 offset: 1325,
4524 length: 1,
4525 token: Token::Special(Special::Separator(Separator::Space)),
4526 },
4527 PositionalToken {
4528 source: uws,
4529 offset: 1326,
4530 length: 4,
4531 token: Token::Word(Word::Number(Number::Integer(1988))),
4532 },
4533 PositionalToken {
4534 source: uws,
4535 offset: 1330,
4536 length: 1,
4537 token: Token::Special(Special::Separator(Separator::Space)),
4538 },
4539 PositionalToken {
4540 source: uws,
4541 offset: 1331,
4542 length: 8,
4543 token: Token::Word(Word::Word("года".to_string())),
4544 },
4545 PositionalToken {
4546 source: uws,
4547 offset: 1339,
4548 length: 1,
4549 token: Token::Special(Special::Separator(Separator::Space)),
4550 },
4551 PositionalToken {
4552 source: uws,
4553 offset: 1340,
4554 length: 14,
4555 token: Token::Word(Word::Word("Борисом".to_string())),
4556 },
4557 PositionalToken {
4558 source: uws,
4559 offset: 1354,
4560 length: 1,
4561 token: Token::Special(Special::Separator(Separator::Space)),
4562 },
4563 PositionalToken {
4564 source: uws,
4565 offset: 1355,
4566 length: 16,
4567 token: Token::Word(Word::Word("Ельциным".to_string())),
4568 },
4569 PositionalToken {
4570 source: uws,
4571 offset: 1371,
4572 length: 1,
4573 token: Token::Special(Special::Punctuation('.')),
4574 },
4575 PositionalToken {
4576 source: uws,
4577 offset: 1372,
4578 length: 1,
4579 token: Token::Special(Special::Separator(Separator::Space)),
4580 },
4581 PositionalToken {
4582 source: uws,
4583 offset: 1373,
4584 length: 8,
4585 token: Token::Word(Word::Word("Было".to_string())),
4586 },
4587 PositionalToken {
4588 source: uws,
4589 offset: 1381,
4590 length: 1,
4591 token: Token::Special(Special::Separator(Separator::Space)),
4592 },
4593 PositionalToken {
4594 source: uws,
4595 offset: 1382,
4596 length: 12,
4597 token: Token::Word(Word::Word("решено".to_string())),
4598 },
4599 PositionalToken {
4600 source: uws,
4601 offset: 1394,
4602 length: 1,
4603 token: Token::Special(Special::Punctuation(',')),
4604 },
4605 PositionalToken {
4606 source: uws,
4607 offset: 1395,
4608 length: 1,
4609 token: Token::Special(Special::Separator(Separator::Space)),
4610 },
4611 PositionalToken {
4612 source: uws,
4613 offset: 1396,
4614 length: 6,
4615 token: Token::Word(Word::Word("что".to_string())),
4616 },
4617 PositionalToken {
4618 source: uws,
4619 offset: 1402,
4620 length: 1,
4621 token: Token::Special(Special::Separator(Separator::Space)),
4622 },
4623 PositionalToken {
4624 source: uws,
4625 offset: 1403,
4626 length: 16,
4627 token: Token::Word(Word::Word("ежегодно".to_string())),
4628 },
4629 PositionalToken {
4630 source: uws,
4631 offset: 1419,
4632 length: 1,
4633 token: Token::Special(Special::Separator(Separator::Space)),
4634 },
4635 PositionalToken {
4636 source: uws,
4637 offset: 1420,
4638 length: 2,
4639 token: Token::Word(Word::Word("в".to_string())),
4640 },
4641 PositionalToken {
4642 source: uws,
4643 offset: 1422,
4644 length: 1,
4645 token: Token::Special(Special::Separator(Separator::Space)),
4646 },
4647 PositionalToken {
4648 source: uws,
4649 offset: 1423,
4650 length: 12,
4651 token: Token::Word(Word::Word("России".to_string())),
4652 },
4653 PositionalToken {
4654 source: uws,
4655 offset: 1435,
4656 length: 1,
4657 token: Token::Special(Special::Separator(Separator::Space)),
4658 },
4659 PositionalToken {
4660 source: uws,
4661 offset: 1436,
4662 length: 22,
4663 token: Token::Word(Word::Word("празднество".to_string())),
4664 },
4665 PositionalToken {
4666 source: uws,
4667 offset: 1458,
4668 length: 1,
4669 token: Token::Special(Special::Separator(Separator::Space)),
4670 },
4671 PositionalToken {
4672 source: uws,
4673 offset: 1459,
4674 length: 6,
4675 token: Token::Word(Word::Word("дня".to_string())),
4676 },
4677 PositionalToken {
4678 source: uws,
4679 offset: 1465,
4680 length: 1,
4681 token: Token::Special(Special::Separator(Separator::Space)),
4682 },
4683 PositionalToken {
4684 source: uws,
4685 offset: 1466,
4686 length: 8,
4687 token: Token::Word(Word::Word("мамы".to_string())),
4688 },
4689 PositionalToken {
4690 source: uws,
4691 offset: 1474,
4692 length: 1,
4693 token: Token::Special(Special::Separator(Separator::Space)),
4694 },
4695 PositionalToken {
4696 source: uws,
4697 offset: 1475,
4698 length: 10,
4699 token: Token::Word(Word::Word("будет".to_string())),
4700 },
4701 PositionalToken {
4702 source: uws,
4703 offset: 1485,
4704 length: 1,
4705 token: Token::Special(Special::Separator(Separator::Space)),
4706 },
4707 PositionalToken {
4708 source: uws,
4709 offset: 1486,
4710 length: 16,
4711 token: Token::Word(Word::Word("выпадать".to_string())),
4712 },
4713 PositionalToken {
4714 source: uws,
4715 offset: 1502,
4716 length: 1,
4717 token: Token::Special(Special::Separator(Separator::Space)),
4718 },
4719 PositionalToken {
4720 source: uws,
4721 offset: 1503,
4722 length: 4,
4723 token: Token::Word(Word::Word("на".to_string())),
4724 },
4725 PositionalToken {
4726 source: uws,
4727 offset: 1507,
4728 length: 1,
4729 token: Token::Special(Special::Separator(Separator::Space)),
4730 },
4731 PositionalToken {
4732 source: uws,
4733 offset: 1508,
4734 length: 18,
4735 token: Token::Word(Word::Word("последнее".to_string())),
4736 },
4737 PositionalToken {
4738 source: uws,
4739 offset: 1526,
4740 length: 1,
4741 token: Token::Special(Special::Separator(Separator::Space)),
4742 },
4743 PositionalToken {
4744 source: uws,
4745 offset: 1527,
4746 length: 22,
4747 token: Token::Word(Word::Word("воскресенье".to_string())),
4748 },
4749 PositionalToken {
4750 source: uws,
4751 offset: 1549,
4752 length: 1,
4753 token: Token::Special(Special::Separator(Separator::Space)),
4754 },
4755 PositionalToken {
4756 source: uws,
4757 offset: 1550,
4758 length: 12,
4759 token: Token::Word(Word::Word("ноября".to_string())),
4760 },
4761 PositionalToken {
4762 source: uws,
4763 offset: 1562,
4764 length: 1,
4765 token: Token::Special(Special::Punctuation('.')),
4766 },
4767 PositionalToken {
4768 source: uws,
4769 offset: 1563,
4770 length: 1,
4771 token: Token::Special(Special::Separator(Separator::Space)),
4772 },
4773 PositionalToken {
4774 source: uws,
4775 offset: 1664,
4776 length: 1,
4777 token: Token::Special(Special::Separator(Separator::Newline)),
4778 },
4779 PositionalToken {
4780 source: uws,
4781 offset: 1665,
4782 length: 2,
4783 token: Token::Special(Special::Separator(Separator::Space)),
4784 },
4785 PositionalToken {
4786 source: uws,
4787 offset: 1725,
4788 length: 1,
4789 token: Token::Special(Special::Separator(Separator::Newline)),
4790 },
4791 PositionalToken {
4792 source: uws,
4793 offset: 1726,
4794 length: 4,
4795 token: Token::Special(Special::Separator(Separator::Space)),
4796 },
4797 PositionalToken {
4798 source: uws,
4799 offset: 2725,
4800 length: 1,
4801 token: Token::Special(Special::Separator(Separator::Newline)),
4802 },
4803 PositionalToken {
4804 source: uws,
4805 offset: 2726,
4806 length: 2,
4807 token: Token::Special(Special::Separator(Separator::Space)),
4808 },
4809 PositionalToken {
4810 source: uws,
4811 offset: 2888,
4812 length: 1,
4813 token: Token::Special(Special::Separator(Separator::Newline)),
4814 },
4815 PositionalToken {
4816 source: uws,
4817 offset: 2889,
4818 length: 2,
4819 token: Token::Special(Special::Separator(Separator::Space)),
4820 },
4821 PositionalToken {
4822 source: uws,
4823 offset: 2891,
4824 length: 1,
4825 token: Token::Special(Special::Separator(Separator::Newline)),
4826 },
4827 PositionalToken {
4828 source: uws,
4829 offset: 2904,
4830 length: 1,
4831 token: Token::Special(Special::Separator(Separator::Newline)),
4832 },
4833 PositionalToken {
4834 source: uws,
4835 offset: 2905,
4836 length: 4,
4837 token: Token::Special(Special::Separator(Separator::Space)),
4838 },
4839 ];
4840
4841 let text = Text::new({
4842 uws.into_source()
4843 .pipe(tagger::Builder::new().create().into_breaker())
4844 .pipe(entities::Builder::new().create().into_piped())
4845 .into_separator()
4846 })
4847 .unwrap();
4848
4849 let lib_res = text
4850 .into_tokenizer(TokenizerParams::v1())
4851 .filter_map(|tt| tt.into_original_token_1())
4852 .collect::<Vec<_>>();
4853
4854 check_results(&result, &lib_res, uws);
4855 }
4856
4857 #[test]
4908 fn numerical_no_split() {
4909 let uws = "12.02.18 31.28.34 23.11.2018 123.568.365.234.578 127.0.0.1 1st 1кг 123123афываыв 12321фвафыов234выалфо 12_123_343.4234_4234";
4910 let lib_res = uws.into_tokenizer(Default::default()).collect::<Vec<_>>();
4911 let result = vec![
4913 PositionalToken {
4914 source: uws,
4915 offset: 0,
4916 length: 8,
4917 token: Token::Word(Word::Numerical(Numerical::DotSeparated(
4918 "12.02.18".to_string(),
4919 ))),
4920 },
4921 PositionalToken {
4922 source: uws,
4923 offset: 8,
4924 length: 1,
4925 token: Token::Special(Special::Separator(Separator::Space)),
4926 },
4927 PositionalToken {
4928 source: uws,
4929 offset: 9,
4930 length: 8,
4931 token: Token::Word(Word::Numerical(Numerical::DotSeparated(
4932 "31.28.34".to_string(),
4933 ))),
4934 },
4935 PositionalToken {
4936 source: uws,
4937 offset: 17,
4938 length: 1,
4939 token: Token::Special(Special::Separator(Separator::Space)),
4940 },
4941 PositionalToken {
4942 source: uws,
4943 offset: 18,
4944 length: 10,
4945 token: Token::Word(Word::Numerical(Numerical::DotSeparated(
4946 "23.11.2018".to_string(),
4947 ))),
4948 },
4949 PositionalToken {
4950 source: uws,
4951 offset: 28,
4952 length: 1,
4953 token: Token::Special(Special::Separator(Separator::Space)),
4954 },
4955 PositionalToken {
4956 source: uws,
4957 offset: 29,
4958 length: 19,
4959 token: Token::Word(Word::Numerical(Numerical::DotSeparated(
4960 "123.568.365.234.578".to_string(),
4961 ))),
4962 },
4963 PositionalToken {
4964 source: uws,
4965 offset: 48,
4966 length: 1,
4967 token: Token::Special(Special::Separator(Separator::Space)),
4968 },
4969 PositionalToken {
4970 source: uws,
4971 offset: 49,
4972 length: 9,
4973 token: Token::Word(Word::Numerical(Numerical::DotSeparated(
4974 "127.0.0.1".to_string(),
4975 ))),
4976 },
4977 PositionalToken {
4978 source: uws,
4979 offset: 58,
4980 length: 1,
4981 token: Token::Special(Special::Separator(Separator::Space)),
4982 },
4983 PositionalToken {
4984 source: uws,
4985 offset: 59,
4986 length: 3,
4987 token: Token::Word(Word::Numerical(Numerical::Measures("1st".to_string()))),
4988 },
4989 PositionalToken {
4990 source: uws,
4991 offset: 62,
4992 length: 1,
4993 token: Token::Special(Special::Separator(Separator::Space)),
4994 },
4995 PositionalToken {
4996 source: uws,
4997 offset: 63,
4998 length: 5,
4999 token: Token::Word(Word::Numerical(Numerical::Measures("1кг".to_string()))),
5000 },
5001 PositionalToken {
5002 source: uws,
5003 offset: 68,
5004 length: 1,
5005 token: Token::Special(Special::Separator(Separator::Space)),
5006 },
5007 PositionalToken {
5008 source: uws,
5009 offset: 69,
5010 length: 20,
5011 token: Token::Word(Word::Numerical(Numerical::Measures(
5012 "123123афываыв".to_string(),
5013 ))),
5014 },
5015 PositionalToken {
5016 source: uws,
5017 offset: 89,
5018 length: 1,
5019 token: Token::Special(Special::Separator(Separator::Space)),
5020 },
5021 PositionalToken {
5022 source: uws,
5023 offset: 90,
5024 length: 34,
5025 token: Token::Word(Word::Numerical(Numerical::Alphanumeric(
5026 "12321фвафыов234выалфо".to_string(),
5027 ))),
5028 },
5029 PositionalToken {
5030 source: uws,
5031 offset: 124,
5032 length: 1,
5033 token: Token::Special(Special::Separator(Separator::Space)),
5034 },
5035 PositionalToken {
5036 source: uws,
5037 offset: 125,
5038 length: 20,
5039 token: Token::Word(Word::Numerical(Numerical::Alphanumeric(
5040 "12_123_343.4234_4234".to_string(),
5041 ))),
5042 },
5043 ];
5044 check_results(&result, &lib_res, uws);
5045 }
5046
5047 #[test]
5048 fn numerical_default() {
5049 let uws = "12.02.18 31.28.34 23.11.2018 123.568.365.234.578 127.0.0.1 1st 1кг 123123афываыв 12321фвафыов234выалфо 12_123_343.4234_4234";
5050 let lib_res = uws
5051 .into_tokenizer(TokenizerParams::v1())
5052 .collect::<Vec<_>>();
5053 let result = vec![
5055 PositionalToken {
5056 source: uws,
5057 offset: 0,
5058 length: 2,
5059 token: Token::Word(Word::Number(Number::Integer(12))),
5060 },
5061 PositionalToken {
5062 source: uws,
5063 offset: 2,
5064 length: 1,
5065 token: Token::Special(Special::Punctuation('.')),
5066 },
5067 PositionalToken {
5068 source: uws,
5069 offset: 3,
5070 length: 2,
5071 token: Token::Word(Word::Number(Number::ZeroInteger {
5072 i: 2,
5073 s: "02".to_string(),
5074 })),
5075 },
5076 PositionalToken {
5077 source: uws,
5078 offset: 5,
5079 length: 1,
5080 token: Token::Special(Special::Punctuation('.')),
5081 },
5082 PositionalToken {
5083 source: uws,
5084 offset: 6,
5085 length: 2,
5086 token: Token::Word(Word::Number(Number::Integer(18))),
5087 },
5088 PositionalToken {
5089 source: uws,
5090 offset: 8,
5091 length: 1,
5092 token: Token::Special(Special::Separator(Separator::Space)),
5093 },
5094 PositionalToken {
5095 source: uws,
5096 offset: 9,
5097 length: 2,
5098 token: Token::Word(Word::Number(Number::Integer(31))),
5099 },
5100 PositionalToken {
5101 source: uws,
5102 offset: 11,
5103 length: 1,
5104 token: Token::Special(Special::Punctuation('.')),
5105 },
5106 PositionalToken {
5107 source: uws,
5108 offset: 12,
5109 length: 2,
5110 token: Token::Word(Word::Number(Number::Integer(28))),
5111 },
5112 PositionalToken {
5113 source: uws,
5114 offset: 14,
5115 length: 1,
5116 token: Token::Special(Special::Punctuation('.')),
5117 },
5118 PositionalToken {
5119 source: uws,
5120 offset: 15,
5121 length: 2,
5122 token: Token::Word(Word::Number(Number::Integer(34))),
5123 },
5124 PositionalToken {
5125 source: uws,
5126 offset: 17,
5127 length: 1,
5128 token: Token::Special(Special::Separator(Separator::Space)),
5129 },
5130 PositionalToken {
5131 source: uws,
5132 offset: 18,
5133 length: 2,
5134 token: Token::Word(Word::Number(Number::Integer(23))),
5135 },
5136 PositionalToken {
5137 source: uws,
5138 offset: 20,
5139 length: 1,
5140 token: Token::Special(Special::Punctuation('.')),
5141 },
5142 PositionalToken {
5143 source: uws,
5144 offset: 21,
5145 length: 2,
5146 token: Token::Word(Word::Number(Number::Integer(11))),
5147 },
5148 PositionalToken {
5149 source: uws,
5150 offset: 23,
5151 length: 1,
5152 token: Token::Special(Special::Punctuation('.')),
5153 },
5154 PositionalToken {
5155 source: uws,
5156 offset: 24,
5157 length: 4,
5158 token: Token::Word(Word::Number(Number::Integer(2018))),
5159 },
5160 PositionalToken {
5161 source: uws,
5162 offset: 28,
5163 length: 1,
5164 token: Token::Special(Special::Separator(Separator::Space)),
5165 },
5166 PositionalToken {
5167 source: uws,
5168 offset: 29,
5169 length: 3,
5170 token: Token::Word(Word::Number(Number::Integer(123))),
5171 },
5172 PositionalToken {
5173 source: uws,
5174 offset: 32,
5175 length: 1,
5176 token: Token::Special(Special::Punctuation('.')),
5177 },
5178 PositionalToken {
5179 source: uws,
5180 offset: 33,
5181 length: 3,
5182 token: Token::Word(Word::Number(Number::Integer(568))),
5183 },
5184 PositionalToken {
5185 source: uws,
5186 offset: 36,
5187 length: 1,
5188 token: Token::Special(Special::Punctuation('.')),
5189 },
5190 PositionalToken {
5191 source: uws,
5192 offset: 37,
5193 length: 3,
5194 token: Token::Word(Word::Number(Number::Integer(365))),
5195 },
5196 PositionalToken {
5197 source: uws,
5198 offset: 40,
5199 length: 1,
5200 token: Token::Special(Special::Punctuation('.')),
5201 },
5202 PositionalToken {
5203 source: uws,
5204 offset: 41,
5205 length: 3,
5206 token: Token::Word(Word::Number(Number::Integer(234))),
5207 },
5208 PositionalToken {
5209 source: uws,
5210 offset: 44,
5211 length: 1,
5212 token: Token::Special(Special::Punctuation('.')),
5213 },
5214 PositionalToken {
5215 source: uws,
5216 offset: 45,
5217 length: 3,
5218 token: Token::Word(Word::Number(Number::Integer(578))),
5219 },
5220 PositionalToken {
5221 source: uws,
5222 offset: 48,
5223 length: 1,
5224 token: Token::Special(Special::Separator(Separator::Space)),
5225 },
5226 PositionalToken {
5227 source: uws,
5228 offset: 49,
5229 length: 3,
5230 token: Token::Word(Word::Number(Number::Integer(127))),
5231 },
5232 PositionalToken {
5233 source: uws,
5234 offset: 52,
5235 length: 1,
5236 token: Token::Special(Special::Punctuation('.')),
5237 },
5238 PositionalToken {
5239 source: uws,
5240 offset: 53,
5241 length: 1,
5242 token: Token::Word(Word::Number(Number::ZeroInteger {
5243 i: 0,
5244 s: "0".to_string(),
5245 })),
5246 },
5247 PositionalToken {
5248 source: uws,
5249 offset: 54,
5250 length: 1,
5251 token: Token::Special(Special::Punctuation('.')),
5252 },
5253 PositionalToken {
5254 source: uws,
5255 offset: 55,
5256 length: 1,
5257 token: Token::Word(Word::Number(Number::ZeroInteger {
5258 i: 0,
5259 s: "0".to_string(),
5260 })),
5261 },
5262 PositionalToken {
5263 source: uws,
5264 offset: 56,
5265 length: 1,
5266 token: Token::Special(Special::Punctuation('.')),
5267 },
5268 PositionalToken {
5269 source: uws,
5270 offset: 57,
5271 length: 1,
5272 token: Token::Word(Word::Number(Number::Integer(1))),
5273 },
5274 PositionalToken {
5275 source: uws,
5276 offset: 58,
5277 length: 1,
5278 token: Token::Special(Special::Separator(Separator::Space)),
5279 },
5280 PositionalToken {
5281 source: uws,
5282 offset: 59,
5283 length: 3,
5284 token: Token::Word(Word::Numerical(Numerical::Measures("1st".to_string()))),
5285 },
5286 PositionalToken {
5287 source: uws,
5288 offset: 62,
5289 length: 1,
5290 token: Token::Special(Special::Separator(Separator::Space)),
5291 },
5292 PositionalToken {
5293 source: uws,
5294 offset: 63,
5295 length: 5,
5296 token: Token::Word(Word::Numerical(Numerical::Measures("1кг".to_string()))),
5297 },
5298 PositionalToken {
5299 source: uws,
5300 offset: 68,
5301 length: 1,
5302 token: Token::Special(Special::Separator(Separator::Space)),
5303 },
5304 PositionalToken {
5305 source: uws,
5306 offset: 69,
5307 length: 20,
5308 token: Token::Word(Word::Numerical(Numerical::Measures(
5309 "123123афываыв".to_string(),
5310 ))),
5311 },
5312 PositionalToken {
5313 source: uws,
5314 offset: 89,
5315 length: 1,
5316 token: Token::Special(Special::Separator(Separator::Space)),
5317 },
5318 PositionalToken {
5319 source: uws,
5320 offset: 90,
5321 length: 34,
5322 token: Token::Word(Word::Numerical(Numerical::Alphanumeric(
5323 "12321фвафыов234выалфо".to_string(),
5324 ))),
5325 },
5326 PositionalToken {
5327 source: uws,
5328 offset: 124,
5329 length: 1,
5330 token: Token::Special(Special::Separator(Separator::Space)),
5331 },
5332 PositionalToken {
5333 source: uws,
5334 offset: 125,
5335 length: 2,
5336 token: Token::Word(Word::Number(Number::Integer(12))),
5337 },
5338 PositionalToken {
5339 source: uws,
5340 offset: 127,
5341 length: 1,
5342 token: Token::Special(Special::Punctuation('_')),
5343 },
5344 PositionalToken {
5345 source: uws,
5346 offset: 128,
5347 length: 3,
5348 token: Token::Word(Word::Number(Number::Integer(123))),
5349 },
5350 PositionalToken {
5351 source: uws,
5352 offset: 131,
5353 length: 1,
5354 token: Token::Special(Special::Punctuation('_')),
5355 },
5356 PositionalToken {
5357 source: uws,
5358 offset: 132,
5359 length: 3,
5360 token: Token::Word(Word::Number(Number::Integer(343))),
5361 },
5362 PositionalToken {
5363 source: uws,
5364 offset: 135,
5365 length: 1,
5366 token: Token::Special(Special::Punctuation('.')),
5367 },
5368 PositionalToken {
5369 source: uws,
5370 offset: 136,
5371 length: 4,
5372 token: Token::Word(Word::Number(Number::Integer(4234))),
5373 },
5374 PositionalToken {
5375 source: uws,
5376 offset: 140,
5377 length: 1,
5378 token: Token::Special(Special::Punctuation('_')),
5379 },
5380 PositionalToken {
5381 source: uws,
5382 offset: 141,
5383 length: 4,
5384 token: Token::Word(Word::Number(Number::Integer(4234))),
5385 },
5386 ];
5387 check_results(&result, &lib_res, uws);
5388 }
5389
5390 enum Lang {
5403 Zho,
5404 Jpn,
5405 Kor,
5406 Ara,
5407 Ell,
5408 }
5409
5410 #[test]
5411 fn test_lang_zho() {
5412 let (uws, result) = get_lang_test(Lang::Zho);
5413 let lib_res = uws
5414 .into_tokenizer(TokenizerParams::v1())
5415 .collect::<Vec<_>>();
5416 check_results(&result, &lib_res, &uws);
5417 }
5418
5419 #[test]
5420 fn test_lang_jpn() {
5421 let (uws, result) = get_lang_test(Lang::Jpn);
5422 let lib_res = uws
5423 .into_tokenizer(TokenizerParams::v1())
5424 .collect::<Vec<_>>();
5425 check_results(&result, &lib_res, &uws);
5426 }
5427
5428 #[test]
5429 fn test_lang_kor() {
5430 let (uws, result) = get_lang_test(Lang::Kor);
5431 let lib_res = uws
5432 .into_tokenizer(TokenizerParams::v1())
5433 .collect::<Vec<_>>();
5434 check_results(&result, &lib_res, &uws);
5435 }
5436
5437 #[test]
5438 fn test_lang_ara() {
5439 let (uws, result) = get_lang_test(Lang::Ara);
5440 let lib_res = uws
5441 .into_tokenizer(TokenizerParams::v1())
5442 .collect::<Vec<_>>();
5443 check_results(&result, &lib_res, &uws);
5444 }
5445
5446 #[test]
5447 fn test_lang_ell() {
5448 let (uws, result) = get_lang_test(Lang::Ell);
5449 let lib_res = uws
5450 .into_tokenizer(TokenizerParams::v1())
5451 .collect::<Vec<_>>();
5452 check_results(&result, &lib_res, &uws);
5453 }
5454
5455 fn get_lang_test(lng: Lang) -> (String, Vec<PositionalToken>) {
5456 let uws = match lng {
5457 Lang::Zho => {
5458 "美国电视连续剧《超人前传》的第一集《试播集》于2001年10月16日在電視網首播,剧集主创人阿尔弗雷德·高夫和迈尔斯·米勒編劇,大卫·努特尔执导。这一试播首次向观众引荐了克拉克·肯特一角,他是位拥有超能力的外星孤儿,与家人和朋友一起在堪薩斯州虚构小镇斯莫维尔生活。在这一集里,肯特首度得知自己的来历,同时还需要阻止一位学生试图杀死镇上高中多名学生的报复之举。本集节目里引入了多个之后将贯穿全季甚至整部剧集的主题元素,例如几位主要角色之间的三角恋情。电视剧在加拿大溫哥華取景,旨在选用其“美国中产阶级”景观,主创人花了5个月的时间专门用于为主角物色合适的演员。试播集在所有演员选好4天后正式开拍。由于时间上的限制,剧组无法搭建好实体外景,因此只能使用计算机绘图技术将数字化的外景插入到镜头中。节目一经上映就打破了电视网的多项收视纪录,并且获得了评论员的普遍好评和多个奖项提名,并在其中两项上胜出"
5459 }
5460 Lang::Kor => {
5461 "플레이스테이션 은 소니 컴퓨터 엔터테인먼트가 개발한 세 번째 가정용 게임기이다. 마이크로소프트의 엑스박스 360, 닌텐도의 Wii와 경쟁하고 있다. 이전 제품에서 온라인 플레이 기능을 비디오 게임 개발사에 전적으로 의존하던 것과 달리 통합 온라인 게임 서비스인 플레이스테이션 네트워크 서비스를 발매와 함께 시작해 제공하고 있으며, 탄탄한 멀티미디어 재생 기능, 플레이스테이션 포터블과의 연결, 고화질 광학 디스크 포맷인 블루레이 디스크 재생 기능 등의 기능을 갖추고 있다. 2006년 11월 11일에 일본에서 처음으로 출시했으며, 11월 17일에는 북미 지역, 2007년 3월 23일에는 유럽과 오세아니아 지역에서, 대한민국의 경우 6월 5일부터 일주일간 예약판매를 실시해, 매일 준비한 수량이 동이 나는 등 많은 관심을 받았으며 6월 16일에 정식 출시 행사를 열었다"
5462 }
5463 Lang::Jpn => {
5464 "熊野三山本願所は、15世紀末以降における熊野三山(熊野本宮、熊野新宮、熊野那智)の造営・修造のための勧進を担った組織の総称。 熊野三山を含めて、日本における古代から中世前半にかけての寺社の造営は、寺社領経営のような恒常的財源、幕府や朝廷などからの一時的な造営料所の寄進、あるいは公権力からの臨時の保護によって行われていた。しかしながら、熊野三山では、これらの財源はすべて15世紀半ばまでに実効性を失った"
5465 }
5466 Lang::Ara => {
5467 "لشکرکشیهای روسهای وارنگی به دریای خزر مجموعهای از حملات نظامی در بین سالهای ۸۶۴ تا ۱۰۴۱ میلادی به سواحل دریای خزر بودهاست. روسهای وارنگی ابتدا در قرن نهم میلادی به عنوان بازرگانان پوست، عسل و برده در سرزمینهای اسلامی(سرکلند) ظاهر شدند. این بازرگانان در مسیر تجاری ولگا به خرید و فروش میپرداختند. نخستین حملهٔ آنان در فاصله سالهای ۸۶۴ تا ۸۸۴ میلادی در مقیاسی کوچک علیه علویان طبرستان رخ داد. نخستین یورش بزرگ روسها در سال ۹۱۳ رخ داد و آنان با ۵۰۰ فروند درازکشتی شهر گرگان و اطراف آن را غارت کردند. آنها در این حمله مقداری کالا و برده را به تاراج بردند و در راه بازگشتن به سمت شمال، در دلتای ولگا، مورد حملهٔ خزرهای مسلمان قرار گرفتند و بعضی از آنان موفق به فرار شدند، ولی در میانهٔ ولگا به قتل رسیدند. دومین هجوم بزرگ روسها به دریای خزر در سال ۹۴۳ به وقوع پیوست. در این دوره ایگور یکم، حاکم روس کیف، رهبری روسها را در دست داشت. روسها پس از توافق با دولت خزرها برای عبور امن از منطقه، تا رود کورا و اعماق قفقاز پیش رفتند و در سال ۹۴۳ موفق شدند بندر بردعه، پایتخت اران (جمهوری آذربایجان کنونی)، را تصرف کنند. روسها در آنجا به مدت چند ماه ماندند و بسیاری از ساکنان شهر را کشتند و از راه غارتگری اموالی را به تاراج بردند. تنها دلیل بازگشت آنان "
5468 }
5469 Lang::Ell => {
5470 "Το Πρόγραμμα υλοποιείται εξ ολοκλήρου από απόσταση και μπορεί να συμμετέχει κάθε εμπλεκόμενος στη ή/και ενδιαφερόμενος για τη διδασκαλία της Ελληνικής ως δεύτερης/ξένης γλώσσας στην Ελλάδα και στο εξωτερικό, αρκεί να είναι απόφοιτος ελληνικής φιλολογίας, ξένων φιλολογιών, παιδαγωγικών τμημάτων, θεολογικών σχολών ή άλλων πανεπιστημιακών τμημάτων ελληνικών ή ισότιμων ξένων πανεπιστημίων. Υπό όρους γίνονται δεκτοί υποψήφιοι που δεν έχουν ολοκληρώσει σπουδές τριτοβάθμιας εκπαίδευσης."
5471 }
5472 };
5473 let tokens = match lng {
5474 Lang::Zho => vec![
5475 PositionalToken {
5476 source: uws,
5477 offset: 0,
5478 length: 3,
5479 token: Token::Word(Word::Word("美".to_string())),
5480 },
5481 PositionalToken {
5482 source: uws,
5483 offset: 3,
5484 length: 3,
5485 token: Token::Word(Word::Word("国".to_string())),
5486 },
5487 PositionalToken {
5488 source: uws,
5489 offset: 6,
5490 length: 3,
5491 token: Token::Word(Word::Word("电".to_string())),
5492 },
5493 PositionalToken {
5494 source: uws,
5495 offset: 9,
5496 length: 3,
5497 token: Token::Word(Word::Word("视".to_string())),
5498 },
5499 PositionalToken {
5500 source: uws,
5501 offset: 12,
5502 length: 3,
5503 token: Token::Word(Word::Word("连".to_string())),
5504 },
5505 PositionalToken {
5506 source: uws,
5507 offset: 15,
5508 length: 3,
5509 token: Token::Word(Word::Word("续".to_string())),
5510 },
5511 PositionalToken {
5512 source: uws,
5513 offset: 18,
5514 length: 3,
5515 token: Token::Word(Word::Word("剧".to_string())),
5516 },
5517 PositionalToken {
5518 source: uws,
5519 offset: 21,
5520 length: 3,
5521 token: Token::Special(Special::Punctuation('《')),
5522 },
5523 PositionalToken {
5524 source: uws,
5525 offset: 24,
5526 length: 3,
5527 token: Token::Word(Word::Word("超".to_string())),
5528 },
5529 PositionalToken {
5530 source: uws,
5531 offset: 27,
5532 length: 3,
5533 token: Token::Word(Word::Word("人".to_string())),
5534 },
5535 PositionalToken {
5536 source: uws,
5537 offset: 30,
5538 length: 3,
5539 token: Token::Word(Word::Word("前".to_string())),
5540 },
5541 PositionalToken {
5542 source: uws,
5543 offset: 33,
5544 length: 3,
5545 token: Token::Word(Word::Word("传".to_string())),
5546 },
5547 PositionalToken {
5548 source: uws,
5549 offset: 36,
5550 length: 3,
5551 token: Token::Special(Special::Punctuation('》')),
5552 },
5553 PositionalToken {
5554 source: uws,
5555 offset: 39,
5556 length: 3,
5557 token: Token::Word(Word::Word("的".to_string())),
5558 },
5559 PositionalToken {
5560 source: uws,
5561 offset: 42,
5562 length: 3,
5563 token: Token::Word(Word::Word("第".to_string())),
5564 },
5565 PositionalToken {
5566 source: uws,
5567 offset: 45,
5568 length: 3,
5569 token: Token::Word(Word::Word("一".to_string())),
5570 },
5571 PositionalToken {
5572 source: uws,
5573 offset: 48,
5574 length: 3,
5575 token: Token::Word(Word::Word("集".to_string())),
5576 },
5577 PositionalToken {
5578 source: uws,
5579 offset: 51,
5580 length: 3,
5581 token: Token::Special(Special::Punctuation('《')),
5582 },
5583 PositionalToken {
5584 source: uws,
5585 offset: 54,
5586 length: 3,
5587 token: Token::Word(Word::Word("试".to_string())),
5588 },
5589 PositionalToken {
5590 source: uws,
5591 offset: 57,
5592 length: 3,
5593 token: Token::Word(Word::Word("播".to_string())),
5594 },
5595 PositionalToken {
5596 source: uws,
5597 offset: 60,
5598 length: 3,
5599 token: Token::Word(Word::Word("集".to_string())),
5600 },
5601 PositionalToken {
5602 source: uws,
5603 offset: 63,
5604 length: 3,
5605 token: Token::Special(Special::Punctuation('》')),
5606 },
5607 PositionalToken {
5608 source: uws,
5609 offset: 66,
5610 length: 3,
5611 token: Token::Word(Word::Word("于".to_string())),
5612 },
5613 PositionalToken {
5614 source: uws,
5615 offset: 69,
5616 length: 4,
5617 token: Token::Word(Word::Number(Number::Integer(2001))),
5618 },
5619 PositionalToken {
5620 source: uws,
5621 offset: 73,
5622 length: 3,
5623 token: Token::Word(Word::Word("年".to_string())),
5624 },
5625 PositionalToken {
5626 source: uws,
5627 offset: 76,
5628 length: 2,
5629 token: Token::Word(Word::Number(Number::Integer(10))),
5630 },
5631 PositionalToken {
5632 source: uws,
5633 offset: 78,
5634 length: 3,
5635 token: Token::Word(Word::Word("月".to_string())),
5636 },
5637 PositionalToken {
5638 source: uws,
5639 offset: 81,
5640 length: 2,
5641 token: Token::Word(Word::Number(Number::Integer(16))),
5642 },
5643 PositionalToken {
5644 source: uws,
5645 offset: 83,
5646 length: 3,
5647 token: Token::Word(Word::Word("日".to_string())),
5648 },
5649 PositionalToken {
5650 source: uws,
5651 offset: 86,
5652 length: 3,
5653 token: Token::Word(Word::Word("在".to_string())),
5654 },
5655 PositionalToken {
5656 source: uws,
5657 offset: 89,
5658 length: 3,
5659 token: Token::Word(Word::Word("電".to_string())),
5660 },
5661 PositionalToken {
5662 source: uws,
5663 offset: 92,
5664 length: 3,
5665 token: Token::Word(Word::Word("視".to_string())),
5666 },
5667 PositionalToken {
5668 source: uws,
5669 offset: 95,
5670 length: 3,
5671 token: Token::Word(Word::Word("網".to_string())),
5672 },
5673 PositionalToken {
5674 source: uws,
5675 offset: 98,
5676 length: 3,
5677 token: Token::Word(Word::Word("首".to_string())),
5678 },
5679 PositionalToken {
5680 source: uws,
5681 offset: 101,
5682 length: 3,
5683 token: Token::Word(Word::Word("播".to_string())),
5684 },
5685 PositionalToken {
5686 source: uws,
5687 offset: 104,
5688 length: 3,
5689 token: Token::Special(Special::Punctuation(',')),
5690 },
5691 PositionalToken {
5692 source: uws,
5693 offset: 107,
5694 length: 3,
5695 token: Token::Word(Word::Word("剧".to_string())),
5696 },
5697 PositionalToken {
5698 source: uws,
5699 offset: 110,
5700 length: 3,
5701 token: Token::Word(Word::Word("集".to_string())),
5702 },
5703 PositionalToken {
5704 source: uws,
5705 offset: 113,
5706 length: 3,
5707 token: Token::Word(Word::Word("主".to_string())),
5708 },
5709 PositionalToken {
5710 source: uws,
5711 offset: 116,
5712 length: 3,
5713 token: Token::Word(Word::Word("创".to_string())),
5714 },
5715 PositionalToken {
5716 source: uws,
5717 offset: 119,
5718 length: 3,
5719 token: Token::Word(Word::Word("人".to_string())),
5720 },
5721 PositionalToken {
5722 source: uws,
5723 offset: 122,
5724 length: 3,
5725 token: Token::Word(Word::Word("阿".to_string())),
5726 },
5727 PositionalToken {
5728 source: uws,
5729 offset: 125,
5730 length: 3,
5731 token: Token::Word(Word::Word("尔".to_string())),
5732 },
5733 PositionalToken {
5734 source: uws,
5735 offset: 128,
5736 length: 3,
5737 token: Token::Word(Word::Word("弗".to_string())),
5738 },
5739 PositionalToken {
5740 source: uws,
5741 offset: 131,
5742 length: 3,
5743 token: Token::Word(Word::Word("雷".to_string())),
5744 },
5745 PositionalToken {
5746 source: uws,
5747 offset: 134,
5748 length: 3,
5749 token: Token::Word(Word::Word("德".to_string())),
5750 },
5751 PositionalToken {
5752 source: uws,
5753 offset: 137,
5754 length: 2,
5755 token: Token::Special(Special::Punctuation('·')),
5756 },
5757 PositionalToken {
5758 source: uws,
5759 offset: 139,
5760 length: 3,
5761 token: Token::Word(Word::Word("高".to_string())),
5762 },
5763 PositionalToken {
5764 source: uws,
5765 offset: 142,
5766 length: 3,
5767 token: Token::Word(Word::Word("夫".to_string())),
5768 },
5769 PositionalToken {
5770 source: uws,
5771 offset: 145,
5772 length: 3,
5773 token: Token::Word(Word::Word("和".to_string())),
5774 },
5775 PositionalToken {
5776 source: uws,
5777 offset: 148,
5778 length: 3,
5779 token: Token::Word(Word::Word("迈".to_string())),
5780 },
5781 PositionalToken {
5782 source: uws,
5783 offset: 151,
5784 length: 3,
5785 token: Token::Word(Word::Word("尔".to_string())),
5786 },
5787 PositionalToken {
5788 source: uws,
5789 offset: 154,
5790 length: 3,
5791 token: Token::Word(Word::Word("斯".to_string())),
5792 },
5793 PositionalToken {
5794 source: uws,
5795 offset: 157,
5796 length: 2,
5797 token: Token::Special(Special::Punctuation('·')),
5798 },
5799 PositionalToken {
5800 source: uws,
5801 offset: 159,
5802 length: 3,
5803 token: Token::Word(Word::Word("米".to_string())),
5804 },
5805 PositionalToken {
5806 source: uws,
5807 offset: 162,
5808 length: 3,
5809 token: Token::Word(Word::Word("勒".to_string())),
5810 },
5811 PositionalToken {
5812 source: uws,
5813 offset: 165,
5814 length: 3,
5815 token: Token::Word(Word::Word("編".to_string())),
5816 },
5817 PositionalToken {
5818 source: uws,
5819 offset: 168,
5820 length: 3,
5821 token: Token::Word(Word::Word("劇".to_string())),
5822 },
5823 PositionalToken {
5824 source: uws,
5825 offset: 171,
5826 length: 3,
5827 token: Token::Special(Special::Punctuation(',')),
5828 },
5829 PositionalToken {
5830 source: uws,
5831 offset: 174,
5832 length: 3,
5833 token: Token::Word(Word::Word("大".to_string())),
5834 },
5835 PositionalToken {
5836 source: uws,
5837 offset: 177,
5838 length: 3,
5839 token: Token::Word(Word::Word("卫".to_string())),
5840 },
5841 PositionalToken {
5842 source: uws,
5843 offset: 180,
5844 length: 2,
5845 token: Token::Special(Special::Punctuation('·')),
5846 },
5847 PositionalToken {
5848 source: uws,
5849 offset: 182,
5850 length: 3,
5851 token: Token::Word(Word::Word("努".to_string())),
5852 },
5853 PositionalToken {
5854 source: uws,
5855 offset: 185,
5856 length: 3,
5857 token: Token::Word(Word::Word("特".to_string())),
5858 },
5859 PositionalToken {
5860 source: uws,
5861 offset: 188,
5862 length: 3,
5863 token: Token::Word(Word::Word("尔".to_string())),
5864 },
5865 PositionalToken {
5866 source: uws,
5867 offset: 191,
5868 length: 3,
5869 token: Token::Word(Word::Word("执".to_string())),
5870 },
5871 PositionalToken {
5872 source: uws,
5873 offset: 194,
5874 length: 3,
5875 token: Token::Word(Word::Word("导".to_string())),
5876 },
5877 PositionalToken {
5878 source: uws,
5879 offset: 197,
5880 length: 3,
5881 token: Token::Special(Special::Punctuation('。')),
5882 },
5883 PositionalToken {
5884 source: uws,
5885 offset: 200,
5886 length: 3,
5887 token: Token::Word(Word::Word("这".to_string())),
5888 },
5889 PositionalToken {
5890 source: uws,
5891 offset: 203,
5892 length: 3,
5893 token: Token::Word(Word::Word("一".to_string())),
5894 },
5895 PositionalToken {
5896 source: uws,
5897 offset: 206,
5898 length: 3,
5899 token: Token::Word(Word::Word("试".to_string())),
5900 },
5901 PositionalToken {
5902 source: uws,
5903 offset: 209,
5904 length: 3,
5905 token: Token::Word(Word::Word("播".to_string())),
5906 },
5907 PositionalToken {
5908 source: uws,
5909 offset: 212,
5910 length: 3,
5911 token: Token::Word(Word::Word("首".to_string())),
5912 },
5913 PositionalToken {
5914 source: uws,
5915 offset: 215,
5916 length: 3,
5917 token: Token::Word(Word::Word("次".to_string())),
5918 },
5919 PositionalToken {
5920 source: uws,
5921 offset: 218,
5922 length: 3,
5923 token: Token::Word(Word::Word("向".to_string())),
5924 },
5925 PositionalToken {
5926 source: uws,
5927 offset: 221,
5928 length: 3,
5929 token: Token::Word(Word::Word("观".to_string())),
5930 },
5931 PositionalToken {
5932 source: uws,
5933 offset: 224,
5934 length: 3,
5935 token: Token::Word(Word::Word("众".to_string())),
5936 },
5937 PositionalToken {
5938 source: uws,
5939 offset: 227,
5940 length: 3,
5941 token: Token::Word(Word::Word("引".to_string())),
5942 },
5943 PositionalToken {
5944 source: uws,
5945 offset: 230,
5946 length: 3,
5947 token: Token::Word(Word::Word("荐".to_string())),
5948 },
5949 PositionalToken {
5950 source: uws,
5951 offset: 233,
5952 length: 3,
5953 token: Token::Word(Word::Word("了".to_string())),
5954 },
5955 PositionalToken {
5956 source: uws,
5957 offset: 236,
5958 length: 3,
5959 token: Token::Word(Word::Word("克".to_string())),
5960 },
5961 PositionalToken {
5962 source: uws,
5963 offset: 239,
5964 length: 3,
5965 token: Token::Word(Word::Word("拉".to_string())),
5966 },
5967 PositionalToken {
5968 source: uws,
5969 offset: 242,
5970 length: 3,
5971 token: Token::Word(Word::Word("克".to_string())),
5972 },
5973 PositionalToken {
5974 source: uws,
5975 offset: 245,
5976 length: 2,
5977 token: Token::Special(Special::Punctuation('·')),
5978 },
5979 PositionalToken {
5980 source: uws,
5981 offset: 247,
5982 length: 3,
5983 token: Token::Word(Word::Word("肯".to_string())),
5984 },
5985 PositionalToken {
5986 source: uws,
5987 offset: 250,
5988 length: 3,
5989 token: Token::Word(Word::Word("特".to_string())),
5990 },
5991 PositionalToken {
5992 source: uws,
5993 offset: 253,
5994 length: 3,
5995 token: Token::Word(Word::Word("一".to_string())),
5996 },
5997 PositionalToken {
5998 source: uws,
5999 offset: 256,
6000 length: 3,
6001 token: Token::Word(Word::Word("角".to_string())),
6002 },
6003 PositionalToken {
6004 source: uws,
6005 offset: 259,
6006 length: 3,
6007 token: Token::Special(Special::Punctuation(',')),
6008 },
6009 PositionalToken {
6010 source: uws,
6011 offset: 262,
6012 length: 3,
6013 token: Token::Word(Word::Word("他".to_string())),
6014 },
6015 PositionalToken {
6016 source: uws,
6017 offset: 265,
6018 length: 3,
6019 token: Token::Word(Word::Word("是".to_string())),
6020 },
6021 PositionalToken {
6022 source: uws,
6023 offset: 268,
6024 length: 3,
6025 token: Token::Word(Word::Word("位".to_string())),
6026 },
6027 PositionalToken {
6028 source: uws,
6029 offset: 271,
6030 length: 3,
6031 token: Token::Word(Word::Word("拥".to_string())),
6032 },
6033 PositionalToken {
6034 source: uws,
6035 offset: 274,
6036 length: 3,
6037 token: Token::Word(Word::Word("有".to_string())),
6038 },
6039 PositionalToken {
6040 source: uws,
6041 offset: 277,
6042 length: 3,
6043 token: Token::Word(Word::Word("超".to_string())),
6044 },
6045 ],
6046 Lang::Jpn => vec![
6047 PositionalToken {
6048 source: uws,
6049 offset: 0,
6050 length: 3,
6051 token: Token::Word(Word::Word("熊".to_string())),
6052 },
6053 PositionalToken {
6054 source: uws,
6055 offset: 3,
6056 length: 3,
6057 token: Token::Word(Word::Word("野".to_string())),
6058 },
6059 PositionalToken {
6060 source: uws,
6061 offset: 6,
6062 length: 3,
6063 token: Token::Word(Word::Word("三".to_string())),
6064 },
6065 PositionalToken {
6066 source: uws,
6067 offset: 9,
6068 length: 3,
6069 token: Token::Word(Word::Word("山".to_string())),
6070 },
6071 PositionalToken {
6072 source: uws,
6073 offset: 12,
6074 length: 3,
6075 token: Token::Word(Word::Word("本".to_string())),
6076 },
6077 PositionalToken {
6078 source: uws,
6079 offset: 15,
6080 length: 3,
6081 token: Token::Word(Word::Word("願".to_string())),
6082 },
6083 PositionalToken {
6084 source: uws,
6085 offset: 18,
6086 length: 3,
6087 token: Token::Word(Word::Word("所".to_string())),
6088 },
6089 PositionalToken {
6090 source: uws,
6091 offset: 21,
6092 length: 3,
6093 token: Token::Word(Word::Word("は".to_string())),
6094 },
6095 PositionalToken {
6096 source: uws,
6097 offset: 24,
6098 length: 3,
6099 token: Token::Special(Special::Punctuation('、')),
6100 },
6101 PositionalToken {
6102 source: uws,
6103 offset: 27,
6104 length: 2,
6105 token: Token::Word(Word::Number(Number::Integer(15))),
6106 },
6107 PositionalToken {
6108 source: uws,
6109 offset: 29,
6110 length: 3,
6111 token: Token::Word(Word::Word("世".to_string())),
6112 },
6113 PositionalToken {
6114 source: uws,
6115 offset: 32,
6116 length: 3,
6117 token: Token::Word(Word::Word("紀".to_string())),
6118 },
6119 PositionalToken {
6120 source: uws,
6121 offset: 35,
6122 length: 3,
6123 token: Token::Word(Word::Word("末".to_string())),
6124 },
6125 PositionalToken {
6126 source: uws,
6127 offset: 38,
6128 length: 3,
6129 token: Token::Word(Word::Word("以".to_string())),
6130 },
6131 PositionalToken {
6132 source: uws,
6133 offset: 41,
6134 length: 3,
6135 token: Token::Word(Word::Word("降".to_string())),
6136 },
6137 PositionalToken {
6138 source: uws,
6139 offset: 44,
6140 length: 3,
6141 token: Token::Word(Word::Word("に".to_string())),
6142 },
6143 PositionalToken {
6144 source: uws,
6145 offset: 47,
6146 length: 3,
6147 token: Token::Word(Word::Word("お".to_string())),
6148 },
6149 PositionalToken {
6150 source: uws,
6151 offset: 50,
6152 length: 3,
6153 token: Token::Word(Word::Word("け".to_string())),
6154 },
6155 PositionalToken {
6156 source: uws,
6157 offset: 53,
6158 length: 3,
6159 token: Token::Word(Word::Word("る".to_string())),
6160 },
6161 PositionalToken {
6162 source: uws,
6163 offset: 56,
6164 length: 3,
6165 token: Token::Word(Word::Word("熊".to_string())),
6166 },
6167 PositionalToken {
6168 source: uws,
6169 offset: 59,
6170 length: 3,
6171 token: Token::Word(Word::Word("野".to_string())),
6172 },
6173 PositionalToken {
6174 source: uws,
6175 offset: 62,
6176 length: 3,
6177 token: Token::Word(Word::Word("三".to_string())),
6178 },
6179 PositionalToken {
6180 source: uws,
6181 offset: 65,
6182 length: 3,
6183 token: Token::Word(Word::Word("山".to_string())),
6184 },
6185 PositionalToken {
6186 source: uws,
6187 offset: 68,
6188 length: 3,
6189 token: Token::Special(Special::Punctuation('(')),
6190 },
6191 PositionalToken {
6192 source: uws,
6193 offset: 71,
6194 length: 3,
6195 token: Token::Word(Word::Word("熊".to_string())),
6196 },
6197 PositionalToken {
6198 source: uws,
6199 offset: 74,
6200 length: 3,
6201 token: Token::Word(Word::Word("野".to_string())),
6202 },
6203 PositionalToken {
6204 source: uws,
6205 offset: 77,
6206 length: 3,
6207 token: Token::Word(Word::Word("本".to_string())),
6208 },
6209 PositionalToken {
6210 source: uws,
6211 offset: 80,
6212 length: 3,
6213 token: Token::Word(Word::Word("宮".to_string())),
6214 },
6215 PositionalToken {
6216 source: uws,
6217 offset: 83,
6218 length: 3,
6219 token: Token::Special(Special::Punctuation('、')),
6220 },
6221 PositionalToken {
6222 source: uws,
6223 offset: 86,
6224 length: 3,
6225 token: Token::Word(Word::Word("熊".to_string())),
6226 },
6227 PositionalToken {
6228 source: uws,
6229 offset: 89,
6230 length: 3,
6231 token: Token::Word(Word::Word("野".to_string())),
6232 },
6233 PositionalToken {
6234 source: uws,
6235 offset: 92,
6236 length: 3,
6237 token: Token::Word(Word::Word("新".to_string())),
6238 },
6239 PositionalToken {
6240 source: uws,
6241 offset: 95,
6242 length: 3,
6243 token: Token::Word(Word::Word("宮".to_string())),
6244 },
6245 PositionalToken {
6246 source: uws,
6247 offset: 98,
6248 length: 3,
6249 token: Token::Special(Special::Punctuation('、')),
6250 },
6251 PositionalToken {
6252 source: uws,
6253 offset: 101,
6254 length: 3,
6255 token: Token::Word(Word::Word("熊".to_string())),
6256 },
6257 PositionalToken {
6258 source: uws,
6259 offset: 104,
6260 length: 3,
6261 token: Token::Word(Word::Word("野".to_string())),
6262 },
6263 PositionalToken {
6264 source: uws,
6265 offset: 107,
6266 length: 3,
6267 token: Token::Word(Word::Word("那".to_string())),
6268 },
6269 PositionalToken {
6270 source: uws,
6271 offset: 110,
6272 length: 3,
6273 token: Token::Word(Word::Word("智".to_string())),
6274 },
6275 PositionalToken {
6276 source: uws,
6277 offset: 113,
6278 length: 3,
6279 token: Token::Special(Special::Punctuation(')')),
6280 },
6281 PositionalToken {
6282 source: uws,
6283 offset: 116,
6284 length: 3,
6285 token: Token::Word(Word::Word("の".to_string())),
6286 },
6287 PositionalToken {
6288 source: uws,
6289 offset: 119,
6290 length: 3,
6291 token: Token::Word(Word::Word("造".to_string())),
6292 },
6293 PositionalToken {
6294 source: uws,
6295 offset: 122,
6296 length: 3,
6297 token: Token::Word(Word::Word("営".to_string())),
6298 },
6299 PositionalToken {
6300 source: uws,
6301 offset: 125,
6302 length: 3,
6303 token: Token::Special(Special::Punctuation('・')),
6304 },
6305 PositionalToken {
6306 source: uws,
6307 offset: 128,
6308 length: 3,
6309 token: Token::Word(Word::Word("修".to_string())),
6310 },
6311 PositionalToken {
6312 source: uws,
6313 offset: 131,
6314 length: 3,
6315 token: Token::Word(Word::Word("造".to_string())),
6316 },
6317 PositionalToken {
6318 source: uws,
6319 offset: 134,
6320 length: 3,
6321 token: Token::Word(Word::Word("の".to_string())),
6322 },
6323 PositionalToken {
6324 source: uws,
6325 offset: 137,
6326 length: 3,
6327 token: Token::Word(Word::Word("た".to_string())),
6328 },
6329 PositionalToken {
6330 source: uws,
6331 offset: 140,
6332 length: 3,
6333 token: Token::Word(Word::Word("め".to_string())),
6334 },
6335 PositionalToken {
6336 source: uws,
6337 offset: 143,
6338 length: 3,
6339 token: Token::Word(Word::Word("の".to_string())),
6340 },
6341 PositionalToken {
6342 source: uws,
6343 offset: 146,
6344 length: 3,
6345 token: Token::Word(Word::Word("勧".to_string())),
6346 },
6347 PositionalToken {
6348 source: uws,
6349 offset: 149,
6350 length: 3,
6351 token: Token::Word(Word::Word("進".to_string())),
6352 },
6353 PositionalToken {
6354 source: uws,
6355 offset: 152,
6356 length: 3,
6357 token: Token::Word(Word::Word("を".to_string())),
6358 },
6359 PositionalToken {
6360 source: uws,
6361 offset: 155,
6362 length: 3,
6363 token: Token::Word(Word::Word("担".to_string())),
6364 },
6365 PositionalToken {
6366 source: uws,
6367 offset: 158,
6368 length: 3,
6369 token: Token::Word(Word::Word("っ".to_string())),
6370 },
6371 PositionalToken {
6372 source: uws,
6373 offset: 161,
6374 length: 3,
6375 token: Token::Word(Word::Word("た".to_string())),
6376 },
6377 PositionalToken {
6378 source: uws,
6379 offset: 164,
6380 length: 3,
6381 token: Token::Word(Word::Word("組".to_string())),
6382 },
6383 PositionalToken {
6384 source: uws,
6385 offset: 167,
6386 length: 3,
6387 token: Token::Word(Word::Word("織".to_string())),
6388 },
6389 PositionalToken {
6390 source: uws,
6391 offset: 170,
6392 length: 3,
6393 token: Token::Word(Word::Word("の".to_string())),
6394 },
6395 PositionalToken {
6396 source: uws,
6397 offset: 173,
6398 length: 3,
6399 token: Token::Word(Word::Word("総".to_string())),
6400 },
6401 PositionalToken {
6402 source: uws,
6403 offset: 176,
6404 length: 3,
6405 token: Token::Word(Word::Word("称".to_string())),
6406 },
6407 PositionalToken {
6408 source: uws,
6409 offset: 179,
6410 length: 3,
6411 token: Token::Special(Special::Punctuation('。')),
6412 },
6413 PositionalToken {
6414 source: uws,
6415 offset: 182,
6416 length: 1,
6417 token: Token::Special(Special::Separator(Separator::Space)),
6418 },
6419 PositionalToken {
6420 source: uws,
6421 offset: 183,
6422 length: 3,
6423 token: Token::Word(Word::Word("熊".to_string())),
6424 },
6425 PositionalToken {
6426 source: uws,
6427 offset: 186,
6428 length: 3,
6429 token: Token::Word(Word::Word("野".to_string())),
6430 },
6431 PositionalToken {
6432 source: uws,
6433 offset: 189,
6434 length: 3,
6435 token: Token::Word(Word::Word("三".to_string())),
6436 },
6437 PositionalToken {
6438 source: uws,
6439 offset: 192,
6440 length: 3,
6441 token: Token::Word(Word::Word("山".to_string())),
6442 },
6443 PositionalToken {
6444 source: uws,
6445 offset: 195,
6446 length: 3,
6447 token: Token::Word(Word::Word("を".to_string())),
6448 },
6449 PositionalToken {
6450 source: uws,
6451 offset: 198,
6452 length: 3,
6453 token: Token::Word(Word::Word("含".to_string())),
6454 },
6455 PositionalToken {
6456 source: uws,
6457 offset: 201,
6458 length: 3,
6459 token: Token::Word(Word::Word("め".to_string())),
6460 },
6461 PositionalToken {
6462 source: uws,
6463 offset: 204,
6464 length: 3,
6465 token: Token::Word(Word::Word("て".to_string())),
6466 },
6467 PositionalToken {
6468 source: uws,
6469 offset: 207,
6470 length: 3,
6471 token: Token::Special(Special::Punctuation('、')),
6472 },
6473 PositionalToken {
6474 source: uws,
6475 offset: 210,
6476 length: 3,
6477 token: Token::Word(Word::Word("日".to_string())),
6478 },
6479 PositionalToken {
6480 source: uws,
6481 offset: 213,
6482 length: 3,
6483 token: Token::Word(Word::Word("本".to_string())),
6484 },
6485 PositionalToken {
6486 source: uws,
6487 offset: 216,
6488 length: 3,
6489 token: Token::Word(Word::Word("に".to_string())),
6490 },
6491 PositionalToken {
6492 source: uws,
6493 offset: 219,
6494 length: 3,
6495 token: Token::Word(Word::Word("お".to_string())),
6496 },
6497 PositionalToken {
6498 source: uws,
6499 offset: 222,
6500 length: 3,
6501 token: Token::Word(Word::Word("け".to_string())),
6502 },
6503 PositionalToken {
6504 source: uws,
6505 offset: 225,
6506 length: 3,
6507 token: Token::Word(Word::Word("る".to_string())),
6508 },
6509 PositionalToken {
6510 source: uws,
6511 offset: 228,
6512 length: 3,
6513 token: Token::Word(Word::Word("古".to_string())),
6514 },
6515 PositionalToken {
6516 source: uws,
6517 offset: 231,
6518 length: 3,
6519 token: Token::Word(Word::Word("代".to_string())),
6520 },
6521 PositionalToken {
6522 source: uws,
6523 offset: 234,
6524 length: 3,
6525 token: Token::Word(Word::Word("か".to_string())),
6526 },
6527 PositionalToken {
6528 source: uws,
6529 offset: 237,
6530 length: 3,
6531 token: Token::Word(Word::Word("ら".to_string())),
6532 },
6533 PositionalToken {
6534 source: uws,
6535 offset: 240,
6536 length: 3,
6537 token: Token::Word(Word::Word("中".to_string())),
6538 },
6539 PositionalToken {
6540 source: uws,
6541 offset: 243,
6542 length: 3,
6543 token: Token::Word(Word::Word("世".to_string())),
6544 },
6545 PositionalToken {
6546 source: uws,
6547 offset: 246,
6548 length: 3,
6549 token: Token::Word(Word::Word("前".to_string())),
6550 },
6551 PositionalToken {
6552 source: uws,
6553 offset: 249,
6554 length: 3,
6555 token: Token::Word(Word::Word("半".to_string())),
6556 },
6557 PositionalToken {
6558 source: uws,
6559 offset: 252,
6560 length: 3,
6561 token: Token::Word(Word::Word("に".to_string())),
6562 },
6563 PositionalToken {
6564 source: uws,
6565 offset: 255,
6566 length: 3,
6567 token: Token::Word(Word::Word("か".to_string())),
6568 },
6569 PositionalToken {
6570 source: uws,
6571 offset: 258,
6572 length: 3,
6573 token: Token::Word(Word::Word("け".to_string())),
6574 },
6575 PositionalToken {
6576 source: uws,
6577 offset: 261,
6578 length: 3,
6579 token: Token::Word(Word::Word("て".to_string())),
6580 },
6581 PositionalToken {
6582 source: uws,
6583 offset: 264,
6584 length: 3,
6585 token: Token::Word(Word::Word("の".to_string())),
6586 },
6587 PositionalToken {
6588 source: uws,
6589 offset: 267,
6590 length: 3,
6591 token: Token::Word(Word::Word("寺".to_string())),
6592 },
6593 PositionalToken {
6594 source: uws,
6595 offset: 270,
6596 length: 3,
6597 token: Token::Word(Word::Word("社".to_string())),
6598 },
6599 PositionalToken {
6600 source: uws,
6601 offset: 273,
6602 length: 3,
6603 token: Token::Word(Word::Word("の".to_string())),
6604 },
6605 PositionalToken {
6606 source: uws,
6607 offset: 276,
6608 length: 3,
6609 token: Token::Word(Word::Word("造".to_string())),
6610 },
6611 PositionalToken {
6612 source: uws,
6613 offset: 279,
6614 length: 3,
6615 token: Token::Word(Word::Word("営".to_string())),
6616 },
6617 PositionalToken {
6618 source: uws,
6619 offset: 282,
6620 length: 3,
6621 token: Token::Word(Word::Word("は".to_string())),
6622 },
6623 PositionalToken {
6624 source: uws,
6625 offset: 285,
6626 length: 3,
6627 token: Token::Special(Special::Punctuation('、')),
6628 },
6629 PositionalToken {
6630 source: uws,
6631 offset: 288,
6632 length: 3,
6633 token: Token::Word(Word::Word("寺".to_string())),
6634 },
6635 PositionalToken {
6636 source: uws,
6637 offset: 291,
6638 length: 3,
6639 token: Token::Word(Word::Word("社".to_string())),
6640 },
6641 ],
6642 Lang::Kor => vec![
6643 PositionalToken {
6644 source: uws,
6645 offset: 0,
6646 length: 21,
6647 token: Token::Word(Word::Word("플레이스테이션".to_string())),
6648 },
6649 PositionalToken {
6650 source: uws,
6651 offset: 21,
6652 length: 1,
6653 token: Token::Special(Special::Separator(Separator::Space)),
6654 },
6655 PositionalToken {
6656 source: uws,
6657 offset: 22,
6658 length: 3,
6659 token: Token::Word(Word::Word("은".to_string())),
6660 },
6661 PositionalToken {
6662 source: uws,
6663 offset: 25,
6664 length: 1,
6665 token: Token::Special(Special::Separator(Separator::Space)),
6666 },
6667 PositionalToken {
6668 source: uws,
6669 offset: 26,
6670 length: 6,
6671 token: Token::Word(Word::Word("소니".to_string())),
6672 },
6673 PositionalToken {
6674 source: uws,
6675 offset: 32,
6676 length: 1,
6677 token: Token::Special(Special::Separator(Separator::Space)),
6678 },
6679 PositionalToken {
6680 source: uws,
6681 offset: 33,
6682 length: 9,
6683 token: Token::Word(Word::Word("컴퓨터".to_string())),
6684 },
6685 PositionalToken {
6686 source: uws,
6687 offset: 42,
6688 length: 1,
6689 token: Token::Special(Special::Separator(Separator::Space)),
6690 },
6691 PositionalToken {
6692 source: uws,
6693 offset: 43,
6694 length: 21,
6695 token: Token::Word(Word::Word("엔터테인먼트가".to_string())),
6696 },
6697 PositionalToken {
6698 source: uws,
6699 offset: 64,
6700 length: 1,
6701 token: Token::Special(Special::Separator(Separator::Space)),
6702 },
6703 PositionalToken {
6704 source: uws,
6705 offset: 65,
6706 length: 9,
6707 token: Token::Word(Word::Word("개발한".to_string())),
6708 },
6709 PositionalToken {
6710 source: uws,
6711 offset: 74,
6712 length: 1,
6713 token: Token::Special(Special::Separator(Separator::Space)),
6714 },
6715 PositionalToken {
6716 source: uws,
6717 offset: 75,
6718 length: 3,
6719 token: Token::Word(Word::Word("세".to_string())),
6720 },
6721 PositionalToken {
6722 source: uws,
6723 offset: 78,
6724 length: 1,
6725 token: Token::Special(Special::Separator(Separator::Space)),
6726 },
6727 PositionalToken {
6728 source: uws,
6729 offset: 79,
6730 length: 6,
6731 token: Token::Word(Word::Word("번째".to_string())),
6732 },
6733 PositionalToken {
6734 source: uws,
6735 offset: 85,
6736 length: 1,
6737 token: Token::Special(Special::Separator(Separator::Space)),
6738 },
6739 PositionalToken {
6740 source: uws,
6741 offset: 86,
6742 length: 9,
6743 token: Token::Word(Word::Word("가정용".to_string())),
6744 },
6745 PositionalToken {
6746 source: uws,
6747 offset: 95,
6748 length: 1,
6749 token: Token::Special(Special::Separator(Separator::Space)),
6750 },
6751 PositionalToken {
6752 source: uws,
6753 offset: 96,
6754 length: 15,
6755 token: Token::Word(Word::Word("게임기이다".to_string())),
6756 },
6757 PositionalToken {
6758 source: uws,
6759 offset: 111,
6760 length: 1,
6761 token: Token::Special(Special::Punctuation('.')),
6762 },
6763 PositionalToken {
6764 source: uws,
6765 offset: 112,
6766 length: 1,
6767 token: Token::Special(Special::Separator(Separator::Space)),
6768 },
6769 PositionalToken {
6770 source: uws,
6771 offset: 113,
6772 length: 24,
6773 token: Token::Word(Word::Word("마이크로소프트의".to_string())),
6774 },
6775 PositionalToken {
6776 source: uws,
6777 offset: 137,
6778 length: 1,
6779 token: Token::Special(Special::Separator(Separator::Space)),
6780 },
6781 PositionalToken {
6782 source: uws,
6783 offset: 138,
6784 length: 12,
6785 token: Token::Word(Word::Word("엑스박스".to_string())),
6786 },
6787 PositionalToken {
6788 source: uws,
6789 offset: 150,
6790 length: 1,
6791 token: Token::Special(Special::Separator(Separator::Space)),
6792 },
6793 PositionalToken {
6794 source: uws,
6795 offset: 151,
6796 length: 3,
6797 token: Token::Word(Word::Number(Number::Integer(360))),
6798 },
6799 PositionalToken {
6800 source: uws,
6801 offset: 154,
6802 length: 1,
6803 token: Token::Special(Special::Punctuation(',')),
6804 },
6805 PositionalToken {
6806 source: uws,
6807 offset: 155,
6808 length: 1,
6809 token: Token::Special(Special::Separator(Separator::Space)),
6810 },
6811 PositionalToken {
6812 source: uws,
6813 offset: 156,
6814 length: 12,
6815 token: Token::Word(Word::Word("닌텐도의".to_string())),
6816 },
6817 PositionalToken {
6818 source: uws,
6819 offset: 168,
6820 length: 1,
6821 token: Token::Special(Special::Separator(Separator::Space)),
6822 },
6823 PositionalToken {
6824 source: uws,
6825 offset: 169,
6826 length: 6,
6827 token: Token::Word(Word::Word("Wii와".to_string())),
6828 },
6829 PositionalToken {
6830 source: uws,
6831 offset: 175,
6832 length: 1,
6833 token: Token::Special(Special::Separator(Separator::Space)),
6834 },
6835 PositionalToken {
6836 source: uws,
6837 offset: 176,
6838 length: 12,
6839 token: Token::Word(Word::Word("경쟁하고".to_string())),
6840 },
6841 PositionalToken {
6842 source: uws,
6843 offset: 188,
6844 length: 1,
6845 token: Token::Special(Special::Separator(Separator::Space)),
6846 },
6847 PositionalToken {
6848 source: uws,
6849 offset: 189,
6850 length: 6,
6851 token: Token::Word(Word::Word("있다".to_string())),
6852 },
6853 PositionalToken {
6854 source: uws,
6855 offset: 195,
6856 length: 1,
6857 token: Token::Special(Special::Punctuation('.')),
6858 },
6859 PositionalToken {
6860 source: uws,
6861 offset: 196,
6862 length: 1,
6863 token: Token::Special(Special::Separator(Separator::Space)),
6864 },
6865 PositionalToken {
6866 source: uws,
6867 offset: 197,
6868 length: 6,
6869 token: Token::Word(Word::Word("이전".to_string())),
6870 },
6871 PositionalToken {
6872 source: uws,
6873 offset: 203,
6874 length: 1,
6875 token: Token::Special(Special::Separator(Separator::Space)),
6876 },
6877 PositionalToken {
6878 source: uws,
6879 offset: 204,
6880 length: 12,
6881 token: Token::Word(Word::Word("제품에서".to_string())),
6882 },
6883 PositionalToken {
6884 source: uws,
6885 offset: 216,
6886 length: 1,
6887 token: Token::Special(Special::Separator(Separator::Space)),
6888 },
6889 PositionalToken {
6890 source: uws,
6891 offset: 217,
6892 length: 9,
6893 token: Token::Word(Word::Word("온라인".to_string())),
6894 },
6895 PositionalToken {
6896 source: uws,
6897 offset: 226,
6898 length: 1,
6899 token: Token::Special(Special::Separator(Separator::Space)),
6900 },
6901 PositionalToken {
6902 source: uws,
6903 offset: 227,
6904 length: 9,
6905 token: Token::Word(Word::Word("플레이".to_string())),
6906 },
6907 PositionalToken {
6908 source: uws,
6909 offset: 236,
6910 length: 1,
6911 token: Token::Special(Special::Separator(Separator::Space)),
6912 },
6913 PositionalToken {
6914 source: uws,
6915 offset: 237,
6916 length: 3,
6917 token: Token::Word(Word::Word("기".to_string())),
6918 },
6919 ],
6920 Lang::Ara => vec![
6921 PositionalToken {
6922 source: uws,
6923 offset: 0,
6924 length: 14,
6925 token: Token::Word(Word::Word("لشکرکشی".to_string())),
6926 },
6927 PositionalToken {
6928 source: uws,
6929 offset: 14,
6930 length: 3,
6931 token: Token::Unicode(Unicode::Formatter(Formatter::Char('\u{200c}'))),
6932 },
6933 PositionalToken {
6934 source: uws,
6935 offset: 17,
6936 length: 6,
6937 token: Token::Word(Word::Word("های".to_string())),
6938 },
6939 PositionalToken {
6940 source: uws,
6941 offset: 23,
6942 length: 1,
6943 token: Token::Special(Special::Separator(Separator::Space)),
6944 },
6945 PositionalToken {
6946 source: uws,
6947 offset: 24,
6948 length: 6,
6949 token: Token::Word(Word::Word("روس".to_string())),
6950 },
6951 PositionalToken {
6952 source: uws,
6953 offset: 30,
6954 length: 3,
6955 token: Token::Unicode(Unicode::Formatter(Formatter::Char('\u{200c}'))),
6956 },
6957 PositionalToken {
6958 source: uws,
6959 offset: 33,
6960 length: 6,
6961 token: Token::Word(Word::Word("های".to_string())),
6962 },
6963 PositionalToken {
6964 source: uws,
6965 offset: 39,
6966 length: 1,
6967 token: Token::Special(Special::Separator(Separator::Space)),
6968 },
6969 PositionalToken {
6970 source: uws,
6971 offset: 40,
6972 length: 12,
6973 token: Token::Word(Word::Word("وارنگی".to_string())),
6974 },
6975 PositionalToken {
6976 source: uws,
6977 offset: 52,
6978 length: 1,
6979 token: Token::Special(Special::Separator(Separator::Space)),
6980 },
6981 PositionalToken {
6982 source: uws,
6983 offset: 53,
6984 length: 4,
6985 token: Token::Word(Word::Word("به".to_string())),
6986 },
6987 PositionalToken {
6988 source: uws,
6989 offset: 57,
6990 length: 1,
6991 token: Token::Special(Special::Separator(Separator::Space)),
6992 },
6993 PositionalToken {
6994 source: uws,
6995 offset: 58,
6996 length: 10,
6997 token: Token::Word(Word::Word("دریای".to_string())),
6998 },
6999 PositionalToken {
7000 source: uws,
7001 offset: 68,
7002 length: 1,
7003 token: Token::Special(Special::Separator(Separator::Space)),
7004 },
7005 PositionalToken {
7006 source: uws,
7007 offset: 69,
7008 length: 6,
7009 token: Token::Word(Word::Word("خزر".to_string())),
7010 },
7011 PositionalToken {
7012 source: uws,
7013 offset: 75,
7014 length: 1,
7015 token: Token::Special(Special::Separator(Separator::Space)),
7016 },
7017 PositionalToken {
7018 source: uws,
7019 offset: 76,
7020 length: 12,
7021 token: Token::Word(Word::Word("مجموعه".to_string())),
7022 },
7023 PositionalToken {
7024 source: uws,
7025 offset: 88,
7026 length: 3,
7027 token: Token::Unicode(Unicode::Formatter(Formatter::Char('\u{200c}'))),
7028 },
7029 PositionalToken {
7030 source: uws,
7031 offset: 91,
7032 length: 4,
7033 token: Token::Word(Word::Word("ای".to_string())),
7034 },
7035 PositionalToken {
7036 source: uws,
7037 offset: 95,
7038 length: 1,
7039 token: Token::Special(Special::Separator(Separator::Space)),
7040 },
7041 PositionalToken {
7042 source: uws,
7043 offset: 96,
7044 length: 4,
7045 token: Token::Word(Word::Word("از".to_string())),
7046 },
7047 PositionalToken {
7048 source: uws,
7049 offset: 100,
7050 length: 1,
7051 token: Token::Special(Special::Separator(Separator::Space)),
7052 },
7053 PositionalToken {
7054 source: uws,
7055 offset: 101,
7056 length: 10,
7057 token: Token::Word(Word::Word("حملات".to_string())),
7058 },
7059 PositionalToken {
7060 source: uws,
7061 offset: 111,
7062 length: 1,
7063 token: Token::Special(Special::Separator(Separator::Space)),
7064 },
7065 PositionalToken {
7066 source: uws,
7067 offset: 112,
7068 length: 10,
7069 token: Token::Word(Word::Word("نظامی".to_string())),
7070 },
7071 PositionalToken {
7072 source: uws,
7073 offset: 122,
7074 length: 1,
7075 token: Token::Special(Special::Separator(Separator::Space)),
7076 },
7077 PositionalToken {
7078 source: uws,
7079 offset: 123,
7080 length: 4,
7081 token: Token::Word(Word::Word("در".to_string())),
7082 },
7083 PositionalToken {
7084 source: uws,
7085 offset: 127,
7086 length: 1,
7087 token: Token::Special(Special::Separator(Separator::Space)),
7088 },
7089 PositionalToken {
7090 source: uws,
7091 offset: 128,
7092 length: 6,
7093 token: Token::Word(Word::Word("بین".to_string())),
7094 },
7095 PositionalToken {
7096 source: uws,
7097 offset: 134,
7098 length: 1,
7099 token: Token::Special(Special::Separator(Separator::Space)),
7100 },
7101 PositionalToken {
7102 source: uws,
7103 offset: 135,
7104 length: 6,
7105 token: Token::Word(Word::Word("سال".to_string())),
7106 },
7107 PositionalToken {
7108 source: uws,
7109 offset: 141,
7110 length: 3,
7111 token: Token::Unicode(Unicode::Formatter(Formatter::Char('\u{200c}'))),
7112 },
7113 PositionalToken {
7114 source: uws,
7115 offset: 144,
7116 length: 6,
7117 token: Token::Word(Word::Word("های".to_string())),
7118 },
7119 PositionalToken {
7120 source: uws,
7121 offset: 150,
7122 length: 1,
7123 token: Token::Special(Special::Separator(Separator::Space)),
7124 },
7125 PositionalToken {
7126 source: uws,
7127 offset: 151,
7128 length: 6,
7129 token: Token::Word(Word::StrangeWord("۸۶۴".to_string())),
7130 },
7131 PositionalToken {
7132 source: uws,
7133 offset: 157,
7134 length: 1,
7135 token: Token::Special(Special::Separator(Separator::Space)),
7136 },
7137 PositionalToken {
7138 source: uws,
7139 offset: 158,
7140 length: 4,
7141 token: Token::Word(Word::Word("تا".to_string())),
7142 },
7143 PositionalToken {
7144 source: uws,
7145 offset: 162,
7146 length: 1,
7147 token: Token::Special(Special::Separator(Separator::Space)),
7148 },
7149 PositionalToken {
7150 source: uws,
7151 offset: 163,
7152 length: 8,
7153 token: Token::Word(Word::StrangeWord("۱۰۴۱".to_string())),
7154 },
7155 PositionalToken {
7156 source: uws,
7157 offset: 171,
7158 length: 1,
7159 token: Token::Special(Special::Separator(Separator::Space)),
7160 },
7161 PositionalToken {
7162 source: uws,
7163 offset: 172,
7164 length: 12,
7165 token: Token::Word(Word::Word("میلادی".to_string())),
7166 },
7167 PositionalToken {
7168 source: uws,
7169 offset: 184,
7170 length: 1,
7171 token: Token::Special(Special::Separator(Separator::Space)),
7172 },
7173 PositionalToken {
7174 source: uws,
7175 offset: 185,
7176 length: 2,
7177 token: Token::Word(Word::Word("ب".to_string())),
7178 },
7179 ],
7180 Lang::Ell => vec![
7181 PositionalToken {
7182 source: uws,
7183 offset: 0,
7184 length: 4,
7185 token: Token::Word(Word::Word("Το".to_string())),
7186 },
7187 PositionalToken {
7188 source: uws,
7189 offset: 4,
7190 length: 1,
7191 token: Token::Special(Special::Separator(Separator::Space)),
7192 },
7193 PositionalToken {
7194 source: uws,
7195 offset: 5,
7196 length: 18,
7197 token: Token::Word(Word::Word("Πρόγραμμα".to_string())),
7198 },
7199 PositionalToken {
7200 source: uws,
7201 offset: 23,
7202 length: 1,
7203 token: Token::Special(Special::Separator(Separator::Space)),
7204 },
7205 PositionalToken {
7206 source: uws,
7207 offset: 24,
7208 length: 22,
7209 token: Token::Word(Word::Word("υλοποιείται".to_string())),
7210 },
7211 PositionalToken {
7212 source: uws,
7213 offset: 46,
7214 length: 1,
7215 token: Token::Special(Special::Separator(Separator::Space)),
7216 },
7217 PositionalToken {
7218 source: uws,
7219 offset: 47,
7220 length: 4,
7221 token: Token::Word(Word::Word("εξ".to_string())),
7222 },
7223 PositionalToken {
7224 source: uws,
7225 offset: 51,
7226 length: 1,
7227 token: Token::Special(Special::Separator(Separator::Space)),
7228 },
7229 PositionalToken {
7230 source: uws,
7231 offset: 52,
7232 length: 18,
7233 token: Token::Word(Word::Word("ολοκλήρου".to_string())),
7234 },
7235 PositionalToken {
7236 source: uws,
7237 offset: 70,
7238 length: 1,
7239 token: Token::Special(Special::Separator(Separator::Space)),
7240 },
7241 PositionalToken {
7242 source: uws,
7243 offset: 71,
7244 length: 6,
7245 token: Token::Word(Word::Word("από".to_string())),
7246 },
7247 PositionalToken {
7248 source: uws,
7249 offset: 77,
7250 length: 1,
7251 token: Token::Special(Special::Separator(Separator::Space)),
7252 },
7253 PositionalToken {
7254 source: uws,
7255 offset: 78,
7256 length: 16,
7257 token: Token::Word(Word::Word("απόσταση".to_string())),
7258 },
7259 PositionalToken {
7260 source: uws,
7261 offset: 94,
7262 length: 1,
7263 token: Token::Special(Special::Separator(Separator::Space)),
7264 },
7265 PositionalToken {
7266 source: uws,
7267 offset: 95,
7268 length: 6,
7269 token: Token::Word(Word::Word("και".to_string())),
7270 },
7271 PositionalToken {
7272 source: uws,
7273 offset: 101,
7274 length: 1,
7275 token: Token::Special(Special::Separator(Separator::Space)),
7276 },
7277 PositionalToken {
7278 source: uws,
7279 offset: 102,
7280 length: 12,
7281 token: Token::Word(Word::Word("μπορεί".to_string())),
7282 },
7283 PositionalToken {
7284 source: uws,
7285 offset: 114,
7286 length: 1,
7287 token: Token::Special(Special::Separator(Separator::Space)),
7288 },
7289 PositionalToken {
7290 source: uws,
7291 offset: 115,
7292 length: 4,
7293 token: Token::Word(Word::Word("να".to_string())),
7294 },
7295 PositionalToken {
7296 source: uws,
7297 offset: 119,
7298 length: 1,
7299 token: Token::Special(Special::Separator(Separator::Space)),
7300 },
7301 PositionalToken {
7302 source: uws,
7303 offset: 120,
7304 length: 20,
7305 token: Token::Word(Word::Word("συμμετέχει".to_string())),
7306 },
7307 PositionalToken {
7308 source: uws,
7309 offset: 140,
7310 length: 1,
7311 token: Token::Special(Special::Separator(Separator::Space)),
7312 },
7313 PositionalToken {
7314 source: uws,
7315 offset: 141,
7316 length: 8,
7317 token: Token::Word(Word::Word("κάθε".to_string())),
7318 },
7319 PositionalToken {
7320 source: uws,
7321 offset: 149,
7322 length: 1,
7323 token: Token::Special(Special::Separator(Separator::Space)),
7324 },
7325 PositionalToken {
7326 source: uws,
7327 offset: 150,
7328 length: 24,
7329 token: Token::Word(Word::Word("εμπλεκόμενος".to_string())),
7330 },
7331 PositionalToken {
7332 source: uws,
7333 offset: 174,
7334 length: 1,
7335 token: Token::Special(Special::Separator(Separator::Space)),
7336 },
7337 PositionalToken {
7338 source: uws,
7339 offset: 175,
7340 length: 6,
7341 token: Token::Word(Word::Word("στη".to_string())),
7342 },
7343 PositionalToken {
7344 source: uws,
7345 offset: 181,
7346 length: 1,
7347 token: Token::Special(Special::Separator(Separator::Space)),
7348 },
7349 PositionalToken {
7350 source: uws,
7351 offset: 182,
7352 length: 2,
7353 token: Token::Word(Word::Word("ή".to_string())),
7354 },
7355 PositionalToken {
7356 source: uws,
7357 offset: 184,
7358 length: 1,
7359 token: Token::Special(Special::Punctuation('/')),
7360 },
7361 ],
7362 };
7363 (
7364 uws.chars()
7365 .take(100)
7366 .fold(String::new(), |acc, c| acc + &format!("{}", c)),
7367 tokens,
7368 )
7369 }
7370}