1use std::{fmt, sync::Arc};
2use text_parsing::{Breaker, IntoSource, Local, Localize, Snip, Source, SourceEvent};
3
4mod emoji;
5pub use emoji::EMOJIMAP;
6
7mod breakers;
8pub use breakers::{SentenceBreaker, UnicodeSentenceBreaker};
9
10mod numbers;
11mod wordbreaker;
12
13mod options;
14pub use options::{IntoTokenizer, TokenizerOptions, TokenizerParams};
15
16mod tokens;
17pub use tokens::Tokens;
18
19mod text_tokens;
20use text_tokens::InnerBound;
21pub use text_tokens::TextTokens;
22
23#[cfg(test)]
24mod test {
25 mod numbers_ru_en;
26}
27
28#[derive(Debug)]
29pub enum Error {
30 TextParser(text_parsing::Error),
31}
32
33pub const EPS: f64 = 1e-8;
34
35#[cfg(feature = "strings")]
36#[derive(Debug, Clone, PartialEq, PartialOrd)]
37pub enum Number {
38 Integer(i64),
39 Float(f64),
40 ZeroInteger { i: i64, s: String },
42}
43
44#[cfg(not(feature = "strings"))]
45#[derive(Debug, Clone, Copy, PartialEq, PartialOrd)]
46pub enum Number {
47 Integer(i64),
48 Float(f64),
49 ZeroInteger { i: i64 },
50}
51
52impl Number {
53 pub fn as_f64(&self) -> f64 {
54 match self {
55 Number::Integer(i) => *i as f64,
56 Number::Float(f) => *f,
57 Number::ZeroInteger { i, .. } => *i as f64,
58 }
59 }
60}
61impl Ord for Number {
62 fn cmp(&self, other: &Number) -> std::cmp::Ordering {
63 let s = self.as_f64();
64 let o = other.as_f64();
65 let d = s - o;
66 match d.abs() < EPS {
67 true => std::cmp::Ordering::Equal,
68 false => {
69 if d > 0.0 {
70 return std::cmp::Ordering::Greater;
71 }
72 if d < 0.0 {
73 return std::cmp::Ordering::Less;
74 }
75 std::cmp::Ordering::Equal
76 }
77 }
78 }
79}
80impl Eq for Number {}
81
82#[derive(Debug, Clone, Copy, Eq, PartialEq, Ord, PartialOrd)]
83pub enum Separator {
84 Space,
85 Tab,
86 Newline,
87 Char(char),
88}
89
90#[derive(Debug, Clone, Copy, Eq, PartialEq, Ord, PartialOrd)]
91pub enum Formatter {
92 Char(char),
93 Joiner, }
95
96#[derive(Debug, Clone, Copy, PartialEq, Eq, Ord, PartialOrd)]
97pub enum Special {
98 Currency(char),
99 Punctuation(char),
100 Symbol(char),
101 Separator(Separator),
102}
103
104#[cfg(feature = "strings")]
105#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord)]
106pub enum Word {
107 Word(String),
108 StrangeWord(String),
109 Numerical(Numerical),
110 Number(Number),
111 Emoji(&'static str),
112}
113
114#[cfg(feature = "strings")]
115#[derive(Debug, Clone, Eq, PartialEq, Ord, PartialOrd)]
116pub enum Numerical {
117 DotSeparated(String),
121 Measures(String),
122 Alphanumeric(String),
123}
124
125#[cfg(feature = "strings")]
126#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord)]
127pub enum Struct {
128 Hashtag(String),
129 Mention(String),
130 }
132
133#[cfg(feature = "strings")]
134#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord)]
135pub enum Unicode {
136 String(String),
137 Formatter(Formatter),
138}
139
140#[cfg(not(feature = "strings"))]
141#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord)]
142pub enum Word {
143 Word,
144 StrangeWord,
145 Numerical(Numerical),
146 Number(Number),
147 Emoji(&'static str),
148}
149
150#[cfg(not(feature = "strings"))]
151#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord)]
152pub enum Numerical {
153 DotSeparated,
157 Measures,
158 Alphanumeric,
159}
160
161#[cfg(not(feature = "strings"))]
162#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord)]
163pub enum Struct {
164 Hashtag,
165 Mention,
166 }
168
169#[cfg(not(feature = "strings"))]
170#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord)]
171pub enum Unicode {
172 String,
173 Formatter(Formatter),
174}
175
176#[cfg(feature = "strings")]
177#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord)]
178pub enum Token {
179 Word(Word),
180 Struct(Struct),
181 Special(Special),
182 Unicode(Unicode),
183}
184
185#[cfg(not(feature = "strings"))]
186#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord)]
187pub enum Token {
188 Word(Word),
189 Struct(Struct),
190 Special(Special),
191 Unicode(Unicode),
192}
193
194#[derive(Debug)]
208pub struct TextStr<'s> {
209 buffer: &'s str,
210 localities: Arc<Vec<TextLocality>>,
211 breakers: Arc<Vec<InnerBound>>,
212}
213impl<'s> TextStr<'s> {
214 pub fn new<'a>(s: &'a str) -> Result<TextStr<'a>, Error> {
215 let text = inner_new(s.into_source(), false)?;
216 Ok(TextStr {
217 buffer: s,
218 localities: text.localities,
219 breakers: text.breakers,
220 })
221 }
222}
223
224fn inner_new<S: Source>(mut source: S, with_buffer: bool) -> Result<Text, Error> {
225 let mut buffer = String::new();
226 let mut localities = Vec::new();
227 let mut breakers = Vec::new();
228 let mut buffer_len = 0;
229
230 while let Some(local_se) = source.next_char().map_err(Error::TextParser)? {
231 let (local, se) = local_se.into_inner();
232 let c = match se {
233 SourceEvent::Char(c) => match c {
234 '\u{0060}' => '\u{0027}',
235 _ => c,
236 },
237 SourceEvent::Breaker(b) => {
238 let (c, opt_b) = match b {
239 Breaker::None => continue,
240 Breaker::Space => (' ', None),
241 Breaker::Line => ('\n', None),
242 Breaker::Word => ('\u{200B}', Some(b)), Breaker::Sentence | Breaker::Paragraph | Breaker::Section => ('\n', Some(b)),
244 };
245 if let Some(b) = opt_b {
246 let br = InnerBound {
247 bytes: Snip {
248 offset: buffer_len,
249 length: c.len_utf8(),
250 },
251 chars: Snip {
252 offset: localities.len(),
253 length: 1,
254 },
255 breaker: b,
256 original: Some(local),
257 };
258 breakers.push(br);
260 }
261 c
262 }
263 };
264
265 let buf_local = ().localize(
266 Snip {
267 offset: localities.len(),
269 length: 1,
270 },
271 Snip {
272 offset: buffer_len,
274 length: c.len_utf8(),
275 },
276 );
277 if with_buffer {
278 buffer.push(c);
279 }
280 buffer_len += c.len_utf8();
281 localities.push(TextLocality {
282 buffer: buf_local,
283 original: local,
284 });
285 }
286 Ok(Text {
287 buffer: Arc::new(buffer),
288 localities: Arc::new(localities),
289 breakers: Arc::new(breakers),
290 })
291}
292
293#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord)]
294pub struct TextLocality {
295 pub buffer: Local<()>,
296 pub original: Local<()>,
297}
298
299#[derive(Debug)]
300pub struct Text {
301 buffer: Arc<String>,
302 localities: Arc<Vec<TextLocality>>,
303 breakers: Arc<Vec<InnerBound>>,
304}
305impl Text {
306 pub fn new<S: Source>(source: S) -> Result<Text, Error> {
307 inner_new(source, true)
308 }
309 pub fn token_text<'s>(&'s self, token: &TextToken) -> &'s str {
310 let Snip {
311 offset: begin,
312 length: len,
313 } = token.locality.bytes();
314 let end = begin + len;
315 &self.buffer[begin..end]
316 }
317 pub fn text(&self) -> &str {
318 self.buffer.as_ref()
319 }
320 pub fn original_locality(&self, idx: usize) -> Option<Local<()>> {
321 self.localities.get(idx).map(|tl| tl.original)
322 }
323 pub fn localities(&self) -> &Vec<TextLocality> {
324 self.localities.as_ref()
325 }
326 pub fn shared_text(&self) -> Text {
327 Text {
328 buffer: self.buffer.clone(),
329 localities: self.localities.clone(),
330 breakers: self.breakers.clone(),
331 }
332 }
333}
334
335impl TryFrom<String> for Text {
336 type Error = Error;
337
338 fn try_from(s: String) -> Result<Text, Error> {
339 let mut text = inner_new((&s).into_source(), false)?;
340 text.buffer = Arc::new(s);
341 Ok(text)
342 }
343}
344
345impl TryFrom<&str> for Text {
346 type Error = Error;
347
348 fn try_from(s: &str) -> Result<Text, Error> {
349 Text::new(s.into_source())
350 }
351}
352
353#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord)]
354pub enum Bound {
355 Sentence,
356 Paragraph,
357 Section,
358}
359
360#[cfg(feature = "strings")]
361#[derive(Clone, PartialEq, PartialOrd, Eq, Ord)]
362pub struct TextToken {
363 locality: Local<()>,
364 original: Option<Local<()>>,
365 pub token: Token2,
366}
367
368#[cfg(not(feature = "strings"))]
369#[derive(Clone, Copy, PartialEq, PartialOrd, Eq, Ord)]
370pub struct TextToken {
371 locality: Local<()>,
372 original: Option<Local<()>>,
373 pub token: Token2,
374}
375
376impl fmt::Debug for TextToken {
377 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
378 write!(
379 f,
380 "TextToken {{ local: {:?} [{:?}] }}, ",
381 self.locality.bytes(),
382 self.locality.chars()
383 )?;
384 match &self.original {
385 Some(orig) => write!(f, "orig: {:?} [{:?}], ", orig.bytes(), orig.chars())?,
386 None => {}
387 }
388 write!(f, "token: {:?} }}", self.token)
389 }
390}
391
392#[cfg(test)]
393impl TextToken {
394 fn into_original_token_1(self) -> Option<Local<Token>> {
395 match self.original {
396 Some(original) => self.token.into_token().map(|t| original.local(t)),
397 None => None,
398 }
399 }
400}
401
402impl TextToken {
403 pub fn local(&self) -> Local<()> {
404 self.locality
405 }
406 pub fn original(&self) -> Option<Local<()>> {
407 self.original
408 }
409 pub fn into_position(mut self) -> TextToken {
410 self.locality = self.locality.into_position();
411 self.original = self.original.map(|or| or.into_position());
412 self
413 }
414 pub fn try_as_token(&self) -> Result<Token, Bound> {
415 self.token.try_as_token()
416 }
417 pub fn as_original_token(&self) -> Option<Local<&Token2>> {
418 self.original.map(|original| original.local(&self.token))
419 }
420 pub fn into_original_token(self) -> Option<Local<Token2>> {
421 self.original.map(|original| original.local(self.token))
422 }
423 pub fn original_str<'s>(&self, original: &'s str) -> Result<&'s str, OriginalError> {
424 match self.original {
425 Some(local) => {
426 let Snip {
427 offset: begin,
428 length: len,
429 } = local.bytes();
430 let end = begin + len;
431 match original.get(begin..end) {
432 Some(s) => Ok(s),
433 None => Err(OriginalError::InvalidSnip),
434 }
435 }
436 None => Err(OriginalError::NoOriginal),
437 }
438 }
439
440 #[cfg(feature = "strings")]
441 fn token_clone(&self) -> Token2 {
442 self.token.clone()
443 }
444
445 #[cfg(not(feature = "strings"))]
446 fn token_clone(&self) -> Token2 {
447 self.token
448 }
449
450 pub fn merge_tokens(
451 &self,
452 other: &TextToken,
453 new_token: Option<Token2>,
454 ) -> Result<TextToken, TextToken> {
455 let (local, left_lb, left_lc) = add_local(&self.locality, &other.locality);
456 let must_be_left = left_lb;
457 let mut ok = must_be_left == left_lc;
458 let orig = match (&self.original, &other.original) {
459 (None, None) => None,
460 (Some(o), None) | (None, Some(o)) => Some(*o),
461 (Some(s), Some(o)) => {
462 let (orig, lb, lc) = add_local(s, o);
463 ok &= must_be_left == lb;
464 ok &= must_be_left == lc;
465 Some(orig)
466 }
467 };
468 let token = TextToken {
469 locality: local,
470 original: orig,
471 token: match new_token {
472 Some(t) => t,
473 None => self.token_clone(),
474 },
475 };
476 match ok {
477 true => Ok(token),
478 false => Err(token),
479 }
480 }
481}
482
483fn add_local(slf: &Local<()>, other: &Local<()>) -> (Local<()>, bool, bool) {
484 let b1 = slf.bytes();
486 let b2 = other.bytes();
487 let c1 = slf.chars();
488 let c2 = other.chars();
489 let (bytes, slf_is_left_by_bytes) = match b1.offset < b2.offset {
490 true => (
491 Snip {
492 offset: b1.offset,
493 length: (b2.offset + b2.length) - b1.offset,
494 },
495 true,
496 ),
497 false => (
498 Snip {
499 offset: b2.offset,
500 length: (b1.offset + b1.length) - b2.offset,
501 },
502 false,
503 ),
504 };
505 let (chars, slf_is_left_by_chars) = match c1.offset < c2.offset {
506 true => (
507 Snip {
508 offset: c1.offset,
509 length: (c2.offset + c2.length) - c1.offset,
510 },
511 true,
512 ),
513 false => (
514 Snip {
515 offset: c2.offset,
516 length: (c1.offset + c1.length) - c2.offset,
517 },
518 false,
519 ),
520 };
521 (
522 ().localize(chars, bytes),
523 slf_is_left_by_bytes,
524 slf_is_left_by_chars,
525 )
526}
527
528impl TextToken {
529 pub fn test_token(lt: Local<Token2>) -> TextToken {
530 let (local, token) = lt.into_inner();
531 TextToken {
532 locality: local,
533 original: Some(local.local(())),
534 token,
535 }
536 }
537 pub fn test_new(token: Token2, local: Local<()>, original: Option<Local<()>>) -> TextToken {
538 TextToken {
539 locality: local,
540 original,
541 token,
542 }
543 }
544}
545
546#[derive(Debug)]
573pub enum OriginalError {
574 NoOriginal,
575 InvalidSnip,
576}
577
578#[cfg(feature = "strings")]
586#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord)]
587pub enum Token2 {
588 Word(Word),
589 Struct(Struct),
590 Special(Special),
591 Unicode(Unicode),
592
593 Bound(Bound),
594}
595#[cfg(not(feature = "strings"))]
596#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord)]
597pub enum Token2 {
598 Word(Word),
599 Struct(Struct),
600 Special(Special),
601 Unicode(Unicode),
602
603 Bound(Bound),
604}
605impl From<Token> for Token2 {
606 fn from(t: Token) -> Token2 {
607 match t {
608 Token::Word(w) => Token2::Word(w),
609 Token::Struct(s) => Token2::Struct(s),
610 Token::Special(s) => Token2::Special(s),
611 Token::Unicode(u) => Token2::Unicode(u),
612 }
613 }
614}
615impl Token2 {
616 #[cfg(not(feature = "strings"))]
617 fn try_as_token(&self) -> Result<Token, Bound> {
618 (*self).try_into_token()
619 }
620
621 #[cfg(feature = "strings")]
622 fn try_as_token(&self) -> Result<Token, Bound> {
623 self.clone().try_into_token()
624 }
625
626 fn try_into_token(self) -> Result<Token, Bound> {
627 match self {
628 Token2::Word(w) => Ok(Token::Word(w)),
629 Token2::Struct(s) => Ok(Token::Struct(s)),
630 Token2::Special(s) => Ok(Token::Special(s)),
631 Token2::Unicode(u) => Ok(Token::Unicode(u)),
632 Token2::Bound(b) => Err(b),
633 }
634 }
635}
636#[cfg(test)]
637impl Token2 {
638 fn into_token(self) -> Option<Token> {
639 match self {
640 Token2::Word(w) => Some(Token::Word(w)),
641 Token2::Struct(s) => Some(Token::Struct(s)),
642 Token2::Special(s) => Some(Token::Special(s)),
643 Token2::Unicode(u) => Some(Token::Unicode(u)),
644 Token2::Bound(_) => None,
645 }
646 }
647}
648
649#[cfg(test)]
650#[cfg(not(feature = "strings"))]
651mod test_no_strings {
652 use super::*;
653 use text_parsing::{
654 IntoPipeParser, IntoSource, Localize, ParserExt, SourceExt, entities, tagger,
655 };
656
657 fn check_results(result: &Vec<Local<Token>>, lib_res: &Vec<Local<Token>>, _uws: &str) {
658 assert_eq!(result.len(), lib_res.len());
659 for i in 0..result.len() {
660 let res: Local<Token> = result[i].clone().into();
661 assert_eq!(res, lib_res[i]);
662 }
663 }
664
665 fn symbols() {
667 let uws = "Сибирь Арене 17 30 от 2560₽ 😀";
668 let lib_res = uws
671 .into_tokenizer(TokenizerParams::v1())
672 .collect::<Vec<_>>();
673 for t in lib_res {
675 println!("{:?}", t);
676 }
677 panic!()
678 }
679}
680
681#[cfg(test)]
682mod test_v0_5 {
683 use super::*;
684 use text_parsing::{IntoPipeParser, IntoSource, ParserExt, SourceExt, entities, tagger};
685
686 fn basic() {
688 let uws = "<p>Oxana Putan shared the quick (\"brown\") fox can't jump 32.3 feet, right? 4pda etc.</p><p> qeq U.S.A asd\n\n\nBrr, it's 29.3°F!\n Русское предложение #36.6 для тестирования деления по юникод-словам...\n🇷🇺 🇸🇹\n👱🏿👶🏽👨🏽\n+Done! Готово</p>";
703 let text = Text::new({
704 uws.into_source()
705 .pipe(tagger::Builder::new().create().into_breaker())
706 .pipe(entities::Builder::new().create().into_piped())
707 .into_separator()
708 })
709 .unwrap();
710 let lib_res = text
711 .into_tokenizer({
712 TokenizerParams::default()
713 .add_option(TokenizerOptions::SplitDot)
714 .add_option(TokenizerOptions::SplitUnderscore)
715 .add_option(TokenizerOptions::SplitColon)
716 .with_default_sentences()
717 })
718 .collect::<Vec<_>>();
719
720 for tok in lib_res {
721 println!(
722 "C{:?}, B{:?}, {:?} -> {:?}",
723 tok.original.map(|loc| loc.chars()),
724 tok.original.map(|loc| loc.bytes()),
725 tok.token,
726 tok.original_str(uws)
727 );
728 }
729
730 panic!()
731 }
732}
733
734#[cfg(test)]
735#[cfg(feature = "strings")]
736mod test_strings {
737 use super::*;
738 use text_parsing::{
739 IntoPipeParser, IntoSource, Localize, ParserExt, SourceExt, entities, tagger,
740 };
741
742 #[allow(dead_code)]
761 fn print_result(lib_res: &Vec<Local<Token>>) {
762 for lt in lib_res {
767 println!("{:?}", lt);
768 }
769 }
770 #[derive(Debug, Clone)]
799 struct CharToken {
800 byte_offset: usize,
801 byte_length: usize,
802 char_offset: usize,
803 char_length: usize,
804 token: Token,
805 }
806 impl Into<Local<Token>> for CharToken {
807 fn into(self) -> Local<Token> {
808 self.token.localize(
809 Snip {
810 offset: self.char_offset,
811 length: self.char_length,
812 },
813 Snip {
814 offset: self.byte_offset,
815 length: self.byte_length,
816 },
817 )
818 }
819 }
820
821 #[derive(Debug, Clone)]
822 struct PositionalToken {
823 source: &'static str,
824 offset: usize,
825 length: usize,
826 token: Token,
827 }
828 impl Into<Local<Token>> for PositionalToken {
829 fn into(self) -> Local<Token> {
830 self.token.localize(
831 Snip {
832 offset: self.source[..self.offset].chars().count(),
833 length: self.source[self.offset..self.offset + self.length]
834 .chars()
835 .count(),
836 },
837 Snip {
838 offset: self.offset,
839 length: self.length,
840 },
841 )
842 }
843 }
844
845 fn check_results(result: &Vec<PositionalToken>, lib_res: &Vec<Local<Token>>, _uws: &str) {
846 assert_eq!(result.len(), lib_res.len());
847 for i in 0..result.len() {
848 let res: Local<Token> = result[i].clone().into();
849 assert_eq!(res, lib_res[i]);
850 }
851 }
852
853 fn check_cresults(result: &Vec<CharToken>, lib_res: &Vec<Local<Token>>, _uws: &str) {
854 assert_eq!(result.len(), lib_res.len());
855 for i in 0..result.len() {
856 let res: Local<Token> = result[i].clone().into();
857 assert_eq!(res, lib_res[i]);
858 }
859 }
860
861 fn check<T: Clone + std::fmt::Debug + Into<Local<Token>>>(
862 res: &Vec<T>,
863 lib: &Vec<Local<Token>>,
864 _uws: &str,
865 ) {
866 let mut lib = lib.iter();
867 let mut res = res.iter().map(|r| {
868 let res: Local<Token> = r.clone().into();
869 res
870 });
871 let mut diff = Vec::new();
872 loop {
873 match (lib.next(), res.next()) {
874 (Some(lw), Some(rw)) => {
875 if *lw != rw {
876 diff.push(format!("LIB: {:?}", lw));
877 diff.push(format!("TEST: {:?}", rw));
878 diff.push("".to_string())
879 }
880 }
881 (Some(lw), None) => {
882 diff.push(format!("LIB: {:?}", lw));
883 diff.push("TEST: ----".to_string());
884 diff.push("".to_string())
885 }
886 (None, Some(rw)) => {
887 diff.push("LIB: ----".to_string());
888 diff.push(format!("TEST: {:?}", rw));
889 diff.push("".to_string())
890 }
891 (None, None) => break,
892 }
893 }
894 if diff.len() > 0 {
895 for ln in &diff {
896 println!("{}", ln);
897 }
898 panic!("Diff count: {}", diff.len() / 3);
899 }
900 }
901
902 #[test]
903 #[rustfmt::skip]
904 fn custom_numbers() {
905 let uws = "115,7 123,398,398 2,123.45 0,05%";
906 let result = vec![
907 PositionalToken { source: uws, offset: 0, length: 5, token: Token::Word(Word::Number(Number::Float(115.7))) },
908 PositionalToken { source: uws, offset: 5, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
909 PositionalToken { source: uws, offset: 6, length: 11, token: Token::Word(Word::Number(Number::Integer(123398398))) },
910 PositionalToken { source: uws, offset: 17, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
911 PositionalToken { source: uws, offset: 18, length: 8, token: Token::Word(Word::Number(Number::Float(2123.45))) },
912 PositionalToken { source: uws, offset: 26, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
913 PositionalToken { source: uws, offset: 27, length: 4, token: Token::Word(Word::Number(Number::Float(0.05))) },
914 PositionalToken { source: uws, offset: 31, length: 1, token: Token::Special(Special::Punctuation('%')) },
915 ];
916 let lib_res = uws
917 .into_tokenizer(TokenizerParams::v1())
918 .collect::<Vec<_>>();
919 check_results(&result, &lib_res, uws);
921 }
922
923 #[test]
924 #[rustfmt::skip]
925 fn custom_numbers_ftoi() {
926 let uws = "1.1 10.0000";
927 let result = vec![
928 PositionalToken { source: uws, offset: 0, length: 3, token: Token::Word(Word::Number(Number::Float(1.1))) },
929 PositionalToken { source: uws, offset: 3, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
930 PositionalToken { source: uws, offset: 4, length: 7, token: Token::Word(Word::Number(Number::Integer(10))) },
931 ];
932 let lib_res = uws
933 .into_tokenizer(TokenizerParams::v1())
934 .collect::<Vec<_>>();
935 check_results(&result, &lib_res, uws);
937 }
938
939 #[test]
940 #[rustfmt::skip]
941 fn custom_numbers_en_1() {
942 let uws = "1.1 10,000";
943 let result = vec![
944 PositionalToken { source: uws, offset: 0, length: 3, token: Token::Word(Word::Number(Number::Float(1.1))) },
945 PositionalToken { source: uws, offset: 3, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
946 PositionalToken { source: uws, offset: 4, length: 6, token: Token::Word(Word::Number(Number::Integer(10000))) },
947 ];
948 let lib_res = uws
949 .into_tokenizer(TokenizerParams::v1())
950 .collect::<Vec<_>>();
951 check_results(&result, &lib_res, uws);
953 }
954
955 #[test]
956 #[rustfmt::skip]
957 fn custom_numbers_en_2() {
958 let uws = "1,000.1 10,000";
959 let result = vec![
960 PositionalToken { source: uws, offset: 0, length: 7, token: Token::Word(Word::Number(Number::Float(1000.1))) },
961 PositionalToken { source: uws, offset: 7, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
962 PositionalToken { source: uws, offset: 8, length: 6, token: Token::Word(Word::Number(Number::Integer(10000))) },
963 ];
964 let lib_res = uws
965 .into_tokenizer(TokenizerParams::v1())
966 .collect::<Vec<_>>();
967 check_results(&result, &lib_res, uws);
969 }
970
971 #[test]
972 #[rustfmt::skip]
973 fn custom_numbers_ru_1() {
974 let uws = "1.1 10,001";
975 let result = vec![
976 PositionalToken { source: uws, offset: 0, length: 3, token: Token::Word(Word::Number(Number::Float(1.1))) },
977 PositionalToken { source: uws, offset: 3, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
978 PositionalToken { source: uws, offset: 4, length: 6, token: Token::Word(Word::Number(Number::Integer(10001))) }, ];
980 let lib_res = uws
981 .into_tokenizer(TokenizerParams::v1().add_option(TokenizerOptions::NumberUnknownComaAsDot))
982 .collect::<Vec<_>>();
983 check_results(&result, &lib_res, uws);
985 }
986
987 #[test]
988 #[rustfmt::skip]
989 fn custom_numbers_ru_2() {
990 let uws = "1,1 10,001";
991 let result = vec![
992 PositionalToken { source: uws, offset: 0, length: 3, token: Token::Word(Word::Number(Number::Float(1.1))) },
993 PositionalToken { source: uws, offset: 3, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
994 PositionalToken { source: uws, offset: 4, length: 6, token: Token::Word(Word::Number(Number::Integer(10001))) }, ];
996 let lib_res = uws
997 .into_tokenizer(TokenizerParams::v1())
998 .collect::<Vec<_>>();
999 check_results(&result, &lib_res, uws);
1001 }
1002
1003 #[test]
1004 #[rustfmt::skip]
1005 fn custom_numbers_ru_3() {
1006 let uws = "10000,1 10,001";
1007 let result = vec![
1008 PositionalToken { source: uws, offset: 0, length: 7, token: Token::Word(Word::Number(Number::Float(10000.1))) },
1009 PositionalToken { source: uws, offset: 7, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
1010 PositionalToken { source: uws, offset: 8, length: 6, token: Token::Word(Word::Number(Number::Integer(10001))) }, ];
1012 let lib_res = uws
1013 .into_tokenizer(TokenizerParams::v1())
1014 .collect::<Vec<_>>();
1015 check_results(&result, &lib_res, uws);
1017 }
1018
1019 #[test]
1020 #[rustfmt::skip]
1021 fn currency() {
1022 let uws = "$ ₽ € ¥";
1023 let result = vec![
1024 PositionalToken { source: uws, offset: 0, length: 1, token: Token::Special(Special::Currency('$')) },
1025 PositionalToken { source: uws, offset: 1, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
1026 PositionalToken { source: uws, offset: 2, length: 3, token: Token::Special(Special::Currency('₽')) },
1027 PositionalToken { source: uws, offset: 5, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
1028 PositionalToken { source: uws, offset: 6, length: 3, token: Token::Special(Special::Currency('€')) },
1029 PositionalToken { source: uws, offset: 9, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
1030 PositionalToken { source: uws, offset: 10, length: 2, token: Token::Special(Special::Currency('¥')) },
1031 ];
1032 let lib_res = uws
1033 .into_tokenizer(TokenizerParams::v1())
1034 .collect::<Vec<_>>();
1035 check_results(&result, &lib_res, uws);
1037 }
1039
1040 #[test]
1041 fn spaces() {
1042 let uws = " spaces too many apces ";
1043 let result = vec![
1044 PositionalToken {
1045 source: uws,
1046 offset: 0,
1047 length: 4,
1048 token: Token::Special(Special::Separator(Separator::Space)),
1049 },
1050 PositionalToken {
1051 source: uws,
1052 offset: 4,
1053 length: 6,
1054 token: Token::Word(Word::Word("spaces".to_string())),
1055 },
1056 PositionalToken {
1057 source: uws,
1058 offset: 10,
1059 length: 4,
1060 token: Token::Special(Special::Separator(Separator::Space)),
1061 },
1062 PositionalToken {
1063 source: uws,
1064 offset: 14,
1065 length: 3,
1066 token: Token::Word(Word::Word("too".to_string())),
1067 },
1068 PositionalToken {
1069 source: uws,
1070 offset: 17,
1071 length: 3,
1072 token: Token::Special(Special::Separator(Separator::Space)),
1073 },
1074 PositionalToken {
1075 source: uws,
1076 offset: 20,
1077 length: 4,
1078 token: Token::Word(Word::Word("many".to_string())),
1079 },
1080 PositionalToken {
1081 source: uws,
1082 offset: 24,
1083 length: 3,
1084 token: Token::Special(Special::Separator(Separator::Space)),
1085 },
1086 PositionalToken {
1087 source: uws,
1088 offset: 27,
1089 length: 5,
1090 token: Token::Word(Word::Word("apces".to_string())),
1091 },
1092 PositionalToken {
1093 source: uws,
1094 offset: 32,
1095 length: 3,
1096 token: Token::Special(Special::Separator(Separator::Space)),
1097 },
1098 ];
1099 let lib_res = uws
1100 .into_tokenizer(TokenizerParams::v1())
1101 .collect::<Vec<_>>();
1102 check_results(&result, &lib_res, uws);
1103 }
1105
1106 #[test]
1107 fn numbers() {
1108 let uws = "(() -2\n() -2";
1109 let result = vec![
1110 PositionalToken {
1111 source: uws,
1112 offset: 0,
1113 length: 1,
1114 token: Token::Special(Special::Punctuation('(')),
1115 },
1116 PositionalToken {
1117 source: uws,
1118 offset: 1,
1119 length: 1,
1120 token: Token::Special(Special::Punctuation('(')),
1121 },
1122 PositionalToken {
1123 source: uws,
1124 offset: 2,
1125 length: 1,
1126 token: Token::Special(Special::Punctuation(')')),
1127 },
1128 PositionalToken {
1129 source: uws,
1130 offset: 3,
1131 length: 1,
1132 token: Token::Special(Special::Separator(Separator::Space)),
1133 },
1134 PositionalToken {
1135 source: uws,
1136 offset: 4,
1137 length: 2,
1138 token: Token::Word(Word::Number(Number::Integer(-2))),
1139 },
1140 PositionalToken {
1141 source: uws,
1142 offset: 6,
1143 length: 1,
1144 token: Token::Special(Special::Separator(Separator::Newline)),
1145 },
1146 PositionalToken {
1147 source: uws,
1148 offset: 7,
1149 length: 1,
1150 token: Token::Special(Special::Punctuation('(')),
1151 },
1152 PositionalToken {
1153 source: uws,
1154 offset: 8,
1155 length: 1,
1156 token: Token::Special(Special::Punctuation(')')),
1157 },
1158 PositionalToken {
1159 source: uws,
1160 offset: 9,
1161 length: 2,
1162 token: Token::Special(Special::Separator(Separator::Space)),
1163 },
1164 PositionalToken {
1165 source: uws,
1166 offset: 11,
1167 length: 2,
1168 token: Token::Word(Word::Number(Number::Integer(-2))),
1169 },
1170 ];
1171 let lib_res = uws
1172 .into_tokenizer({
1173 TokenizerParams::default()
1174 .add_option(TokenizerOptions::SplitDot)
1175 .add_option(TokenizerOptions::SplitUnderscore)
1176 .add_option(TokenizerOptions::SplitColon)
1177 .add_option(TokenizerOptions::MergeWhites)
1178 })
1179 .collect::<Vec<_>>();
1180 check_results(&result, &lib_res, uws);
1181 }
1182
1183 #[test]
1184 fn word_with_inner_hyphens() {
1185 let uws = "Опросы показывают";
1186 let result = vec![
1187 PositionalToken {
1188 source: uws,
1189 offset: 0,
1190 length: 14,
1191 token: Token::Word(Word::StrangeWord("Опросы".to_string())),
1192 },
1193 PositionalToken {
1194 source: uws,
1195 offset: 14,
1196 length: 1,
1197 token: Token::Special(Special::Separator(Separator::Space)),
1198 },
1199 PositionalToken {
1200 source: uws,
1201 offset: 15,
1202 length: 28,
1203 token: Token::Word(Word::StrangeWord("показывают".to_string())),
1204 },
1205 ];
1206 let lib_res = uws
1207 .into_tokenizer(TokenizerParams::v1())
1208 .collect::<Vec<_>>();
1209 check_results(&result, &lib_res, uws);
1210 }
1211
1212 #[test]
1213 fn mixed_but_word() {
1214 let uws = "L’Oreal";
1215 let result = vec![PositionalToken {
1216 source: uws,
1217 offset: 0,
1218 length: 9,
1219 token: Token::Word(Word::StrangeWord("L’Oreal".to_string())),
1220 }];
1221 let lib_res = uws
1222 .into_tokenizer(TokenizerParams::v1())
1223 .collect::<Vec<_>>();
1224 check_results(&result, &lib_res, uws);
1225 }
1226
1227 #[test]
1228 fn hashtags() {
1229 let uws = "#hashtag#hashtag2";
1230 let result = vec![
1231 PositionalToken {
1232 source: uws,
1233 offset: 0,
1234 length: 8,
1235 token: Token::Struct(Struct::Hashtag("hashtag".to_string())),
1236 },
1237 PositionalToken {
1238 source: uws,
1239 offset: 8,
1240 length: 9,
1241 token: Token::Struct(Struct::Hashtag("hashtag2".to_string())),
1242 },
1243 ];
1270 let lib_res = uws
1271 .into_tokenizer(TokenizerParams::v1())
1272 .collect::<Vec<_>>();
1273 check_results(&result, &lib_res, uws);
1274 }
1275
1276 #[test]
1277 fn hashtags2() {
1278 let uws = "#hashtag#hashtag2 #hash_tag";
1279 let result = vec![
1280 PositionalToken {
1281 source: uws,
1282 offset: 0,
1283 length: 8,
1284 token: Token::Struct(Struct::Hashtag("hashtag".to_string())),
1285 },
1286 PositionalToken {
1287 source: uws,
1288 offset: 8,
1289 length: 9,
1290 token: Token::Struct(Struct::Hashtag("hashtag2".to_string())),
1291 },
1292 PositionalToken {
1293 source: uws,
1294 offset: 17,
1295 length: 1,
1296 token: Token::Special(Special::Separator(Separator::Space)),
1297 },
1298 PositionalToken {
1299 source: uws,
1300 offset: 18,
1301 length: 9,
1302 token: Token::Struct(Struct::Hashtag("hash_tag".to_string())),
1303 },
1304 ];
1305 let lib_res = uws
1306 .into_tokenizer(TokenizerParams::v1().add_option(TokenizerOptions::StructTokens))
1307 .collect::<Vec<_>>();
1308 check_results(&result, &lib_res, uws);
1309 }
1310
1311 #[test]
1312 fn mention2() {
1313 let uws = "@hashtag@hashtag2 @hash_tag";
1314 let result = vec![
1315 PositionalToken {
1316 source: uws,
1317 offset: 0,
1318 length: 8,
1319 token: Token::Struct(Struct::Mention("hashtag".to_string())),
1320 },
1321 PositionalToken {
1322 source: uws,
1323 offset: 8,
1324 length: 9,
1325 token: Token::Struct(Struct::Mention("hashtag2".to_string())),
1326 },
1327 PositionalToken {
1328 source: uws,
1329 offset: 17,
1330 length: 1,
1331 token: Token::Special(Special::Separator(Separator::Space)),
1332 },
1333 PositionalToken {
1334 source: uws,
1335 offset: 18,
1336 length: 9,
1337 token: Token::Struct(Struct::Mention("hash_tag".to_string())),
1338 },
1339 ];
1340 let lib_res = uws
1341 .into_tokenizer(TokenizerParams::v1().add_option(TokenizerOptions::StructTokens))
1342 .collect::<Vec<_>>();
1343 check_results(&result, &lib_res, uws);
1344 }
1345
1346 #[test]
1347 fn apostrophe() {
1348 let uws = "l'oreal; l\u{0060}oreal";
1349 let result = vec![
1350 PositionalToken {
1351 source: uws,
1352 offset: 0,
1353 length: 7,
1354 token: Token::Word(Word::Word("l'oreal".to_string())),
1355 },
1356 PositionalToken {
1357 source: uws,
1358 offset: 7,
1359 length: 1,
1360 token: Token::Special(Special::Punctuation(';')),
1361 },
1362 PositionalToken {
1363 source: uws,
1364 offset: 8,
1365 length: 1,
1366 token: Token::Special(Special::Separator(Separator::Space)),
1367 },
1368 PositionalToken {
1369 source: uws,
1370 offset: 9,
1371 length: 7,
1372 token: Token::Word(Word::Word("l'oreal".to_string())),
1373 },
1374 ];
1375 let text = Text::new(uws.into_source()).unwrap();
1376 let lib_res = text
1377 .into_tokenizer(TokenizerParams::v1())
1378 .filter_map(|tt| tt.into_original_token_1())
1379 .collect::<Vec<_>>();
1380 check_results(&result, &lib_res, uws);
1381 }
1382
1383 #[test]
1384 fn char_tokens() {
1385 let uws = "[Oxana Putan|1712640565] shared the quick (\"brown\") fox can't jump 32.3 feet, right? 4pda etc. qeq U.S.A asd\n\n\nBrr, it's 29.3°F!\n Русское предложение #36.6 для тестирования деления по юникод-словам...\n🇷🇺 🇸🇹\n👱🏿👶🏽👨🏽\n+Done! Готово";
1386 let result = vec![
1387 CharToken {
1388 byte_offset: 0,
1389 byte_length: 1,
1390 char_offset: 0,
1391 char_length: 1,
1392 token: Token::Special(Special::Punctuation('[')),
1393 },
1394 CharToken {
1395 byte_offset: 1,
1396 byte_length: 5,
1397 char_offset: 1,
1398 char_length: 5,
1399 token: Token::Word(Word::Word("Oxana".to_string())),
1400 },
1401 CharToken {
1402 byte_offset: 6,
1403 byte_length: 1,
1404 char_offset: 6,
1405 char_length: 1,
1406 token: Token::Special(Special::Separator(Separator::Space)),
1407 },
1408 CharToken {
1409 byte_offset: 7,
1410 byte_length: 5,
1411 char_offset: 7,
1412 char_length: 5,
1413 token: Token::Word(Word::Word("Putan".to_string())),
1414 },
1415 CharToken {
1416 byte_offset: 12,
1417 byte_length: 1,
1418 char_offset: 12,
1419 char_length: 1,
1420 token: Token::Special(Special::Punctuation('|')),
1421 },
1422 CharToken {
1423 byte_offset: 13,
1424 byte_length: 10,
1425 char_offset: 13,
1426 char_length: 10,
1427 token: Token::Word(Word::Number(Number::Integer(1712640565))),
1428 },
1429 CharToken {
1430 byte_offset: 23,
1431 byte_length: 1,
1432 char_offset: 23,
1433 char_length: 1,
1434 token: Token::Special(Special::Punctuation(']')),
1435 },
1436 CharToken {
1444 byte_offset: 24,
1445 byte_length: 1,
1446 char_offset: 24,
1447 char_length: 1,
1448 token: Token::Special(Special::Separator(Separator::Space)),
1449 },
1450 CharToken {
1451 byte_offset: 25,
1452 byte_length: 6,
1453 char_offset: 25,
1454 char_length: 6,
1455 token: Token::Word(Word::Word("shared".to_string())),
1456 },
1457 CharToken {
1458 byte_offset: 31,
1459 byte_length: 1,
1460 char_offset: 31,
1461 char_length: 1,
1462 token: Token::Special(Special::Separator(Separator::Space)),
1463 },
1464 CharToken {
1465 byte_offset: 32,
1466 byte_length: 3,
1467 char_offset: 32,
1468 char_length: 3,
1469 token: Token::Word(Word::Word("the".to_string())),
1470 },
1471 CharToken {
1472 byte_offset: 35,
1473 byte_length: 1,
1474 char_offset: 35,
1475 char_length: 1,
1476 token: Token::Special(Special::Separator(Separator::Space)),
1477 },
1478 CharToken {
1479 byte_offset: 36,
1480 byte_length: 5,
1481 char_offset: 36,
1482 char_length: 5,
1483 token: Token::Word(Word::Word("quick".to_string())),
1484 },
1485 CharToken {
1486 byte_offset: 41,
1487 byte_length: 1,
1488 char_offset: 41,
1489 char_length: 1,
1490 token: Token::Special(Special::Separator(Separator::Space)),
1491 },
1492 CharToken {
1493 byte_offset: 42,
1494 byte_length: 1,
1495 char_offset: 42,
1496 char_length: 1,
1497 token: Token::Special(Special::Punctuation('(')),
1498 },
1499 CharToken {
1500 byte_offset: 43,
1501 byte_length: 1,
1502 char_offset: 43,
1503 char_length: 1,
1504 token: Token::Special(Special::Punctuation('"')),
1505 },
1506 CharToken {
1507 byte_offset: 44,
1508 byte_length: 5,
1509 char_offset: 44,
1510 char_length: 5,
1511 token: Token::Word(Word::Word("brown".to_string())),
1512 },
1513 CharToken {
1514 byte_offset: 49,
1515 byte_length: 1,
1516 char_offset: 49,
1517 char_length: 1,
1518 token: Token::Special(Special::Punctuation('"')),
1519 },
1520 CharToken {
1521 byte_offset: 50,
1522 byte_length: 1,
1523 char_offset: 50,
1524 char_length: 1,
1525 token: Token::Special(Special::Punctuation(')')),
1526 },
1527 CharToken {
1528 byte_offset: 51,
1529 byte_length: 1,
1530 char_offset: 51,
1531 char_length: 1,
1532 token: Token::Special(Special::Separator(Separator::Space)),
1533 },
1534 CharToken {
1535 byte_offset: 52,
1536 byte_length: 3,
1537 char_offset: 52,
1538 char_length: 3,
1539 token: Token::Word(Word::Word("fox".to_string())),
1540 },
1541 CharToken {
1542 byte_offset: 55,
1543 byte_length: 1,
1544 char_offset: 55,
1545 char_length: 1,
1546 token: Token::Special(Special::Separator(Separator::Space)),
1547 },
1548 CharToken {
1549 byte_offset: 56,
1550 byte_length: 5,
1551 char_offset: 56,
1552 char_length: 5,
1553 token: Token::Word(Word::Word("can\'t".to_string())),
1554 },
1555 CharToken {
1556 byte_offset: 61,
1557 byte_length: 1,
1558 char_offset: 61,
1559 char_length: 1,
1560 token: Token::Special(Special::Separator(Separator::Space)),
1561 },
1562 CharToken {
1563 byte_offset: 62,
1564 byte_length: 4,
1565 char_offset: 62,
1566 char_length: 4,
1567 token: Token::Word(Word::Word("jump".to_string())),
1568 },
1569 CharToken {
1570 byte_offset: 66,
1571 byte_length: 1,
1572 char_offset: 66,
1573 char_length: 1,
1574 token: Token::Special(Special::Separator(Separator::Space)),
1575 },
1576 CharToken {
1577 byte_offset: 67,
1578 byte_length: 4,
1579 char_offset: 67,
1580 char_length: 4,
1581 token: Token::Word(Word::Number(Number::Float(32.3))),
1582 },
1583 CharToken {
1584 byte_offset: 71,
1585 byte_length: 1,
1586 char_offset: 71,
1587 char_length: 1,
1588 token: Token::Special(Special::Separator(Separator::Space)),
1589 },
1590 CharToken {
1591 byte_offset: 72,
1592 byte_length: 4,
1593 char_offset: 72,
1594 char_length: 4,
1595 token: Token::Word(Word::Word("feet".to_string())),
1596 },
1597 CharToken {
1598 byte_offset: 76,
1599 byte_length: 1,
1600 char_offset: 76,
1601 char_length: 1,
1602 token: Token::Special(Special::Punctuation(',')),
1603 },
1604 CharToken {
1605 byte_offset: 77,
1606 byte_length: 1,
1607 char_offset: 77,
1608 char_length: 1,
1609 token: Token::Special(Special::Separator(Separator::Space)),
1610 },
1611 CharToken {
1612 byte_offset: 78,
1613 byte_length: 5,
1614 char_offset: 78,
1615 char_length: 5,
1616 token: Token::Word(Word::Word("right".to_string())),
1617 },
1618 CharToken {
1619 byte_offset: 83,
1620 byte_length: 1,
1621 char_offset: 83,
1622 char_length: 1,
1623 token: Token::Special(Special::Punctuation('?')),
1624 },
1625 CharToken {
1626 byte_offset: 84,
1627 byte_length: 1,
1628 char_offset: 84,
1629 char_length: 1,
1630 token: Token::Special(Special::Separator(Separator::Space)),
1631 },
1632 CharToken {
1633 byte_offset: 85,
1634 byte_length: 4,
1635 char_offset: 85,
1636 char_length: 4,
1637 token: Token::Word(Word::Numerical(Numerical::Measures("4pda".to_string()))),
1638 },
1639 CharToken {
1640 byte_offset: 89,
1641 byte_length: 1,
1642 char_offset: 89,
1643 char_length: 1,
1644 token: Token::Special(Special::Separator(Separator::Space)),
1645 },
1646 CharToken {
1647 byte_offset: 90,
1648 byte_length: 3,
1649 char_offset: 90,
1650 char_length: 3,
1651 token: Token::Word(Word::Word("etc".to_string())),
1652 },
1653 CharToken {
1654 byte_offset: 93,
1655 byte_length: 1,
1656 char_offset: 93,
1657 char_length: 1,
1658 token: Token::Special(Special::Punctuation('.')),
1659 },
1660 CharToken {
1661 byte_offset: 94,
1662 byte_length: 1,
1663 char_offset: 94,
1664 char_length: 1,
1665 token: Token::Special(Special::Separator(Separator::Space)),
1666 },
1667 CharToken {
1668 byte_offset: 95,
1669 byte_length: 3,
1670 char_offset: 95,
1671 char_length: 3,
1672 token: Token::Word(Word::Word("qeq".to_string())),
1673 },
1674 CharToken {
1675 byte_offset: 98,
1676 byte_length: 1,
1677 char_offset: 98,
1678 char_length: 1,
1679 token: Token::Special(Special::Separator(Separator::Space)),
1680 },
1681 CharToken {
1682 byte_offset: 99,
1683 byte_length: 5,
1684 char_offset: 99,
1685 char_length: 5,
1686 token: Token::Word(Word::Word("U.S.A".to_string())),
1687 },
1688 CharToken {
1689 byte_offset: 104,
1690 byte_length: 2,
1691 char_offset: 104,
1692 char_length: 2,
1693 token: Token::Special(Special::Separator(Separator::Space)),
1694 },
1695 CharToken {
1696 byte_offset: 106,
1697 byte_length: 3,
1698 char_offset: 106,
1699 char_length: 3,
1700 token: Token::Word(Word::Word("asd".to_string())),
1701 },
1702 CharToken {
1703 byte_offset: 109,
1704 byte_length: 3,
1705 char_offset: 109,
1706 char_length: 3,
1707 token: Token::Special(Special::Separator(Separator::Newline)),
1708 },
1709 CharToken {
1710 byte_offset: 112,
1711 byte_length: 3,
1712 char_offset: 112,
1713 char_length: 3,
1714 token: Token::Word(Word::Word("Brr".to_string())),
1715 },
1716 CharToken {
1717 byte_offset: 115,
1718 byte_length: 1,
1719 char_offset: 115,
1720 char_length: 1,
1721 token: Token::Special(Special::Punctuation(',')),
1722 },
1723 CharToken {
1724 byte_offset: 116,
1725 byte_length: 1,
1726 char_offset: 116,
1727 char_length: 1,
1728 token: Token::Special(Special::Separator(Separator::Space)),
1729 },
1730 CharToken {
1731 byte_offset: 117,
1732 byte_length: 4,
1733 char_offset: 117,
1734 char_length: 4,
1735 token: Token::Word(Word::Word("it\'s".to_string())),
1736 },
1737 CharToken {
1738 byte_offset: 121,
1739 byte_length: 1,
1740 char_offset: 121,
1741 char_length: 1,
1742 token: Token::Special(Special::Separator(Separator::Space)),
1743 },
1744 CharToken {
1745 byte_offset: 122,
1746 byte_length: 4,
1747 char_offset: 122,
1748 char_length: 4,
1749 token: Token::Word(Word::Number(Number::Float(29.3))),
1750 },
1751 CharToken {
1752 byte_offset: 126,
1753 byte_length: 2,
1754 char_offset: 126,
1755 char_length: 1,
1756 token: Token::Special(Special::Symbol('°')),
1757 },
1758 CharToken {
1759 byte_offset: 128,
1760 byte_length: 1,
1761 char_offset: 127,
1762 char_length: 1,
1763 token: Token::Word(Word::Word("F".to_string())),
1764 },
1765 CharToken {
1766 byte_offset: 129,
1767 byte_length: 1,
1768 char_offset: 128,
1769 char_length: 1,
1770 token: Token::Special(Special::Punctuation('!')),
1771 },
1772 CharToken {
1773 byte_offset: 130,
1774 byte_length: 1,
1775 char_offset: 129,
1776 char_length: 1,
1777 token: Token::Special(Special::Separator(Separator::Newline)),
1778 },
1779 CharToken {
1780 byte_offset: 131,
1781 byte_length: 1,
1782 char_offset: 130,
1783 char_length: 1,
1784 token: Token::Special(Special::Separator(Separator::Space)),
1785 },
1786 CharToken {
1787 byte_offset: 132,
1788 byte_length: 14,
1789 char_offset: 131,
1790 char_length: 7,
1791 token: Token::Word(Word::Word("Русское".to_string())),
1792 },
1793 CharToken {
1794 byte_offset: 146,
1795 byte_length: 1,
1796 char_offset: 138,
1797 char_length: 1,
1798 token: Token::Special(Special::Separator(Separator::Space)),
1799 },
1800 CharToken {
1801 byte_offset: 147,
1802 byte_length: 22,
1803 char_offset: 139,
1804 char_length: 11,
1805 token: Token::Word(Word::Word("предложение".to_string())),
1806 },
1807 CharToken {
1808 byte_offset: 169,
1809 byte_length: 1,
1810 char_offset: 150,
1811 char_length: 1,
1812 token: Token::Special(Special::Separator(Separator::Space)),
1813 },
1814 CharToken {
1815 byte_offset: 170,
1816 byte_length: 5,
1817 char_offset: 151,
1818 char_length: 5,
1819 token: Token::Struct(Struct::Hashtag("36.6".to_string())),
1820 },
1821 CharToken {
1822 byte_offset: 175,
1823 byte_length: 1,
1824 char_offset: 156,
1825 char_length: 1,
1826 token: Token::Special(Special::Separator(Separator::Space)),
1827 },
1828 CharToken {
1829 byte_offset: 176,
1830 byte_length: 6,
1831 char_offset: 157,
1832 char_length: 3,
1833 token: Token::Word(Word::Word("для".to_string())),
1834 },
1835 CharToken {
1836 byte_offset: 182,
1837 byte_length: 1,
1838 char_offset: 160,
1839 char_length: 1,
1840 token: Token::Special(Special::Separator(Separator::Space)),
1841 },
1842 CharToken {
1843 byte_offset: 183,
1844 byte_length: 24,
1845 char_offset: 161,
1846 char_length: 12,
1847 token: Token::Word(Word::Word("тестирования".to_string())),
1848 },
1849 CharToken {
1850 byte_offset: 207,
1851 byte_length: 1,
1852 char_offset: 173,
1853 char_length: 1,
1854 token: Token::Special(Special::Separator(Separator::Space)),
1855 },
1856 CharToken {
1857 byte_offset: 208,
1858 byte_length: 14,
1859 char_offset: 174,
1860 char_length: 7,
1861 token: Token::Word(Word::Word("деления".to_string())),
1862 },
1863 CharToken {
1864 byte_offset: 222,
1865 byte_length: 1,
1866 char_offset: 181,
1867 char_length: 1,
1868 token: Token::Special(Special::Separator(Separator::Space)),
1869 },
1870 CharToken {
1871 byte_offset: 223,
1872 byte_length: 4,
1873 char_offset: 182,
1874 char_length: 2,
1875 token: Token::Word(Word::Word("по".to_string())),
1876 },
1877 CharToken {
1878 byte_offset: 227,
1879 byte_length: 1,
1880 char_offset: 184,
1881 char_length: 1,
1882 token: Token::Special(Special::Separator(Separator::Space)),
1883 },
1884 CharToken {
1885 byte_offset: 228,
1886 byte_length: 12,
1887 char_offset: 185,
1888 char_length: 6,
1889 token: Token::Word(Word::Word("юникод".to_string())),
1890 },
1891 CharToken {
1892 byte_offset: 240,
1893 byte_length: 1,
1894 char_offset: 191,
1895 char_length: 1,
1896 token: Token::Special(Special::Punctuation('-')),
1897 },
1898 CharToken {
1899 byte_offset: 241,
1900 byte_length: 12,
1901 char_offset: 192,
1902 char_length: 6,
1903 token: Token::Word(Word::Word("словам".to_string())),
1904 },
1905 CharToken {
1906 byte_offset: 253,
1907 byte_length: 3,
1908 char_offset: 198,
1909 char_length: 3,
1910 token: Token::Special(Special::Punctuation('.')),
1911 },
1912 CharToken {
1913 byte_offset: 256,
1914 byte_length: 1,
1915 char_offset: 201,
1916 char_length: 1,
1917 token: Token::Special(Special::Separator(Separator::Newline)),
1918 },
1919 CharToken {
1920 byte_offset: 257,
1921 byte_length: 8,
1922 char_offset: 202,
1923 char_length: 2,
1924 token: Token::Word(Word::Emoji("russia")),
1925 },
1926 CharToken {
1927 byte_offset: 265,
1928 byte_length: 1,
1929 char_offset: 204,
1930 char_length: 1,
1931 token: Token::Special(Special::Separator(Separator::Space)),
1932 },
1933 CharToken {
1934 byte_offset: 266,
1935 byte_length: 8,
1936 char_offset: 205,
1937 char_length: 2,
1938 token: Token::Word(Word::Emoji("sao_tome_and_principe")),
1939 },
1940 CharToken {
1941 byte_offset: 274,
1942 byte_length: 1,
1943 char_offset: 207,
1944 char_length: 1,
1945 token: Token::Special(Special::Separator(Separator::Newline)),
1946 },
1947 CharToken {
1948 byte_offset: 275,
1949 byte_length: 8,
1950 char_offset: 208,
1951 char_length: 2,
1952 token: Token::Word(Word::Emoji("blond_haired_person_dark_skin_tone")),
1953 },
1954 CharToken {
1955 byte_offset: 283,
1956 byte_length: 8,
1957 char_offset: 210,
1958 char_length: 2,
1959 token: Token::Word(Word::Emoji("baby_medium_skin_tone")),
1960 },
1961 CharToken {
1962 byte_offset: 291,
1963 byte_length: 8,
1964 char_offset: 212,
1965 char_length: 2,
1966 token: Token::Word(Word::Emoji("man_medium_skin_tone")),
1967 },
1968 CharToken {
1969 byte_offset: 299,
1970 byte_length: 1,
1971 char_offset: 214,
1972 char_length: 1,
1973 token: Token::Special(Special::Separator(Separator::Newline)),
1974 },
1975 CharToken {
1976 byte_offset: 300,
1977 byte_length: 1,
1978 char_offset: 215,
1979 char_length: 1,
1980 token: Token::Special(Special::Punctuation('+')),
1981 },
1982 CharToken {
1983 byte_offset: 301,
1984 byte_length: 4,
1985 char_offset: 216,
1986 char_length: 4,
1987 token: Token::Word(Word::Word("Done".to_string())),
1988 },
1989 CharToken {
1990 byte_offset: 305,
1991 byte_length: 1,
1992 char_offset: 220,
1993 char_length: 1,
1994 token: Token::Special(Special::Punctuation('!')),
1995 },
1996 CharToken {
1997 byte_offset: 306,
1998 byte_length: 1,
1999 char_offset: 221,
2000 char_length: 1,
2001 token: Token::Special(Special::Separator(Separator::Space)),
2002 },
2003 CharToken {
2004 byte_offset: 307,
2005 byte_length: 12,
2006 char_offset: 222,
2007 char_length: 6,
2008 token: Token::Word(Word::Word("Готово".to_string())),
2009 },
2010 ];
2011
2012 let lib_res = uws
2013 .into_tokenizer(TokenizerParams::complex())
2014 .collect::<Vec<_>>();
2015
2016 check_cresults(&result, &lib_res, uws);
2018 }
2019
2020 #[test]
2021 fn general_default() {
2022 let uws = "The quick (\"brown\") fox can't jump 32.3 feet, right? 4pda etc. qeq U.S.A asd\n\n\nBrr, it's 29.3°F!\n Русское предложение #36.6 для тестирования деления по юникод-словам...\n";
2023 let result = vec![
2024 PositionalToken {
2025 source: uws,
2026 offset: 0,
2027 length: 3,
2028 token: Token::Word(Word::Word("The".to_string())),
2029 },
2030 PositionalToken {
2031 source: uws,
2032 offset: 3,
2033 length: 1,
2034 token: Token::Special(Special::Separator(Separator::Space)),
2035 },
2036 PositionalToken {
2037 source: uws,
2038 offset: 4,
2039 length: 5,
2040 token: Token::Word(Word::Word("quick".to_string())),
2041 },
2042 PositionalToken {
2043 source: uws,
2044 offset: 9,
2045 length: 1,
2046 token: Token::Special(Special::Separator(Separator::Space)),
2047 },
2048 PositionalToken {
2049 source: uws,
2050 offset: 10,
2051 length: 1,
2052 token: Token::Special(Special::Punctuation('(')),
2053 },
2054 PositionalToken {
2055 source: uws,
2056 offset: 11,
2057 length: 1,
2058 token: Token::Special(Special::Punctuation('"')),
2059 },
2060 PositionalToken {
2061 source: uws,
2062 offset: 12,
2063 length: 5,
2064 token: Token::Word(Word::Word("brown".to_string())),
2065 },
2066 PositionalToken {
2067 source: uws,
2068 offset: 17,
2069 length: 1,
2070 token: Token::Special(Special::Punctuation('"')),
2071 },
2072 PositionalToken {
2073 source: uws,
2074 offset: 18,
2075 length: 1,
2076 token: Token::Special(Special::Punctuation(')')),
2077 },
2078 PositionalToken {
2079 source: uws,
2080 offset: 19,
2081 length: 1,
2082 token: Token::Special(Special::Separator(Separator::Space)),
2083 },
2084 PositionalToken {
2085 source: uws,
2086 offset: 20,
2087 length: 3,
2088 token: Token::Word(Word::Word("fox".to_string())),
2089 },
2090 PositionalToken {
2091 source: uws,
2092 offset: 23,
2093 length: 1,
2094 token: Token::Special(Special::Separator(Separator::Space)),
2095 },
2096 PositionalToken {
2097 source: uws,
2098 offset: 24,
2099 length: 5,
2100 token: Token::Word(Word::Word("can\'t".to_string())),
2101 },
2102 PositionalToken {
2103 source: uws,
2104 offset: 29,
2105 length: 1,
2106 token: Token::Special(Special::Separator(Separator::Space)),
2107 },
2108 PositionalToken {
2109 source: uws,
2110 offset: 30,
2111 length: 4,
2112 token: Token::Word(Word::Word("jump".to_string())),
2113 },
2114 PositionalToken {
2115 source: uws,
2116 offset: 34,
2117 length: 1,
2118 token: Token::Special(Special::Separator(Separator::Space)),
2119 },
2120 PositionalToken {
2121 source: uws,
2122 offset: 35,
2123 length: 4,
2124 token: Token::Word(Word::Number(Number::Float(32.3))),
2125 },
2126 PositionalToken {
2127 source: uws,
2128 offset: 39,
2129 length: 1,
2130 token: Token::Special(Special::Separator(Separator::Space)),
2131 },
2132 PositionalToken {
2133 source: uws,
2134 offset: 40,
2135 length: 4,
2136 token: Token::Word(Word::Word("feet".to_string())),
2137 },
2138 PositionalToken {
2139 source: uws,
2140 offset: 44,
2141 length: 1,
2142 token: Token::Special(Special::Punctuation(',')),
2143 },
2144 PositionalToken {
2145 source: uws,
2146 offset: 45,
2147 length: 1,
2148 token: Token::Special(Special::Separator(Separator::Space)),
2149 },
2150 PositionalToken {
2151 source: uws,
2152 offset: 46,
2153 length: 5,
2154 token: Token::Word(Word::Word("right".to_string())),
2155 },
2156 PositionalToken {
2157 source: uws,
2158 offset: 51,
2159 length: 1,
2160 token: Token::Special(Special::Punctuation('?')),
2161 },
2162 PositionalToken {
2163 source: uws,
2164 offset: 52,
2165 length: 1,
2166 token: Token::Special(Special::Separator(Separator::Space)),
2167 },
2168 PositionalToken {
2169 source: uws,
2170 offset: 53,
2171 length: 4,
2172 token: Token::Word(Word::Numerical(Numerical::Measures("4pda".to_string()))),
2173 }, PositionalToken {
2175 source: uws,
2176 offset: 57,
2177 length: 1,
2178 token: Token::Special(Special::Separator(Separator::Space)),
2179 },
2180 PositionalToken {
2181 source: uws,
2182 offset: 58,
2183 length: 3,
2184 token: Token::Word(Word::Word("etc".to_string())),
2185 },
2186 PositionalToken {
2187 source: uws,
2188 offset: 61,
2189 length: 1,
2190 token: Token::Special(Special::Punctuation('.')),
2191 },
2192 PositionalToken {
2193 source: uws,
2194 offset: 62,
2195 length: 1,
2196 token: Token::Special(Special::Separator(Separator::Space)),
2197 },
2198 PositionalToken {
2199 source: uws,
2200 offset: 63,
2201 length: 3,
2202 token: Token::Word(Word::Word("qeq".to_string())),
2203 },
2204 PositionalToken {
2205 source: uws,
2206 offset: 66,
2207 length: 1,
2208 token: Token::Special(Special::Separator(Separator::Space)),
2209 },
2210 PositionalToken {
2211 source: uws,
2212 offset: 67,
2213 length: 1,
2214 token: Token::Word(Word::Word("U".to_string())),
2215 },
2216 PositionalToken {
2217 source: uws,
2218 offset: 68,
2219 length: 1,
2220 token: Token::Special(Special::Punctuation('.')),
2221 },
2222 PositionalToken {
2223 source: uws,
2224 offset: 69,
2225 length: 1,
2226 token: Token::Word(Word::Word("S".to_string())),
2227 },
2228 PositionalToken {
2229 source: uws,
2230 offset: 70,
2231 length: 1,
2232 token: Token::Special(Special::Punctuation('.')),
2233 },
2234 PositionalToken {
2235 source: uws,
2236 offset: 71,
2237 length: 1,
2238 token: Token::Word(Word::Word("A".to_string())),
2239 },
2240 PositionalToken {
2241 source: uws,
2242 offset: 72,
2243 length: 2,
2244 token: Token::Special(Special::Separator(Separator::Space)),
2245 },
2246 PositionalToken {
2247 source: uws,
2248 offset: 74,
2249 length: 3,
2250 token: Token::Word(Word::Word("asd".to_string())),
2251 },
2252 PositionalToken {
2253 source: uws,
2254 offset: 77,
2255 length: 3,
2256 token: Token::Special(Special::Separator(Separator::Newline)),
2257 },
2258 PositionalToken {
2259 source: uws,
2260 offset: 80,
2261 length: 3,
2262 token: Token::Word(Word::Word("Brr".to_string())),
2263 },
2264 PositionalToken {
2265 source: uws,
2266 offset: 83,
2267 length: 1,
2268 token: Token::Special(Special::Punctuation(',')),
2269 },
2270 PositionalToken {
2271 source: uws,
2272 offset: 84,
2273 length: 1,
2274 token: Token::Special(Special::Separator(Separator::Space)),
2275 },
2276 PositionalToken {
2277 source: uws,
2278 offset: 85,
2279 length: 4,
2280 token: Token::Word(Word::Word("it\'s".to_string())),
2281 },
2282 PositionalToken {
2283 source: uws,
2284 offset: 89,
2285 length: 1,
2286 token: Token::Special(Special::Separator(Separator::Space)),
2287 },
2288 PositionalToken {
2289 source: uws,
2290 offset: 90,
2291 length: 4,
2292 token: Token::Word(Word::Number(Number::Float(29.3))),
2293 },
2294 PositionalToken {
2295 source: uws,
2296 offset: 94,
2297 length: 2,
2298 token: Token::Special(Special::Symbol('°')),
2299 },
2300 PositionalToken {
2301 source: uws,
2302 offset: 96,
2303 length: 1,
2304 token: Token::Word(Word::Word("F".to_string())),
2305 },
2306 PositionalToken {
2307 source: uws,
2308 offset: 97,
2309 length: 1,
2310 token: Token::Special(Special::Punctuation('!')),
2311 },
2312 PositionalToken {
2313 source: uws,
2314 offset: 98,
2315 length: 1,
2316 token: Token::Special(Special::Separator(Separator::Newline)),
2317 },
2318 PositionalToken {
2319 source: uws,
2320 offset: 99,
2321 length: 1,
2322 token: Token::Special(Special::Separator(Separator::Space)),
2323 },
2324 PositionalToken {
2325 source: uws,
2326 offset: 100,
2327 length: 14,
2328 token: Token::Word(Word::Word("Русское".to_string())),
2329 },
2330 PositionalToken {
2331 source: uws,
2332 offset: 114,
2333 length: 1,
2334 token: Token::Special(Special::Separator(Separator::Space)),
2335 },
2336 PositionalToken {
2337 source: uws,
2338 offset: 115,
2339 length: 22,
2340 token: Token::Word(Word::Word("предложение".to_string())),
2341 },
2342 PositionalToken {
2343 source: uws,
2344 offset: 137,
2345 length: 1,
2346 token: Token::Special(Special::Separator(Separator::Space)),
2347 },
2348 PositionalToken {
2349 source: uws,
2350 offset: 138,
2351 length: 5,
2352 token: Token::Struct(Struct::Hashtag("36.6".to_string())),
2353 },
2354 PositionalToken {
2367 source: uws,
2368 offset: 143,
2369 length: 1,
2370 token: Token::Special(Special::Separator(Separator::Space)),
2371 },
2372 PositionalToken {
2373 source: uws,
2374 offset: 144,
2375 length: 6,
2376 token: Token::Word(Word::Word("для".to_string())),
2377 },
2378 PositionalToken {
2379 source: uws,
2380 offset: 150,
2381 length: 1,
2382 token: Token::Special(Special::Separator(Separator::Space)),
2383 },
2384 PositionalToken {
2385 source: uws,
2386 offset: 151,
2387 length: 24,
2388 token: Token::Word(Word::Word("тестирования".to_string())),
2389 },
2390 PositionalToken {
2391 source: uws,
2392 offset: 175,
2393 length: 1,
2394 token: Token::Special(Special::Separator(Separator::Space)),
2395 },
2396 PositionalToken {
2397 source: uws,
2398 offset: 176,
2399 length: 14,
2400 token: Token::Word(Word::Word("деления".to_string())),
2401 },
2402 PositionalToken {
2403 source: uws,
2404 offset: 190,
2405 length: 1,
2406 token: Token::Special(Special::Separator(Separator::Space)),
2407 },
2408 PositionalToken {
2409 source: uws,
2410 offset: 191,
2411 length: 4,
2412 token: Token::Word(Word::Word("по".to_string())),
2413 },
2414 PositionalToken {
2415 source: uws,
2416 offset: 195,
2417 length: 1,
2418 token: Token::Special(Special::Separator(Separator::Space)),
2419 },
2420 PositionalToken {
2421 source: uws,
2422 offset: 196,
2423 length: 12,
2424 token: Token::Word(Word::Word("юникод".to_string())),
2425 },
2426 PositionalToken {
2427 source: uws,
2428 offset: 208,
2429 length: 1,
2430 token: Token::Special(Special::Punctuation('-')),
2431 },
2432 PositionalToken {
2433 source: uws,
2434 offset: 209,
2435 length: 12,
2436 token: Token::Word(Word::Word("словам".to_string())),
2437 },
2438 PositionalToken {
2439 source: uws,
2440 offset: 221,
2441 length: 3,
2442 token: Token::Special(Special::Punctuation('.')),
2443 },
2444 PositionalToken {
2445 source: uws,
2446 offset: 224,
2447 length: 1,
2448 token: Token::Special(Special::Separator(Separator::Newline)),
2449 },
2450 ];
2451 let lib_res = uws
2452 .into_tokenizer(TokenizerParams::v1())
2453 .collect::<Vec<_>>();
2454 check_results(&result, &lib_res, uws);
2455 }
2456
2457 #[test]
2458 fn general_no_split() {
2459 let uws = "The quick (\"brown\") fox can't jump 32.3 feet, right? 4pda etc. qeq U.S.A asd\n\n\nBrr, it's 29.3°F!\n Русское предложение #36.6 для тестирования деления по юникод-словам...\n";
2460 let result = vec![
2461 PositionalToken {
2462 source: uws,
2463 offset: 0,
2464 length: 3,
2465 token: Token::Word(Word::Word("The".to_string())),
2466 },
2467 PositionalToken {
2468 source: uws,
2469 offset: 3,
2470 length: 1,
2471 token: Token::Special(Special::Separator(Separator::Space)),
2472 },
2473 PositionalToken {
2474 source: uws,
2475 offset: 4,
2476 length: 5,
2477 token: Token::Word(Word::Word("quick".to_string())),
2478 },
2479 PositionalToken {
2480 source: uws,
2481 offset: 9,
2482 length: 1,
2483 token: Token::Special(Special::Separator(Separator::Space)),
2484 },
2485 PositionalToken {
2486 source: uws,
2487 offset: 10,
2488 length: 1,
2489 token: Token::Special(Special::Punctuation('(')),
2490 },
2491 PositionalToken {
2492 source: uws,
2493 offset: 11,
2494 length: 1,
2495 token: Token::Special(Special::Punctuation('"')),
2496 },
2497 PositionalToken {
2498 source: uws,
2499 offset: 12,
2500 length: 5,
2501 token: Token::Word(Word::Word("brown".to_string())),
2502 },
2503 PositionalToken {
2504 source: uws,
2505 offset: 17,
2506 length: 1,
2507 token: Token::Special(Special::Punctuation('"')),
2508 },
2509 PositionalToken {
2510 source: uws,
2511 offset: 18,
2512 length: 1,
2513 token: Token::Special(Special::Punctuation(')')),
2514 },
2515 PositionalToken {
2516 source: uws,
2517 offset: 19,
2518 length: 1,
2519 token: Token::Special(Special::Separator(Separator::Space)),
2520 },
2521 PositionalToken {
2522 source: uws,
2523 offset: 20,
2524 length: 3,
2525 token: Token::Word(Word::Word("fox".to_string())),
2526 },
2527 PositionalToken {
2528 source: uws,
2529 offset: 23,
2530 length: 1,
2531 token: Token::Special(Special::Separator(Separator::Space)),
2532 },
2533 PositionalToken {
2534 source: uws,
2535 offset: 24,
2536 length: 5,
2537 token: Token::Word(Word::Word("can\'t".to_string())),
2538 },
2539 PositionalToken {
2540 source: uws,
2541 offset: 29,
2542 length: 1,
2543 token: Token::Special(Special::Separator(Separator::Space)),
2544 },
2545 PositionalToken {
2546 source: uws,
2547 offset: 30,
2548 length: 4,
2549 token: Token::Word(Word::Word("jump".to_string())),
2550 },
2551 PositionalToken {
2552 source: uws,
2553 offset: 34,
2554 length: 1,
2555 token: Token::Special(Special::Separator(Separator::Space)),
2556 },
2557 PositionalToken {
2558 source: uws,
2559 offset: 35,
2560 length: 4,
2561 token: Token::Word(Word::Number(Number::Float(32.3))),
2562 },
2563 PositionalToken {
2564 source: uws,
2565 offset: 39,
2566 length: 1,
2567 token: Token::Special(Special::Separator(Separator::Space)),
2568 },
2569 PositionalToken {
2570 source: uws,
2571 offset: 40,
2572 length: 4,
2573 token: Token::Word(Word::Word("feet".to_string())),
2574 },
2575 PositionalToken {
2576 source: uws,
2577 offset: 44,
2578 length: 1,
2579 token: Token::Special(Special::Punctuation(',')),
2580 },
2581 PositionalToken {
2582 source: uws,
2583 offset: 45,
2584 length: 1,
2585 token: Token::Special(Special::Separator(Separator::Space)),
2586 },
2587 PositionalToken {
2588 source: uws,
2589 offset: 46,
2590 length: 5,
2591 token: Token::Word(Word::Word("right".to_string())),
2592 },
2593 PositionalToken {
2594 source: uws,
2595 offset: 51,
2596 length: 1,
2597 token: Token::Special(Special::Punctuation('?')),
2598 },
2599 PositionalToken {
2600 source: uws,
2601 offset: 52,
2602 length: 1,
2603 token: Token::Special(Special::Separator(Separator::Space)),
2604 },
2605 PositionalToken {
2606 source: uws,
2607 offset: 53,
2608 length: 4,
2609 token: Token::Word(Word::Numerical(Numerical::Measures("4pda".to_string()))),
2610 }, PositionalToken {
2612 source: uws,
2613 offset: 57,
2614 length: 1,
2615 token: Token::Special(Special::Separator(Separator::Space)),
2616 },
2617 PositionalToken {
2618 source: uws,
2619 offset: 58,
2620 length: 3,
2621 token: Token::Word(Word::Word("etc".to_string())),
2622 },
2623 PositionalToken {
2624 source: uws,
2625 offset: 61,
2626 length: 1,
2627 token: Token::Special(Special::Punctuation('.')),
2628 },
2629 PositionalToken {
2630 source: uws,
2631 offset: 62,
2632 length: 1,
2633 token: Token::Special(Special::Separator(Separator::Space)),
2634 },
2635 PositionalToken {
2636 source: uws,
2637 offset: 63,
2638 length: 3,
2639 token: Token::Word(Word::Word("qeq".to_string())),
2640 },
2641 PositionalToken {
2642 source: uws,
2643 offset: 66,
2644 length: 1,
2645 token: Token::Special(Special::Separator(Separator::Space)),
2646 },
2647 PositionalToken {
2648 source: uws,
2649 offset: 67,
2650 length: 5,
2651 token: Token::Word(Word::Word("U.S.A".to_string())),
2652 },
2653 PositionalToken {
2654 source: uws,
2655 offset: 72,
2656 length: 1,
2657 token: Token::Special(Special::Separator(Separator::Space)),
2658 },
2659 PositionalToken {
2660 source: uws,
2661 offset: 73,
2662 length: 1,
2663 token: Token::Special(Special::Separator(Separator::Space)),
2664 },
2665 PositionalToken {
2666 source: uws,
2667 offset: 74,
2668 length: 3,
2669 token: Token::Word(Word::Word("asd".to_string())),
2670 },
2671 PositionalToken {
2672 source: uws,
2673 offset: 77,
2674 length: 1,
2675 token: Token::Special(Special::Separator(Separator::Newline)),
2676 },
2677 PositionalToken {
2678 source: uws,
2679 offset: 78,
2680 length: 1,
2681 token: Token::Special(Special::Separator(Separator::Newline)),
2682 },
2683 PositionalToken {
2684 source: uws,
2685 offset: 79,
2686 length: 1,
2687 token: Token::Special(Special::Separator(Separator::Newline)),
2688 },
2689 PositionalToken {
2690 source: uws,
2691 offset: 80,
2692 length: 3,
2693 token: Token::Word(Word::Word("Brr".to_string())),
2694 },
2695 PositionalToken {
2696 source: uws,
2697 offset: 83,
2698 length: 1,
2699 token: Token::Special(Special::Punctuation(',')),
2700 },
2701 PositionalToken {
2702 source: uws,
2703 offset: 84,
2704 length: 1,
2705 token: Token::Special(Special::Separator(Separator::Space)),
2706 },
2707 PositionalToken {
2708 source: uws,
2709 offset: 85,
2710 length: 4,
2711 token: Token::Word(Word::Word("it\'s".to_string())),
2712 },
2713 PositionalToken {
2714 source: uws,
2715 offset: 89,
2716 length: 1,
2717 token: Token::Special(Special::Separator(Separator::Space)),
2718 },
2719 PositionalToken {
2720 source: uws,
2721 offset: 90,
2722 length: 4,
2723 token: Token::Word(Word::Number(Number::Float(29.3))),
2724 },
2725 PositionalToken {
2726 source: uws,
2727 offset: 94,
2728 length: 2,
2729 token: Token::Special(Special::Symbol('°')),
2730 },
2731 PositionalToken {
2732 source: uws,
2733 offset: 96,
2734 length: 1,
2735 token: Token::Word(Word::Word("F".to_string())),
2736 },
2737 PositionalToken {
2738 source: uws,
2739 offset: 97,
2740 length: 1,
2741 token: Token::Special(Special::Punctuation('!')),
2742 },
2743 PositionalToken {
2744 source: uws,
2745 offset: 98,
2746 length: 1,
2747 token: Token::Special(Special::Separator(Separator::Newline)),
2748 },
2749 PositionalToken {
2750 source: uws,
2751 offset: 99,
2752 length: 1,
2753 token: Token::Special(Special::Separator(Separator::Space)),
2754 },
2755 PositionalToken {
2756 source: uws,
2757 offset: 100,
2758 length: 14,
2759 token: Token::Word(Word::Word("Русское".to_string())),
2760 },
2761 PositionalToken {
2762 source: uws,
2763 offset: 114,
2764 length: 1,
2765 token: Token::Special(Special::Separator(Separator::Space)),
2766 },
2767 PositionalToken {
2768 source: uws,
2769 offset: 115,
2770 length: 22,
2771 token: Token::Word(Word::Word("предложение".to_string())),
2772 },
2773 PositionalToken {
2774 source: uws,
2775 offset: 137,
2776 length: 1,
2777 token: Token::Special(Special::Separator(Separator::Space)),
2778 },
2779 PositionalToken {
2780 source: uws,
2781 offset: 138,
2782 length: 1,
2783 token: Token::Special(Special::Punctuation('#')),
2784 },
2785 PositionalToken {
2786 source: uws,
2787 offset: 139,
2788 length: 4,
2789 token: Token::Word(Word::Number(Number::Float(36.6))),
2790 },
2791 PositionalToken {
2792 source: uws,
2793 offset: 143,
2794 length: 1,
2795 token: Token::Special(Special::Separator(Separator::Space)),
2796 },
2797 PositionalToken {
2798 source: uws,
2799 offset: 144,
2800 length: 6,
2801 token: Token::Word(Word::Word("для".to_string())),
2802 },
2803 PositionalToken {
2804 source: uws,
2805 offset: 150,
2806 length: 1,
2807 token: Token::Special(Special::Separator(Separator::Space)),
2808 },
2809 PositionalToken {
2810 source: uws,
2811 offset: 151,
2812 length: 24,
2813 token: Token::Word(Word::Word("тестирования".to_string())),
2814 },
2815 PositionalToken {
2816 source: uws,
2817 offset: 175,
2818 length: 1,
2819 token: Token::Special(Special::Separator(Separator::Space)),
2820 },
2821 PositionalToken {
2822 source: uws,
2823 offset: 176,
2824 length: 14,
2825 token: Token::Word(Word::Word("деления".to_string())),
2826 },
2827 PositionalToken {
2828 source: uws,
2829 offset: 190,
2830 length: 1,
2831 token: Token::Special(Special::Separator(Separator::Space)),
2832 },
2833 PositionalToken {
2834 source: uws,
2835 offset: 191,
2836 length: 4,
2837 token: Token::Word(Word::Word("по".to_string())),
2838 },
2839 PositionalToken {
2840 source: uws,
2841 offset: 195,
2842 length: 1,
2843 token: Token::Special(Special::Separator(Separator::Space)),
2844 },
2845 PositionalToken {
2846 source: uws,
2847 offset: 196,
2848 length: 12,
2849 token: Token::Word(Word::Word("юникод".to_string())),
2850 },
2851 PositionalToken {
2852 source: uws,
2853 offset: 208,
2854 length: 1,
2855 token: Token::Special(Special::Punctuation('-')),
2856 },
2857 PositionalToken {
2858 source: uws,
2859 offset: 209,
2860 length: 12,
2861 token: Token::Word(Word::Word("словам".to_string())),
2862 },
2863 PositionalToken {
2864 source: uws,
2865 offset: 221,
2866 length: 1,
2867 token: Token::Special(Special::Punctuation('.')),
2868 },
2869 PositionalToken {
2870 source: uws,
2871 offset: 222,
2872 length: 1,
2873 token: Token::Special(Special::Punctuation('.')),
2874 },
2875 PositionalToken {
2876 source: uws,
2877 offset: 223,
2878 length: 1,
2879 token: Token::Special(Special::Punctuation('.')),
2880 },
2881 PositionalToken {
2882 source: uws,
2883 offset: 224,
2884 length: 1,
2885 token: Token::Special(Special::Separator(Separator::Newline)),
2886 },
2887 ];
2888 let lib_res = uws.into_tokenizer(Default::default()).collect::<Vec<_>>();
2889 check_results(&result, &lib_res, uws);
2890 }
2891
2892 #[test]
2893 fn general_complex() {
2894 let uws = "The quick (\"brown\") fox can't jump 32.3 feet, right? 4pda etc. qeq U.S.A asd\n\n\nBrr, it's 29.3°F!\n Русское предложение #36.6 для тестирования деления по юникод-словам...\n";
2895 let result = vec![
2896 PositionalToken {
2897 source: uws,
2898 offset: 0,
2899 length: 3,
2900 token: Token::Word(Word::Word("The".to_string())),
2901 },
2902 PositionalToken {
2903 source: uws,
2904 offset: 3,
2905 length: 1,
2906 token: Token::Special(Special::Separator(Separator::Space)),
2907 },
2908 PositionalToken {
2909 source: uws,
2910 offset: 4,
2911 length: 5,
2912 token: Token::Word(Word::Word("quick".to_string())),
2913 },
2914 PositionalToken {
2915 source: uws,
2916 offset: 9,
2917 length: 1,
2918 token: Token::Special(Special::Separator(Separator::Space)),
2919 },
2920 PositionalToken {
2921 source: uws,
2922 offset: 10,
2923 length: 1,
2924 token: Token::Special(Special::Punctuation('(')),
2925 },
2926 PositionalToken {
2927 source: uws,
2928 offset: 11,
2929 length: 1,
2930 token: Token::Special(Special::Punctuation('"')),
2931 },
2932 PositionalToken {
2933 source: uws,
2934 offset: 12,
2935 length: 5,
2936 token: Token::Word(Word::Word("brown".to_string())),
2937 },
2938 PositionalToken {
2939 source: uws,
2940 offset: 17,
2941 length: 1,
2942 token: Token::Special(Special::Punctuation('"')),
2943 },
2944 PositionalToken {
2945 source: uws,
2946 offset: 18,
2947 length: 1,
2948 token: Token::Special(Special::Punctuation(')')),
2949 },
2950 PositionalToken {
2951 source: uws,
2952 offset: 19,
2953 length: 1,
2954 token: Token::Special(Special::Separator(Separator::Space)),
2955 },
2956 PositionalToken {
2957 source: uws,
2958 offset: 20,
2959 length: 3,
2960 token: Token::Word(Word::Word("fox".to_string())),
2961 },
2962 PositionalToken {
2963 source: uws,
2964 offset: 23,
2965 length: 1,
2966 token: Token::Special(Special::Separator(Separator::Space)),
2967 },
2968 PositionalToken {
2969 source: uws,
2970 offset: 24,
2971 length: 5,
2972 token: Token::Word(Word::Word("can\'t".to_string())),
2973 },
2974 PositionalToken {
2975 source: uws,
2976 offset: 29,
2977 length: 1,
2978 token: Token::Special(Special::Separator(Separator::Space)),
2979 },
2980 PositionalToken {
2981 source: uws,
2982 offset: 30,
2983 length: 4,
2984 token: Token::Word(Word::Word("jump".to_string())),
2985 },
2986 PositionalToken {
2987 source: uws,
2988 offset: 34,
2989 length: 1,
2990 token: Token::Special(Special::Separator(Separator::Space)),
2991 },
2992 PositionalToken {
2993 source: uws,
2994 offset: 35,
2995 length: 4,
2996 token: Token::Word(Word::Number(Number::Float(32.3))),
2997 },
2998 PositionalToken {
2999 source: uws,
3000 offset: 39,
3001 length: 1,
3002 token: Token::Special(Special::Separator(Separator::Space)),
3003 },
3004 PositionalToken {
3005 source: uws,
3006 offset: 40,
3007 length: 4,
3008 token: Token::Word(Word::Word("feet".to_string())),
3009 },
3010 PositionalToken {
3011 source: uws,
3012 offset: 44,
3013 length: 1,
3014 token: Token::Special(Special::Punctuation(',')),
3015 },
3016 PositionalToken {
3017 source: uws,
3018 offset: 45,
3019 length: 1,
3020 token: Token::Special(Special::Separator(Separator::Space)),
3021 },
3022 PositionalToken {
3023 source: uws,
3024 offset: 46,
3025 length: 5,
3026 token: Token::Word(Word::Word("right".to_string())),
3027 },
3028 PositionalToken {
3029 source: uws,
3030 offset: 51,
3031 length: 1,
3032 token: Token::Special(Special::Punctuation('?')),
3033 },
3034 PositionalToken {
3035 source: uws,
3036 offset: 52,
3037 length: 1,
3038 token: Token::Special(Special::Separator(Separator::Space)),
3039 },
3040 PositionalToken {
3041 source: uws,
3042 offset: 53,
3043 length: 4,
3044 token: Token::Word(Word::Numerical(Numerical::Measures("4pda".to_string()))),
3045 }, PositionalToken {
3047 source: uws,
3048 offset: 57,
3049 length: 1,
3050 token: Token::Special(Special::Separator(Separator::Space)),
3051 },
3052 PositionalToken {
3053 source: uws,
3054 offset: 58,
3055 length: 3,
3056 token: Token::Word(Word::Word("etc".to_string())),
3057 },
3058 PositionalToken {
3059 source: uws,
3060 offset: 61,
3061 length: 1,
3062 token: Token::Special(Special::Punctuation('.')),
3063 },
3064 PositionalToken {
3065 source: uws,
3066 offset: 62,
3067 length: 1,
3068 token: Token::Special(Special::Separator(Separator::Space)),
3069 },
3070 PositionalToken {
3071 source: uws,
3072 offset: 63,
3073 length: 3,
3074 token: Token::Word(Word::Word("qeq".to_string())),
3075 },
3076 PositionalToken {
3077 source: uws,
3078 offset: 66,
3079 length: 1,
3080 token: Token::Special(Special::Separator(Separator::Space)),
3081 },
3082 PositionalToken {
3083 source: uws,
3084 offset: 67,
3085 length: 5,
3086 token: Token::Word(Word::Word("U.S.A".to_string())),
3087 },
3088 PositionalToken {
3089 source: uws,
3090 offset: 72,
3091 length: 2,
3092 token: Token::Special(Special::Separator(Separator::Space)),
3093 },
3094 PositionalToken {
3095 source: uws,
3096 offset: 74,
3097 length: 3,
3098 token: Token::Word(Word::Word("asd".to_string())),
3099 },
3100 PositionalToken {
3101 source: uws,
3102 offset: 77,
3103 length: 3,
3104 token: Token::Special(Special::Separator(Separator::Newline)),
3105 },
3106 PositionalToken {
3107 source: uws,
3108 offset: 80,
3109 length: 3,
3110 token: Token::Word(Word::Word("Brr".to_string())),
3111 },
3112 PositionalToken {
3113 source: uws,
3114 offset: 83,
3115 length: 1,
3116 token: Token::Special(Special::Punctuation(',')),
3117 },
3118 PositionalToken {
3119 source: uws,
3120 offset: 84,
3121 length: 1,
3122 token: Token::Special(Special::Separator(Separator::Space)),
3123 },
3124 PositionalToken {
3125 source: uws,
3126 offset: 85,
3127 length: 4,
3128 token: Token::Word(Word::Word("it\'s".to_string())),
3129 },
3130 PositionalToken {
3131 source: uws,
3132 offset: 89,
3133 length: 1,
3134 token: Token::Special(Special::Separator(Separator::Space)),
3135 },
3136 PositionalToken {
3137 source: uws,
3138 offset: 90,
3139 length: 4,
3140 token: Token::Word(Word::Number(Number::Float(29.3))),
3141 },
3142 PositionalToken {
3143 source: uws,
3144 offset: 94,
3145 length: 2,
3146 token: Token::Special(Special::Symbol('°')),
3147 },
3148 PositionalToken {
3149 source: uws,
3150 offset: 96,
3151 length: 1,
3152 token: Token::Word(Word::Word("F".to_string())),
3153 },
3154 PositionalToken {
3155 source: uws,
3156 offset: 97,
3157 length: 1,
3158 token: Token::Special(Special::Punctuation('!')),
3159 },
3160 PositionalToken {
3161 source: uws,
3162 offset: 98,
3163 length: 1,
3164 token: Token::Special(Special::Separator(Separator::Newline)),
3165 },
3166 PositionalToken {
3167 source: uws,
3168 offset: 99,
3169 length: 1,
3170 token: Token::Special(Special::Separator(Separator::Space)),
3171 },
3172 PositionalToken {
3173 source: uws,
3174 offset: 100,
3175 length: 14,
3176 token: Token::Word(Word::Word("Русское".to_string())),
3177 },
3178 PositionalToken {
3179 source: uws,
3180 offset: 114,
3181 length: 1,
3182 token: Token::Special(Special::Separator(Separator::Space)),
3183 },
3184 PositionalToken {
3185 source: uws,
3186 offset: 115,
3187 length: 22,
3188 token: Token::Word(Word::Word("предложение".to_string())),
3189 },
3190 PositionalToken {
3191 source: uws,
3192 offset: 137,
3193 length: 1,
3194 token: Token::Special(Special::Separator(Separator::Space)),
3195 },
3196 PositionalToken {
3197 source: uws,
3198 offset: 138,
3199 length: 5,
3200 token: Token::Struct(Struct::Hashtag("36.6".to_string())),
3201 },
3202 PositionalToken {
3203 source: uws,
3204 offset: 143,
3205 length: 1,
3206 token: Token::Special(Special::Separator(Separator::Space)),
3207 },
3208 PositionalToken {
3209 source: uws,
3210 offset: 144,
3211 length: 6,
3212 token: Token::Word(Word::Word("для".to_string())),
3213 },
3214 PositionalToken {
3215 source: uws,
3216 offset: 150,
3217 length: 1,
3218 token: Token::Special(Special::Separator(Separator::Space)),
3219 },
3220 PositionalToken {
3221 source: uws,
3222 offset: 151,
3223 length: 24,
3224 token: Token::Word(Word::Word("тестирования".to_string())),
3225 },
3226 PositionalToken {
3227 source: uws,
3228 offset: 175,
3229 length: 1,
3230 token: Token::Special(Special::Separator(Separator::Space)),
3231 },
3232 PositionalToken {
3233 source: uws,
3234 offset: 176,
3235 length: 14,
3236 token: Token::Word(Word::Word("деления".to_string())),
3237 },
3238 PositionalToken {
3239 source: uws,
3240 offset: 190,
3241 length: 1,
3242 token: Token::Special(Special::Separator(Separator::Space)),
3243 },
3244 PositionalToken {
3245 source: uws,
3246 offset: 191,
3247 length: 4,
3248 token: Token::Word(Word::Word("по".to_string())),
3249 },
3250 PositionalToken {
3251 source: uws,
3252 offset: 195,
3253 length: 1,
3254 token: Token::Special(Special::Separator(Separator::Space)),
3255 },
3256 PositionalToken {
3257 source: uws,
3258 offset: 196,
3259 length: 12,
3260 token: Token::Word(Word::Word("юникод".to_string())),
3261 },
3262 PositionalToken {
3263 source: uws,
3264 offset: 208,
3265 length: 1,
3266 token: Token::Special(Special::Punctuation('-')),
3267 },
3268 PositionalToken {
3269 source: uws,
3270 offset: 209,
3271 length: 12,
3272 token: Token::Word(Word::Word("словам".to_string())),
3273 },
3274 PositionalToken {
3275 source: uws,
3276 offset: 221,
3277 length: 3,
3278 token: Token::Special(Special::Punctuation('.')),
3279 },
3280 PositionalToken {
3281 source: uws,
3282 offset: 224,
3283 length: 1,
3284 token: Token::Special(Special::Separator(Separator::Newline)),
3285 },
3286 ];
3287 let lib_res = uws
3288 .into_tokenizer(TokenizerParams::complex())
3289 .collect::<Vec<_>>();
3290 check_results(&result, &lib_res, uws);
3291 }
3292
3293 #[test]
3294 fn plus_minus() {
3295 let uws = "+23 -4.5 -34 +25.7 - 2 + 5.6";
3296 let result = vec![
3297 PositionalToken {
3298 source: uws,
3299 offset: 0,
3300 length: 3,
3301 token: Token::Word(Word::Number(Number::Integer(23))),
3302 },
3303 PositionalToken {
3304 source: uws,
3305 offset: 3,
3306 length: 1,
3307 token: Token::Special(Special::Separator(Separator::Space)),
3308 },
3309 PositionalToken {
3310 source: uws,
3311 offset: 4,
3312 length: 4,
3313 token: Token::Word(Word::Number(Number::Float(-4.5))),
3314 },
3315 PositionalToken {
3316 source: uws,
3317 offset: 8,
3318 length: 1,
3319 token: Token::Special(Special::Separator(Separator::Space)),
3320 },
3321 PositionalToken {
3322 source: uws,
3323 offset: 9,
3324 length: 3,
3325 token: Token::Word(Word::Number(Number::Integer(-34))),
3326 },
3327 PositionalToken {
3328 source: uws,
3329 offset: 12,
3330 length: 1,
3331 token: Token::Special(Special::Separator(Separator::Space)),
3332 },
3333 PositionalToken {
3334 source: uws,
3335 offset: 13,
3336 length: 5,
3337 token: Token::Word(Word::Number(Number::Float(25.7))),
3338 },
3339 PositionalToken {
3340 source: uws,
3341 offset: 18,
3342 length: 1,
3343 token: Token::Special(Special::Separator(Separator::Space)),
3344 },
3345 PositionalToken {
3346 source: uws,
3347 offset: 19,
3348 length: 1,
3349 token: Token::Special(Special::Punctuation('-')),
3350 },
3351 PositionalToken {
3352 source: uws,
3353 offset: 20,
3354 length: 1,
3355 token: Token::Special(Special::Separator(Separator::Space)),
3356 },
3357 PositionalToken {
3358 source: uws,
3359 offset: 21,
3360 length: 1,
3361 token: Token::Word(Word::Number(Number::Integer(2))),
3362 },
3363 PositionalToken {
3364 source: uws,
3365 offset: 22,
3366 length: 1,
3367 token: Token::Special(Special::Separator(Separator::Space)),
3368 },
3369 PositionalToken {
3370 source: uws,
3371 offset: 23,
3372 length: 1,
3373 token: Token::Special(Special::Punctuation('+')),
3374 },
3375 PositionalToken {
3376 source: uws,
3377 offset: 24,
3378 length: 1,
3379 token: Token::Special(Special::Separator(Separator::Space)),
3380 },
3381 PositionalToken {
3382 source: uws,
3383 offset: 25,
3384 length: 3,
3385 token: Token::Word(Word::Number(Number::Float(5.6))),
3386 },
3387 ];
3388 let lib_res = uws
3389 .into_tokenizer(TokenizerParams::v1())
3390 .collect::<Vec<_>>();
3391 check(&result, &lib_res, uws);
3392 }
3394
3395 #[test]
3396 #[ignore]
3397 fn woman_bouncing_ball() {
3398 let uws = "\u{26f9}\u{200d}\u{2640}";
3399 let result = vec![PositionalToken {
3400 source: uws,
3401 offset: 0,
3402 length: 9,
3403 token: Token::Word(Word::Emoji("woman_bouncing_ball")),
3404 }];
3405 let lib_res = uws
3406 .into_tokenizer(TokenizerParams::v1())
3407 .collect::<Vec<_>>();
3408 check_results(&result, &lib_res, uws);
3409 }
3411
3412 #[test]
3413 fn emoji_and_rusabbr_default() {
3414 let uws = "🇷🇺 🇸🇹\n👱🏿👶🏽👨🏽\n👱\nС.С.С.Р.\n👨👩👦👦\n🧠\n";
3415 let result = vec![
3416 PositionalToken {
3417 source: uws,
3418 offset: 0,
3419 length: 8,
3420 token: Token::Word(Word::Emoji("russia")),
3421 },
3422 PositionalToken {
3423 source: uws,
3424 offset: 8,
3425 length: 1,
3426 token: Token::Special(Special::Separator(Separator::Space)),
3427 },
3428 PositionalToken {
3429 source: uws,
3430 offset: 9,
3431 length: 8,
3432 token: Token::Word(Word::Emoji("sao_tome_and_principe")),
3433 },
3434 PositionalToken {
3435 source: uws,
3436 offset: 17,
3437 length: 1,
3438 token: Token::Special(Special::Separator(Separator::Newline)),
3439 },
3440 PositionalToken {
3441 source: uws,
3442 offset: 18,
3443 length: 8,
3444 token: Token::Word(Word::Emoji("blond_haired_person_dark_skin_tone")),
3445 },
3446 PositionalToken {
3447 source: uws,
3448 offset: 26,
3449 length: 8,
3450 token: Token::Word(Word::Emoji("baby_medium_skin_tone")),
3451 },
3452 PositionalToken {
3453 source: uws,
3454 offset: 34,
3455 length: 8,
3456 token: Token::Word(Word::Emoji("man_medium_skin_tone")),
3457 },
3458 PositionalToken {
3459 source: uws,
3460 offset: 42,
3461 length: 1,
3462 token: Token::Special(Special::Separator(Separator::Newline)),
3463 },
3464 PositionalToken {
3465 source: uws,
3466 offset: 43,
3467 length: 4,
3468 token: Token::Word(Word::Emoji("blond_haired_person")),
3469 },
3470 PositionalToken {
3471 source: uws,
3472 offset: 47,
3473 length: 1,
3474 token: Token::Special(Special::Separator(Separator::Newline)),
3475 },
3476 PositionalToken {
3477 source: uws,
3478 offset: 48,
3479 length: 2,
3480 token: Token::Word(Word::Word("С".to_string())),
3481 },
3482 PositionalToken {
3483 source: uws,
3484 offset: 50,
3485 length: 1,
3486 token: Token::Special(Special::Punctuation('.')),
3487 },
3488 PositionalToken {
3489 source: uws,
3490 offset: 51,
3491 length: 2,
3492 token: Token::Word(Word::Word("С".to_string())),
3493 },
3494 PositionalToken {
3495 source: uws,
3496 offset: 53,
3497 length: 1,
3498 token: Token::Special(Special::Punctuation('.')),
3499 },
3500 PositionalToken {
3501 source: uws,
3502 offset: 54,
3503 length: 2,
3504 token: Token::Word(Word::Word("С".to_string())),
3505 },
3506 PositionalToken {
3507 source: uws,
3508 offset: 56,
3509 length: 1,
3510 token: Token::Special(Special::Punctuation('.')),
3511 },
3512 PositionalToken {
3513 source: uws,
3514 offset: 57,
3515 length: 2,
3516 token: Token::Word(Word::Word("Р".to_string())),
3517 },
3518 PositionalToken {
3519 source: uws,
3520 offset: 59,
3521 length: 1,
3522 token: Token::Special(Special::Punctuation('.')),
3523 },
3524 PositionalToken {
3525 source: uws,
3526 offset: 60,
3527 length: 1,
3528 token: Token::Special(Special::Separator(Separator::Newline)),
3529 },
3530 PositionalToken {
3531 source: uws,
3532 offset: 61,
3533 length: 25,
3534 token: Token::Word(Word::Emoji("family_man_woman_boy_boy")),
3535 },
3536 PositionalToken {
3537 source: uws,
3538 offset: 86,
3539 length: 1,
3540 token: Token::Special(Special::Separator(Separator::Newline)),
3541 },
3542 PositionalToken {
3543 source: uws,
3544 offset: 87,
3545 length: 4,
3546 token: Token::Word(Word::Emoji("brain")),
3547 },
3548 PositionalToken {
3549 source: uws,
3550 offset: 91,
3551 length: 1,
3552 token: Token::Special(Special::Separator(Separator::Newline)),
3553 },
3554 ];
3555
3556 let lib_res = uws
3557 .into_tokenizer(TokenizerParams::v1())
3558 .collect::<Vec<_>>();
3559 check_results(&result, &lib_res, uws);
3560 }
3562
3563 #[test]
3564 fn emoji_and_rusabbr_no_split() {
3565 let uws = "🇷🇺 🇸🇹\n👱🏿👶🏽👨🏽\n👱\nС.С.С.Р.\n👨👩👦👦\n🧠\n";
3566 let result = vec![
3567 PositionalToken {
3568 source: uws,
3569 offset: 0,
3570 length: 8,
3571 token: Token::Word(Word::Emoji("russia")),
3572 },
3573 PositionalToken {
3574 source: uws,
3575 offset: 8,
3576 length: 1,
3577 token: Token::Special(Special::Separator(Separator::Space)),
3578 },
3579 PositionalToken {
3580 source: uws,
3581 offset: 9,
3582 length: 8,
3583 token: Token::Word(Word::Emoji("sao_tome_and_principe")),
3584 },
3585 PositionalToken {
3586 source: uws,
3587 offset: 17,
3588 length: 1,
3589 token: Token::Special(Special::Separator(Separator::Newline)),
3590 },
3591 PositionalToken {
3592 source: uws,
3593 offset: 18,
3594 length: 8,
3595 token: Token::Word(Word::Emoji("blond_haired_person_dark_skin_tone")),
3596 },
3597 PositionalToken {
3598 source: uws,
3599 offset: 26,
3600 length: 8,
3601 token: Token::Word(Word::Emoji("baby_medium_skin_tone")),
3602 },
3603 PositionalToken {
3604 source: uws,
3605 offset: 34,
3606 length: 8,
3607 token: Token::Word(Word::Emoji("man_medium_skin_tone")),
3608 },
3609 PositionalToken {
3610 source: uws,
3611 offset: 42,
3612 length: 1,
3613 token: Token::Special(Special::Separator(Separator::Newline)),
3614 },
3615 PositionalToken {
3616 source: uws,
3617 offset: 43,
3618 length: 4,
3619 token: Token::Word(Word::Emoji("blond_haired_person")),
3620 },
3621 PositionalToken {
3622 source: uws,
3623 offset: 47,
3624 length: 1,
3625 token: Token::Special(Special::Separator(Separator::Newline)),
3626 },
3627 PositionalToken {
3628 source: uws,
3629 offset: 48,
3630 length: 11,
3631 token: Token::Word(Word::Word("С.С.С.Р".to_string())),
3632 },
3633 PositionalToken {
3634 source: uws,
3635 offset: 59,
3636 length: 1,
3637 token: Token::Special(Special::Punctuation('.')),
3638 },
3639 PositionalToken {
3640 source: uws,
3641 offset: 60,
3642 length: 1,
3643 token: Token::Special(Special::Separator(Separator::Newline)),
3644 },
3645 PositionalToken {
3646 source: uws,
3647 offset: 61,
3648 length: 25,
3649 token: Token::Word(Word::Emoji("family_man_woman_boy_boy")),
3650 },
3651 PositionalToken {
3652 source: uws,
3653 offset: 86,
3654 length: 1,
3655 token: Token::Special(Special::Separator(Separator::Newline)),
3656 },
3657 PositionalToken {
3658 source: uws,
3659 offset: 87,
3660 length: 4,
3661 token: Token::Word(Word::Emoji("brain")),
3662 },
3663 PositionalToken {
3664 source: uws,
3665 offset: 91,
3666 length: 1,
3667 token: Token::Special(Special::Separator(Separator::Newline)),
3668 },
3669 ];
3670
3671 let lib_res = uws.into_tokenizer(Default::default()).collect::<Vec<_>>();
3672 check_results(&result, &lib_res, uws);
3673 }
3675
3676 #[test]
3900 fn html() {
3901 let uws = "<div class=\"article article_view \" id=\"article_view_-113039156_9551\" data-article-url=\"/@chaibuket-o-chem-ne-zabyt-25-noyabrya\" data-audio-context=\"article:-113039156_9551\"><h1 class=\"article_decoration_first article_decoration_last\" >День Мамы </h1><p class=\"article_decoration_first article_decoration_last\" >День, когда поздравляют мам, бабушек, сестер и жён — это всемирный праздник, называемый «День Мамы». В настоящее время его отмечают почти в каждой стране, просто везде разные даты и способы празднования. </p><h3 class=\"article_decoration_first article_decoration_last\" ><span class='article_anchor_title'>\n <span class='article_anchor_button' id='pochemu-my-ego-prazdnuem'></span>\n <span class='article_anchor_fsymbol'>П</span>\n</span>ПОЧЕМУ МЫ ЕГО ПРАЗДНУЕМ</h3><p class=\"article_decoration_first article_decoration_last article_decoration_before\" >В 1987 году комитет госдумы по делам женщин, семьи и молодежи выступил с предложением учредить «День мамы», а сам приказ был подписан уже 30 января 1988 года Борисом Ельциным. Было решено, что ежегодно в России празднество дня мамы будет выпадать на последнее воскресенье ноября. </p><figure data-type=\"101\" data-mode=\"\" class=\"article_decoration_first article_decoration_last\" >\n <div class=\"article_figure_content\" style=\"width: 1125px\">\n <div class=\"article_figure_sizer_content\"><div class=\"article_object_sizer_wrap\" data-sizes=\"[{"s":["https://pp.userapi.com/c849128/v849128704/c0ffd/pcNJaBH3NDo.jpg",75,50],"m":["https://pp.userapi.com/c849128/v849128704/c0ffe/ozCLs2kHtRY.jpg",130,87],"x":["https://pp.userapi.com/c849128/v849128704/c0fff/E4KtTNDydzE.jpg",604,403],"y":["https://pp.userapi.com/c849128/v849128704/c1000/1nLxpYKavzU.jpg",807,538],"z":["https://pp.userapi.com/c849128/v849128704/c1001/IgEODe90yEk.jpg",1125,750],"o":["https://pp.userapi.com/c849128/v849128704/c1002/01faNwVZ2_E.jpg",130,87],"p":["https://pp.userapi.com/c849128/v849128704/c1003/baDFzbdRP2s.jpg",200,133],"q":["https://pp.userapi.com/c849128/v849128704/c1004/CY4khI6KJKA.jpg",320,213],"r":["https://pp.userapi.com/c849128/v849128704/c1005/NOvAJ6-VltY.jpg",510,340]}]\">\n <img class=\"article_object_sizer_inner article_object_photo__image_blur\" src=\"https://pp.userapi.com/c849128/v849128704/c0ffd/pcNJaBH3NDo.jpg\" data-baseurl=\"\"/>\n \n</div></div>\n <div class=\"article_figure_sizer\" style=\"padding-bottom: 66.666666666667%\"></div>";
3902 let result = vec![
3903 PositionalToken {
3904 source: uws,
3905 offset: 236,
3906 length: 8,
3907 token: Token::Word(Word::Word("День".to_string())),
3908 },
3909 PositionalToken {
3910 source: uws,
3911 offset: 244,
3912 length: 1,
3913 token: Token::Special(Special::Separator(Separator::Space)),
3914 },
3915 PositionalToken {
3916 source: uws,
3917 offset: 245,
3918 length: 8,
3919 token: Token::Word(Word::Word("Мамы".to_string())),
3920 },
3921 PositionalToken {
3922 source: uws,
3923 offset: 253,
3924 length: 1,
3925 token: Token::Special(Special::Separator(Separator::Space)),
3926 },
3927 PositionalToken {
3928 source: uws,
3929 offset: 321,
3930 length: 8,
3931 token: Token::Word(Word::Word("День".to_string())),
3932 },
3933 PositionalToken {
3934 source: uws,
3935 offset: 329,
3936 length: 1,
3937 token: Token::Special(Special::Punctuation(',')),
3938 },
3939 PositionalToken {
3940 source: uws,
3941 offset: 330,
3942 length: 1,
3943 token: Token::Special(Special::Separator(Separator::Space)),
3944 },
3945 PositionalToken {
3946 source: uws,
3947 offset: 331,
3948 length: 10,
3949 token: Token::Word(Word::Word("когда".to_string())),
3950 },
3951 PositionalToken {
3952 source: uws,
3953 offset: 341,
3954 length: 1,
3955 token: Token::Special(Special::Separator(Separator::Space)),
3956 },
3957 PositionalToken {
3958 source: uws,
3959 offset: 342,
3960 length: 22,
3961 token: Token::Word(Word::Word("поздравляют".to_string())),
3962 },
3963 PositionalToken {
3964 source: uws,
3965 offset: 364,
3966 length: 1,
3967 token: Token::Special(Special::Separator(Separator::Space)),
3968 },
3969 PositionalToken {
3970 source: uws,
3971 offset: 365,
3972 length: 6,
3973 token: Token::Word(Word::Word("мам".to_string())),
3974 },
3975 PositionalToken {
3976 source: uws,
3977 offset: 371,
3978 length: 1,
3979 token: Token::Special(Special::Punctuation(',')),
3980 },
3981 PositionalToken {
3982 source: uws,
3983 offset: 372,
3984 length: 1,
3985 token: Token::Special(Special::Separator(Separator::Space)),
3986 },
3987 PositionalToken {
3988 source: uws,
3989 offset: 373,
3990 length: 14,
3991 token: Token::Word(Word::Word("бабушек".to_string())),
3992 },
3993 PositionalToken {
3994 source: uws,
3995 offset: 387,
3996 length: 1,
3997 token: Token::Special(Special::Punctuation(',')),
3998 },
3999 PositionalToken {
4000 source: uws,
4001 offset: 388,
4002 length: 1,
4003 token: Token::Special(Special::Separator(Separator::Space)),
4004 },
4005 PositionalToken {
4006 source: uws,
4007 offset: 389,
4008 length: 12,
4009 token: Token::Word(Word::Word("сестер".to_string())),
4010 },
4011 PositionalToken {
4012 source: uws,
4013 offset: 401,
4014 length: 1,
4015 token: Token::Special(Special::Separator(Separator::Space)),
4016 },
4017 PositionalToken {
4018 source: uws,
4019 offset: 402,
4020 length: 2,
4021 token: Token::Word(Word::Word("и".to_string())),
4022 },
4023 PositionalToken {
4024 source: uws,
4025 offset: 404,
4026 length: 1,
4027 token: Token::Special(Special::Separator(Separator::Space)),
4028 },
4029 PositionalToken {
4030 source: uws,
4031 offset: 405,
4032 length: 6,
4033 token: Token::Word(Word::Word("жён".to_string())),
4034 },
4035 PositionalToken {
4036 source: uws,
4037 offset: 411,
4038 length: 1,
4039 token: Token::Special(Special::Separator(Separator::Space)),
4040 },
4041 PositionalToken {
4042 source: uws,
4043 offset: 412,
4044 length: 3,
4045 token: Token::Special(Special::Punctuation('—')),
4046 },
4047 PositionalToken {
4048 source: uws,
4049 offset: 415,
4050 length: 1,
4051 token: Token::Special(Special::Separator(Separator::Space)),
4052 },
4053 PositionalToken {
4054 source: uws,
4055 offset: 416,
4056 length: 6,
4057 token: Token::Word(Word::Word("это".to_string())),
4058 },
4059 PositionalToken {
4060 source: uws,
4061 offset: 422,
4062 length: 1,
4063 token: Token::Special(Special::Separator(Separator::Space)),
4064 },
4065 PositionalToken {
4066 source: uws,
4067 offset: 423,
4068 length: 18,
4069 token: Token::Word(Word::Word("всемирный".to_string())),
4070 },
4071 PositionalToken {
4072 source: uws,
4073 offset: 441,
4074 length: 1,
4075 token: Token::Special(Special::Separator(Separator::Space)),
4076 },
4077 PositionalToken {
4078 source: uws,
4079 offset: 442,
4080 length: 16,
4081 token: Token::Word(Word::Word("праздник".to_string())),
4082 },
4083 PositionalToken {
4084 source: uws,
4085 offset: 458,
4086 length: 1,
4087 token: Token::Special(Special::Punctuation(',')),
4088 },
4089 PositionalToken {
4090 source: uws,
4091 offset: 459,
4092 length: 1,
4093 token: Token::Special(Special::Separator(Separator::Space)),
4094 },
4095 PositionalToken {
4096 source: uws,
4097 offset: 460,
4098 length: 20,
4099 token: Token::Word(Word::Word("называемый".to_string())),
4100 },
4101 PositionalToken {
4102 source: uws,
4103 offset: 480,
4104 length: 1,
4105 token: Token::Special(Special::Separator(Separator::Space)),
4106 },
4107 PositionalToken {
4108 source: uws,
4109 offset: 481,
4110 length: 2,
4111 token: Token::Special(Special::Punctuation('«')),
4112 },
4113 PositionalToken {
4114 source: uws,
4115 offset: 483,
4116 length: 8,
4117 token: Token::Word(Word::Word("День".to_string())),
4118 },
4119 PositionalToken {
4120 source: uws,
4121 offset: 491,
4122 length: 1,
4123 token: Token::Special(Special::Separator(Separator::Space)),
4124 },
4125 PositionalToken {
4126 source: uws,
4127 offset: 492,
4128 length: 8,
4129 token: Token::Word(Word::Word("Мамы".to_string())),
4130 },
4131 PositionalToken {
4132 source: uws,
4133 offset: 500,
4134 length: 2,
4135 token: Token::Special(Special::Punctuation('»')),
4136 },
4137 PositionalToken {
4138 source: uws,
4139 offset: 502,
4140 length: 1,
4141 token: Token::Special(Special::Punctuation('.')),
4142 },
4143 PositionalToken {
4144 source: uws,
4145 offset: 503,
4146 length: 1,
4147 token: Token::Special(Special::Separator(Separator::Space)),
4148 },
4149 PositionalToken {
4150 source: uws,
4151 offset: 504,
4152 length: 2,
4153 token: Token::Word(Word::Word("В".to_string())),
4154 },
4155 PositionalToken {
4156 source: uws,
4157 offset: 506,
4158 length: 1,
4159 token: Token::Special(Special::Separator(Separator::Space)),
4160 },
4161 PositionalToken {
4162 source: uws,
4163 offset: 507,
4164 length: 18,
4165 token: Token::Word(Word::Word("настоящее".to_string())),
4166 },
4167 PositionalToken {
4168 source: uws,
4169 offset: 525,
4170 length: 1,
4171 token: Token::Special(Special::Separator(Separator::Space)),
4172 },
4173 PositionalToken {
4174 source: uws,
4175 offset: 526,
4176 length: 10,
4177 token: Token::Word(Word::Word("время".to_string())),
4178 },
4179 PositionalToken {
4180 source: uws,
4181 offset: 536,
4182 length: 1,
4183 token: Token::Special(Special::Separator(Separator::Space)),
4184 },
4185 PositionalToken {
4186 source: uws,
4187 offset: 537,
4188 length: 6,
4189 token: Token::Word(Word::Word("его".to_string())),
4190 },
4191 PositionalToken {
4192 source: uws,
4193 offset: 543,
4194 length: 1,
4195 token: Token::Special(Special::Separator(Separator::Space)),
4196 },
4197 PositionalToken {
4198 source: uws,
4199 offset: 544,
4200 length: 16,
4201 token: Token::Word(Word::Word("отмечают".to_string())),
4202 },
4203 PositionalToken {
4204 source: uws,
4205 offset: 560,
4206 length: 1,
4207 token: Token::Special(Special::Separator(Separator::Space)),
4208 },
4209 PositionalToken {
4210 source: uws,
4211 offset: 561,
4212 length: 10,
4213 token: Token::Word(Word::Word("почти".to_string())),
4214 },
4215 PositionalToken {
4216 source: uws,
4217 offset: 571,
4218 length: 1,
4219 token: Token::Special(Special::Separator(Separator::Space)),
4220 },
4221 PositionalToken {
4222 source: uws,
4223 offset: 572,
4224 length: 2,
4225 token: Token::Word(Word::Word("в".to_string())),
4226 },
4227 PositionalToken {
4228 source: uws,
4229 offset: 574,
4230 length: 1,
4231 token: Token::Special(Special::Separator(Separator::Space)),
4232 },
4233 PositionalToken {
4234 source: uws,
4235 offset: 575,
4236 length: 12,
4237 token: Token::Word(Word::Word("каждой".to_string())),
4238 },
4239 PositionalToken {
4240 source: uws,
4241 offset: 587,
4242 length: 1,
4243 token: Token::Special(Special::Separator(Separator::Space)),
4244 },
4245 PositionalToken {
4246 source: uws,
4247 offset: 588,
4248 length: 12,
4249 token: Token::Word(Word::Word("стране".to_string())),
4250 },
4251 PositionalToken {
4252 source: uws,
4253 offset: 600,
4254 length: 1,
4255 token: Token::Special(Special::Punctuation(',')),
4256 },
4257 PositionalToken {
4258 source: uws,
4259 offset: 601,
4260 length: 1,
4261 token: Token::Special(Special::Separator(Separator::Space)),
4262 },
4263 PositionalToken {
4264 source: uws,
4265 offset: 602,
4266 length: 12,
4267 token: Token::Word(Word::Word("просто".to_string())),
4268 },
4269 PositionalToken {
4270 source: uws,
4271 offset: 614,
4272 length: 1,
4273 token: Token::Special(Special::Separator(Separator::Space)),
4274 },
4275 PositionalToken {
4276 source: uws,
4277 offset: 615,
4278 length: 10,
4279 token: Token::Word(Word::Word("везде".to_string())),
4280 },
4281 PositionalToken {
4282 source: uws,
4283 offset: 625,
4284 length: 1,
4285 token: Token::Special(Special::Separator(Separator::Space)),
4286 },
4287 PositionalToken {
4288 source: uws,
4289 offset: 626,
4290 length: 12,
4291 token: Token::Word(Word::Word("разные".to_string())),
4292 },
4293 PositionalToken {
4294 source: uws,
4295 offset: 638,
4296 length: 1,
4297 token: Token::Special(Special::Separator(Separator::Space)),
4298 },
4299 PositionalToken {
4300 source: uws,
4301 offset: 639,
4302 length: 8,
4303 token: Token::Word(Word::Word("даты".to_string())),
4304 },
4305 PositionalToken {
4306 source: uws,
4307 offset: 647,
4308 length: 1,
4309 token: Token::Special(Special::Separator(Separator::Space)),
4310 },
4311 PositionalToken {
4312 source: uws,
4313 offset: 648,
4314 length: 2,
4315 token: Token::Word(Word::Word("и".to_string())),
4316 },
4317 PositionalToken {
4318 source: uws,
4319 offset: 650,
4320 length: 1,
4321 token: Token::Special(Special::Separator(Separator::Space)),
4322 },
4323 PositionalToken {
4324 source: uws,
4325 offset: 651,
4326 length: 14,
4327 token: Token::Word(Word::Word("способы".to_string())),
4328 },
4329 PositionalToken {
4330 source: uws,
4331 offset: 665,
4332 length: 1,
4333 token: Token::Special(Special::Separator(Separator::Space)),
4334 },
4335 PositionalToken {
4336 source: uws,
4337 offset: 666,
4338 length: 24,
4339 token: Token::Word(Word::Word("празднования".to_string())),
4340 },
4341 PositionalToken {
4342 source: uws,
4343 offset: 690,
4344 length: 1,
4345 token: Token::Special(Special::Punctuation('.')),
4346 },
4347 PositionalToken {
4348 source: uws,
4349 offset: 691,
4350 length: 1,
4351 token: Token::Special(Special::Separator(Separator::Space)),
4352 },
4353 PositionalToken {
4354 source: uws,
4355 offset: 794,
4356 length: 1,
4357 token: Token::Special(Special::Separator(Separator::Newline)),
4358 },
4359 PositionalToken {
4360 source: uws,
4361 offset: 795,
4362 length: 2,
4363 token: Token::Special(Special::Separator(Separator::Space)),
4364 },
4365 PositionalToken {
4366 source: uws,
4367 offset: 870,
4368 length: 1,
4369 token: Token::Special(Special::Separator(Separator::Newline)),
4370 },
4371 PositionalToken {
4372 source: uws,
4373 offset: 871,
4374 length: 2,
4375 token: Token::Special(Special::Separator(Separator::Space)),
4376 },
4377 PositionalToken {
4378 source: uws,
4379 offset: 910,
4380 length: 2,
4381 token: Token::Word(Word::Word("П".to_string())),
4382 },
4383 PositionalToken {
4384 source: uws,
4385 offset: 919,
4386 length: 1,
4387 token: Token::Special(Special::Separator(Separator::Newline)),
4388 },
4389 PositionalToken {
4390 source: uws,
4391 offset: 927,
4392 length: 12,
4393 token: Token::Word(Word::Word("ПОЧЕМУ".to_string())),
4394 },
4395 PositionalToken {
4396 source: uws,
4397 offset: 939,
4398 length: 1,
4399 token: Token::Special(Special::Separator(Separator::Space)),
4400 },
4401 PositionalToken {
4402 source: uws,
4403 offset: 940,
4404 length: 4,
4405 token: Token::Word(Word::Word("МЫ".to_string())),
4406 },
4407 PositionalToken {
4408 source: uws,
4409 offset: 944,
4410 length: 1,
4411 token: Token::Special(Special::Separator(Separator::Space)),
4412 },
4413 PositionalToken {
4414 source: uws,
4415 offset: 945,
4416 length: 6,
4417 token: Token::Word(Word::Word("ЕГО".to_string())),
4418 },
4419 PositionalToken {
4420 source: uws,
4421 offset: 951,
4422 length: 1,
4423 token: Token::Special(Special::Separator(Separator::Space)),
4424 },
4425 PositionalToken {
4426 source: uws,
4427 offset: 952,
4428 length: 18,
4429 token: Token::Word(Word::Word("ПРАЗДНУЕМ".to_string())),
4430 },
4431 PositionalToken {
4432 source: uws,
4433 offset: 1063,
4434 length: 2,
4435 token: Token::Word(Word::Word("В".to_string())),
4436 },
4437 PositionalToken {
4438 source: uws,
4439 offset: 1065,
4440 length: 1,
4441 token: Token::Special(Special::Separator(Separator::Space)),
4442 },
4443 PositionalToken {
4444 source: uws,
4445 offset: 1066,
4446 length: 4,
4447 token: Token::Word(Word::Number(Number::Integer(1987))),
4448 },
4449 PositionalToken {
4450 source: uws,
4451 offset: 1070,
4452 length: 1,
4453 token: Token::Special(Special::Separator(Separator::Space)),
4454 },
4455 PositionalToken {
4456 source: uws,
4457 offset: 1071,
4458 length: 8,
4459 token: Token::Word(Word::Word("году".to_string())),
4460 },
4461 PositionalToken {
4462 source: uws,
4463 offset: 1079,
4464 length: 1,
4465 token: Token::Special(Special::Separator(Separator::Space)),
4466 },
4467 PositionalToken {
4468 source: uws,
4469 offset: 1080,
4470 length: 14,
4471 token: Token::Word(Word::Word("комитет".to_string())),
4472 },
4473 PositionalToken {
4474 source: uws,
4475 offset: 1094,
4476 length: 1,
4477 token: Token::Special(Special::Separator(Separator::Space)),
4478 },
4479 PositionalToken {
4480 source: uws,
4481 offset: 1095,
4482 length: 14,
4483 token: Token::Word(Word::Word("госдумы".to_string())),
4484 },
4485 PositionalToken {
4486 source: uws,
4487 offset: 1109,
4488 length: 1,
4489 token: Token::Special(Special::Separator(Separator::Space)),
4490 },
4491 PositionalToken {
4492 source: uws,
4493 offset: 1110,
4494 length: 4,
4495 token: Token::Word(Word::Word("по".to_string())),
4496 },
4497 PositionalToken {
4498 source: uws,
4499 offset: 1114,
4500 length: 1,
4501 token: Token::Special(Special::Separator(Separator::Space)),
4502 },
4503 PositionalToken {
4504 source: uws,
4505 offset: 1115,
4506 length: 10,
4507 token: Token::Word(Word::Word("делам".to_string())),
4508 },
4509 PositionalToken {
4510 source: uws,
4511 offset: 1125,
4512 length: 1,
4513 token: Token::Special(Special::Separator(Separator::Space)),
4514 },
4515 PositionalToken {
4516 source: uws,
4517 offset: 1126,
4518 length: 12,
4519 token: Token::Word(Word::Word("женщин".to_string())),
4520 },
4521 PositionalToken {
4522 source: uws,
4523 offset: 1138,
4524 length: 1,
4525 token: Token::Special(Special::Punctuation(',')),
4526 },
4527 PositionalToken {
4528 source: uws,
4529 offset: 1139,
4530 length: 1,
4531 token: Token::Special(Special::Separator(Separator::Space)),
4532 },
4533 PositionalToken {
4534 source: uws,
4535 offset: 1140,
4536 length: 10,
4537 token: Token::Word(Word::Word("семьи".to_string())),
4538 },
4539 PositionalToken {
4540 source: uws,
4541 offset: 1150,
4542 length: 1,
4543 token: Token::Special(Special::Separator(Separator::Space)),
4544 },
4545 PositionalToken {
4546 source: uws,
4547 offset: 1151,
4548 length: 2,
4549 token: Token::Word(Word::Word("и".to_string())),
4550 },
4551 PositionalToken {
4552 source: uws,
4553 offset: 1153,
4554 length: 1,
4555 token: Token::Special(Special::Separator(Separator::Space)),
4556 },
4557 PositionalToken {
4558 source: uws,
4559 offset: 1154,
4560 length: 16,
4561 token: Token::Word(Word::Word("молодежи".to_string())),
4562 },
4563 PositionalToken {
4564 source: uws,
4565 offset: 1170,
4566 length: 1,
4567 token: Token::Special(Special::Separator(Separator::Space)),
4568 },
4569 PositionalToken {
4570 source: uws,
4571 offset: 1171,
4572 length: 16,
4573 token: Token::Word(Word::Word("выступил".to_string())),
4574 },
4575 PositionalToken {
4576 source: uws,
4577 offset: 1187,
4578 length: 1,
4579 token: Token::Special(Special::Separator(Separator::Space)),
4580 },
4581 PositionalToken {
4582 source: uws,
4583 offset: 1188,
4584 length: 2,
4585 token: Token::Word(Word::Word("с".to_string())),
4586 },
4587 PositionalToken {
4588 source: uws,
4589 offset: 1190,
4590 length: 1,
4591 token: Token::Special(Special::Separator(Separator::Space)),
4592 },
4593 PositionalToken {
4594 source: uws,
4595 offset: 1191,
4596 length: 24,
4597 token: Token::Word(Word::Word("предложением".to_string())),
4598 },
4599 PositionalToken {
4600 source: uws,
4601 offset: 1215,
4602 length: 1,
4603 token: Token::Special(Special::Separator(Separator::Space)),
4604 },
4605 PositionalToken {
4606 source: uws,
4607 offset: 1216,
4608 length: 16,
4609 token: Token::Word(Word::Word("учредить".to_string())),
4610 },
4611 PositionalToken {
4612 source: uws,
4613 offset: 1232,
4614 length: 1,
4615 token: Token::Special(Special::Separator(Separator::Space)),
4616 },
4617 PositionalToken {
4618 source: uws,
4619 offset: 1233,
4620 length: 2,
4621 token: Token::Special(Special::Punctuation('«')),
4622 },
4623 PositionalToken {
4624 source: uws,
4625 offset: 1235,
4626 length: 8,
4627 token: Token::Word(Word::Word("День".to_string())),
4628 },
4629 PositionalToken {
4630 source: uws,
4631 offset: 1243,
4632 length: 1,
4633 token: Token::Special(Special::Separator(Separator::Space)),
4634 },
4635 PositionalToken {
4636 source: uws,
4637 offset: 1244,
4638 length: 8,
4639 token: Token::Word(Word::Word("мамы".to_string())),
4640 },
4641 PositionalToken {
4642 source: uws,
4643 offset: 1252,
4644 length: 2,
4645 token: Token::Special(Special::Punctuation('»')),
4646 },
4647 PositionalToken {
4648 source: uws,
4649 offset: 1254,
4650 length: 1,
4651 token: Token::Special(Special::Punctuation(',')),
4652 },
4653 PositionalToken {
4654 source: uws,
4655 offset: 1255,
4656 length: 1,
4657 token: Token::Special(Special::Separator(Separator::Space)),
4658 },
4659 PositionalToken {
4660 source: uws,
4661 offset: 1256,
4662 length: 2,
4663 token: Token::Word(Word::Word("а".to_string())),
4664 },
4665 PositionalToken {
4666 source: uws,
4667 offset: 1258,
4668 length: 1,
4669 token: Token::Special(Special::Separator(Separator::Space)),
4670 },
4671 PositionalToken {
4672 source: uws,
4673 offset: 1259,
4674 length: 6,
4675 token: Token::Word(Word::Word("сам".to_string())),
4676 },
4677 PositionalToken {
4678 source: uws,
4679 offset: 1265,
4680 length: 1,
4681 token: Token::Special(Special::Separator(Separator::Space)),
4682 },
4683 PositionalToken {
4684 source: uws,
4685 offset: 1266,
4686 length: 12,
4687 token: Token::Word(Word::Word("приказ".to_string())),
4688 },
4689 PositionalToken {
4690 source: uws,
4691 offset: 1278,
4692 length: 1,
4693 token: Token::Special(Special::Separator(Separator::Space)),
4694 },
4695 PositionalToken {
4696 source: uws,
4697 offset: 1279,
4698 length: 6,
4699 token: Token::Word(Word::Word("был".to_string())),
4700 },
4701 PositionalToken {
4702 source: uws,
4703 offset: 1285,
4704 length: 1,
4705 token: Token::Special(Special::Separator(Separator::Space)),
4706 },
4707 PositionalToken {
4708 source: uws,
4709 offset: 1286,
4710 length: 16,
4711 token: Token::Word(Word::Word("подписан".to_string())),
4712 },
4713 PositionalToken {
4714 source: uws,
4715 offset: 1302,
4716 length: 1,
4717 token: Token::Special(Special::Separator(Separator::Space)),
4718 },
4719 PositionalToken {
4720 source: uws,
4721 offset: 1303,
4722 length: 6,
4723 token: Token::Word(Word::Word("уже".to_string())),
4724 },
4725 PositionalToken {
4726 source: uws,
4727 offset: 1309,
4728 length: 1,
4729 token: Token::Special(Special::Separator(Separator::Space)),
4730 },
4731 PositionalToken {
4732 source: uws,
4733 offset: 1310,
4734 length: 2,
4735 token: Token::Word(Word::Number(Number::Integer(30))),
4736 },
4737 PositionalToken {
4738 source: uws,
4739 offset: 1312,
4740 length: 1,
4741 token: Token::Special(Special::Separator(Separator::Space)),
4742 },
4743 PositionalToken {
4744 source: uws,
4745 offset: 1313,
4746 length: 12,
4747 token: Token::Word(Word::Word("января".to_string())),
4748 },
4749 PositionalToken {
4750 source: uws,
4751 offset: 1325,
4752 length: 1,
4753 token: Token::Special(Special::Separator(Separator::Space)),
4754 },
4755 PositionalToken {
4756 source: uws,
4757 offset: 1326,
4758 length: 4,
4759 token: Token::Word(Word::Number(Number::Integer(1988))),
4760 },
4761 PositionalToken {
4762 source: uws,
4763 offset: 1330,
4764 length: 1,
4765 token: Token::Special(Special::Separator(Separator::Space)),
4766 },
4767 PositionalToken {
4768 source: uws,
4769 offset: 1331,
4770 length: 8,
4771 token: Token::Word(Word::Word("года".to_string())),
4772 },
4773 PositionalToken {
4774 source: uws,
4775 offset: 1339,
4776 length: 1,
4777 token: Token::Special(Special::Separator(Separator::Space)),
4778 },
4779 PositionalToken {
4780 source: uws,
4781 offset: 1340,
4782 length: 14,
4783 token: Token::Word(Word::Word("Борисом".to_string())),
4784 },
4785 PositionalToken {
4786 source: uws,
4787 offset: 1354,
4788 length: 1,
4789 token: Token::Special(Special::Separator(Separator::Space)),
4790 },
4791 PositionalToken {
4792 source: uws,
4793 offset: 1355,
4794 length: 16,
4795 token: Token::Word(Word::Word("Ельциным".to_string())),
4796 },
4797 PositionalToken {
4798 source: uws,
4799 offset: 1371,
4800 length: 1,
4801 token: Token::Special(Special::Punctuation('.')),
4802 },
4803 PositionalToken {
4804 source: uws,
4805 offset: 1372,
4806 length: 1,
4807 token: Token::Special(Special::Separator(Separator::Space)),
4808 },
4809 PositionalToken {
4810 source: uws,
4811 offset: 1373,
4812 length: 8,
4813 token: Token::Word(Word::Word("Было".to_string())),
4814 },
4815 PositionalToken {
4816 source: uws,
4817 offset: 1381,
4818 length: 1,
4819 token: Token::Special(Special::Separator(Separator::Space)),
4820 },
4821 PositionalToken {
4822 source: uws,
4823 offset: 1382,
4824 length: 12,
4825 token: Token::Word(Word::Word("решено".to_string())),
4826 },
4827 PositionalToken {
4828 source: uws,
4829 offset: 1394,
4830 length: 1,
4831 token: Token::Special(Special::Punctuation(',')),
4832 },
4833 PositionalToken {
4834 source: uws,
4835 offset: 1395,
4836 length: 1,
4837 token: Token::Special(Special::Separator(Separator::Space)),
4838 },
4839 PositionalToken {
4840 source: uws,
4841 offset: 1396,
4842 length: 6,
4843 token: Token::Word(Word::Word("что".to_string())),
4844 },
4845 PositionalToken {
4846 source: uws,
4847 offset: 1402,
4848 length: 1,
4849 token: Token::Special(Special::Separator(Separator::Space)),
4850 },
4851 PositionalToken {
4852 source: uws,
4853 offset: 1403,
4854 length: 16,
4855 token: Token::Word(Word::Word("ежегодно".to_string())),
4856 },
4857 PositionalToken {
4858 source: uws,
4859 offset: 1419,
4860 length: 1,
4861 token: Token::Special(Special::Separator(Separator::Space)),
4862 },
4863 PositionalToken {
4864 source: uws,
4865 offset: 1420,
4866 length: 2,
4867 token: Token::Word(Word::Word("в".to_string())),
4868 },
4869 PositionalToken {
4870 source: uws,
4871 offset: 1422,
4872 length: 1,
4873 token: Token::Special(Special::Separator(Separator::Space)),
4874 },
4875 PositionalToken {
4876 source: uws,
4877 offset: 1423,
4878 length: 12,
4879 token: Token::Word(Word::Word("России".to_string())),
4880 },
4881 PositionalToken {
4882 source: uws,
4883 offset: 1435,
4884 length: 1,
4885 token: Token::Special(Special::Separator(Separator::Space)),
4886 },
4887 PositionalToken {
4888 source: uws,
4889 offset: 1436,
4890 length: 22,
4891 token: Token::Word(Word::Word("празднество".to_string())),
4892 },
4893 PositionalToken {
4894 source: uws,
4895 offset: 1458,
4896 length: 1,
4897 token: Token::Special(Special::Separator(Separator::Space)),
4898 },
4899 PositionalToken {
4900 source: uws,
4901 offset: 1459,
4902 length: 6,
4903 token: Token::Word(Word::Word("дня".to_string())),
4904 },
4905 PositionalToken {
4906 source: uws,
4907 offset: 1465,
4908 length: 1,
4909 token: Token::Special(Special::Separator(Separator::Space)),
4910 },
4911 PositionalToken {
4912 source: uws,
4913 offset: 1466,
4914 length: 8,
4915 token: Token::Word(Word::Word("мамы".to_string())),
4916 },
4917 PositionalToken {
4918 source: uws,
4919 offset: 1474,
4920 length: 1,
4921 token: Token::Special(Special::Separator(Separator::Space)),
4922 },
4923 PositionalToken {
4924 source: uws,
4925 offset: 1475,
4926 length: 10,
4927 token: Token::Word(Word::Word("будет".to_string())),
4928 },
4929 PositionalToken {
4930 source: uws,
4931 offset: 1485,
4932 length: 1,
4933 token: Token::Special(Special::Separator(Separator::Space)),
4934 },
4935 PositionalToken {
4936 source: uws,
4937 offset: 1486,
4938 length: 16,
4939 token: Token::Word(Word::Word("выпадать".to_string())),
4940 },
4941 PositionalToken {
4942 source: uws,
4943 offset: 1502,
4944 length: 1,
4945 token: Token::Special(Special::Separator(Separator::Space)),
4946 },
4947 PositionalToken {
4948 source: uws,
4949 offset: 1503,
4950 length: 4,
4951 token: Token::Word(Word::Word("на".to_string())),
4952 },
4953 PositionalToken {
4954 source: uws,
4955 offset: 1507,
4956 length: 1,
4957 token: Token::Special(Special::Separator(Separator::Space)),
4958 },
4959 PositionalToken {
4960 source: uws,
4961 offset: 1508,
4962 length: 18,
4963 token: Token::Word(Word::Word("последнее".to_string())),
4964 },
4965 PositionalToken {
4966 source: uws,
4967 offset: 1526,
4968 length: 1,
4969 token: Token::Special(Special::Separator(Separator::Space)),
4970 },
4971 PositionalToken {
4972 source: uws,
4973 offset: 1527,
4974 length: 22,
4975 token: Token::Word(Word::Word("воскресенье".to_string())),
4976 },
4977 PositionalToken {
4978 source: uws,
4979 offset: 1549,
4980 length: 1,
4981 token: Token::Special(Special::Separator(Separator::Space)),
4982 },
4983 PositionalToken {
4984 source: uws,
4985 offset: 1550,
4986 length: 12,
4987 token: Token::Word(Word::Word("ноября".to_string())),
4988 },
4989 PositionalToken {
4990 source: uws,
4991 offset: 1562,
4992 length: 1,
4993 token: Token::Special(Special::Punctuation('.')),
4994 },
4995 PositionalToken {
4996 source: uws,
4997 offset: 1563,
4998 length: 1,
4999 token: Token::Special(Special::Separator(Separator::Space)),
5000 },
5001 PositionalToken {
5002 source: uws,
5003 offset: 1664,
5004 length: 1,
5005 token: Token::Special(Special::Separator(Separator::Newline)),
5006 },
5007 PositionalToken {
5008 source: uws,
5009 offset: 1665,
5010 length: 2,
5011 token: Token::Special(Special::Separator(Separator::Space)),
5012 },
5013 PositionalToken {
5014 source: uws,
5015 offset: 1725,
5016 length: 1,
5017 token: Token::Special(Special::Separator(Separator::Newline)),
5018 },
5019 PositionalToken {
5020 source: uws,
5021 offset: 1726,
5022 length: 4,
5023 token: Token::Special(Special::Separator(Separator::Space)),
5024 },
5025 PositionalToken {
5026 source: uws,
5027 offset: 2725,
5028 length: 1,
5029 token: Token::Special(Special::Separator(Separator::Newline)),
5030 },
5031 PositionalToken {
5032 source: uws,
5033 offset: 2726,
5034 length: 2,
5035 token: Token::Special(Special::Separator(Separator::Space)),
5036 },
5037 PositionalToken {
5038 source: uws,
5039 offset: 2888,
5040 length: 1,
5041 token: Token::Special(Special::Separator(Separator::Newline)),
5042 },
5043 PositionalToken {
5044 source: uws,
5045 offset: 2889,
5046 length: 2,
5047 token: Token::Special(Special::Separator(Separator::Space)),
5048 },
5049 PositionalToken {
5050 source: uws,
5051 offset: 2891,
5052 length: 1,
5053 token: Token::Special(Special::Separator(Separator::Newline)),
5054 },
5055 PositionalToken {
5056 source: uws,
5057 offset: 2904,
5058 length: 1,
5059 token: Token::Special(Special::Separator(Separator::Newline)),
5060 },
5061 PositionalToken {
5062 source: uws,
5063 offset: 2905,
5064 length: 4,
5065 token: Token::Special(Special::Separator(Separator::Space)),
5066 },
5067 ];
5068
5069 let text = Text::new({
5070 uws.into_source()
5071 .pipe(tagger::Builder::new().create().into_breaker())
5072 .pipe(entities::Builder::new().create().into_piped())
5073 .into_separator()
5074 })
5075 .unwrap();
5076
5077 let lib_res = text
5078 .into_tokenizer(TokenizerParams::v1())
5079 .filter_map(|tt| tt.into_original_token_1())
5080 .collect::<Vec<_>>();
5081
5082 check_results(&result, &lib_res, uws);
5083 }
5084
5085 #[test]
5136 fn numerical_no_split() {
5137 let uws = "12.02.18 31.28.34 23.11.2018 123.568.365.234.578 127.0.0.1 1st 1кг 123123афываыв 12321фвафыов234выалфо 12_123_343.4234_4234";
5138 let lib_res = uws.into_tokenizer(Default::default()).collect::<Vec<_>>();
5139 let result = vec![
5141 PositionalToken {
5142 source: uws,
5143 offset: 0,
5144 length: 8,
5145 token: Token::Word(Word::Numerical(Numerical::DotSeparated(
5146 "12.02.18".to_string(),
5147 ))),
5148 },
5149 PositionalToken {
5150 source: uws,
5151 offset: 8,
5152 length: 1,
5153 token: Token::Special(Special::Separator(Separator::Space)),
5154 },
5155 PositionalToken {
5156 source: uws,
5157 offset: 9,
5158 length: 8,
5159 token: Token::Word(Word::Numerical(Numerical::DotSeparated(
5160 "31.28.34".to_string(),
5161 ))),
5162 },
5163 PositionalToken {
5164 source: uws,
5165 offset: 17,
5166 length: 1,
5167 token: Token::Special(Special::Separator(Separator::Space)),
5168 },
5169 PositionalToken {
5170 source: uws,
5171 offset: 18,
5172 length: 10,
5173 token: Token::Word(Word::Numerical(Numerical::DotSeparated(
5174 "23.11.2018".to_string(),
5175 ))),
5176 },
5177 PositionalToken {
5178 source: uws,
5179 offset: 28,
5180 length: 1,
5181 token: Token::Special(Special::Separator(Separator::Space)),
5182 },
5183 PositionalToken {
5184 source: uws,
5185 offset: 29,
5186 length: 19,
5187 token: Token::Word(Word::Number(Number::Integer(123568365234578))),
5189 },
5190 PositionalToken {
5191 source: uws,
5192 offset: 48,
5193 length: 1,
5194 token: Token::Special(Special::Separator(Separator::Space)),
5195 },
5196 PositionalToken {
5197 source: uws,
5198 offset: 49,
5199 length: 9,
5200 token: Token::Word(Word::Numerical(Numerical::DotSeparated(
5201 "127.0.0.1".to_string(),
5202 ))),
5203 },
5204 PositionalToken {
5205 source: uws,
5206 offset: 58,
5207 length: 1,
5208 token: Token::Special(Special::Separator(Separator::Space)),
5209 },
5210 PositionalToken {
5211 source: uws,
5212 offset: 59,
5213 length: 3,
5214 token: Token::Word(Word::Numerical(Numerical::Measures("1st".to_string()))),
5215 },
5216 PositionalToken {
5217 source: uws,
5218 offset: 62,
5219 length: 1,
5220 token: Token::Special(Special::Separator(Separator::Space)),
5221 },
5222 PositionalToken {
5223 source: uws,
5224 offset: 63,
5225 length: 5,
5226 token: Token::Word(Word::Numerical(Numerical::Measures("1кг".to_string()))),
5227 },
5228 PositionalToken {
5229 source: uws,
5230 offset: 68,
5231 length: 1,
5232 token: Token::Special(Special::Separator(Separator::Space)),
5233 },
5234 PositionalToken {
5235 source: uws,
5236 offset: 69,
5237 length: 20,
5238 token: Token::Word(Word::Numerical(Numerical::Measures(
5239 "123123афываыв".to_string(),
5240 ))),
5241 },
5242 PositionalToken {
5243 source: uws,
5244 offset: 89,
5245 length: 1,
5246 token: Token::Special(Special::Separator(Separator::Space)),
5247 },
5248 PositionalToken {
5249 source: uws,
5250 offset: 90,
5251 length: 34,
5252 token: Token::Word(Word::Numerical(Numerical::Alphanumeric(
5253 "12321фвафыов234выалфо".to_string(),
5254 ))),
5255 },
5256 PositionalToken {
5257 source: uws,
5258 offset: 124,
5259 length: 1,
5260 token: Token::Special(Special::Separator(Separator::Space)),
5261 },
5262 PositionalToken {
5263 source: uws,
5264 offset: 125,
5265 length: 20,
5266 token: Token::Word(Word::Numerical(Numerical::Alphanumeric(
5267 "12_123_343.4234_4234".to_string(),
5268 ))),
5269 },
5270 ];
5271 check_results(&result, &lib_res, uws);
5272 }
5273
5274 #[test]
5275 fn numerical_default() {
5276 let uws = "12.02.18 31.28.34 23.11.2018 123.568.365.234.578 127.0.0.1 1st 1кг 123123афываыв 12321фвафыов234выалфо 12_123_343.4234_4234";
5277 let lib_res = uws
5278 .into_tokenizer(TokenizerParams::v1())
5279 .collect::<Vec<_>>();
5280 let result = vec![
5283 PositionalToken {
5284 source: uws,
5285 offset: 0,
5286 length: 2,
5287 token: Token::Word(Word::Number(Number::Integer(12))),
5288 },
5289 PositionalToken {
5290 source: uws,
5291 offset: 2,
5292 length: 1,
5293 token: Token::Special(Special::Punctuation('.')),
5294 },
5295 PositionalToken {
5296 source: uws,
5297 offset: 3,
5298 length: 2,
5299 token: Token::Word(Word::Number(Number::ZeroInteger {
5300 i: 2,
5301 s: "02".to_string(),
5302 })),
5303 },
5304 PositionalToken {
5305 source: uws,
5306 offset: 5,
5307 length: 1,
5308 token: Token::Special(Special::Punctuation('.')),
5309 },
5310 PositionalToken {
5311 source: uws,
5312 offset: 6,
5313 length: 2,
5314 token: Token::Word(Word::Number(Number::Integer(18))),
5315 },
5316 PositionalToken {
5317 source: uws,
5318 offset: 8,
5319 length: 1,
5320 token: Token::Special(Special::Separator(Separator::Space)),
5321 },
5322 PositionalToken {
5323 source: uws,
5324 offset: 9,
5325 length: 2,
5326 token: Token::Word(Word::Number(Number::Integer(31))),
5327 },
5328 PositionalToken {
5329 source: uws,
5330 offset: 11,
5331 length: 1,
5332 token: Token::Special(Special::Punctuation('.')),
5333 },
5334 PositionalToken {
5335 source: uws,
5336 offset: 12,
5337 length: 2,
5338 token: Token::Word(Word::Number(Number::Integer(28))),
5339 },
5340 PositionalToken {
5341 source: uws,
5342 offset: 14,
5343 length: 1,
5344 token: Token::Special(Special::Punctuation('.')),
5345 },
5346 PositionalToken {
5347 source: uws,
5348 offset: 15,
5349 length: 2,
5350 token: Token::Word(Word::Number(Number::Integer(34))),
5351 },
5352 PositionalToken {
5353 source: uws,
5354 offset: 17,
5355 length: 1,
5356 token: Token::Special(Special::Separator(Separator::Space)),
5357 },
5358 PositionalToken {
5359 source: uws,
5360 offset: 18,
5361 length: 2,
5362 token: Token::Word(Word::Number(Number::Integer(23))),
5363 },
5364 PositionalToken {
5365 source: uws,
5366 offset: 20,
5367 length: 1,
5368 token: Token::Special(Special::Punctuation('.')),
5369 },
5370 PositionalToken {
5371 source: uws,
5372 offset: 21,
5373 length: 2,
5374 token: Token::Word(Word::Number(Number::Integer(11))),
5375 },
5376 PositionalToken {
5377 source: uws,
5378 offset: 23,
5379 length: 1,
5380 token: Token::Special(Special::Punctuation('.')),
5381 },
5382 PositionalToken {
5383 source: uws,
5384 offset: 24,
5385 length: 4,
5386 token: Token::Word(Word::Number(Number::Integer(2018))),
5387 },
5388 PositionalToken {
5389 source: uws,
5390 offset: 28,
5391 length: 1,
5392 token: Token::Special(Special::Separator(Separator::Space)),
5393 },
5394 PositionalToken {
5395 source: uws,
5396 offset: 29,
5397 length: 19,
5398 token: Token::Word(Word::Number(Number::Integer(123568365234578))),
5400 },
5401 PositionalToken {
5456 source: uws,
5457 offset: 48,
5458 length: 1,
5459 token: Token::Special(Special::Separator(Separator::Space)),
5460 },
5461 PositionalToken {
5462 source: uws,
5463 offset: 49,
5464 length: 3,
5465 token: Token::Word(Word::Number(Number::Integer(127))),
5466 },
5467 PositionalToken {
5468 source: uws,
5469 offset: 52,
5470 length: 1,
5471 token: Token::Special(Special::Punctuation('.')),
5472 },
5473 PositionalToken {
5474 source: uws,
5475 offset: 53,
5476 length: 1,
5477 token: Token::Word(Word::Number(Number::ZeroInteger {
5478 i: 0,
5479 s: "0".to_string(),
5480 })),
5481 },
5482 PositionalToken {
5483 source: uws,
5484 offset: 54,
5485 length: 1,
5486 token: Token::Special(Special::Punctuation('.')),
5487 },
5488 PositionalToken {
5489 source: uws,
5490 offset: 55,
5491 length: 1,
5492 token: Token::Word(Word::Number(Number::ZeroInteger {
5493 i: 0,
5494 s: "0".to_string(),
5495 })),
5496 },
5497 PositionalToken {
5498 source: uws,
5499 offset: 56,
5500 length: 1,
5501 token: Token::Special(Special::Punctuation('.')),
5502 },
5503 PositionalToken {
5504 source: uws,
5505 offset: 57,
5506 length: 1,
5507 token: Token::Word(Word::Number(Number::Integer(1))),
5508 },
5509 PositionalToken {
5510 source: uws,
5511 offset: 58,
5512 length: 1,
5513 token: Token::Special(Special::Separator(Separator::Space)),
5514 },
5515 PositionalToken {
5516 source: uws,
5517 offset: 59,
5518 length: 3,
5519 token: Token::Word(Word::Numerical(Numerical::Measures("1st".to_string()))),
5520 },
5521 PositionalToken {
5522 source: uws,
5523 offset: 62,
5524 length: 1,
5525 token: Token::Special(Special::Separator(Separator::Space)),
5526 },
5527 PositionalToken {
5528 source: uws,
5529 offset: 63,
5530 length: 5,
5531 token: Token::Word(Word::Numerical(Numerical::Measures("1кг".to_string()))),
5532 },
5533 PositionalToken {
5534 source: uws,
5535 offset: 68,
5536 length: 1,
5537 token: Token::Special(Special::Separator(Separator::Space)),
5538 },
5539 PositionalToken {
5540 source: uws,
5541 offset: 69,
5542 length: 20,
5543 token: Token::Word(Word::Numerical(Numerical::Measures(
5544 "123123афываыв".to_string(),
5545 ))),
5546 },
5547 PositionalToken {
5548 source: uws,
5549 offset: 89,
5550 length: 1,
5551 token: Token::Special(Special::Separator(Separator::Space)),
5552 },
5553 PositionalToken {
5554 source: uws,
5555 offset: 90,
5556 length: 34,
5557 token: Token::Word(Word::Numerical(Numerical::Alphanumeric(
5558 "12321фвафыов234выалфо".to_string(),
5559 ))),
5560 },
5561 PositionalToken {
5562 source: uws,
5563 offset: 124,
5564 length: 1,
5565 token: Token::Special(Special::Separator(Separator::Space)),
5566 },
5567 PositionalToken {
5568 source: uws,
5569 offset: 125,
5570 length: 2,
5571 token: Token::Word(Word::Number(Number::Integer(12))),
5572 },
5573 PositionalToken {
5574 source: uws,
5575 offset: 127,
5576 length: 1,
5577 token: Token::Special(Special::Punctuation('_')),
5578 },
5579 PositionalToken {
5580 source: uws,
5581 offset: 128,
5582 length: 3,
5583 token: Token::Word(Word::Number(Number::Integer(123))),
5584 },
5585 PositionalToken {
5586 source: uws,
5587 offset: 131,
5588 length: 1,
5589 token: Token::Special(Special::Punctuation('_')),
5590 },
5591 PositionalToken {
5592 source: uws,
5593 offset: 132,
5594 length: 3,
5595 token: Token::Word(Word::Number(Number::Integer(343))),
5596 },
5597 PositionalToken {
5598 source: uws,
5599 offset: 135,
5600 length: 1,
5601 token: Token::Special(Special::Punctuation('.')),
5602 },
5603 PositionalToken {
5604 source: uws,
5605 offset: 136,
5606 length: 4,
5607 token: Token::Word(Word::Number(Number::Integer(4234))),
5608 },
5609 PositionalToken {
5610 source: uws,
5611 offset: 140,
5612 length: 1,
5613 token: Token::Special(Special::Punctuation('_')),
5614 },
5615 PositionalToken {
5616 source: uws,
5617 offset: 141,
5618 length: 4,
5619 token: Token::Word(Word::Number(Number::Integer(4234))),
5620 },
5621 ];
5622 check_results(&result, &lib_res, uws);
5623 }
5624
5625 enum Lang {
5638 Zho,
5639 Jpn,
5640 Kor,
5641 Ara,
5642 Ell,
5643 }
5644
5645 #[test]
5646 fn test_lang_zho() {
5647 let (uws, result) = get_lang_test(Lang::Zho);
5648 let lib_res = uws
5649 .into_tokenizer(TokenizerParams::v1())
5650 .collect::<Vec<_>>();
5651 check_results(&result, &lib_res, &uws);
5652 }
5653
5654 #[test]
5655 fn test_lang_jpn() {
5656 let (uws, result) = get_lang_test(Lang::Jpn);
5657 let lib_res = uws
5658 .into_tokenizer(TokenizerParams::v1())
5659 .collect::<Vec<_>>();
5660 check_results(&result, &lib_res, &uws);
5661 }
5662
5663 #[test]
5664 fn test_lang_kor() {
5665 let (uws, result) = get_lang_test(Lang::Kor);
5666 let lib_res = uws
5667 .into_tokenizer(TokenizerParams::v1())
5668 .collect::<Vec<_>>();
5669 check_results(&result, &lib_res, &uws);
5670 }
5671
5672 #[test]
5673 fn test_lang_ara() {
5674 let (uws, result) = get_lang_test(Lang::Ara);
5675 let lib_res = uws
5676 .into_tokenizer(TokenizerParams::v1())
5677 .collect::<Vec<_>>();
5678 check_results(&result, &lib_res, &uws);
5679 }
5680
5681 #[test]
5682 fn test_lang_ell() {
5683 let (uws, result) = get_lang_test(Lang::Ell);
5684 let lib_res = uws
5685 .into_tokenizer(TokenizerParams::v1())
5686 .collect::<Vec<_>>();
5687 check_results(&result, &lib_res, &uws);
5688 }
5689
5690 fn get_lang_test(lng: Lang) -> (String, Vec<PositionalToken>) {
5691 let uws = match lng {
5692 Lang::Zho => {
5693 "美国电视连续剧《超人前传》的第一集《试播集》于2001年10月16日在電視網首播,剧集主创人阿尔弗雷德·高夫和迈尔斯·米勒編劇,大卫·努特尔执导。这一试播首次向观众引荐了克拉克·肯特一角,他是位拥有超能力的外星孤儿,与家人和朋友一起在堪薩斯州虚构小镇斯莫维尔生活。在这一集里,肯特首度得知自己的来历,同时还需要阻止一位学生试图杀死镇上高中多名学生的报复之举。本集节目里引入了多个之后将贯穿全季甚至整部剧集的主题元素,例如几位主要角色之间的三角恋情。电视剧在加拿大溫哥華取景,旨在选用其“美国中产阶级”景观,主创人花了5个月的时间专门用于为主角物色合适的演员。试播集在所有演员选好4天后正式开拍。由于时间上的限制,剧组无法搭建好实体外景,因此只能使用计算机绘图技术将数字化的外景插入到镜头中。节目一经上映就打破了电视网的多项收视纪录,并且获得了评论员的普遍好评和多个奖项提名,并在其中两项上胜出"
5694 }
5695 Lang::Kor => {
5696 "플레이스테이션 은 소니 컴퓨터 엔터테인먼트가 개발한 세 번째 가정용 게임기이다. 마이크로소프트의 엑스박스 360, 닌텐도의 Wii와 경쟁하고 있다. 이전 제품에서 온라인 플레이 기능을 비디오 게임 개발사에 전적으로 의존하던 것과 달리 통합 온라인 게임 서비스인 플레이스테이션 네트워크 서비스를 발매와 함께 시작해 제공하고 있으며, 탄탄한 멀티미디어 재생 기능, 플레이스테이션 포터블과의 연결, 고화질 광학 디스크 포맷인 블루레이 디스크 재생 기능 등의 기능을 갖추고 있다. 2006년 11월 11일에 일본에서 처음으로 출시했으며, 11월 17일에는 북미 지역, 2007년 3월 23일에는 유럽과 오세아니아 지역에서, 대한민국의 경우 6월 5일부터 일주일간 예약판매를 실시해, 매일 준비한 수량이 동이 나는 등 많은 관심을 받았으며 6월 16일에 정식 출시 행사를 열었다"
5697 }
5698 Lang::Jpn => {
5699 "熊野三山本願所は、15世紀末以降における熊野三山(熊野本宮、熊野新宮、熊野那智)の造営・修造のための勧進を担った組織の総称。 熊野三山を含めて、日本における古代から中世前半にかけての寺社の造営は、寺社領経営のような恒常的財源、幕府や朝廷などからの一時的な造営料所の寄進、あるいは公権力からの臨時の保護によって行われていた。しかしながら、熊野三山では、これらの財源はすべて15世紀半ばまでに実効性を失った"
5700 }
5701 Lang::Ara => {
5702 "لشکرکشیهای روسهای وارنگی به دریای خزر مجموعهای از حملات نظامی در بین سالهای ۸۶۴ تا ۱۰۴۱ میلادی به سواحل دریای خزر بودهاست. روسهای وارنگی ابتدا در قرن نهم میلادی به عنوان بازرگانان پوست، عسل و برده در سرزمینهای اسلامی(سرکلند) ظاهر شدند. این بازرگانان در مسیر تجاری ولگا به خرید و فروش میپرداختند. نخستین حملهٔ آنان در فاصله سالهای ۸۶۴ تا ۸۸۴ میلادی در مقیاسی کوچک علیه علویان طبرستان رخ داد. نخستین یورش بزرگ روسها در سال ۹۱۳ رخ داد و آنان با ۵۰۰ فروند درازکشتی شهر گرگان و اطراف آن را غارت کردند. آنها در این حمله مقداری کالا و برده را به تاراج بردند و در راه بازگشتن به سمت شمال، در دلتای ولگا، مورد حملهٔ خزرهای مسلمان قرار گرفتند و بعضی از آنان موفق به فرار شدند، ولی در میانهٔ ولگا به قتل رسیدند. دومین هجوم بزرگ روسها به دریای خزر در سال ۹۴۳ به وقوع پیوست. در این دوره ایگور یکم، حاکم روس کیف، رهبری روسها را در دست داشت. روسها پس از توافق با دولت خزرها برای عبور امن از منطقه، تا رود کورا و اعماق قفقاز پیش رفتند و در سال ۹۴۳ موفق شدند بندر بردعه، پایتخت اران (جمهوری آذربایجان کنونی)، را تصرف کنند. روسها در آنجا به مدت چند ماه ماندند و بسیاری از ساکنان شهر را کشتند و از راه غارتگری اموالی را به تاراج بردند. تنها دلیل بازگشت آنان "
5703 }
5704 Lang::Ell => {
5705 "Το Πρόγραμμα υλοποιείται εξ ολοκλήρου από απόσταση και μπορεί να συμμετέχει κάθε εμπλεκόμενος στη ή/και ενδιαφερόμενος για τη διδασκαλία της Ελληνικής ως δεύτερης/ξένης γλώσσας στην Ελλάδα και στο εξωτερικό, αρκεί να είναι απόφοιτος ελληνικής φιλολογίας, ξένων φιλολογιών, παιδαγωγικών τμημάτων, θεολογικών σχολών ή άλλων πανεπιστημιακών τμημάτων ελληνικών ή ισότιμων ξένων πανεπιστημίων. Υπό όρους γίνονται δεκτοί υποψήφιοι που δεν έχουν ολοκληρώσει σπουδές τριτοβάθμιας εκπαίδευσης."
5706 }
5707 };
5708 let tokens = match lng {
5709 Lang::Zho => vec![
5710 PositionalToken {
5711 source: uws,
5712 offset: 0,
5713 length: 3,
5714 token: Token::Word(Word::Word("美".to_string())),
5715 },
5716 PositionalToken {
5717 source: uws,
5718 offset: 3,
5719 length: 3,
5720 token: Token::Word(Word::Word("国".to_string())),
5721 },
5722 PositionalToken {
5723 source: uws,
5724 offset: 6,
5725 length: 3,
5726 token: Token::Word(Word::Word("电".to_string())),
5727 },
5728 PositionalToken {
5729 source: uws,
5730 offset: 9,
5731 length: 3,
5732 token: Token::Word(Word::Word("视".to_string())),
5733 },
5734 PositionalToken {
5735 source: uws,
5736 offset: 12,
5737 length: 3,
5738 token: Token::Word(Word::Word("连".to_string())),
5739 },
5740 PositionalToken {
5741 source: uws,
5742 offset: 15,
5743 length: 3,
5744 token: Token::Word(Word::Word("续".to_string())),
5745 },
5746 PositionalToken {
5747 source: uws,
5748 offset: 18,
5749 length: 3,
5750 token: Token::Word(Word::Word("剧".to_string())),
5751 },
5752 PositionalToken {
5753 source: uws,
5754 offset: 21,
5755 length: 3,
5756 token: Token::Special(Special::Punctuation('《')),
5757 },
5758 PositionalToken {
5759 source: uws,
5760 offset: 24,
5761 length: 3,
5762 token: Token::Word(Word::Word("超".to_string())),
5763 },
5764 PositionalToken {
5765 source: uws,
5766 offset: 27,
5767 length: 3,
5768 token: Token::Word(Word::Word("人".to_string())),
5769 },
5770 PositionalToken {
5771 source: uws,
5772 offset: 30,
5773 length: 3,
5774 token: Token::Word(Word::Word("前".to_string())),
5775 },
5776 PositionalToken {
5777 source: uws,
5778 offset: 33,
5779 length: 3,
5780 token: Token::Word(Word::Word("传".to_string())),
5781 },
5782 PositionalToken {
5783 source: uws,
5784 offset: 36,
5785 length: 3,
5786 token: Token::Special(Special::Punctuation('》')),
5787 },
5788 PositionalToken {
5789 source: uws,
5790 offset: 39,
5791 length: 3,
5792 token: Token::Word(Word::Word("的".to_string())),
5793 },
5794 PositionalToken {
5795 source: uws,
5796 offset: 42,
5797 length: 3,
5798 token: Token::Word(Word::Word("第".to_string())),
5799 },
5800 PositionalToken {
5801 source: uws,
5802 offset: 45,
5803 length: 3,
5804 token: Token::Word(Word::Word("一".to_string())),
5805 },
5806 PositionalToken {
5807 source: uws,
5808 offset: 48,
5809 length: 3,
5810 token: Token::Word(Word::Word("集".to_string())),
5811 },
5812 PositionalToken {
5813 source: uws,
5814 offset: 51,
5815 length: 3,
5816 token: Token::Special(Special::Punctuation('《')),
5817 },
5818 PositionalToken {
5819 source: uws,
5820 offset: 54,
5821 length: 3,
5822 token: Token::Word(Word::Word("试".to_string())),
5823 },
5824 PositionalToken {
5825 source: uws,
5826 offset: 57,
5827 length: 3,
5828 token: Token::Word(Word::Word("播".to_string())),
5829 },
5830 PositionalToken {
5831 source: uws,
5832 offset: 60,
5833 length: 3,
5834 token: Token::Word(Word::Word("集".to_string())),
5835 },
5836 PositionalToken {
5837 source: uws,
5838 offset: 63,
5839 length: 3,
5840 token: Token::Special(Special::Punctuation('》')),
5841 },
5842 PositionalToken {
5843 source: uws,
5844 offset: 66,
5845 length: 3,
5846 token: Token::Word(Word::Word("于".to_string())),
5847 },
5848 PositionalToken {
5849 source: uws,
5850 offset: 69,
5851 length: 4,
5852 token: Token::Word(Word::Number(Number::Integer(2001))),
5853 },
5854 PositionalToken {
5855 source: uws,
5856 offset: 73,
5857 length: 3,
5858 token: Token::Word(Word::Word("年".to_string())),
5859 },
5860 PositionalToken {
5861 source: uws,
5862 offset: 76,
5863 length: 2,
5864 token: Token::Word(Word::Number(Number::Integer(10))),
5865 },
5866 PositionalToken {
5867 source: uws,
5868 offset: 78,
5869 length: 3,
5870 token: Token::Word(Word::Word("月".to_string())),
5871 },
5872 PositionalToken {
5873 source: uws,
5874 offset: 81,
5875 length: 2,
5876 token: Token::Word(Word::Number(Number::Integer(16))),
5877 },
5878 PositionalToken {
5879 source: uws,
5880 offset: 83,
5881 length: 3,
5882 token: Token::Word(Word::Word("日".to_string())),
5883 },
5884 PositionalToken {
5885 source: uws,
5886 offset: 86,
5887 length: 3,
5888 token: Token::Word(Word::Word("在".to_string())),
5889 },
5890 PositionalToken {
5891 source: uws,
5892 offset: 89,
5893 length: 3,
5894 token: Token::Word(Word::Word("電".to_string())),
5895 },
5896 PositionalToken {
5897 source: uws,
5898 offset: 92,
5899 length: 3,
5900 token: Token::Word(Word::Word("視".to_string())),
5901 },
5902 PositionalToken {
5903 source: uws,
5904 offset: 95,
5905 length: 3,
5906 token: Token::Word(Word::Word("網".to_string())),
5907 },
5908 PositionalToken {
5909 source: uws,
5910 offset: 98,
5911 length: 3,
5912 token: Token::Word(Word::Word("首".to_string())),
5913 },
5914 PositionalToken {
5915 source: uws,
5916 offset: 101,
5917 length: 3,
5918 token: Token::Word(Word::Word("播".to_string())),
5919 },
5920 PositionalToken {
5921 source: uws,
5922 offset: 104,
5923 length: 3,
5924 token: Token::Special(Special::Punctuation(',')),
5925 },
5926 PositionalToken {
5927 source: uws,
5928 offset: 107,
5929 length: 3,
5930 token: Token::Word(Word::Word("剧".to_string())),
5931 },
5932 PositionalToken {
5933 source: uws,
5934 offset: 110,
5935 length: 3,
5936 token: Token::Word(Word::Word("集".to_string())),
5937 },
5938 PositionalToken {
5939 source: uws,
5940 offset: 113,
5941 length: 3,
5942 token: Token::Word(Word::Word("主".to_string())),
5943 },
5944 PositionalToken {
5945 source: uws,
5946 offset: 116,
5947 length: 3,
5948 token: Token::Word(Word::Word("创".to_string())),
5949 },
5950 PositionalToken {
5951 source: uws,
5952 offset: 119,
5953 length: 3,
5954 token: Token::Word(Word::Word("人".to_string())),
5955 },
5956 PositionalToken {
5957 source: uws,
5958 offset: 122,
5959 length: 3,
5960 token: Token::Word(Word::Word("阿".to_string())),
5961 },
5962 PositionalToken {
5963 source: uws,
5964 offset: 125,
5965 length: 3,
5966 token: Token::Word(Word::Word("尔".to_string())),
5967 },
5968 PositionalToken {
5969 source: uws,
5970 offset: 128,
5971 length: 3,
5972 token: Token::Word(Word::Word("弗".to_string())),
5973 },
5974 PositionalToken {
5975 source: uws,
5976 offset: 131,
5977 length: 3,
5978 token: Token::Word(Word::Word("雷".to_string())),
5979 },
5980 PositionalToken {
5981 source: uws,
5982 offset: 134,
5983 length: 3,
5984 token: Token::Word(Word::Word("德".to_string())),
5985 },
5986 PositionalToken {
5987 source: uws,
5988 offset: 137,
5989 length: 2,
5990 token: Token::Special(Special::Punctuation('·')),
5991 },
5992 PositionalToken {
5993 source: uws,
5994 offset: 139,
5995 length: 3,
5996 token: Token::Word(Word::Word("高".to_string())),
5997 },
5998 PositionalToken {
5999 source: uws,
6000 offset: 142,
6001 length: 3,
6002 token: Token::Word(Word::Word("夫".to_string())),
6003 },
6004 PositionalToken {
6005 source: uws,
6006 offset: 145,
6007 length: 3,
6008 token: Token::Word(Word::Word("和".to_string())),
6009 },
6010 PositionalToken {
6011 source: uws,
6012 offset: 148,
6013 length: 3,
6014 token: Token::Word(Word::Word("迈".to_string())),
6015 },
6016 PositionalToken {
6017 source: uws,
6018 offset: 151,
6019 length: 3,
6020 token: Token::Word(Word::Word("尔".to_string())),
6021 },
6022 PositionalToken {
6023 source: uws,
6024 offset: 154,
6025 length: 3,
6026 token: Token::Word(Word::Word("斯".to_string())),
6027 },
6028 PositionalToken {
6029 source: uws,
6030 offset: 157,
6031 length: 2,
6032 token: Token::Special(Special::Punctuation('·')),
6033 },
6034 PositionalToken {
6035 source: uws,
6036 offset: 159,
6037 length: 3,
6038 token: Token::Word(Word::Word("米".to_string())),
6039 },
6040 PositionalToken {
6041 source: uws,
6042 offset: 162,
6043 length: 3,
6044 token: Token::Word(Word::Word("勒".to_string())),
6045 },
6046 PositionalToken {
6047 source: uws,
6048 offset: 165,
6049 length: 3,
6050 token: Token::Word(Word::Word("編".to_string())),
6051 },
6052 PositionalToken {
6053 source: uws,
6054 offset: 168,
6055 length: 3,
6056 token: Token::Word(Word::Word("劇".to_string())),
6057 },
6058 PositionalToken {
6059 source: uws,
6060 offset: 171,
6061 length: 3,
6062 token: Token::Special(Special::Punctuation(',')),
6063 },
6064 PositionalToken {
6065 source: uws,
6066 offset: 174,
6067 length: 3,
6068 token: Token::Word(Word::Word("大".to_string())),
6069 },
6070 PositionalToken {
6071 source: uws,
6072 offset: 177,
6073 length: 3,
6074 token: Token::Word(Word::Word("卫".to_string())),
6075 },
6076 PositionalToken {
6077 source: uws,
6078 offset: 180,
6079 length: 2,
6080 token: Token::Special(Special::Punctuation('·')),
6081 },
6082 PositionalToken {
6083 source: uws,
6084 offset: 182,
6085 length: 3,
6086 token: Token::Word(Word::Word("努".to_string())),
6087 },
6088 PositionalToken {
6089 source: uws,
6090 offset: 185,
6091 length: 3,
6092 token: Token::Word(Word::Word("特".to_string())),
6093 },
6094 PositionalToken {
6095 source: uws,
6096 offset: 188,
6097 length: 3,
6098 token: Token::Word(Word::Word("尔".to_string())),
6099 },
6100 PositionalToken {
6101 source: uws,
6102 offset: 191,
6103 length: 3,
6104 token: Token::Word(Word::Word("执".to_string())),
6105 },
6106 PositionalToken {
6107 source: uws,
6108 offset: 194,
6109 length: 3,
6110 token: Token::Word(Word::Word("导".to_string())),
6111 },
6112 PositionalToken {
6113 source: uws,
6114 offset: 197,
6115 length: 3,
6116 token: Token::Special(Special::Punctuation('。')),
6117 },
6118 PositionalToken {
6119 source: uws,
6120 offset: 200,
6121 length: 3,
6122 token: Token::Word(Word::Word("这".to_string())),
6123 },
6124 PositionalToken {
6125 source: uws,
6126 offset: 203,
6127 length: 3,
6128 token: Token::Word(Word::Word("一".to_string())),
6129 },
6130 PositionalToken {
6131 source: uws,
6132 offset: 206,
6133 length: 3,
6134 token: Token::Word(Word::Word("试".to_string())),
6135 },
6136 PositionalToken {
6137 source: uws,
6138 offset: 209,
6139 length: 3,
6140 token: Token::Word(Word::Word("播".to_string())),
6141 },
6142 PositionalToken {
6143 source: uws,
6144 offset: 212,
6145 length: 3,
6146 token: Token::Word(Word::Word("首".to_string())),
6147 },
6148 PositionalToken {
6149 source: uws,
6150 offset: 215,
6151 length: 3,
6152 token: Token::Word(Word::Word("次".to_string())),
6153 },
6154 PositionalToken {
6155 source: uws,
6156 offset: 218,
6157 length: 3,
6158 token: Token::Word(Word::Word("向".to_string())),
6159 },
6160 PositionalToken {
6161 source: uws,
6162 offset: 221,
6163 length: 3,
6164 token: Token::Word(Word::Word("观".to_string())),
6165 },
6166 PositionalToken {
6167 source: uws,
6168 offset: 224,
6169 length: 3,
6170 token: Token::Word(Word::Word("众".to_string())),
6171 },
6172 PositionalToken {
6173 source: uws,
6174 offset: 227,
6175 length: 3,
6176 token: Token::Word(Word::Word("引".to_string())),
6177 },
6178 PositionalToken {
6179 source: uws,
6180 offset: 230,
6181 length: 3,
6182 token: Token::Word(Word::Word("荐".to_string())),
6183 },
6184 PositionalToken {
6185 source: uws,
6186 offset: 233,
6187 length: 3,
6188 token: Token::Word(Word::Word("了".to_string())),
6189 },
6190 PositionalToken {
6191 source: uws,
6192 offset: 236,
6193 length: 3,
6194 token: Token::Word(Word::Word("克".to_string())),
6195 },
6196 PositionalToken {
6197 source: uws,
6198 offset: 239,
6199 length: 3,
6200 token: Token::Word(Word::Word("拉".to_string())),
6201 },
6202 PositionalToken {
6203 source: uws,
6204 offset: 242,
6205 length: 3,
6206 token: Token::Word(Word::Word("克".to_string())),
6207 },
6208 PositionalToken {
6209 source: uws,
6210 offset: 245,
6211 length: 2,
6212 token: Token::Special(Special::Punctuation('·')),
6213 },
6214 PositionalToken {
6215 source: uws,
6216 offset: 247,
6217 length: 3,
6218 token: Token::Word(Word::Word("肯".to_string())),
6219 },
6220 PositionalToken {
6221 source: uws,
6222 offset: 250,
6223 length: 3,
6224 token: Token::Word(Word::Word("特".to_string())),
6225 },
6226 PositionalToken {
6227 source: uws,
6228 offset: 253,
6229 length: 3,
6230 token: Token::Word(Word::Word("一".to_string())),
6231 },
6232 PositionalToken {
6233 source: uws,
6234 offset: 256,
6235 length: 3,
6236 token: Token::Word(Word::Word("角".to_string())),
6237 },
6238 PositionalToken {
6239 source: uws,
6240 offset: 259,
6241 length: 3,
6242 token: Token::Special(Special::Punctuation(',')),
6243 },
6244 PositionalToken {
6245 source: uws,
6246 offset: 262,
6247 length: 3,
6248 token: Token::Word(Word::Word("他".to_string())),
6249 },
6250 PositionalToken {
6251 source: uws,
6252 offset: 265,
6253 length: 3,
6254 token: Token::Word(Word::Word("是".to_string())),
6255 },
6256 PositionalToken {
6257 source: uws,
6258 offset: 268,
6259 length: 3,
6260 token: Token::Word(Word::Word("位".to_string())),
6261 },
6262 PositionalToken {
6263 source: uws,
6264 offset: 271,
6265 length: 3,
6266 token: Token::Word(Word::Word("拥".to_string())),
6267 },
6268 PositionalToken {
6269 source: uws,
6270 offset: 274,
6271 length: 3,
6272 token: Token::Word(Word::Word("有".to_string())),
6273 },
6274 PositionalToken {
6275 source: uws,
6276 offset: 277,
6277 length: 3,
6278 token: Token::Word(Word::Word("超".to_string())),
6279 },
6280 ],
6281 Lang::Jpn => vec![
6282 PositionalToken {
6283 source: uws,
6284 offset: 0,
6285 length: 3,
6286 token: Token::Word(Word::Word("熊".to_string())),
6287 },
6288 PositionalToken {
6289 source: uws,
6290 offset: 3,
6291 length: 3,
6292 token: Token::Word(Word::Word("野".to_string())),
6293 },
6294 PositionalToken {
6295 source: uws,
6296 offset: 6,
6297 length: 3,
6298 token: Token::Word(Word::Word("三".to_string())),
6299 },
6300 PositionalToken {
6301 source: uws,
6302 offset: 9,
6303 length: 3,
6304 token: Token::Word(Word::Word("山".to_string())),
6305 },
6306 PositionalToken {
6307 source: uws,
6308 offset: 12,
6309 length: 3,
6310 token: Token::Word(Word::Word("本".to_string())),
6311 },
6312 PositionalToken {
6313 source: uws,
6314 offset: 15,
6315 length: 3,
6316 token: Token::Word(Word::Word("願".to_string())),
6317 },
6318 PositionalToken {
6319 source: uws,
6320 offset: 18,
6321 length: 3,
6322 token: Token::Word(Word::Word("所".to_string())),
6323 },
6324 PositionalToken {
6325 source: uws,
6326 offset: 21,
6327 length: 3,
6328 token: Token::Word(Word::Word("は".to_string())),
6329 },
6330 PositionalToken {
6331 source: uws,
6332 offset: 24,
6333 length: 3,
6334 token: Token::Special(Special::Punctuation('、')),
6335 },
6336 PositionalToken {
6337 source: uws,
6338 offset: 27,
6339 length: 2,
6340 token: Token::Word(Word::Number(Number::Integer(15))),
6341 },
6342 PositionalToken {
6343 source: uws,
6344 offset: 29,
6345 length: 3,
6346 token: Token::Word(Word::Word("世".to_string())),
6347 },
6348 PositionalToken {
6349 source: uws,
6350 offset: 32,
6351 length: 3,
6352 token: Token::Word(Word::Word("紀".to_string())),
6353 },
6354 PositionalToken {
6355 source: uws,
6356 offset: 35,
6357 length: 3,
6358 token: Token::Word(Word::Word("末".to_string())),
6359 },
6360 PositionalToken {
6361 source: uws,
6362 offset: 38,
6363 length: 3,
6364 token: Token::Word(Word::Word("以".to_string())),
6365 },
6366 PositionalToken {
6367 source: uws,
6368 offset: 41,
6369 length: 3,
6370 token: Token::Word(Word::Word("降".to_string())),
6371 },
6372 PositionalToken {
6373 source: uws,
6374 offset: 44,
6375 length: 3,
6376 token: Token::Word(Word::Word("に".to_string())),
6377 },
6378 PositionalToken {
6379 source: uws,
6380 offset: 47,
6381 length: 3,
6382 token: Token::Word(Word::Word("お".to_string())),
6383 },
6384 PositionalToken {
6385 source: uws,
6386 offset: 50,
6387 length: 3,
6388 token: Token::Word(Word::Word("け".to_string())),
6389 },
6390 PositionalToken {
6391 source: uws,
6392 offset: 53,
6393 length: 3,
6394 token: Token::Word(Word::Word("る".to_string())),
6395 },
6396 PositionalToken {
6397 source: uws,
6398 offset: 56,
6399 length: 3,
6400 token: Token::Word(Word::Word("熊".to_string())),
6401 },
6402 PositionalToken {
6403 source: uws,
6404 offset: 59,
6405 length: 3,
6406 token: Token::Word(Word::Word("野".to_string())),
6407 },
6408 PositionalToken {
6409 source: uws,
6410 offset: 62,
6411 length: 3,
6412 token: Token::Word(Word::Word("三".to_string())),
6413 },
6414 PositionalToken {
6415 source: uws,
6416 offset: 65,
6417 length: 3,
6418 token: Token::Word(Word::Word("山".to_string())),
6419 },
6420 PositionalToken {
6421 source: uws,
6422 offset: 68,
6423 length: 3,
6424 token: Token::Special(Special::Punctuation('(')),
6425 },
6426 PositionalToken {
6427 source: uws,
6428 offset: 71,
6429 length: 3,
6430 token: Token::Word(Word::Word("熊".to_string())),
6431 },
6432 PositionalToken {
6433 source: uws,
6434 offset: 74,
6435 length: 3,
6436 token: Token::Word(Word::Word("野".to_string())),
6437 },
6438 PositionalToken {
6439 source: uws,
6440 offset: 77,
6441 length: 3,
6442 token: Token::Word(Word::Word("本".to_string())),
6443 },
6444 PositionalToken {
6445 source: uws,
6446 offset: 80,
6447 length: 3,
6448 token: Token::Word(Word::Word("宮".to_string())),
6449 },
6450 PositionalToken {
6451 source: uws,
6452 offset: 83,
6453 length: 3,
6454 token: Token::Special(Special::Punctuation('、')),
6455 },
6456 PositionalToken {
6457 source: uws,
6458 offset: 86,
6459 length: 3,
6460 token: Token::Word(Word::Word("熊".to_string())),
6461 },
6462 PositionalToken {
6463 source: uws,
6464 offset: 89,
6465 length: 3,
6466 token: Token::Word(Word::Word("野".to_string())),
6467 },
6468 PositionalToken {
6469 source: uws,
6470 offset: 92,
6471 length: 3,
6472 token: Token::Word(Word::Word("新".to_string())),
6473 },
6474 PositionalToken {
6475 source: uws,
6476 offset: 95,
6477 length: 3,
6478 token: Token::Word(Word::Word("宮".to_string())),
6479 },
6480 PositionalToken {
6481 source: uws,
6482 offset: 98,
6483 length: 3,
6484 token: Token::Special(Special::Punctuation('、')),
6485 },
6486 PositionalToken {
6487 source: uws,
6488 offset: 101,
6489 length: 3,
6490 token: Token::Word(Word::Word("熊".to_string())),
6491 },
6492 PositionalToken {
6493 source: uws,
6494 offset: 104,
6495 length: 3,
6496 token: Token::Word(Word::Word("野".to_string())),
6497 },
6498 PositionalToken {
6499 source: uws,
6500 offset: 107,
6501 length: 3,
6502 token: Token::Word(Word::Word("那".to_string())),
6503 },
6504 PositionalToken {
6505 source: uws,
6506 offset: 110,
6507 length: 3,
6508 token: Token::Word(Word::Word("智".to_string())),
6509 },
6510 PositionalToken {
6511 source: uws,
6512 offset: 113,
6513 length: 3,
6514 token: Token::Special(Special::Punctuation(')')),
6515 },
6516 PositionalToken {
6517 source: uws,
6518 offset: 116,
6519 length: 3,
6520 token: Token::Word(Word::Word("の".to_string())),
6521 },
6522 PositionalToken {
6523 source: uws,
6524 offset: 119,
6525 length: 3,
6526 token: Token::Word(Word::Word("造".to_string())),
6527 },
6528 PositionalToken {
6529 source: uws,
6530 offset: 122,
6531 length: 3,
6532 token: Token::Word(Word::Word("営".to_string())),
6533 },
6534 PositionalToken {
6535 source: uws,
6536 offset: 125,
6537 length: 3,
6538 token: Token::Special(Special::Punctuation('・')),
6539 },
6540 PositionalToken {
6541 source: uws,
6542 offset: 128,
6543 length: 3,
6544 token: Token::Word(Word::Word("修".to_string())),
6545 },
6546 PositionalToken {
6547 source: uws,
6548 offset: 131,
6549 length: 3,
6550 token: Token::Word(Word::Word("造".to_string())),
6551 },
6552 PositionalToken {
6553 source: uws,
6554 offset: 134,
6555 length: 3,
6556 token: Token::Word(Word::Word("の".to_string())),
6557 },
6558 PositionalToken {
6559 source: uws,
6560 offset: 137,
6561 length: 3,
6562 token: Token::Word(Word::Word("た".to_string())),
6563 },
6564 PositionalToken {
6565 source: uws,
6566 offset: 140,
6567 length: 3,
6568 token: Token::Word(Word::Word("め".to_string())),
6569 },
6570 PositionalToken {
6571 source: uws,
6572 offset: 143,
6573 length: 3,
6574 token: Token::Word(Word::Word("の".to_string())),
6575 },
6576 PositionalToken {
6577 source: uws,
6578 offset: 146,
6579 length: 3,
6580 token: Token::Word(Word::Word("勧".to_string())),
6581 },
6582 PositionalToken {
6583 source: uws,
6584 offset: 149,
6585 length: 3,
6586 token: Token::Word(Word::Word("進".to_string())),
6587 },
6588 PositionalToken {
6589 source: uws,
6590 offset: 152,
6591 length: 3,
6592 token: Token::Word(Word::Word("を".to_string())),
6593 },
6594 PositionalToken {
6595 source: uws,
6596 offset: 155,
6597 length: 3,
6598 token: Token::Word(Word::Word("担".to_string())),
6599 },
6600 PositionalToken {
6601 source: uws,
6602 offset: 158,
6603 length: 3,
6604 token: Token::Word(Word::Word("っ".to_string())),
6605 },
6606 PositionalToken {
6607 source: uws,
6608 offset: 161,
6609 length: 3,
6610 token: Token::Word(Word::Word("た".to_string())),
6611 },
6612 PositionalToken {
6613 source: uws,
6614 offset: 164,
6615 length: 3,
6616 token: Token::Word(Word::Word("組".to_string())),
6617 },
6618 PositionalToken {
6619 source: uws,
6620 offset: 167,
6621 length: 3,
6622 token: Token::Word(Word::Word("織".to_string())),
6623 },
6624 PositionalToken {
6625 source: uws,
6626 offset: 170,
6627 length: 3,
6628 token: Token::Word(Word::Word("の".to_string())),
6629 },
6630 PositionalToken {
6631 source: uws,
6632 offset: 173,
6633 length: 3,
6634 token: Token::Word(Word::Word("総".to_string())),
6635 },
6636 PositionalToken {
6637 source: uws,
6638 offset: 176,
6639 length: 3,
6640 token: Token::Word(Word::Word("称".to_string())),
6641 },
6642 PositionalToken {
6643 source: uws,
6644 offset: 179,
6645 length: 3,
6646 token: Token::Special(Special::Punctuation('。')),
6647 },
6648 PositionalToken {
6649 source: uws,
6650 offset: 182,
6651 length: 1,
6652 token: Token::Special(Special::Separator(Separator::Space)),
6653 },
6654 PositionalToken {
6655 source: uws,
6656 offset: 183,
6657 length: 3,
6658 token: Token::Word(Word::Word("熊".to_string())),
6659 },
6660 PositionalToken {
6661 source: uws,
6662 offset: 186,
6663 length: 3,
6664 token: Token::Word(Word::Word("野".to_string())),
6665 },
6666 PositionalToken {
6667 source: uws,
6668 offset: 189,
6669 length: 3,
6670 token: Token::Word(Word::Word("三".to_string())),
6671 },
6672 PositionalToken {
6673 source: uws,
6674 offset: 192,
6675 length: 3,
6676 token: Token::Word(Word::Word("山".to_string())),
6677 },
6678 PositionalToken {
6679 source: uws,
6680 offset: 195,
6681 length: 3,
6682 token: Token::Word(Word::Word("を".to_string())),
6683 },
6684 PositionalToken {
6685 source: uws,
6686 offset: 198,
6687 length: 3,
6688 token: Token::Word(Word::Word("含".to_string())),
6689 },
6690 PositionalToken {
6691 source: uws,
6692 offset: 201,
6693 length: 3,
6694 token: Token::Word(Word::Word("め".to_string())),
6695 },
6696 PositionalToken {
6697 source: uws,
6698 offset: 204,
6699 length: 3,
6700 token: Token::Word(Word::Word("て".to_string())),
6701 },
6702 PositionalToken {
6703 source: uws,
6704 offset: 207,
6705 length: 3,
6706 token: Token::Special(Special::Punctuation('、')),
6707 },
6708 PositionalToken {
6709 source: uws,
6710 offset: 210,
6711 length: 3,
6712 token: Token::Word(Word::Word("日".to_string())),
6713 },
6714 PositionalToken {
6715 source: uws,
6716 offset: 213,
6717 length: 3,
6718 token: Token::Word(Word::Word("本".to_string())),
6719 },
6720 PositionalToken {
6721 source: uws,
6722 offset: 216,
6723 length: 3,
6724 token: Token::Word(Word::Word("に".to_string())),
6725 },
6726 PositionalToken {
6727 source: uws,
6728 offset: 219,
6729 length: 3,
6730 token: Token::Word(Word::Word("お".to_string())),
6731 },
6732 PositionalToken {
6733 source: uws,
6734 offset: 222,
6735 length: 3,
6736 token: Token::Word(Word::Word("け".to_string())),
6737 },
6738 PositionalToken {
6739 source: uws,
6740 offset: 225,
6741 length: 3,
6742 token: Token::Word(Word::Word("る".to_string())),
6743 },
6744 PositionalToken {
6745 source: uws,
6746 offset: 228,
6747 length: 3,
6748 token: Token::Word(Word::Word("古".to_string())),
6749 },
6750 PositionalToken {
6751 source: uws,
6752 offset: 231,
6753 length: 3,
6754 token: Token::Word(Word::Word("代".to_string())),
6755 },
6756 PositionalToken {
6757 source: uws,
6758 offset: 234,
6759 length: 3,
6760 token: Token::Word(Word::Word("か".to_string())),
6761 },
6762 PositionalToken {
6763 source: uws,
6764 offset: 237,
6765 length: 3,
6766 token: Token::Word(Word::Word("ら".to_string())),
6767 },
6768 PositionalToken {
6769 source: uws,
6770 offset: 240,
6771 length: 3,
6772 token: Token::Word(Word::Word("中".to_string())),
6773 },
6774 PositionalToken {
6775 source: uws,
6776 offset: 243,
6777 length: 3,
6778 token: Token::Word(Word::Word("世".to_string())),
6779 },
6780 PositionalToken {
6781 source: uws,
6782 offset: 246,
6783 length: 3,
6784 token: Token::Word(Word::Word("前".to_string())),
6785 },
6786 PositionalToken {
6787 source: uws,
6788 offset: 249,
6789 length: 3,
6790 token: Token::Word(Word::Word("半".to_string())),
6791 },
6792 PositionalToken {
6793 source: uws,
6794 offset: 252,
6795 length: 3,
6796 token: Token::Word(Word::Word("に".to_string())),
6797 },
6798 PositionalToken {
6799 source: uws,
6800 offset: 255,
6801 length: 3,
6802 token: Token::Word(Word::Word("か".to_string())),
6803 },
6804 PositionalToken {
6805 source: uws,
6806 offset: 258,
6807 length: 3,
6808 token: Token::Word(Word::Word("け".to_string())),
6809 },
6810 PositionalToken {
6811 source: uws,
6812 offset: 261,
6813 length: 3,
6814 token: Token::Word(Word::Word("て".to_string())),
6815 },
6816 PositionalToken {
6817 source: uws,
6818 offset: 264,
6819 length: 3,
6820 token: Token::Word(Word::Word("の".to_string())),
6821 },
6822 PositionalToken {
6823 source: uws,
6824 offset: 267,
6825 length: 3,
6826 token: Token::Word(Word::Word("寺".to_string())),
6827 },
6828 PositionalToken {
6829 source: uws,
6830 offset: 270,
6831 length: 3,
6832 token: Token::Word(Word::Word("社".to_string())),
6833 },
6834 PositionalToken {
6835 source: uws,
6836 offset: 273,
6837 length: 3,
6838 token: Token::Word(Word::Word("の".to_string())),
6839 },
6840 PositionalToken {
6841 source: uws,
6842 offset: 276,
6843 length: 3,
6844 token: Token::Word(Word::Word("造".to_string())),
6845 },
6846 PositionalToken {
6847 source: uws,
6848 offset: 279,
6849 length: 3,
6850 token: Token::Word(Word::Word("営".to_string())),
6851 },
6852 PositionalToken {
6853 source: uws,
6854 offset: 282,
6855 length: 3,
6856 token: Token::Word(Word::Word("は".to_string())),
6857 },
6858 PositionalToken {
6859 source: uws,
6860 offset: 285,
6861 length: 3,
6862 token: Token::Special(Special::Punctuation('、')),
6863 },
6864 PositionalToken {
6865 source: uws,
6866 offset: 288,
6867 length: 3,
6868 token: Token::Word(Word::Word("寺".to_string())),
6869 },
6870 PositionalToken {
6871 source: uws,
6872 offset: 291,
6873 length: 3,
6874 token: Token::Word(Word::Word("社".to_string())),
6875 },
6876 ],
6877 Lang::Kor => vec![
6878 PositionalToken {
6879 source: uws,
6880 offset: 0,
6881 length: 21,
6882 token: Token::Word(Word::Word("플레이스테이션".to_string())),
6883 },
6884 PositionalToken {
6885 source: uws,
6886 offset: 21,
6887 length: 1,
6888 token: Token::Special(Special::Separator(Separator::Space)),
6889 },
6890 PositionalToken {
6891 source: uws,
6892 offset: 22,
6893 length: 3,
6894 token: Token::Word(Word::Word("은".to_string())),
6895 },
6896 PositionalToken {
6897 source: uws,
6898 offset: 25,
6899 length: 1,
6900 token: Token::Special(Special::Separator(Separator::Space)),
6901 },
6902 PositionalToken {
6903 source: uws,
6904 offset: 26,
6905 length: 6,
6906 token: Token::Word(Word::Word("소니".to_string())),
6907 },
6908 PositionalToken {
6909 source: uws,
6910 offset: 32,
6911 length: 1,
6912 token: Token::Special(Special::Separator(Separator::Space)),
6913 },
6914 PositionalToken {
6915 source: uws,
6916 offset: 33,
6917 length: 9,
6918 token: Token::Word(Word::Word("컴퓨터".to_string())),
6919 },
6920 PositionalToken {
6921 source: uws,
6922 offset: 42,
6923 length: 1,
6924 token: Token::Special(Special::Separator(Separator::Space)),
6925 },
6926 PositionalToken {
6927 source: uws,
6928 offset: 43,
6929 length: 21,
6930 token: Token::Word(Word::Word("엔터테인먼트가".to_string())),
6931 },
6932 PositionalToken {
6933 source: uws,
6934 offset: 64,
6935 length: 1,
6936 token: Token::Special(Special::Separator(Separator::Space)),
6937 },
6938 PositionalToken {
6939 source: uws,
6940 offset: 65,
6941 length: 9,
6942 token: Token::Word(Word::Word("개발한".to_string())),
6943 },
6944 PositionalToken {
6945 source: uws,
6946 offset: 74,
6947 length: 1,
6948 token: Token::Special(Special::Separator(Separator::Space)),
6949 },
6950 PositionalToken {
6951 source: uws,
6952 offset: 75,
6953 length: 3,
6954 token: Token::Word(Word::Word("세".to_string())),
6955 },
6956 PositionalToken {
6957 source: uws,
6958 offset: 78,
6959 length: 1,
6960 token: Token::Special(Special::Separator(Separator::Space)),
6961 },
6962 PositionalToken {
6963 source: uws,
6964 offset: 79,
6965 length: 6,
6966 token: Token::Word(Word::Word("번째".to_string())),
6967 },
6968 PositionalToken {
6969 source: uws,
6970 offset: 85,
6971 length: 1,
6972 token: Token::Special(Special::Separator(Separator::Space)),
6973 },
6974 PositionalToken {
6975 source: uws,
6976 offset: 86,
6977 length: 9,
6978 token: Token::Word(Word::Word("가정용".to_string())),
6979 },
6980 PositionalToken {
6981 source: uws,
6982 offset: 95,
6983 length: 1,
6984 token: Token::Special(Special::Separator(Separator::Space)),
6985 },
6986 PositionalToken {
6987 source: uws,
6988 offset: 96,
6989 length: 15,
6990 token: Token::Word(Word::Word("게임기이다".to_string())),
6991 },
6992 PositionalToken {
6993 source: uws,
6994 offset: 111,
6995 length: 1,
6996 token: Token::Special(Special::Punctuation('.')),
6997 },
6998 PositionalToken {
6999 source: uws,
7000 offset: 112,
7001 length: 1,
7002 token: Token::Special(Special::Separator(Separator::Space)),
7003 },
7004 PositionalToken {
7005 source: uws,
7006 offset: 113,
7007 length: 24,
7008 token: Token::Word(Word::Word("마이크로소프트의".to_string())),
7009 },
7010 PositionalToken {
7011 source: uws,
7012 offset: 137,
7013 length: 1,
7014 token: Token::Special(Special::Separator(Separator::Space)),
7015 },
7016 PositionalToken {
7017 source: uws,
7018 offset: 138,
7019 length: 12,
7020 token: Token::Word(Word::Word("엑스박스".to_string())),
7021 },
7022 PositionalToken {
7023 source: uws,
7024 offset: 150,
7025 length: 1,
7026 token: Token::Special(Special::Separator(Separator::Space)),
7027 },
7028 PositionalToken {
7029 source: uws,
7030 offset: 151,
7031 length: 3,
7032 token: Token::Word(Word::Number(Number::Integer(360))),
7033 },
7034 PositionalToken {
7035 source: uws,
7036 offset: 154,
7037 length: 1,
7038 token: Token::Special(Special::Punctuation(',')),
7039 },
7040 PositionalToken {
7041 source: uws,
7042 offset: 155,
7043 length: 1,
7044 token: Token::Special(Special::Separator(Separator::Space)),
7045 },
7046 PositionalToken {
7047 source: uws,
7048 offset: 156,
7049 length: 12,
7050 token: Token::Word(Word::Word("닌텐도의".to_string())),
7051 },
7052 PositionalToken {
7053 source: uws,
7054 offset: 168,
7055 length: 1,
7056 token: Token::Special(Special::Separator(Separator::Space)),
7057 },
7058 PositionalToken {
7059 source: uws,
7060 offset: 169,
7061 length: 6,
7062 token: Token::Word(Word::Word("Wii와".to_string())),
7063 },
7064 PositionalToken {
7065 source: uws,
7066 offset: 175,
7067 length: 1,
7068 token: Token::Special(Special::Separator(Separator::Space)),
7069 },
7070 PositionalToken {
7071 source: uws,
7072 offset: 176,
7073 length: 12,
7074 token: Token::Word(Word::Word("경쟁하고".to_string())),
7075 },
7076 PositionalToken {
7077 source: uws,
7078 offset: 188,
7079 length: 1,
7080 token: Token::Special(Special::Separator(Separator::Space)),
7081 },
7082 PositionalToken {
7083 source: uws,
7084 offset: 189,
7085 length: 6,
7086 token: Token::Word(Word::Word("있다".to_string())),
7087 },
7088 PositionalToken {
7089 source: uws,
7090 offset: 195,
7091 length: 1,
7092 token: Token::Special(Special::Punctuation('.')),
7093 },
7094 PositionalToken {
7095 source: uws,
7096 offset: 196,
7097 length: 1,
7098 token: Token::Special(Special::Separator(Separator::Space)),
7099 },
7100 PositionalToken {
7101 source: uws,
7102 offset: 197,
7103 length: 6,
7104 token: Token::Word(Word::Word("이전".to_string())),
7105 },
7106 PositionalToken {
7107 source: uws,
7108 offset: 203,
7109 length: 1,
7110 token: Token::Special(Special::Separator(Separator::Space)),
7111 },
7112 PositionalToken {
7113 source: uws,
7114 offset: 204,
7115 length: 12,
7116 token: Token::Word(Word::Word("제품에서".to_string())),
7117 },
7118 PositionalToken {
7119 source: uws,
7120 offset: 216,
7121 length: 1,
7122 token: Token::Special(Special::Separator(Separator::Space)),
7123 },
7124 PositionalToken {
7125 source: uws,
7126 offset: 217,
7127 length: 9,
7128 token: Token::Word(Word::Word("온라인".to_string())),
7129 },
7130 PositionalToken {
7131 source: uws,
7132 offset: 226,
7133 length: 1,
7134 token: Token::Special(Special::Separator(Separator::Space)),
7135 },
7136 PositionalToken {
7137 source: uws,
7138 offset: 227,
7139 length: 9,
7140 token: Token::Word(Word::Word("플레이".to_string())),
7141 },
7142 PositionalToken {
7143 source: uws,
7144 offset: 236,
7145 length: 1,
7146 token: Token::Special(Special::Separator(Separator::Space)),
7147 },
7148 PositionalToken {
7149 source: uws,
7150 offset: 237,
7151 length: 3,
7152 token: Token::Word(Word::Word("기".to_string())),
7153 },
7154 ],
7155 Lang::Ara => vec![
7156 PositionalToken {
7157 source: uws,
7158 offset: 0,
7159 length: 14,
7160 token: Token::Word(Word::Word("لشکرکشی".to_string())),
7161 },
7162 PositionalToken {
7163 source: uws,
7164 offset: 14,
7165 length: 3,
7166 token: Token::Unicode(Unicode::Formatter(Formatter::Char('\u{200c}'))),
7167 },
7168 PositionalToken {
7169 source: uws,
7170 offset: 17,
7171 length: 6,
7172 token: Token::Word(Word::Word("های".to_string())),
7173 },
7174 PositionalToken {
7175 source: uws,
7176 offset: 23,
7177 length: 1,
7178 token: Token::Special(Special::Separator(Separator::Space)),
7179 },
7180 PositionalToken {
7181 source: uws,
7182 offset: 24,
7183 length: 6,
7184 token: Token::Word(Word::Word("روس".to_string())),
7185 },
7186 PositionalToken {
7187 source: uws,
7188 offset: 30,
7189 length: 3,
7190 token: Token::Unicode(Unicode::Formatter(Formatter::Char('\u{200c}'))),
7191 },
7192 PositionalToken {
7193 source: uws,
7194 offset: 33,
7195 length: 6,
7196 token: Token::Word(Word::Word("های".to_string())),
7197 },
7198 PositionalToken {
7199 source: uws,
7200 offset: 39,
7201 length: 1,
7202 token: Token::Special(Special::Separator(Separator::Space)),
7203 },
7204 PositionalToken {
7205 source: uws,
7206 offset: 40,
7207 length: 12,
7208 token: Token::Word(Word::Word("وارنگی".to_string())),
7209 },
7210 PositionalToken {
7211 source: uws,
7212 offset: 52,
7213 length: 1,
7214 token: Token::Special(Special::Separator(Separator::Space)),
7215 },
7216 PositionalToken {
7217 source: uws,
7218 offset: 53,
7219 length: 4,
7220 token: Token::Word(Word::Word("به".to_string())),
7221 },
7222 PositionalToken {
7223 source: uws,
7224 offset: 57,
7225 length: 1,
7226 token: Token::Special(Special::Separator(Separator::Space)),
7227 },
7228 PositionalToken {
7229 source: uws,
7230 offset: 58,
7231 length: 10,
7232 token: Token::Word(Word::Word("دریای".to_string())),
7233 },
7234 PositionalToken {
7235 source: uws,
7236 offset: 68,
7237 length: 1,
7238 token: Token::Special(Special::Separator(Separator::Space)),
7239 },
7240 PositionalToken {
7241 source: uws,
7242 offset: 69,
7243 length: 6,
7244 token: Token::Word(Word::Word("خزر".to_string())),
7245 },
7246 PositionalToken {
7247 source: uws,
7248 offset: 75,
7249 length: 1,
7250 token: Token::Special(Special::Separator(Separator::Space)),
7251 },
7252 PositionalToken {
7253 source: uws,
7254 offset: 76,
7255 length: 12,
7256 token: Token::Word(Word::Word("مجموعه".to_string())),
7257 },
7258 PositionalToken {
7259 source: uws,
7260 offset: 88,
7261 length: 3,
7262 token: Token::Unicode(Unicode::Formatter(Formatter::Char('\u{200c}'))),
7263 },
7264 PositionalToken {
7265 source: uws,
7266 offset: 91,
7267 length: 4,
7268 token: Token::Word(Word::Word("ای".to_string())),
7269 },
7270 PositionalToken {
7271 source: uws,
7272 offset: 95,
7273 length: 1,
7274 token: Token::Special(Special::Separator(Separator::Space)),
7275 },
7276 PositionalToken {
7277 source: uws,
7278 offset: 96,
7279 length: 4,
7280 token: Token::Word(Word::Word("از".to_string())),
7281 },
7282 PositionalToken {
7283 source: uws,
7284 offset: 100,
7285 length: 1,
7286 token: Token::Special(Special::Separator(Separator::Space)),
7287 },
7288 PositionalToken {
7289 source: uws,
7290 offset: 101,
7291 length: 10,
7292 token: Token::Word(Word::Word("حملات".to_string())),
7293 },
7294 PositionalToken {
7295 source: uws,
7296 offset: 111,
7297 length: 1,
7298 token: Token::Special(Special::Separator(Separator::Space)),
7299 },
7300 PositionalToken {
7301 source: uws,
7302 offset: 112,
7303 length: 10,
7304 token: Token::Word(Word::Word("نظامی".to_string())),
7305 },
7306 PositionalToken {
7307 source: uws,
7308 offset: 122,
7309 length: 1,
7310 token: Token::Special(Special::Separator(Separator::Space)),
7311 },
7312 PositionalToken {
7313 source: uws,
7314 offset: 123,
7315 length: 4,
7316 token: Token::Word(Word::Word("در".to_string())),
7317 },
7318 PositionalToken {
7319 source: uws,
7320 offset: 127,
7321 length: 1,
7322 token: Token::Special(Special::Separator(Separator::Space)),
7323 },
7324 PositionalToken {
7325 source: uws,
7326 offset: 128,
7327 length: 6,
7328 token: Token::Word(Word::Word("بین".to_string())),
7329 },
7330 PositionalToken {
7331 source: uws,
7332 offset: 134,
7333 length: 1,
7334 token: Token::Special(Special::Separator(Separator::Space)),
7335 },
7336 PositionalToken {
7337 source: uws,
7338 offset: 135,
7339 length: 6,
7340 token: Token::Word(Word::Word("سال".to_string())),
7341 },
7342 PositionalToken {
7343 source: uws,
7344 offset: 141,
7345 length: 3,
7346 token: Token::Unicode(Unicode::Formatter(Formatter::Char('\u{200c}'))),
7347 },
7348 PositionalToken {
7349 source: uws,
7350 offset: 144,
7351 length: 6,
7352 token: Token::Word(Word::Word("های".to_string())),
7353 },
7354 PositionalToken {
7355 source: uws,
7356 offset: 150,
7357 length: 1,
7358 token: Token::Special(Special::Separator(Separator::Space)),
7359 },
7360 PositionalToken {
7361 source: uws,
7362 offset: 151,
7363 length: 6,
7364 token: Token::Word(Word::StrangeWord("۸۶۴".to_string())),
7365 },
7366 PositionalToken {
7367 source: uws,
7368 offset: 157,
7369 length: 1,
7370 token: Token::Special(Special::Separator(Separator::Space)),
7371 },
7372 PositionalToken {
7373 source: uws,
7374 offset: 158,
7375 length: 4,
7376 token: Token::Word(Word::Word("تا".to_string())),
7377 },
7378 PositionalToken {
7379 source: uws,
7380 offset: 162,
7381 length: 1,
7382 token: Token::Special(Special::Separator(Separator::Space)),
7383 },
7384 PositionalToken {
7385 source: uws,
7386 offset: 163,
7387 length: 8,
7388 token: Token::Word(Word::StrangeWord("۱۰۴۱".to_string())),
7389 },
7390 PositionalToken {
7391 source: uws,
7392 offset: 171,
7393 length: 1,
7394 token: Token::Special(Special::Separator(Separator::Space)),
7395 },
7396 PositionalToken {
7397 source: uws,
7398 offset: 172,
7399 length: 12,
7400 token: Token::Word(Word::Word("میلادی".to_string())),
7401 },
7402 PositionalToken {
7403 source: uws,
7404 offset: 184,
7405 length: 1,
7406 token: Token::Special(Special::Separator(Separator::Space)),
7407 },
7408 PositionalToken {
7409 source: uws,
7410 offset: 185,
7411 length: 2,
7412 token: Token::Word(Word::Word("ب".to_string())),
7413 },
7414 ],
7415 Lang::Ell => vec![
7416 PositionalToken {
7417 source: uws,
7418 offset: 0,
7419 length: 4,
7420 token: Token::Word(Word::Word("Το".to_string())),
7421 },
7422 PositionalToken {
7423 source: uws,
7424 offset: 4,
7425 length: 1,
7426 token: Token::Special(Special::Separator(Separator::Space)),
7427 },
7428 PositionalToken {
7429 source: uws,
7430 offset: 5,
7431 length: 18,
7432 token: Token::Word(Word::Word("Πρόγραμμα".to_string())),
7433 },
7434 PositionalToken {
7435 source: uws,
7436 offset: 23,
7437 length: 1,
7438 token: Token::Special(Special::Separator(Separator::Space)),
7439 },
7440 PositionalToken {
7441 source: uws,
7442 offset: 24,
7443 length: 22,
7444 token: Token::Word(Word::Word("υλοποιείται".to_string())),
7445 },
7446 PositionalToken {
7447 source: uws,
7448 offset: 46,
7449 length: 1,
7450 token: Token::Special(Special::Separator(Separator::Space)),
7451 },
7452 PositionalToken {
7453 source: uws,
7454 offset: 47,
7455 length: 4,
7456 token: Token::Word(Word::Word("εξ".to_string())),
7457 },
7458 PositionalToken {
7459 source: uws,
7460 offset: 51,
7461 length: 1,
7462 token: Token::Special(Special::Separator(Separator::Space)),
7463 },
7464 PositionalToken {
7465 source: uws,
7466 offset: 52,
7467 length: 18,
7468 token: Token::Word(Word::Word("ολοκλήρου".to_string())),
7469 },
7470 PositionalToken {
7471 source: uws,
7472 offset: 70,
7473 length: 1,
7474 token: Token::Special(Special::Separator(Separator::Space)),
7475 },
7476 PositionalToken {
7477 source: uws,
7478 offset: 71,
7479 length: 6,
7480 token: Token::Word(Word::Word("από".to_string())),
7481 },
7482 PositionalToken {
7483 source: uws,
7484 offset: 77,
7485 length: 1,
7486 token: Token::Special(Special::Separator(Separator::Space)),
7487 },
7488 PositionalToken {
7489 source: uws,
7490 offset: 78,
7491 length: 16,
7492 token: Token::Word(Word::Word("απόσταση".to_string())),
7493 },
7494 PositionalToken {
7495 source: uws,
7496 offset: 94,
7497 length: 1,
7498 token: Token::Special(Special::Separator(Separator::Space)),
7499 },
7500 PositionalToken {
7501 source: uws,
7502 offset: 95,
7503 length: 6,
7504 token: Token::Word(Word::Word("και".to_string())),
7505 },
7506 PositionalToken {
7507 source: uws,
7508 offset: 101,
7509 length: 1,
7510 token: Token::Special(Special::Separator(Separator::Space)),
7511 },
7512 PositionalToken {
7513 source: uws,
7514 offset: 102,
7515 length: 12,
7516 token: Token::Word(Word::Word("μπορεί".to_string())),
7517 },
7518 PositionalToken {
7519 source: uws,
7520 offset: 114,
7521 length: 1,
7522 token: Token::Special(Special::Separator(Separator::Space)),
7523 },
7524 PositionalToken {
7525 source: uws,
7526 offset: 115,
7527 length: 4,
7528 token: Token::Word(Word::Word("να".to_string())),
7529 },
7530 PositionalToken {
7531 source: uws,
7532 offset: 119,
7533 length: 1,
7534 token: Token::Special(Special::Separator(Separator::Space)),
7535 },
7536 PositionalToken {
7537 source: uws,
7538 offset: 120,
7539 length: 20,
7540 token: Token::Word(Word::Word("συμμετέχει".to_string())),
7541 },
7542 PositionalToken {
7543 source: uws,
7544 offset: 140,
7545 length: 1,
7546 token: Token::Special(Special::Separator(Separator::Space)),
7547 },
7548 PositionalToken {
7549 source: uws,
7550 offset: 141,
7551 length: 8,
7552 token: Token::Word(Word::Word("κάθε".to_string())),
7553 },
7554 PositionalToken {
7555 source: uws,
7556 offset: 149,
7557 length: 1,
7558 token: Token::Special(Special::Separator(Separator::Space)),
7559 },
7560 PositionalToken {
7561 source: uws,
7562 offset: 150,
7563 length: 24,
7564 token: Token::Word(Word::Word("εμπλεκόμενος".to_string())),
7565 },
7566 PositionalToken {
7567 source: uws,
7568 offset: 174,
7569 length: 1,
7570 token: Token::Special(Special::Separator(Separator::Space)),
7571 },
7572 PositionalToken {
7573 source: uws,
7574 offset: 175,
7575 length: 6,
7576 token: Token::Word(Word::Word("στη".to_string())),
7577 },
7578 PositionalToken {
7579 source: uws,
7580 offset: 181,
7581 length: 1,
7582 token: Token::Special(Special::Separator(Separator::Space)),
7583 },
7584 PositionalToken {
7585 source: uws,
7586 offset: 182,
7587 length: 2,
7588 token: Token::Word(Word::Word("ή".to_string())),
7589 },
7590 PositionalToken {
7591 source: uws,
7592 offset: 184,
7593 length: 1,
7594 token: Token::Special(Special::Punctuation('/')),
7595 },
7596 ],
7597 };
7598 (
7599 uws.chars()
7600 .take(100)
7601 .fold(String::new(), |acc, c| acc + &format!("{}", c)),
7602 tokens,
7603 )
7604 }
7605}