1use std::{fmt, sync::Arc};
2use text_parsing::{Breaker, IntoSource, Local, Localize, Snip, Source, SourceEvent};
3
4mod emoji;
5pub use emoji::EMOJIMAP;
6
7mod breakers;
8pub use breakers::{SentenceBreaker, UnicodeSentenceBreaker};
9
10mod numbers;
11pub use numbers::NumberNotation;
12
13mod wordbreaker;
14
15mod options;
16pub use options::{IntoTokenizer, TokenizerOptions, TokenizerParams};
17
18mod tokens;
19pub use tokens::Tokens;
20
21mod text_tokens;
22use text_tokens::InnerBound;
23pub use text_tokens::TextTokens;
24
25#[cfg(test)]
26mod test {
27 mod numbers_ru_en;
28}
29
30#[derive(Debug)]
31pub enum Error {
32 TextParser(text_parsing::Error),
33}
34
35pub const EPS: f64 = 1e-8;
36
37#[cfg(feature = "strings")]
38#[derive(Debug, Clone, PartialEq, PartialOrd)]
39pub enum Number {
40 Integer(i64),
41 Float(f64),
42 ZeroInteger { i: i64, s: String },
44}
45
46#[cfg(not(feature = "strings"))]
47#[derive(Debug, Clone, Copy, PartialEq, PartialOrd)]
48pub enum Number {
49 Integer(i64),
50 Float(f64),
51 ZeroInteger { i: i64 },
52}
53
54impl Number {
55 pub fn as_f64(&self) -> f64 {
56 match self {
57 Number::Integer(i) => *i as f64,
58 Number::Float(f) => *f,
59 Number::ZeroInteger { i, .. } => *i as f64,
60 }
61 }
62}
63impl Ord for Number {
64 fn cmp(&self, other: &Number) -> std::cmp::Ordering {
65 let s = self.as_f64();
66 let o = other.as_f64();
67 let d = s - o;
68 match d.abs() < EPS {
69 true => std::cmp::Ordering::Equal,
70 false => {
71 if d > 0.0 {
72 return std::cmp::Ordering::Greater;
73 }
74 if d < 0.0 {
75 return std::cmp::Ordering::Less;
76 }
77 std::cmp::Ordering::Equal
78 }
79 }
80 }
81}
82impl Eq for Number {}
83
84#[derive(Debug, Clone, Copy, Eq, PartialEq, Ord, PartialOrd)]
85pub enum Separator {
86 Space,
87 Tab,
88 Newline,
89 Char(char),
90}
91
92#[derive(Debug, Clone, Copy, Eq, PartialEq, Ord, PartialOrd)]
93pub enum Formatter {
94 Char(char),
95 Joiner, }
97
98#[derive(Debug, Clone, Copy, PartialEq, Eq, Ord, PartialOrd)]
99pub enum Special {
100 Currency(char),
101 Punctuation(char),
102 Symbol(char),
103 Separator(Separator),
104}
105
106#[cfg(feature = "strings")]
107#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord)]
108pub enum Word {
109 Word(String),
110 StrangeWord(String),
111 Numerical(Numerical),
112 Number(Number),
113 Emoji(&'static str),
114}
115
116#[cfg(feature = "strings")]
117#[derive(Debug, Clone, Eq, PartialEq, Ord, PartialOrd)]
118pub enum Numerical {
119 DotSeparated(String),
123 Measures(String),
124 Alphanumeric(String),
125}
126
127#[cfg(feature = "strings")]
128#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord)]
129pub enum Struct {
130 Hashtag(String),
131 Mention(String),
132 }
134
135#[cfg(feature = "strings")]
136#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord)]
137pub enum Unicode {
138 String(String),
139 Formatter(Formatter),
140}
141
142#[cfg(not(feature = "strings"))]
143#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord)]
144pub enum Word {
145 Word,
146 StrangeWord,
147 Numerical(Numerical),
148 Number(Number),
149 Emoji(&'static str),
150}
151
152#[cfg(not(feature = "strings"))]
153#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord)]
154pub enum Numerical {
155 DotSeparated,
159 Measures,
160 Alphanumeric,
161}
162
163#[cfg(not(feature = "strings"))]
164#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord)]
165pub enum Struct {
166 Hashtag,
167 Mention,
168 }
170
171#[cfg(not(feature = "strings"))]
172#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord)]
173pub enum Unicode {
174 String,
175 Formatter(Formatter),
176}
177
178#[cfg(feature = "strings")]
179#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord)]
180pub enum Token {
181 Word(Word),
182 Struct(Struct),
183 Special(Special),
184 Unicode(Unicode),
185}
186
187#[cfg(not(feature = "strings"))]
188#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord)]
189pub enum Token {
190 Word(Word),
191 Struct(Struct),
192 Special(Special),
193 Unicode(Unicode),
194}
195
196#[derive(Debug)]
210pub struct TextStr<'s> {
211 buffer: &'s str,
212 localities: Arc<Vec<TextLocality>>,
213 breakers: Arc<Vec<InnerBound>>,
214}
215impl<'s> TextStr<'s> {
216 pub fn new<'a>(s: &'a str) -> Result<TextStr<'a>, Error> {
217 let text = inner_new(s.into_source(), false)?;
218 Ok(TextStr {
219 buffer: s,
220 localities: text.localities,
221 breakers: text.breakers,
222 })
223 }
224}
225
226fn inner_new<S: Source>(mut source: S, with_buffer: bool) -> Result<Text, Error> {
227 let mut buffer = String::new();
228 let mut localities = Vec::new();
229 let mut breakers = Vec::new();
230 let mut buffer_len = 0;
231
232 while let Some(local_se) = source.next_char().map_err(Error::TextParser)? {
233 let (local, se) = local_se.into_inner();
234 let c = match se {
235 SourceEvent::Char(c) => match c {
236 '\u{0060}' => '\u{0027}',
237 _ => c,
238 },
239 SourceEvent::Breaker(b) => {
240 let (c, opt_b) = match b {
241 Breaker::None => continue,
242 Breaker::Space => (' ', None),
243 Breaker::Line => ('\n', None),
244 Breaker::Word => ('\u{200B}', Some(b)), Breaker::Sentence | Breaker::Paragraph | Breaker::Section => ('\n', Some(b)),
246 };
247 if let Some(b) = opt_b {
248 let br = InnerBound {
249 bytes: Snip {
250 offset: buffer_len,
251 length: c.len_utf8(),
252 },
253 chars: Snip {
254 offset: localities.len(),
255 length: 1,
256 },
257 breaker: b,
258 original: Some(local),
259 };
260 breakers.push(br);
262 }
263 c
264 }
265 };
266
267 let buf_local = ().localize(
268 Snip {
269 offset: localities.len(),
271 length: 1,
272 },
273 Snip {
274 offset: buffer_len,
276 length: c.len_utf8(),
277 },
278 );
279 if with_buffer {
280 buffer.push(c);
281 }
282 buffer_len += c.len_utf8();
283 localities.push(TextLocality {
284 buffer: buf_local,
285 original: local,
286 });
287 }
288 Ok(Text {
289 buffer: Arc::new(buffer),
290 localities: Arc::new(localities),
291 breakers: Arc::new(breakers),
292 })
293}
294
295#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord)]
296pub struct TextLocality {
297 pub buffer: Local<()>,
298 pub original: Local<()>,
299}
300
301#[derive(Debug)]
302pub struct Text {
303 buffer: Arc<String>,
304 localities: Arc<Vec<TextLocality>>,
305 breakers: Arc<Vec<InnerBound>>,
306}
307impl Text {
308 pub fn new<S: Source>(source: S) -> Result<Text, Error> {
309 inner_new(source, true)
310 }
311 pub fn token_text<'s>(&'s self, token: &TextToken) -> &'s str {
312 let Snip {
313 offset: begin,
314 length: len,
315 } = token.locality.bytes();
316 let end = begin + len;
317 &self.buffer[begin..end]
318 }
319 pub fn text(&self) -> &str {
320 self.buffer.as_ref()
321 }
322 pub fn original_locality(&self, idx: usize) -> Option<Local<()>> {
323 self.localities.get(idx).map(|tl| tl.original)
324 }
325 pub fn localities(&self) -> &Vec<TextLocality> {
326 self.localities.as_ref()
327 }
328 pub fn shared_text(&self) -> Text {
329 Text {
330 buffer: self.buffer.clone(),
331 localities: self.localities.clone(),
332 breakers: self.breakers.clone(),
333 }
334 }
335}
336
337impl TryFrom<String> for Text {
338 type Error = Error;
339
340 fn try_from(s: String) -> Result<Text, Error> {
341 let mut text = inner_new((&s).into_source(), false)?;
342 text.buffer = Arc::new(s);
343 Ok(text)
344 }
345}
346
347impl TryFrom<&str> for Text {
348 type Error = Error;
349
350 fn try_from(s: &str) -> Result<Text, Error> {
351 Text::new(s.into_source())
352 }
353}
354
355#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord)]
356pub enum Bound {
357 Sentence,
358 Paragraph,
359 Section,
360}
361
362#[cfg(feature = "strings")]
363#[derive(Clone, PartialEq, PartialOrd, Eq, Ord)]
364pub struct TextToken {
365 locality: Local<()>,
366 original: Option<Local<()>>,
367 pub token: Token2,
368}
369
370#[cfg(not(feature = "strings"))]
371#[derive(Clone, Copy, PartialEq, PartialOrd, Eq, Ord)]
372pub struct TextToken {
373 locality: Local<()>,
374 original: Option<Local<()>>,
375 pub token: Token2,
376}
377
378impl fmt::Debug for TextToken {
379 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
380 write!(
381 f,
382 "TextToken {{ local: {:?} [{:?}] }}, ",
383 self.locality.bytes(),
384 self.locality.chars()
385 )?;
386 match &self.original {
387 Some(orig) => write!(f, "orig: {:?} [{:?}], ", orig.bytes(), orig.chars())?,
388 None => {}
389 }
390 write!(f, "token: {:?} }}", self.token)
391 }
392}
393
394#[cfg(test)]
395impl TextToken {
396 fn into_original_token_1(self) -> Option<Local<Token>> {
397 match self.original {
398 Some(original) => self.token.into_token().map(|t| original.local(t)),
399 None => None,
400 }
401 }
402}
403
404impl TextToken {
405 pub fn local(&self) -> Local<()> {
406 self.locality
407 }
408 pub fn original(&self) -> Option<Local<()>> {
409 self.original
410 }
411 pub fn into_position(mut self) -> TextToken {
412 self.locality = self.locality.into_position();
413 self.original = self.original.map(|or| or.into_position());
414 self
415 }
416 pub fn try_as_token(&self) -> Result<Token, Bound> {
417 self.token.try_as_token()
418 }
419 pub fn as_original_token(&self) -> Option<Local<&Token2>> {
420 self.original.map(|original| original.local(&self.token))
421 }
422 pub fn into_original_token(self) -> Option<Local<Token2>> {
423 self.original.map(|original| original.local(self.token))
424 }
425 pub fn original_str<'s>(&self, original: &'s str) -> Result<&'s str, OriginalError> {
426 match self.original {
427 Some(local) => {
428 let Snip {
429 offset: begin,
430 length: len,
431 } = local.bytes();
432 let end = begin + len;
433 match original.get(begin..end) {
434 Some(s) => Ok(s),
435 None => Err(OriginalError::InvalidSnip),
436 }
437 }
438 None => Err(OriginalError::NoOriginal),
439 }
440 }
441
442 #[cfg(feature = "strings")]
443 fn token_clone(&self) -> Token2 {
444 self.token.clone()
445 }
446
447 #[cfg(not(feature = "strings"))]
448 fn token_clone(&self) -> Token2 {
449 self.token
450 }
451
452 pub fn merge_tokens(
453 &self,
454 other: &TextToken,
455 new_token: Option<Token2>,
456 ) -> Result<TextToken, TextToken> {
457 let (local, left_lb, left_lc) = add_local(&self.locality, &other.locality);
458 let must_be_left = left_lb;
459 let mut ok = must_be_left == left_lc;
460 let orig = match (&self.original, &other.original) {
461 (None, None) => None,
462 (Some(o), None) | (None, Some(o)) => Some(*o),
463 (Some(s), Some(o)) => {
464 let (orig, lb, lc) = add_local(s, o);
465 ok &= must_be_left == lb;
466 ok &= must_be_left == lc;
467 Some(orig)
468 }
469 };
470 let token = TextToken {
471 locality: local,
472 original: orig,
473 token: match new_token {
474 Some(t) => t,
475 None => self.token_clone(),
476 },
477 };
478 match ok {
479 true => Ok(token),
480 false => Err(token),
481 }
482 }
483}
484
485fn add_local(slf: &Local<()>, other: &Local<()>) -> (Local<()>, bool, bool) {
486 let b1 = slf.bytes();
488 let b2 = other.bytes();
489 let c1 = slf.chars();
490 let c2 = other.chars();
491 let (bytes, slf_is_left_by_bytes) = match b1.offset < b2.offset {
492 true => (
493 Snip {
494 offset: b1.offset,
495 length: (b2.offset + b2.length) - b1.offset,
496 },
497 true,
498 ),
499 false => (
500 Snip {
501 offset: b2.offset,
502 length: (b1.offset + b1.length) - b2.offset,
503 },
504 false,
505 ),
506 };
507 let (chars, slf_is_left_by_chars) = match c1.offset < c2.offset {
508 true => (
509 Snip {
510 offset: c1.offset,
511 length: (c2.offset + c2.length) - c1.offset,
512 },
513 true,
514 ),
515 false => (
516 Snip {
517 offset: c2.offset,
518 length: (c1.offset + c1.length) - c2.offset,
519 },
520 false,
521 ),
522 };
523 (
524 ().localize(chars, bytes),
525 slf_is_left_by_bytes,
526 slf_is_left_by_chars,
527 )
528}
529
530impl TextToken {
531 pub fn test_token(lt: Local<Token2>) -> TextToken {
532 let (local, token) = lt.into_inner();
533 TextToken {
534 locality: local,
535 original: Some(local.local(())),
536 token,
537 }
538 }
539 pub fn test_new(token: Token2, local: Local<()>, original: Option<Local<()>>) -> TextToken {
540 TextToken {
541 locality: local,
542 original,
543 token,
544 }
545 }
546}
547
548#[derive(Debug)]
575pub enum OriginalError {
576 NoOriginal,
577 InvalidSnip,
578}
579
580#[cfg(feature = "strings")]
588#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord)]
589pub enum Token2 {
590 Word(Word),
591 Struct(Struct),
592 Special(Special),
593 Unicode(Unicode),
594
595 Bound(Bound),
596}
597#[cfg(not(feature = "strings"))]
598#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord)]
599pub enum Token2 {
600 Word(Word),
601 Struct(Struct),
602 Special(Special),
603 Unicode(Unicode),
604
605 Bound(Bound),
606}
607impl From<Token> for Token2 {
608 fn from(t: Token) -> Token2 {
609 match t {
610 Token::Word(w) => Token2::Word(w),
611 Token::Struct(s) => Token2::Struct(s),
612 Token::Special(s) => Token2::Special(s),
613 Token::Unicode(u) => Token2::Unicode(u),
614 }
615 }
616}
617impl Token2 {
618 #[cfg(not(feature = "strings"))]
619 fn try_as_token(&self) -> Result<Token, Bound> {
620 (*self).try_into_token()
621 }
622
623 #[cfg(feature = "strings")]
624 fn try_as_token(&self) -> Result<Token, Bound> {
625 self.clone().try_into_token()
626 }
627
628 fn try_into_token(self) -> Result<Token, Bound> {
629 match self {
630 Token2::Word(w) => Ok(Token::Word(w)),
631 Token2::Struct(s) => Ok(Token::Struct(s)),
632 Token2::Special(s) => Ok(Token::Special(s)),
633 Token2::Unicode(u) => Ok(Token::Unicode(u)),
634 Token2::Bound(b) => Err(b),
635 }
636 }
637}
638#[cfg(test)]
639impl Token2 {
640 fn into_token(self) -> Option<Token> {
641 match self {
642 Token2::Word(w) => Some(Token::Word(w)),
643 Token2::Struct(s) => Some(Token::Struct(s)),
644 Token2::Special(s) => Some(Token::Special(s)),
645 Token2::Unicode(u) => Some(Token::Unicode(u)),
646 Token2::Bound(_) => None,
647 }
648 }
649}
650
651#[cfg(test)]
652#[cfg(not(feature = "strings"))]
653mod test_no_strings {
654 use super::*;
655 use text_parsing::{
656 IntoPipeParser, IntoSource, Localize, ParserExt, SourceExt, entities, tagger,
657 };
658
659 fn check_results(result: &Vec<Local<Token>>, lib_res: &Vec<Local<Token>>, _uws: &str) {
660 assert_eq!(result.len(), lib_res.len());
661 for i in 0..result.len() {
662 let res: Local<Token> = result[i].clone().into();
663 assert_eq!(res, lib_res[i]);
664 }
665 }
666
667 fn symbols() {
669 let uws = "Сибирь Арене 17 30 от 2560₽ 😀";
670 let lib_res = uws
673 .into_tokenizer(TokenizerParams::v1())
674 .collect::<Vec<_>>();
675 for t in lib_res {
677 println!("{:?}", t);
678 }
679 panic!()
680 }
681}
682
683#[cfg(test)]
684mod test_v0_5 {
685 use super::*;
686 use text_parsing::{IntoPipeParser, IntoSource, ParserExt, SourceExt, entities, tagger};
687
688 fn basic() {
690 let uws = "<p>Oxana Putan shared the quick (\"brown\") fox can't jump 32.3 feet, right? 4pda etc.</p><p> qeq U.S.A asd\n\n\nBrr, it's 29.3°F!\n Русское предложение #36.6 для тестирования деления по юникод-словам...\n🇷🇺 🇸🇹\n👱🏿👶🏽👨🏽\n+Done! Готово</p>";
705 let text = Text::new({
706 uws.into_source()
707 .pipe(tagger::Builder::new().create().into_breaker())
708 .pipe(entities::Builder::new().create().into_piped())
709 .into_separator()
710 })
711 .unwrap();
712 let lib_res = text
713 .into_tokenizer({
714 TokenizerParams::default()
715 .add_option(TokenizerOptions::SplitDot)
716 .add_option(TokenizerOptions::SplitUnderscore)
717 .add_option(TokenizerOptions::SplitColon)
718 .with_default_sentences()
719 })
720 .collect::<Vec<_>>();
721
722 for tok in lib_res {
723 println!(
724 "C{:?}, B{:?}, {:?} -> {:?}",
725 tok.original.map(|loc| loc.chars()),
726 tok.original.map(|loc| loc.bytes()),
727 tok.token,
728 tok.original_str(uws)
729 );
730 }
731
732 panic!()
733 }
734}
735
736#[cfg(test)]
737#[cfg(feature = "strings")]
738mod test_strings {
739 use super::*;
740 use text_parsing::{
741 IntoPipeParser, IntoSource, Localize, ParserExt, SourceExt, entities, tagger,
742 };
743
744 #[allow(dead_code)]
763 fn print_result(lib_res: &Vec<Local<Token>>) {
764 for lt in lib_res {
769 println!("{:?}", lt);
770 }
771 }
772 #[derive(Debug, Clone)]
801 struct CharToken {
802 byte_offset: usize,
803 byte_length: usize,
804 char_offset: usize,
805 char_length: usize,
806 token: Token,
807 }
808 impl Into<Local<Token>> for CharToken {
809 fn into(self) -> Local<Token> {
810 self.token.localize(
811 Snip {
812 offset: self.char_offset,
813 length: self.char_length,
814 },
815 Snip {
816 offset: self.byte_offset,
817 length: self.byte_length,
818 },
819 )
820 }
821 }
822
823 #[derive(Debug, Clone)]
824 struct PositionalToken {
825 source: &'static str,
826 offset: usize,
827 length: usize,
828 token: Token,
829 }
830 impl Into<Local<Token>> for PositionalToken {
831 fn into(self) -> Local<Token> {
832 self.token.localize(
833 Snip {
834 offset: self.source[..self.offset].chars().count(),
835 length: self.source[self.offset..self.offset + self.length]
836 .chars()
837 .count(),
838 },
839 Snip {
840 offset: self.offset,
841 length: self.length,
842 },
843 )
844 }
845 }
846
847 fn check_results(result: &Vec<PositionalToken>, lib_res: &Vec<Local<Token>>, _uws: &str) {
848 assert_eq!(result.len(), lib_res.len());
849 for i in 0..result.len() {
850 let res: Local<Token> = result[i].clone().into();
851 assert_eq!(res, lib_res[i]);
852 }
853 }
854
855 fn check_cresults(result: &Vec<CharToken>, lib_res: &Vec<Local<Token>>, _uws: &str) {
856 assert_eq!(result.len(), lib_res.len());
857 for i in 0..result.len() {
858 let res: Local<Token> = result[i].clone().into();
859 assert_eq!(res, lib_res[i]);
860 }
861 }
862
863 fn check<T: Clone + std::fmt::Debug + Into<Local<Token>>>(
864 res: &Vec<T>,
865 lib: &Vec<Local<Token>>,
866 _uws: &str,
867 ) {
868 let mut lib = lib.iter();
869 let mut res = res.iter().map(|r| {
870 let res: Local<Token> = r.clone().into();
871 res
872 });
873 let mut diff = Vec::new();
874 loop {
875 match (lib.next(), res.next()) {
876 (Some(lw), Some(rw)) => {
877 if *lw != rw {
878 diff.push(format!("LIB: {:?}", lw));
879 diff.push(format!("TEST: {:?}", rw));
880 diff.push("".to_string())
881 }
882 }
883 (Some(lw), None) => {
884 diff.push(format!("LIB: {:?}", lw));
885 diff.push("TEST: ----".to_string());
886 diff.push("".to_string())
887 }
888 (None, Some(rw)) => {
889 diff.push("LIB: ----".to_string());
890 diff.push(format!("TEST: {:?}", rw));
891 diff.push("".to_string())
892 }
893 (None, None) => break,
894 }
895 }
896 if diff.len() > 0 {
897 for ln in &diff {
898 println!("{}", ln);
899 }
900 panic!("Diff count: {}", diff.len() / 3);
901 }
902 }
903
904 #[test]
905 #[rustfmt::skip]
906 fn custom_numbers() {
907 let uws = "115,7 123,398,398 2,123.45 0,05%";
908 let result = vec![
909 PositionalToken { source: uws, offset: 0, length: 5, token: Token::Word(Word::Number(Number::Float(115.7))) },
910 PositionalToken { source: uws, offset: 5, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
911 PositionalToken { source: uws, offset: 6, length: 11, token: Token::Word(Word::Number(Number::Integer(123398398))) },
912 PositionalToken { source: uws, offset: 17, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
913 PositionalToken { source: uws, offset: 18, length: 8, token: Token::Word(Word::Number(Number::Float(2123.45))) },
914 PositionalToken { source: uws, offset: 26, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
915 PositionalToken { source: uws, offset: 27, length: 4, token: Token::Word(Word::Number(Number::Float(0.05))) },
916 PositionalToken { source: uws, offset: 31, length: 1, token: Token::Special(Special::Punctuation('%')) },
917 ];
918 let lib_res = uws
919 .into_tokenizer(TokenizerParams::v1())
920 .collect::<Vec<_>>();
921 check_results(&result, &lib_res, uws);
923 }
924
925 #[test]
926 #[rustfmt::skip]
927 fn custom_numbers_ftoi() {
928 let uws = "1.1 10.0000";
929 let result = vec![
930 PositionalToken { source: uws, offset: 0, length: 3, token: Token::Word(Word::Number(Number::Float(1.1))) },
931 PositionalToken { source: uws, offset: 3, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
932 PositionalToken { source: uws, offset: 4, length: 7, token: Token::Word(Word::Number(Number::Integer(10))) },
933 ];
934 let lib_res = uws
935 .into_tokenizer(TokenizerParams::v1())
936 .collect::<Vec<_>>();
937 check_results(&result, &lib_res, uws);
939 }
940
941 #[test]
942 #[rustfmt::skip]
943 fn custom_numbers_en_1() {
944 let uws = "1.1 10,000";
945 let result = vec![
946 PositionalToken { source: uws, offset: 0, length: 3, token: Token::Word(Word::Number(Number::Float(1.1))) },
947 PositionalToken { source: uws, offset: 3, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
948 PositionalToken { source: uws, offset: 4, length: 6, token: Token::Word(Word::Number(Number::Integer(10000))) },
949 ];
950 let lib_res = uws
951 .into_tokenizer(TokenizerParams::v1())
952 .collect::<Vec<_>>();
953 check_results(&result, &lib_res, uws);
955 }
956
957 #[test]
958 #[rustfmt::skip]
959 fn custom_numbers_en_2() {
960 let uws = "1,000.1 10,000";
961 let result = vec![
962 PositionalToken { source: uws, offset: 0, length: 7, token: Token::Word(Word::Number(Number::Float(1000.1))) },
963 PositionalToken { source: uws, offset: 7, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
964 PositionalToken { source: uws, offset: 8, length: 6, token: Token::Word(Word::Number(Number::Integer(10000))) },
965 ];
966 let lib_res = uws
967 .into_tokenizer(TokenizerParams::v1())
968 .collect::<Vec<_>>();
969 check_results(&result, &lib_res, uws);
971 }
972
973 #[test]
974 #[rustfmt::skip]
975 fn custom_numbers_ru_1() {
976 let uws = "1.1 10,001";
977 let result = vec![
978 PositionalToken { source: uws, offset: 0, length: 3, token: Token::Word(Word::Number(Number::Float(1.1))) },
979 PositionalToken { source: uws, offset: 3, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
980 PositionalToken { source: uws, offset: 4, length: 6, token: Token::Word(Word::Number(Number::Integer(10001))) }, ];
982 let lib_res = uws
983 .into_tokenizer(TokenizerParams::v1().add_option(TokenizerOptions::NumberUnknownComaAsDot))
984 .collect::<Vec<_>>();
985 check_results(&result, &lib_res, uws);
987 }
988
989 #[test]
990 #[rustfmt::skip]
991 fn custom_numbers_ru_2() {
992 let uws = "1,1 10,001";
993 let result = vec![
994 PositionalToken { source: uws, offset: 0, length: 3, token: Token::Word(Word::Number(Number::Float(1.1))) },
995 PositionalToken { source: uws, offset: 3, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
996 PositionalToken { source: uws, offset: 4, length: 6, token: Token::Word(Word::Number(Number::Integer(10001))) }, ];
998 let lib_res = uws
999 .into_tokenizer(TokenizerParams::v1())
1000 .collect::<Vec<_>>();
1001 check_results(&result, &lib_res, uws);
1003 }
1004
1005 #[test]
1006 #[rustfmt::skip]
1007 fn custom_numbers_ru_3() {
1008 let uws = "10000,1 10,001";
1009 let result = vec![
1010 PositionalToken { source: uws, offset: 0, length: 7, token: Token::Word(Word::Number(Number::Float(10000.1))) },
1011 PositionalToken { source: uws, offset: 7, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
1012 PositionalToken { source: uws, offset: 8, length: 6, token: Token::Word(Word::Number(Number::Integer(10001))) }, ];
1014 let lib_res = uws
1015 .into_tokenizer(TokenizerParams::v1())
1016 .collect::<Vec<_>>();
1017 check_results(&result, &lib_res, uws);
1019 }
1020
1021 #[test]
1022 #[rustfmt::skip]
1023 fn currency() {
1024 let uws = "$ ₽ € ¥";
1025 let result = vec![
1026 PositionalToken { source: uws, offset: 0, length: 1, token: Token::Special(Special::Currency('$')) },
1027 PositionalToken { source: uws, offset: 1, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
1028 PositionalToken { source: uws, offset: 2, length: 3, token: Token::Special(Special::Currency('₽')) },
1029 PositionalToken { source: uws, offset: 5, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
1030 PositionalToken { source: uws, offset: 6, length: 3, token: Token::Special(Special::Currency('€')) },
1031 PositionalToken { source: uws, offset: 9, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
1032 PositionalToken { source: uws, offset: 10, length: 2, token: Token::Special(Special::Currency('¥')) },
1033 ];
1034 let lib_res = uws
1035 .into_tokenizer(TokenizerParams::v1())
1036 .collect::<Vec<_>>();
1037 check_results(&result, &lib_res, uws);
1039 }
1041
1042 #[test]
1043 fn spaces() {
1044 let uws = " spaces too many apces ";
1045 let result = vec![
1046 PositionalToken {
1047 source: uws,
1048 offset: 0,
1049 length: 4,
1050 token: Token::Special(Special::Separator(Separator::Space)),
1051 },
1052 PositionalToken {
1053 source: uws,
1054 offset: 4,
1055 length: 6,
1056 token: Token::Word(Word::Word("spaces".to_string())),
1057 },
1058 PositionalToken {
1059 source: uws,
1060 offset: 10,
1061 length: 4,
1062 token: Token::Special(Special::Separator(Separator::Space)),
1063 },
1064 PositionalToken {
1065 source: uws,
1066 offset: 14,
1067 length: 3,
1068 token: Token::Word(Word::Word("too".to_string())),
1069 },
1070 PositionalToken {
1071 source: uws,
1072 offset: 17,
1073 length: 3,
1074 token: Token::Special(Special::Separator(Separator::Space)),
1075 },
1076 PositionalToken {
1077 source: uws,
1078 offset: 20,
1079 length: 4,
1080 token: Token::Word(Word::Word("many".to_string())),
1081 },
1082 PositionalToken {
1083 source: uws,
1084 offset: 24,
1085 length: 3,
1086 token: Token::Special(Special::Separator(Separator::Space)),
1087 },
1088 PositionalToken {
1089 source: uws,
1090 offset: 27,
1091 length: 5,
1092 token: Token::Word(Word::Word("apces".to_string())),
1093 },
1094 PositionalToken {
1095 source: uws,
1096 offset: 32,
1097 length: 3,
1098 token: Token::Special(Special::Separator(Separator::Space)),
1099 },
1100 ];
1101 let lib_res = uws
1102 .into_tokenizer(TokenizerParams::v1())
1103 .collect::<Vec<_>>();
1104 check_results(&result, &lib_res, uws);
1105 }
1107
1108 #[test]
1109 fn numbers() {
1110 let uws = "(() -2\n() -2";
1111 let result = vec![
1112 PositionalToken {
1113 source: uws,
1114 offset: 0,
1115 length: 1,
1116 token: Token::Special(Special::Punctuation('(')),
1117 },
1118 PositionalToken {
1119 source: uws,
1120 offset: 1,
1121 length: 1,
1122 token: Token::Special(Special::Punctuation('(')),
1123 },
1124 PositionalToken {
1125 source: uws,
1126 offset: 2,
1127 length: 1,
1128 token: Token::Special(Special::Punctuation(')')),
1129 },
1130 PositionalToken {
1131 source: uws,
1132 offset: 3,
1133 length: 1,
1134 token: Token::Special(Special::Separator(Separator::Space)),
1135 },
1136 PositionalToken {
1137 source: uws,
1138 offset: 4,
1139 length: 2,
1140 token: Token::Word(Word::Number(Number::Integer(-2))),
1141 },
1142 PositionalToken {
1143 source: uws,
1144 offset: 6,
1145 length: 1,
1146 token: Token::Special(Special::Separator(Separator::Newline)),
1147 },
1148 PositionalToken {
1149 source: uws,
1150 offset: 7,
1151 length: 1,
1152 token: Token::Special(Special::Punctuation('(')),
1153 },
1154 PositionalToken {
1155 source: uws,
1156 offset: 8,
1157 length: 1,
1158 token: Token::Special(Special::Punctuation(')')),
1159 },
1160 PositionalToken {
1161 source: uws,
1162 offset: 9,
1163 length: 2,
1164 token: Token::Special(Special::Separator(Separator::Space)),
1165 },
1166 PositionalToken {
1167 source: uws,
1168 offset: 11,
1169 length: 2,
1170 token: Token::Word(Word::Number(Number::Integer(-2))),
1171 },
1172 ];
1173 let lib_res = uws
1174 .into_tokenizer({
1175 TokenizerParams::default()
1176 .add_option(TokenizerOptions::SplitDot)
1177 .add_option(TokenizerOptions::SplitUnderscore)
1178 .add_option(TokenizerOptions::SplitColon)
1179 .add_option(TokenizerOptions::MergeWhites)
1180 })
1181 .collect::<Vec<_>>();
1182 check_results(&result, &lib_res, uws);
1183 }
1184
1185 #[test]
1186 fn word_with_inner_hyphens() {
1187 let uws = "Опросы показывают";
1188 let result = vec![
1189 PositionalToken {
1190 source: uws,
1191 offset: 0,
1192 length: 14,
1193 token: Token::Word(Word::StrangeWord("Опросы".to_string())),
1194 },
1195 PositionalToken {
1196 source: uws,
1197 offset: 14,
1198 length: 1,
1199 token: Token::Special(Special::Separator(Separator::Space)),
1200 },
1201 PositionalToken {
1202 source: uws,
1203 offset: 15,
1204 length: 28,
1205 token: Token::Word(Word::StrangeWord("показывают".to_string())),
1206 },
1207 ];
1208 let lib_res = uws
1209 .into_tokenizer(TokenizerParams::v1())
1210 .collect::<Vec<_>>();
1211 check_results(&result, &lib_res, uws);
1212 }
1213
1214 #[test]
1215 fn mixed_but_word() {
1216 let uws = "L’Oreal";
1217 let result = vec![PositionalToken {
1218 source: uws,
1219 offset: 0,
1220 length: 9,
1221 token: Token::Word(Word::StrangeWord("L’Oreal".to_string())),
1222 }];
1223 let lib_res = uws
1224 .into_tokenizer(TokenizerParams::v1())
1225 .collect::<Vec<_>>();
1226 check_results(&result, &lib_res, uws);
1227 }
1228
1229 #[test]
1230 fn hashtags() {
1231 let uws = "#hashtag#hashtag2";
1232 let result = vec![
1233 PositionalToken {
1234 source: uws,
1235 offset: 0,
1236 length: 8,
1237 token: Token::Struct(Struct::Hashtag("hashtag".to_string())),
1238 },
1239 PositionalToken {
1240 source: uws,
1241 offset: 8,
1242 length: 9,
1243 token: Token::Struct(Struct::Hashtag("hashtag2".to_string())),
1244 },
1245 ];
1272 let lib_res = uws
1273 .into_tokenizer(TokenizerParams::v1())
1274 .collect::<Vec<_>>();
1275 check_results(&result, &lib_res, uws);
1276 }
1277
1278 #[test]
1279 fn hashtags2() {
1280 let uws = "#hashtag#hashtag2 #hash_tag";
1281 let result = vec![
1282 PositionalToken {
1283 source: uws,
1284 offset: 0,
1285 length: 8,
1286 token: Token::Struct(Struct::Hashtag("hashtag".to_string())),
1287 },
1288 PositionalToken {
1289 source: uws,
1290 offset: 8,
1291 length: 9,
1292 token: Token::Struct(Struct::Hashtag("hashtag2".to_string())),
1293 },
1294 PositionalToken {
1295 source: uws,
1296 offset: 17,
1297 length: 1,
1298 token: Token::Special(Special::Separator(Separator::Space)),
1299 },
1300 PositionalToken {
1301 source: uws,
1302 offset: 18,
1303 length: 9,
1304 token: Token::Struct(Struct::Hashtag("hash_tag".to_string())),
1305 },
1306 ];
1307 let lib_res = uws
1308 .into_tokenizer(TokenizerParams::v1().add_option(TokenizerOptions::StructTokens))
1309 .collect::<Vec<_>>();
1310 check_results(&result, &lib_res, uws);
1311 }
1312
1313 #[test]
1314 fn mention2() {
1315 let uws = "@hashtag@hashtag2 @hash_tag";
1316 let result = vec![
1317 PositionalToken {
1318 source: uws,
1319 offset: 0,
1320 length: 8,
1321 token: Token::Struct(Struct::Mention("hashtag".to_string())),
1322 },
1323 PositionalToken {
1324 source: uws,
1325 offset: 8,
1326 length: 9,
1327 token: Token::Struct(Struct::Mention("hashtag2".to_string())),
1328 },
1329 PositionalToken {
1330 source: uws,
1331 offset: 17,
1332 length: 1,
1333 token: Token::Special(Special::Separator(Separator::Space)),
1334 },
1335 PositionalToken {
1336 source: uws,
1337 offset: 18,
1338 length: 9,
1339 token: Token::Struct(Struct::Mention("hash_tag".to_string())),
1340 },
1341 ];
1342 let lib_res = uws
1343 .into_tokenizer(TokenizerParams::v1().add_option(TokenizerOptions::StructTokens))
1344 .collect::<Vec<_>>();
1345 check_results(&result, &lib_res, uws);
1346 }
1347
1348 #[test]
1349 fn apostrophe() {
1350 let uws = "l'oreal; l\u{0060}oreal";
1351 let result = vec![
1352 PositionalToken {
1353 source: uws,
1354 offset: 0,
1355 length: 7,
1356 token: Token::Word(Word::Word("l'oreal".to_string())),
1357 },
1358 PositionalToken {
1359 source: uws,
1360 offset: 7,
1361 length: 1,
1362 token: Token::Special(Special::Punctuation(';')),
1363 },
1364 PositionalToken {
1365 source: uws,
1366 offset: 8,
1367 length: 1,
1368 token: Token::Special(Special::Separator(Separator::Space)),
1369 },
1370 PositionalToken {
1371 source: uws,
1372 offset: 9,
1373 length: 7,
1374 token: Token::Word(Word::Word("l'oreal".to_string())),
1375 },
1376 ];
1377 let text = Text::new(uws.into_source()).unwrap();
1378 let lib_res = text
1379 .into_tokenizer(TokenizerParams::v1())
1380 .filter_map(|tt| tt.into_original_token_1())
1381 .collect::<Vec<_>>();
1382 check_results(&result, &lib_res, uws);
1383 }
1384
1385 #[test]
1386 fn char_tokens() {
1387 let uws = "[Oxana Putan|1712640565] shared the quick (\"brown\") fox can't jump 32.3 feet, right? 4pda etc. qeq U.S.A asd\n\n\nBrr, it's 29.3°F!\n Русское предложение #36.6 для тестирования деления по юникод-словам...\n🇷🇺 🇸🇹\n👱🏿👶🏽👨🏽\n+Done! Готово";
1388 let result = vec![
1389 CharToken {
1390 byte_offset: 0,
1391 byte_length: 1,
1392 char_offset: 0,
1393 char_length: 1,
1394 token: Token::Special(Special::Punctuation('[')),
1395 },
1396 CharToken {
1397 byte_offset: 1,
1398 byte_length: 5,
1399 char_offset: 1,
1400 char_length: 5,
1401 token: Token::Word(Word::Word("Oxana".to_string())),
1402 },
1403 CharToken {
1404 byte_offset: 6,
1405 byte_length: 1,
1406 char_offset: 6,
1407 char_length: 1,
1408 token: Token::Special(Special::Separator(Separator::Space)),
1409 },
1410 CharToken {
1411 byte_offset: 7,
1412 byte_length: 5,
1413 char_offset: 7,
1414 char_length: 5,
1415 token: Token::Word(Word::Word("Putan".to_string())),
1416 },
1417 CharToken {
1418 byte_offset: 12,
1419 byte_length: 1,
1420 char_offset: 12,
1421 char_length: 1,
1422 token: Token::Special(Special::Punctuation('|')),
1423 },
1424 CharToken {
1425 byte_offset: 13,
1426 byte_length: 10,
1427 char_offset: 13,
1428 char_length: 10,
1429 token: Token::Word(Word::Number(Number::Integer(1712640565))),
1430 },
1431 CharToken {
1432 byte_offset: 23,
1433 byte_length: 1,
1434 char_offset: 23,
1435 char_length: 1,
1436 token: Token::Special(Special::Punctuation(']')),
1437 },
1438 CharToken {
1446 byte_offset: 24,
1447 byte_length: 1,
1448 char_offset: 24,
1449 char_length: 1,
1450 token: Token::Special(Special::Separator(Separator::Space)),
1451 },
1452 CharToken {
1453 byte_offset: 25,
1454 byte_length: 6,
1455 char_offset: 25,
1456 char_length: 6,
1457 token: Token::Word(Word::Word("shared".to_string())),
1458 },
1459 CharToken {
1460 byte_offset: 31,
1461 byte_length: 1,
1462 char_offset: 31,
1463 char_length: 1,
1464 token: Token::Special(Special::Separator(Separator::Space)),
1465 },
1466 CharToken {
1467 byte_offset: 32,
1468 byte_length: 3,
1469 char_offset: 32,
1470 char_length: 3,
1471 token: Token::Word(Word::Word("the".to_string())),
1472 },
1473 CharToken {
1474 byte_offset: 35,
1475 byte_length: 1,
1476 char_offset: 35,
1477 char_length: 1,
1478 token: Token::Special(Special::Separator(Separator::Space)),
1479 },
1480 CharToken {
1481 byte_offset: 36,
1482 byte_length: 5,
1483 char_offset: 36,
1484 char_length: 5,
1485 token: Token::Word(Word::Word("quick".to_string())),
1486 },
1487 CharToken {
1488 byte_offset: 41,
1489 byte_length: 1,
1490 char_offset: 41,
1491 char_length: 1,
1492 token: Token::Special(Special::Separator(Separator::Space)),
1493 },
1494 CharToken {
1495 byte_offset: 42,
1496 byte_length: 1,
1497 char_offset: 42,
1498 char_length: 1,
1499 token: Token::Special(Special::Punctuation('(')),
1500 },
1501 CharToken {
1502 byte_offset: 43,
1503 byte_length: 1,
1504 char_offset: 43,
1505 char_length: 1,
1506 token: Token::Special(Special::Punctuation('"')),
1507 },
1508 CharToken {
1509 byte_offset: 44,
1510 byte_length: 5,
1511 char_offset: 44,
1512 char_length: 5,
1513 token: Token::Word(Word::Word("brown".to_string())),
1514 },
1515 CharToken {
1516 byte_offset: 49,
1517 byte_length: 1,
1518 char_offset: 49,
1519 char_length: 1,
1520 token: Token::Special(Special::Punctuation('"')),
1521 },
1522 CharToken {
1523 byte_offset: 50,
1524 byte_length: 1,
1525 char_offset: 50,
1526 char_length: 1,
1527 token: Token::Special(Special::Punctuation(')')),
1528 },
1529 CharToken {
1530 byte_offset: 51,
1531 byte_length: 1,
1532 char_offset: 51,
1533 char_length: 1,
1534 token: Token::Special(Special::Separator(Separator::Space)),
1535 },
1536 CharToken {
1537 byte_offset: 52,
1538 byte_length: 3,
1539 char_offset: 52,
1540 char_length: 3,
1541 token: Token::Word(Word::Word("fox".to_string())),
1542 },
1543 CharToken {
1544 byte_offset: 55,
1545 byte_length: 1,
1546 char_offset: 55,
1547 char_length: 1,
1548 token: Token::Special(Special::Separator(Separator::Space)),
1549 },
1550 CharToken {
1551 byte_offset: 56,
1552 byte_length: 5,
1553 char_offset: 56,
1554 char_length: 5,
1555 token: Token::Word(Word::Word("can\'t".to_string())),
1556 },
1557 CharToken {
1558 byte_offset: 61,
1559 byte_length: 1,
1560 char_offset: 61,
1561 char_length: 1,
1562 token: Token::Special(Special::Separator(Separator::Space)),
1563 },
1564 CharToken {
1565 byte_offset: 62,
1566 byte_length: 4,
1567 char_offset: 62,
1568 char_length: 4,
1569 token: Token::Word(Word::Word("jump".to_string())),
1570 },
1571 CharToken {
1572 byte_offset: 66,
1573 byte_length: 1,
1574 char_offset: 66,
1575 char_length: 1,
1576 token: Token::Special(Special::Separator(Separator::Space)),
1577 },
1578 CharToken {
1579 byte_offset: 67,
1580 byte_length: 4,
1581 char_offset: 67,
1582 char_length: 4,
1583 token: Token::Word(Word::Number(Number::Float(32.3))),
1584 },
1585 CharToken {
1586 byte_offset: 71,
1587 byte_length: 1,
1588 char_offset: 71,
1589 char_length: 1,
1590 token: Token::Special(Special::Separator(Separator::Space)),
1591 },
1592 CharToken {
1593 byte_offset: 72,
1594 byte_length: 4,
1595 char_offset: 72,
1596 char_length: 4,
1597 token: Token::Word(Word::Word("feet".to_string())),
1598 },
1599 CharToken {
1600 byte_offset: 76,
1601 byte_length: 1,
1602 char_offset: 76,
1603 char_length: 1,
1604 token: Token::Special(Special::Punctuation(',')),
1605 },
1606 CharToken {
1607 byte_offset: 77,
1608 byte_length: 1,
1609 char_offset: 77,
1610 char_length: 1,
1611 token: Token::Special(Special::Separator(Separator::Space)),
1612 },
1613 CharToken {
1614 byte_offset: 78,
1615 byte_length: 5,
1616 char_offset: 78,
1617 char_length: 5,
1618 token: Token::Word(Word::Word("right".to_string())),
1619 },
1620 CharToken {
1621 byte_offset: 83,
1622 byte_length: 1,
1623 char_offset: 83,
1624 char_length: 1,
1625 token: Token::Special(Special::Punctuation('?')),
1626 },
1627 CharToken {
1628 byte_offset: 84,
1629 byte_length: 1,
1630 char_offset: 84,
1631 char_length: 1,
1632 token: Token::Special(Special::Separator(Separator::Space)),
1633 },
1634 CharToken {
1635 byte_offset: 85,
1636 byte_length: 4,
1637 char_offset: 85,
1638 char_length: 4,
1639 token: Token::Word(Word::Numerical(Numerical::Measures("4pda".to_string()))),
1640 },
1641 CharToken {
1642 byte_offset: 89,
1643 byte_length: 1,
1644 char_offset: 89,
1645 char_length: 1,
1646 token: Token::Special(Special::Separator(Separator::Space)),
1647 },
1648 CharToken {
1649 byte_offset: 90,
1650 byte_length: 3,
1651 char_offset: 90,
1652 char_length: 3,
1653 token: Token::Word(Word::Word("etc".to_string())),
1654 },
1655 CharToken {
1656 byte_offset: 93,
1657 byte_length: 1,
1658 char_offset: 93,
1659 char_length: 1,
1660 token: Token::Special(Special::Punctuation('.')),
1661 },
1662 CharToken {
1663 byte_offset: 94,
1664 byte_length: 1,
1665 char_offset: 94,
1666 char_length: 1,
1667 token: Token::Special(Special::Separator(Separator::Space)),
1668 },
1669 CharToken {
1670 byte_offset: 95,
1671 byte_length: 3,
1672 char_offset: 95,
1673 char_length: 3,
1674 token: Token::Word(Word::Word("qeq".to_string())),
1675 },
1676 CharToken {
1677 byte_offset: 98,
1678 byte_length: 1,
1679 char_offset: 98,
1680 char_length: 1,
1681 token: Token::Special(Special::Separator(Separator::Space)),
1682 },
1683 CharToken {
1684 byte_offset: 99,
1685 byte_length: 5,
1686 char_offset: 99,
1687 char_length: 5,
1688 token: Token::Word(Word::Word("U.S.A".to_string())),
1689 },
1690 CharToken {
1691 byte_offset: 104,
1692 byte_length: 2,
1693 char_offset: 104,
1694 char_length: 2,
1695 token: Token::Special(Special::Separator(Separator::Space)),
1696 },
1697 CharToken {
1698 byte_offset: 106,
1699 byte_length: 3,
1700 char_offset: 106,
1701 char_length: 3,
1702 token: Token::Word(Word::Word("asd".to_string())),
1703 },
1704 CharToken {
1705 byte_offset: 109,
1706 byte_length: 3,
1707 char_offset: 109,
1708 char_length: 3,
1709 token: Token::Special(Special::Separator(Separator::Newline)),
1710 },
1711 CharToken {
1712 byte_offset: 112,
1713 byte_length: 3,
1714 char_offset: 112,
1715 char_length: 3,
1716 token: Token::Word(Word::Word("Brr".to_string())),
1717 },
1718 CharToken {
1719 byte_offset: 115,
1720 byte_length: 1,
1721 char_offset: 115,
1722 char_length: 1,
1723 token: Token::Special(Special::Punctuation(',')),
1724 },
1725 CharToken {
1726 byte_offset: 116,
1727 byte_length: 1,
1728 char_offset: 116,
1729 char_length: 1,
1730 token: Token::Special(Special::Separator(Separator::Space)),
1731 },
1732 CharToken {
1733 byte_offset: 117,
1734 byte_length: 4,
1735 char_offset: 117,
1736 char_length: 4,
1737 token: Token::Word(Word::Word("it\'s".to_string())),
1738 },
1739 CharToken {
1740 byte_offset: 121,
1741 byte_length: 1,
1742 char_offset: 121,
1743 char_length: 1,
1744 token: Token::Special(Special::Separator(Separator::Space)),
1745 },
1746 CharToken {
1747 byte_offset: 122,
1748 byte_length: 4,
1749 char_offset: 122,
1750 char_length: 4,
1751 token: Token::Word(Word::Number(Number::Float(29.3))),
1752 },
1753 CharToken {
1754 byte_offset: 126,
1755 byte_length: 2,
1756 char_offset: 126,
1757 char_length: 1,
1758 token: Token::Special(Special::Symbol('°')),
1759 },
1760 CharToken {
1761 byte_offset: 128,
1762 byte_length: 1,
1763 char_offset: 127,
1764 char_length: 1,
1765 token: Token::Word(Word::Word("F".to_string())),
1766 },
1767 CharToken {
1768 byte_offset: 129,
1769 byte_length: 1,
1770 char_offset: 128,
1771 char_length: 1,
1772 token: Token::Special(Special::Punctuation('!')),
1773 },
1774 CharToken {
1775 byte_offset: 130,
1776 byte_length: 1,
1777 char_offset: 129,
1778 char_length: 1,
1779 token: Token::Special(Special::Separator(Separator::Newline)),
1780 },
1781 CharToken {
1782 byte_offset: 131,
1783 byte_length: 1,
1784 char_offset: 130,
1785 char_length: 1,
1786 token: Token::Special(Special::Separator(Separator::Space)),
1787 },
1788 CharToken {
1789 byte_offset: 132,
1790 byte_length: 14,
1791 char_offset: 131,
1792 char_length: 7,
1793 token: Token::Word(Word::Word("Русское".to_string())),
1794 },
1795 CharToken {
1796 byte_offset: 146,
1797 byte_length: 1,
1798 char_offset: 138,
1799 char_length: 1,
1800 token: Token::Special(Special::Separator(Separator::Space)),
1801 },
1802 CharToken {
1803 byte_offset: 147,
1804 byte_length: 22,
1805 char_offset: 139,
1806 char_length: 11,
1807 token: Token::Word(Word::Word("предложение".to_string())),
1808 },
1809 CharToken {
1810 byte_offset: 169,
1811 byte_length: 1,
1812 char_offset: 150,
1813 char_length: 1,
1814 token: Token::Special(Special::Separator(Separator::Space)),
1815 },
1816 CharToken {
1817 byte_offset: 170,
1818 byte_length: 5,
1819 char_offset: 151,
1820 char_length: 5,
1821 token: Token::Struct(Struct::Hashtag("36.6".to_string())),
1822 },
1823 CharToken {
1824 byte_offset: 175,
1825 byte_length: 1,
1826 char_offset: 156,
1827 char_length: 1,
1828 token: Token::Special(Special::Separator(Separator::Space)),
1829 },
1830 CharToken {
1831 byte_offset: 176,
1832 byte_length: 6,
1833 char_offset: 157,
1834 char_length: 3,
1835 token: Token::Word(Word::Word("для".to_string())),
1836 },
1837 CharToken {
1838 byte_offset: 182,
1839 byte_length: 1,
1840 char_offset: 160,
1841 char_length: 1,
1842 token: Token::Special(Special::Separator(Separator::Space)),
1843 },
1844 CharToken {
1845 byte_offset: 183,
1846 byte_length: 24,
1847 char_offset: 161,
1848 char_length: 12,
1849 token: Token::Word(Word::Word("тестирования".to_string())),
1850 },
1851 CharToken {
1852 byte_offset: 207,
1853 byte_length: 1,
1854 char_offset: 173,
1855 char_length: 1,
1856 token: Token::Special(Special::Separator(Separator::Space)),
1857 },
1858 CharToken {
1859 byte_offset: 208,
1860 byte_length: 14,
1861 char_offset: 174,
1862 char_length: 7,
1863 token: Token::Word(Word::Word("деления".to_string())),
1864 },
1865 CharToken {
1866 byte_offset: 222,
1867 byte_length: 1,
1868 char_offset: 181,
1869 char_length: 1,
1870 token: Token::Special(Special::Separator(Separator::Space)),
1871 },
1872 CharToken {
1873 byte_offset: 223,
1874 byte_length: 4,
1875 char_offset: 182,
1876 char_length: 2,
1877 token: Token::Word(Word::Word("по".to_string())),
1878 },
1879 CharToken {
1880 byte_offset: 227,
1881 byte_length: 1,
1882 char_offset: 184,
1883 char_length: 1,
1884 token: Token::Special(Special::Separator(Separator::Space)),
1885 },
1886 CharToken {
1887 byte_offset: 228,
1888 byte_length: 12,
1889 char_offset: 185,
1890 char_length: 6,
1891 token: Token::Word(Word::Word("юникод".to_string())),
1892 },
1893 CharToken {
1894 byte_offset: 240,
1895 byte_length: 1,
1896 char_offset: 191,
1897 char_length: 1,
1898 token: Token::Special(Special::Punctuation('-')),
1899 },
1900 CharToken {
1901 byte_offset: 241,
1902 byte_length: 12,
1903 char_offset: 192,
1904 char_length: 6,
1905 token: Token::Word(Word::Word("словам".to_string())),
1906 },
1907 CharToken {
1908 byte_offset: 253,
1909 byte_length: 3,
1910 char_offset: 198,
1911 char_length: 3,
1912 token: Token::Special(Special::Punctuation('.')),
1913 },
1914 CharToken {
1915 byte_offset: 256,
1916 byte_length: 1,
1917 char_offset: 201,
1918 char_length: 1,
1919 token: Token::Special(Special::Separator(Separator::Newline)),
1920 },
1921 CharToken {
1922 byte_offset: 257,
1923 byte_length: 8,
1924 char_offset: 202,
1925 char_length: 2,
1926 token: Token::Word(Word::Emoji("russia")),
1927 },
1928 CharToken {
1929 byte_offset: 265,
1930 byte_length: 1,
1931 char_offset: 204,
1932 char_length: 1,
1933 token: Token::Special(Special::Separator(Separator::Space)),
1934 },
1935 CharToken {
1936 byte_offset: 266,
1937 byte_length: 8,
1938 char_offset: 205,
1939 char_length: 2,
1940 token: Token::Word(Word::Emoji("sao_tome_and_principe")),
1941 },
1942 CharToken {
1943 byte_offset: 274,
1944 byte_length: 1,
1945 char_offset: 207,
1946 char_length: 1,
1947 token: Token::Special(Special::Separator(Separator::Newline)),
1948 },
1949 CharToken {
1950 byte_offset: 275,
1951 byte_length: 8,
1952 char_offset: 208,
1953 char_length: 2,
1954 token: Token::Word(Word::Emoji("blond_haired_person_dark_skin_tone")),
1955 },
1956 CharToken {
1957 byte_offset: 283,
1958 byte_length: 8,
1959 char_offset: 210,
1960 char_length: 2,
1961 token: Token::Word(Word::Emoji("baby_medium_skin_tone")),
1962 },
1963 CharToken {
1964 byte_offset: 291,
1965 byte_length: 8,
1966 char_offset: 212,
1967 char_length: 2,
1968 token: Token::Word(Word::Emoji("man_medium_skin_tone")),
1969 },
1970 CharToken {
1971 byte_offset: 299,
1972 byte_length: 1,
1973 char_offset: 214,
1974 char_length: 1,
1975 token: Token::Special(Special::Separator(Separator::Newline)),
1976 },
1977 CharToken {
1978 byte_offset: 300,
1979 byte_length: 1,
1980 char_offset: 215,
1981 char_length: 1,
1982 token: Token::Special(Special::Punctuation('+')),
1983 },
1984 CharToken {
1985 byte_offset: 301,
1986 byte_length: 4,
1987 char_offset: 216,
1988 char_length: 4,
1989 token: Token::Word(Word::Word("Done".to_string())),
1990 },
1991 CharToken {
1992 byte_offset: 305,
1993 byte_length: 1,
1994 char_offset: 220,
1995 char_length: 1,
1996 token: Token::Special(Special::Punctuation('!')),
1997 },
1998 CharToken {
1999 byte_offset: 306,
2000 byte_length: 1,
2001 char_offset: 221,
2002 char_length: 1,
2003 token: Token::Special(Special::Separator(Separator::Space)),
2004 },
2005 CharToken {
2006 byte_offset: 307,
2007 byte_length: 12,
2008 char_offset: 222,
2009 char_length: 6,
2010 token: Token::Word(Word::Word("Готово".to_string())),
2011 },
2012 ];
2013
2014 let lib_res = uws
2015 .into_tokenizer(TokenizerParams::complex())
2016 .collect::<Vec<_>>();
2017
2018 check_cresults(&result, &lib_res, uws);
2020 }
2021
2022 #[test]
2023 fn general_default() {
2024 let uws = "The quick (\"brown\") fox can't jump 32.3 feet, right? 4pda etc. qeq U.S.A asd\n\n\nBrr, it's 29.3°F!\n Русское предложение #36.6 для тестирования деления по юникод-словам...\n";
2025 let result = vec![
2026 PositionalToken {
2027 source: uws,
2028 offset: 0,
2029 length: 3,
2030 token: Token::Word(Word::Word("The".to_string())),
2031 },
2032 PositionalToken {
2033 source: uws,
2034 offset: 3,
2035 length: 1,
2036 token: Token::Special(Special::Separator(Separator::Space)),
2037 },
2038 PositionalToken {
2039 source: uws,
2040 offset: 4,
2041 length: 5,
2042 token: Token::Word(Word::Word("quick".to_string())),
2043 },
2044 PositionalToken {
2045 source: uws,
2046 offset: 9,
2047 length: 1,
2048 token: Token::Special(Special::Separator(Separator::Space)),
2049 },
2050 PositionalToken {
2051 source: uws,
2052 offset: 10,
2053 length: 1,
2054 token: Token::Special(Special::Punctuation('(')),
2055 },
2056 PositionalToken {
2057 source: uws,
2058 offset: 11,
2059 length: 1,
2060 token: Token::Special(Special::Punctuation('"')),
2061 },
2062 PositionalToken {
2063 source: uws,
2064 offset: 12,
2065 length: 5,
2066 token: Token::Word(Word::Word("brown".to_string())),
2067 },
2068 PositionalToken {
2069 source: uws,
2070 offset: 17,
2071 length: 1,
2072 token: Token::Special(Special::Punctuation('"')),
2073 },
2074 PositionalToken {
2075 source: uws,
2076 offset: 18,
2077 length: 1,
2078 token: Token::Special(Special::Punctuation(')')),
2079 },
2080 PositionalToken {
2081 source: uws,
2082 offset: 19,
2083 length: 1,
2084 token: Token::Special(Special::Separator(Separator::Space)),
2085 },
2086 PositionalToken {
2087 source: uws,
2088 offset: 20,
2089 length: 3,
2090 token: Token::Word(Word::Word("fox".to_string())),
2091 },
2092 PositionalToken {
2093 source: uws,
2094 offset: 23,
2095 length: 1,
2096 token: Token::Special(Special::Separator(Separator::Space)),
2097 },
2098 PositionalToken {
2099 source: uws,
2100 offset: 24,
2101 length: 5,
2102 token: Token::Word(Word::Word("can\'t".to_string())),
2103 },
2104 PositionalToken {
2105 source: uws,
2106 offset: 29,
2107 length: 1,
2108 token: Token::Special(Special::Separator(Separator::Space)),
2109 },
2110 PositionalToken {
2111 source: uws,
2112 offset: 30,
2113 length: 4,
2114 token: Token::Word(Word::Word("jump".to_string())),
2115 },
2116 PositionalToken {
2117 source: uws,
2118 offset: 34,
2119 length: 1,
2120 token: Token::Special(Special::Separator(Separator::Space)),
2121 },
2122 PositionalToken {
2123 source: uws,
2124 offset: 35,
2125 length: 4,
2126 token: Token::Word(Word::Number(Number::Float(32.3))),
2127 },
2128 PositionalToken {
2129 source: uws,
2130 offset: 39,
2131 length: 1,
2132 token: Token::Special(Special::Separator(Separator::Space)),
2133 },
2134 PositionalToken {
2135 source: uws,
2136 offset: 40,
2137 length: 4,
2138 token: Token::Word(Word::Word("feet".to_string())),
2139 },
2140 PositionalToken {
2141 source: uws,
2142 offset: 44,
2143 length: 1,
2144 token: Token::Special(Special::Punctuation(',')),
2145 },
2146 PositionalToken {
2147 source: uws,
2148 offset: 45,
2149 length: 1,
2150 token: Token::Special(Special::Separator(Separator::Space)),
2151 },
2152 PositionalToken {
2153 source: uws,
2154 offset: 46,
2155 length: 5,
2156 token: Token::Word(Word::Word("right".to_string())),
2157 },
2158 PositionalToken {
2159 source: uws,
2160 offset: 51,
2161 length: 1,
2162 token: Token::Special(Special::Punctuation('?')),
2163 },
2164 PositionalToken {
2165 source: uws,
2166 offset: 52,
2167 length: 1,
2168 token: Token::Special(Special::Separator(Separator::Space)),
2169 },
2170 PositionalToken {
2171 source: uws,
2172 offset: 53,
2173 length: 4,
2174 token: Token::Word(Word::Numerical(Numerical::Measures("4pda".to_string()))),
2175 }, PositionalToken {
2177 source: uws,
2178 offset: 57,
2179 length: 1,
2180 token: Token::Special(Special::Separator(Separator::Space)),
2181 },
2182 PositionalToken {
2183 source: uws,
2184 offset: 58,
2185 length: 3,
2186 token: Token::Word(Word::Word("etc".to_string())),
2187 },
2188 PositionalToken {
2189 source: uws,
2190 offset: 61,
2191 length: 1,
2192 token: Token::Special(Special::Punctuation('.')),
2193 },
2194 PositionalToken {
2195 source: uws,
2196 offset: 62,
2197 length: 1,
2198 token: Token::Special(Special::Separator(Separator::Space)),
2199 },
2200 PositionalToken {
2201 source: uws,
2202 offset: 63,
2203 length: 3,
2204 token: Token::Word(Word::Word("qeq".to_string())),
2205 },
2206 PositionalToken {
2207 source: uws,
2208 offset: 66,
2209 length: 1,
2210 token: Token::Special(Special::Separator(Separator::Space)),
2211 },
2212 PositionalToken {
2213 source: uws,
2214 offset: 67,
2215 length: 1,
2216 token: Token::Word(Word::Word("U".to_string())),
2217 },
2218 PositionalToken {
2219 source: uws,
2220 offset: 68,
2221 length: 1,
2222 token: Token::Special(Special::Punctuation('.')),
2223 },
2224 PositionalToken {
2225 source: uws,
2226 offset: 69,
2227 length: 1,
2228 token: Token::Word(Word::Word("S".to_string())),
2229 },
2230 PositionalToken {
2231 source: uws,
2232 offset: 70,
2233 length: 1,
2234 token: Token::Special(Special::Punctuation('.')),
2235 },
2236 PositionalToken {
2237 source: uws,
2238 offset: 71,
2239 length: 1,
2240 token: Token::Word(Word::Word("A".to_string())),
2241 },
2242 PositionalToken {
2243 source: uws,
2244 offset: 72,
2245 length: 2,
2246 token: Token::Special(Special::Separator(Separator::Space)),
2247 },
2248 PositionalToken {
2249 source: uws,
2250 offset: 74,
2251 length: 3,
2252 token: Token::Word(Word::Word("asd".to_string())),
2253 },
2254 PositionalToken {
2255 source: uws,
2256 offset: 77,
2257 length: 3,
2258 token: Token::Special(Special::Separator(Separator::Newline)),
2259 },
2260 PositionalToken {
2261 source: uws,
2262 offset: 80,
2263 length: 3,
2264 token: Token::Word(Word::Word("Brr".to_string())),
2265 },
2266 PositionalToken {
2267 source: uws,
2268 offset: 83,
2269 length: 1,
2270 token: Token::Special(Special::Punctuation(',')),
2271 },
2272 PositionalToken {
2273 source: uws,
2274 offset: 84,
2275 length: 1,
2276 token: Token::Special(Special::Separator(Separator::Space)),
2277 },
2278 PositionalToken {
2279 source: uws,
2280 offset: 85,
2281 length: 4,
2282 token: Token::Word(Word::Word("it\'s".to_string())),
2283 },
2284 PositionalToken {
2285 source: uws,
2286 offset: 89,
2287 length: 1,
2288 token: Token::Special(Special::Separator(Separator::Space)),
2289 },
2290 PositionalToken {
2291 source: uws,
2292 offset: 90,
2293 length: 4,
2294 token: Token::Word(Word::Number(Number::Float(29.3))),
2295 },
2296 PositionalToken {
2297 source: uws,
2298 offset: 94,
2299 length: 2,
2300 token: Token::Special(Special::Symbol('°')),
2301 },
2302 PositionalToken {
2303 source: uws,
2304 offset: 96,
2305 length: 1,
2306 token: Token::Word(Word::Word("F".to_string())),
2307 },
2308 PositionalToken {
2309 source: uws,
2310 offset: 97,
2311 length: 1,
2312 token: Token::Special(Special::Punctuation('!')),
2313 },
2314 PositionalToken {
2315 source: uws,
2316 offset: 98,
2317 length: 1,
2318 token: Token::Special(Special::Separator(Separator::Newline)),
2319 },
2320 PositionalToken {
2321 source: uws,
2322 offset: 99,
2323 length: 1,
2324 token: Token::Special(Special::Separator(Separator::Space)),
2325 },
2326 PositionalToken {
2327 source: uws,
2328 offset: 100,
2329 length: 14,
2330 token: Token::Word(Word::Word("Русское".to_string())),
2331 },
2332 PositionalToken {
2333 source: uws,
2334 offset: 114,
2335 length: 1,
2336 token: Token::Special(Special::Separator(Separator::Space)),
2337 },
2338 PositionalToken {
2339 source: uws,
2340 offset: 115,
2341 length: 22,
2342 token: Token::Word(Word::Word("предложение".to_string())),
2343 },
2344 PositionalToken {
2345 source: uws,
2346 offset: 137,
2347 length: 1,
2348 token: Token::Special(Special::Separator(Separator::Space)),
2349 },
2350 PositionalToken {
2351 source: uws,
2352 offset: 138,
2353 length: 5,
2354 token: Token::Struct(Struct::Hashtag("36.6".to_string())),
2355 },
2356 PositionalToken {
2369 source: uws,
2370 offset: 143,
2371 length: 1,
2372 token: Token::Special(Special::Separator(Separator::Space)),
2373 },
2374 PositionalToken {
2375 source: uws,
2376 offset: 144,
2377 length: 6,
2378 token: Token::Word(Word::Word("для".to_string())),
2379 },
2380 PositionalToken {
2381 source: uws,
2382 offset: 150,
2383 length: 1,
2384 token: Token::Special(Special::Separator(Separator::Space)),
2385 },
2386 PositionalToken {
2387 source: uws,
2388 offset: 151,
2389 length: 24,
2390 token: Token::Word(Word::Word("тестирования".to_string())),
2391 },
2392 PositionalToken {
2393 source: uws,
2394 offset: 175,
2395 length: 1,
2396 token: Token::Special(Special::Separator(Separator::Space)),
2397 },
2398 PositionalToken {
2399 source: uws,
2400 offset: 176,
2401 length: 14,
2402 token: Token::Word(Word::Word("деления".to_string())),
2403 },
2404 PositionalToken {
2405 source: uws,
2406 offset: 190,
2407 length: 1,
2408 token: Token::Special(Special::Separator(Separator::Space)),
2409 },
2410 PositionalToken {
2411 source: uws,
2412 offset: 191,
2413 length: 4,
2414 token: Token::Word(Word::Word("по".to_string())),
2415 },
2416 PositionalToken {
2417 source: uws,
2418 offset: 195,
2419 length: 1,
2420 token: Token::Special(Special::Separator(Separator::Space)),
2421 },
2422 PositionalToken {
2423 source: uws,
2424 offset: 196,
2425 length: 12,
2426 token: Token::Word(Word::Word("юникод".to_string())),
2427 },
2428 PositionalToken {
2429 source: uws,
2430 offset: 208,
2431 length: 1,
2432 token: Token::Special(Special::Punctuation('-')),
2433 },
2434 PositionalToken {
2435 source: uws,
2436 offset: 209,
2437 length: 12,
2438 token: Token::Word(Word::Word("словам".to_string())),
2439 },
2440 PositionalToken {
2441 source: uws,
2442 offset: 221,
2443 length: 3,
2444 token: Token::Special(Special::Punctuation('.')),
2445 },
2446 PositionalToken {
2447 source: uws,
2448 offset: 224,
2449 length: 1,
2450 token: Token::Special(Special::Separator(Separator::Newline)),
2451 },
2452 ];
2453 let lib_res = uws
2454 .into_tokenizer(TokenizerParams::v1())
2455 .collect::<Vec<_>>();
2456 check_results(&result, &lib_res, uws);
2457 }
2458
2459 #[test]
2460 fn general_no_split() {
2461 let uws = "The quick (\"brown\") fox can't jump 32.3 feet, right? 4pda etc. qeq U.S.A asd\n\n\nBrr, it's 29.3°F!\n Русское предложение #36.6 для тестирования деления по юникод-словам...\n";
2462 let result = vec![
2463 PositionalToken {
2464 source: uws,
2465 offset: 0,
2466 length: 3,
2467 token: Token::Word(Word::Word("The".to_string())),
2468 },
2469 PositionalToken {
2470 source: uws,
2471 offset: 3,
2472 length: 1,
2473 token: Token::Special(Special::Separator(Separator::Space)),
2474 },
2475 PositionalToken {
2476 source: uws,
2477 offset: 4,
2478 length: 5,
2479 token: Token::Word(Word::Word("quick".to_string())),
2480 },
2481 PositionalToken {
2482 source: uws,
2483 offset: 9,
2484 length: 1,
2485 token: Token::Special(Special::Separator(Separator::Space)),
2486 },
2487 PositionalToken {
2488 source: uws,
2489 offset: 10,
2490 length: 1,
2491 token: Token::Special(Special::Punctuation('(')),
2492 },
2493 PositionalToken {
2494 source: uws,
2495 offset: 11,
2496 length: 1,
2497 token: Token::Special(Special::Punctuation('"')),
2498 },
2499 PositionalToken {
2500 source: uws,
2501 offset: 12,
2502 length: 5,
2503 token: Token::Word(Word::Word("brown".to_string())),
2504 },
2505 PositionalToken {
2506 source: uws,
2507 offset: 17,
2508 length: 1,
2509 token: Token::Special(Special::Punctuation('"')),
2510 },
2511 PositionalToken {
2512 source: uws,
2513 offset: 18,
2514 length: 1,
2515 token: Token::Special(Special::Punctuation(')')),
2516 },
2517 PositionalToken {
2518 source: uws,
2519 offset: 19,
2520 length: 1,
2521 token: Token::Special(Special::Separator(Separator::Space)),
2522 },
2523 PositionalToken {
2524 source: uws,
2525 offset: 20,
2526 length: 3,
2527 token: Token::Word(Word::Word("fox".to_string())),
2528 },
2529 PositionalToken {
2530 source: uws,
2531 offset: 23,
2532 length: 1,
2533 token: Token::Special(Special::Separator(Separator::Space)),
2534 },
2535 PositionalToken {
2536 source: uws,
2537 offset: 24,
2538 length: 5,
2539 token: Token::Word(Word::Word("can\'t".to_string())),
2540 },
2541 PositionalToken {
2542 source: uws,
2543 offset: 29,
2544 length: 1,
2545 token: Token::Special(Special::Separator(Separator::Space)),
2546 },
2547 PositionalToken {
2548 source: uws,
2549 offset: 30,
2550 length: 4,
2551 token: Token::Word(Word::Word("jump".to_string())),
2552 },
2553 PositionalToken {
2554 source: uws,
2555 offset: 34,
2556 length: 1,
2557 token: Token::Special(Special::Separator(Separator::Space)),
2558 },
2559 PositionalToken {
2560 source: uws,
2561 offset: 35,
2562 length: 4,
2563 token: Token::Word(Word::Number(Number::Float(32.3))),
2564 },
2565 PositionalToken {
2566 source: uws,
2567 offset: 39,
2568 length: 1,
2569 token: Token::Special(Special::Separator(Separator::Space)),
2570 },
2571 PositionalToken {
2572 source: uws,
2573 offset: 40,
2574 length: 4,
2575 token: Token::Word(Word::Word("feet".to_string())),
2576 },
2577 PositionalToken {
2578 source: uws,
2579 offset: 44,
2580 length: 1,
2581 token: Token::Special(Special::Punctuation(',')),
2582 },
2583 PositionalToken {
2584 source: uws,
2585 offset: 45,
2586 length: 1,
2587 token: Token::Special(Special::Separator(Separator::Space)),
2588 },
2589 PositionalToken {
2590 source: uws,
2591 offset: 46,
2592 length: 5,
2593 token: Token::Word(Word::Word("right".to_string())),
2594 },
2595 PositionalToken {
2596 source: uws,
2597 offset: 51,
2598 length: 1,
2599 token: Token::Special(Special::Punctuation('?')),
2600 },
2601 PositionalToken {
2602 source: uws,
2603 offset: 52,
2604 length: 1,
2605 token: Token::Special(Special::Separator(Separator::Space)),
2606 },
2607 PositionalToken {
2608 source: uws,
2609 offset: 53,
2610 length: 4,
2611 token: Token::Word(Word::Numerical(Numerical::Measures("4pda".to_string()))),
2612 }, PositionalToken {
2614 source: uws,
2615 offset: 57,
2616 length: 1,
2617 token: Token::Special(Special::Separator(Separator::Space)),
2618 },
2619 PositionalToken {
2620 source: uws,
2621 offset: 58,
2622 length: 3,
2623 token: Token::Word(Word::Word("etc".to_string())),
2624 },
2625 PositionalToken {
2626 source: uws,
2627 offset: 61,
2628 length: 1,
2629 token: Token::Special(Special::Punctuation('.')),
2630 },
2631 PositionalToken {
2632 source: uws,
2633 offset: 62,
2634 length: 1,
2635 token: Token::Special(Special::Separator(Separator::Space)),
2636 },
2637 PositionalToken {
2638 source: uws,
2639 offset: 63,
2640 length: 3,
2641 token: Token::Word(Word::Word("qeq".to_string())),
2642 },
2643 PositionalToken {
2644 source: uws,
2645 offset: 66,
2646 length: 1,
2647 token: Token::Special(Special::Separator(Separator::Space)),
2648 },
2649 PositionalToken {
2650 source: uws,
2651 offset: 67,
2652 length: 5,
2653 token: Token::Word(Word::Word("U.S.A".to_string())),
2654 },
2655 PositionalToken {
2656 source: uws,
2657 offset: 72,
2658 length: 1,
2659 token: Token::Special(Special::Separator(Separator::Space)),
2660 },
2661 PositionalToken {
2662 source: uws,
2663 offset: 73,
2664 length: 1,
2665 token: Token::Special(Special::Separator(Separator::Space)),
2666 },
2667 PositionalToken {
2668 source: uws,
2669 offset: 74,
2670 length: 3,
2671 token: Token::Word(Word::Word("asd".to_string())),
2672 },
2673 PositionalToken {
2674 source: uws,
2675 offset: 77,
2676 length: 1,
2677 token: Token::Special(Special::Separator(Separator::Newline)),
2678 },
2679 PositionalToken {
2680 source: uws,
2681 offset: 78,
2682 length: 1,
2683 token: Token::Special(Special::Separator(Separator::Newline)),
2684 },
2685 PositionalToken {
2686 source: uws,
2687 offset: 79,
2688 length: 1,
2689 token: Token::Special(Special::Separator(Separator::Newline)),
2690 },
2691 PositionalToken {
2692 source: uws,
2693 offset: 80,
2694 length: 3,
2695 token: Token::Word(Word::Word("Brr".to_string())),
2696 },
2697 PositionalToken {
2698 source: uws,
2699 offset: 83,
2700 length: 1,
2701 token: Token::Special(Special::Punctuation(',')),
2702 },
2703 PositionalToken {
2704 source: uws,
2705 offset: 84,
2706 length: 1,
2707 token: Token::Special(Special::Separator(Separator::Space)),
2708 },
2709 PositionalToken {
2710 source: uws,
2711 offset: 85,
2712 length: 4,
2713 token: Token::Word(Word::Word("it\'s".to_string())),
2714 },
2715 PositionalToken {
2716 source: uws,
2717 offset: 89,
2718 length: 1,
2719 token: Token::Special(Special::Separator(Separator::Space)),
2720 },
2721 PositionalToken {
2722 source: uws,
2723 offset: 90,
2724 length: 4,
2725 token: Token::Word(Word::Number(Number::Float(29.3))),
2726 },
2727 PositionalToken {
2728 source: uws,
2729 offset: 94,
2730 length: 2,
2731 token: Token::Special(Special::Symbol('°')),
2732 },
2733 PositionalToken {
2734 source: uws,
2735 offset: 96,
2736 length: 1,
2737 token: Token::Word(Word::Word("F".to_string())),
2738 },
2739 PositionalToken {
2740 source: uws,
2741 offset: 97,
2742 length: 1,
2743 token: Token::Special(Special::Punctuation('!')),
2744 },
2745 PositionalToken {
2746 source: uws,
2747 offset: 98,
2748 length: 1,
2749 token: Token::Special(Special::Separator(Separator::Newline)),
2750 },
2751 PositionalToken {
2752 source: uws,
2753 offset: 99,
2754 length: 1,
2755 token: Token::Special(Special::Separator(Separator::Space)),
2756 },
2757 PositionalToken {
2758 source: uws,
2759 offset: 100,
2760 length: 14,
2761 token: Token::Word(Word::Word("Русское".to_string())),
2762 },
2763 PositionalToken {
2764 source: uws,
2765 offset: 114,
2766 length: 1,
2767 token: Token::Special(Special::Separator(Separator::Space)),
2768 },
2769 PositionalToken {
2770 source: uws,
2771 offset: 115,
2772 length: 22,
2773 token: Token::Word(Word::Word("предложение".to_string())),
2774 },
2775 PositionalToken {
2776 source: uws,
2777 offset: 137,
2778 length: 1,
2779 token: Token::Special(Special::Separator(Separator::Space)),
2780 },
2781 PositionalToken {
2782 source: uws,
2783 offset: 138,
2784 length: 1,
2785 token: Token::Special(Special::Punctuation('#')),
2786 },
2787 PositionalToken {
2788 source: uws,
2789 offset: 139,
2790 length: 4,
2791 token: Token::Word(Word::Number(Number::Float(36.6))),
2792 },
2793 PositionalToken {
2794 source: uws,
2795 offset: 143,
2796 length: 1,
2797 token: Token::Special(Special::Separator(Separator::Space)),
2798 },
2799 PositionalToken {
2800 source: uws,
2801 offset: 144,
2802 length: 6,
2803 token: Token::Word(Word::Word("для".to_string())),
2804 },
2805 PositionalToken {
2806 source: uws,
2807 offset: 150,
2808 length: 1,
2809 token: Token::Special(Special::Separator(Separator::Space)),
2810 },
2811 PositionalToken {
2812 source: uws,
2813 offset: 151,
2814 length: 24,
2815 token: Token::Word(Word::Word("тестирования".to_string())),
2816 },
2817 PositionalToken {
2818 source: uws,
2819 offset: 175,
2820 length: 1,
2821 token: Token::Special(Special::Separator(Separator::Space)),
2822 },
2823 PositionalToken {
2824 source: uws,
2825 offset: 176,
2826 length: 14,
2827 token: Token::Word(Word::Word("деления".to_string())),
2828 },
2829 PositionalToken {
2830 source: uws,
2831 offset: 190,
2832 length: 1,
2833 token: Token::Special(Special::Separator(Separator::Space)),
2834 },
2835 PositionalToken {
2836 source: uws,
2837 offset: 191,
2838 length: 4,
2839 token: Token::Word(Word::Word("по".to_string())),
2840 },
2841 PositionalToken {
2842 source: uws,
2843 offset: 195,
2844 length: 1,
2845 token: Token::Special(Special::Separator(Separator::Space)),
2846 },
2847 PositionalToken {
2848 source: uws,
2849 offset: 196,
2850 length: 12,
2851 token: Token::Word(Word::Word("юникод".to_string())),
2852 },
2853 PositionalToken {
2854 source: uws,
2855 offset: 208,
2856 length: 1,
2857 token: Token::Special(Special::Punctuation('-')),
2858 },
2859 PositionalToken {
2860 source: uws,
2861 offset: 209,
2862 length: 12,
2863 token: Token::Word(Word::Word("словам".to_string())),
2864 },
2865 PositionalToken {
2866 source: uws,
2867 offset: 221,
2868 length: 1,
2869 token: Token::Special(Special::Punctuation('.')),
2870 },
2871 PositionalToken {
2872 source: uws,
2873 offset: 222,
2874 length: 1,
2875 token: Token::Special(Special::Punctuation('.')),
2876 },
2877 PositionalToken {
2878 source: uws,
2879 offset: 223,
2880 length: 1,
2881 token: Token::Special(Special::Punctuation('.')),
2882 },
2883 PositionalToken {
2884 source: uws,
2885 offset: 224,
2886 length: 1,
2887 token: Token::Special(Special::Separator(Separator::Newline)),
2888 },
2889 ];
2890 let lib_res = uws.into_tokenizer(Default::default()).collect::<Vec<_>>();
2891 check_results(&result, &lib_res, uws);
2892 }
2893
2894 #[test]
2895 fn general_complex() {
2896 let uws = "The quick (\"brown\") fox can't jump 32.3 feet, right? 4pda etc. qeq U.S.A asd\n\n\nBrr, it's 29.3°F!\n Русское предложение #36.6 для тестирования деления по юникод-словам...\n";
2897 let result = vec![
2898 PositionalToken {
2899 source: uws,
2900 offset: 0,
2901 length: 3,
2902 token: Token::Word(Word::Word("The".to_string())),
2903 },
2904 PositionalToken {
2905 source: uws,
2906 offset: 3,
2907 length: 1,
2908 token: Token::Special(Special::Separator(Separator::Space)),
2909 },
2910 PositionalToken {
2911 source: uws,
2912 offset: 4,
2913 length: 5,
2914 token: Token::Word(Word::Word("quick".to_string())),
2915 },
2916 PositionalToken {
2917 source: uws,
2918 offset: 9,
2919 length: 1,
2920 token: Token::Special(Special::Separator(Separator::Space)),
2921 },
2922 PositionalToken {
2923 source: uws,
2924 offset: 10,
2925 length: 1,
2926 token: Token::Special(Special::Punctuation('(')),
2927 },
2928 PositionalToken {
2929 source: uws,
2930 offset: 11,
2931 length: 1,
2932 token: Token::Special(Special::Punctuation('"')),
2933 },
2934 PositionalToken {
2935 source: uws,
2936 offset: 12,
2937 length: 5,
2938 token: Token::Word(Word::Word("brown".to_string())),
2939 },
2940 PositionalToken {
2941 source: uws,
2942 offset: 17,
2943 length: 1,
2944 token: Token::Special(Special::Punctuation('"')),
2945 },
2946 PositionalToken {
2947 source: uws,
2948 offset: 18,
2949 length: 1,
2950 token: Token::Special(Special::Punctuation(')')),
2951 },
2952 PositionalToken {
2953 source: uws,
2954 offset: 19,
2955 length: 1,
2956 token: Token::Special(Special::Separator(Separator::Space)),
2957 },
2958 PositionalToken {
2959 source: uws,
2960 offset: 20,
2961 length: 3,
2962 token: Token::Word(Word::Word("fox".to_string())),
2963 },
2964 PositionalToken {
2965 source: uws,
2966 offset: 23,
2967 length: 1,
2968 token: Token::Special(Special::Separator(Separator::Space)),
2969 },
2970 PositionalToken {
2971 source: uws,
2972 offset: 24,
2973 length: 5,
2974 token: Token::Word(Word::Word("can\'t".to_string())),
2975 },
2976 PositionalToken {
2977 source: uws,
2978 offset: 29,
2979 length: 1,
2980 token: Token::Special(Special::Separator(Separator::Space)),
2981 },
2982 PositionalToken {
2983 source: uws,
2984 offset: 30,
2985 length: 4,
2986 token: Token::Word(Word::Word("jump".to_string())),
2987 },
2988 PositionalToken {
2989 source: uws,
2990 offset: 34,
2991 length: 1,
2992 token: Token::Special(Special::Separator(Separator::Space)),
2993 },
2994 PositionalToken {
2995 source: uws,
2996 offset: 35,
2997 length: 4,
2998 token: Token::Word(Word::Number(Number::Float(32.3))),
2999 },
3000 PositionalToken {
3001 source: uws,
3002 offset: 39,
3003 length: 1,
3004 token: Token::Special(Special::Separator(Separator::Space)),
3005 },
3006 PositionalToken {
3007 source: uws,
3008 offset: 40,
3009 length: 4,
3010 token: Token::Word(Word::Word("feet".to_string())),
3011 },
3012 PositionalToken {
3013 source: uws,
3014 offset: 44,
3015 length: 1,
3016 token: Token::Special(Special::Punctuation(',')),
3017 },
3018 PositionalToken {
3019 source: uws,
3020 offset: 45,
3021 length: 1,
3022 token: Token::Special(Special::Separator(Separator::Space)),
3023 },
3024 PositionalToken {
3025 source: uws,
3026 offset: 46,
3027 length: 5,
3028 token: Token::Word(Word::Word("right".to_string())),
3029 },
3030 PositionalToken {
3031 source: uws,
3032 offset: 51,
3033 length: 1,
3034 token: Token::Special(Special::Punctuation('?')),
3035 },
3036 PositionalToken {
3037 source: uws,
3038 offset: 52,
3039 length: 1,
3040 token: Token::Special(Special::Separator(Separator::Space)),
3041 },
3042 PositionalToken {
3043 source: uws,
3044 offset: 53,
3045 length: 4,
3046 token: Token::Word(Word::Numerical(Numerical::Measures("4pda".to_string()))),
3047 }, PositionalToken {
3049 source: uws,
3050 offset: 57,
3051 length: 1,
3052 token: Token::Special(Special::Separator(Separator::Space)),
3053 },
3054 PositionalToken {
3055 source: uws,
3056 offset: 58,
3057 length: 3,
3058 token: Token::Word(Word::Word("etc".to_string())),
3059 },
3060 PositionalToken {
3061 source: uws,
3062 offset: 61,
3063 length: 1,
3064 token: Token::Special(Special::Punctuation('.')),
3065 },
3066 PositionalToken {
3067 source: uws,
3068 offset: 62,
3069 length: 1,
3070 token: Token::Special(Special::Separator(Separator::Space)),
3071 },
3072 PositionalToken {
3073 source: uws,
3074 offset: 63,
3075 length: 3,
3076 token: Token::Word(Word::Word("qeq".to_string())),
3077 },
3078 PositionalToken {
3079 source: uws,
3080 offset: 66,
3081 length: 1,
3082 token: Token::Special(Special::Separator(Separator::Space)),
3083 },
3084 PositionalToken {
3085 source: uws,
3086 offset: 67,
3087 length: 5,
3088 token: Token::Word(Word::Word("U.S.A".to_string())),
3089 },
3090 PositionalToken {
3091 source: uws,
3092 offset: 72,
3093 length: 2,
3094 token: Token::Special(Special::Separator(Separator::Space)),
3095 },
3096 PositionalToken {
3097 source: uws,
3098 offset: 74,
3099 length: 3,
3100 token: Token::Word(Word::Word("asd".to_string())),
3101 },
3102 PositionalToken {
3103 source: uws,
3104 offset: 77,
3105 length: 3,
3106 token: Token::Special(Special::Separator(Separator::Newline)),
3107 },
3108 PositionalToken {
3109 source: uws,
3110 offset: 80,
3111 length: 3,
3112 token: Token::Word(Word::Word("Brr".to_string())),
3113 },
3114 PositionalToken {
3115 source: uws,
3116 offset: 83,
3117 length: 1,
3118 token: Token::Special(Special::Punctuation(',')),
3119 },
3120 PositionalToken {
3121 source: uws,
3122 offset: 84,
3123 length: 1,
3124 token: Token::Special(Special::Separator(Separator::Space)),
3125 },
3126 PositionalToken {
3127 source: uws,
3128 offset: 85,
3129 length: 4,
3130 token: Token::Word(Word::Word("it\'s".to_string())),
3131 },
3132 PositionalToken {
3133 source: uws,
3134 offset: 89,
3135 length: 1,
3136 token: Token::Special(Special::Separator(Separator::Space)),
3137 },
3138 PositionalToken {
3139 source: uws,
3140 offset: 90,
3141 length: 4,
3142 token: Token::Word(Word::Number(Number::Float(29.3))),
3143 },
3144 PositionalToken {
3145 source: uws,
3146 offset: 94,
3147 length: 2,
3148 token: Token::Special(Special::Symbol('°')),
3149 },
3150 PositionalToken {
3151 source: uws,
3152 offset: 96,
3153 length: 1,
3154 token: Token::Word(Word::Word("F".to_string())),
3155 },
3156 PositionalToken {
3157 source: uws,
3158 offset: 97,
3159 length: 1,
3160 token: Token::Special(Special::Punctuation('!')),
3161 },
3162 PositionalToken {
3163 source: uws,
3164 offset: 98,
3165 length: 1,
3166 token: Token::Special(Special::Separator(Separator::Newline)),
3167 },
3168 PositionalToken {
3169 source: uws,
3170 offset: 99,
3171 length: 1,
3172 token: Token::Special(Special::Separator(Separator::Space)),
3173 },
3174 PositionalToken {
3175 source: uws,
3176 offset: 100,
3177 length: 14,
3178 token: Token::Word(Word::Word("Русское".to_string())),
3179 },
3180 PositionalToken {
3181 source: uws,
3182 offset: 114,
3183 length: 1,
3184 token: Token::Special(Special::Separator(Separator::Space)),
3185 },
3186 PositionalToken {
3187 source: uws,
3188 offset: 115,
3189 length: 22,
3190 token: Token::Word(Word::Word("предложение".to_string())),
3191 },
3192 PositionalToken {
3193 source: uws,
3194 offset: 137,
3195 length: 1,
3196 token: Token::Special(Special::Separator(Separator::Space)),
3197 },
3198 PositionalToken {
3199 source: uws,
3200 offset: 138,
3201 length: 5,
3202 token: Token::Struct(Struct::Hashtag("36.6".to_string())),
3203 },
3204 PositionalToken {
3205 source: uws,
3206 offset: 143,
3207 length: 1,
3208 token: Token::Special(Special::Separator(Separator::Space)),
3209 },
3210 PositionalToken {
3211 source: uws,
3212 offset: 144,
3213 length: 6,
3214 token: Token::Word(Word::Word("для".to_string())),
3215 },
3216 PositionalToken {
3217 source: uws,
3218 offset: 150,
3219 length: 1,
3220 token: Token::Special(Special::Separator(Separator::Space)),
3221 },
3222 PositionalToken {
3223 source: uws,
3224 offset: 151,
3225 length: 24,
3226 token: Token::Word(Word::Word("тестирования".to_string())),
3227 },
3228 PositionalToken {
3229 source: uws,
3230 offset: 175,
3231 length: 1,
3232 token: Token::Special(Special::Separator(Separator::Space)),
3233 },
3234 PositionalToken {
3235 source: uws,
3236 offset: 176,
3237 length: 14,
3238 token: Token::Word(Word::Word("деления".to_string())),
3239 },
3240 PositionalToken {
3241 source: uws,
3242 offset: 190,
3243 length: 1,
3244 token: Token::Special(Special::Separator(Separator::Space)),
3245 },
3246 PositionalToken {
3247 source: uws,
3248 offset: 191,
3249 length: 4,
3250 token: Token::Word(Word::Word("по".to_string())),
3251 },
3252 PositionalToken {
3253 source: uws,
3254 offset: 195,
3255 length: 1,
3256 token: Token::Special(Special::Separator(Separator::Space)),
3257 },
3258 PositionalToken {
3259 source: uws,
3260 offset: 196,
3261 length: 12,
3262 token: Token::Word(Word::Word("юникод".to_string())),
3263 },
3264 PositionalToken {
3265 source: uws,
3266 offset: 208,
3267 length: 1,
3268 token: Token::Special(Special::Punctuation('-')),
3269 },
3270 PositionalToken {
3271 source: uws,
3272 offset: 209,
3273 length: 12,
3274 token: Token::Word(Word::Word("словам".to_string())),
3275 },
3276 PositionalToken {
3277 source: uws,
3278 offset: 221,
3279 length: 3,
3280 token: Token::Special(Special::Punctuation('.')),
3281 },
3282 PositionalToken {
3283 source: uws,
3284 offset: 224,
3285 length: 1,
3286 token: Token::Special(Special::Separator(Separator::Newline)),
3287 },
3288 ];
3289 let lib_res = uws
3290 .into_tokenizer(TokenizerParams::complex())
3291 .collect::<Vec<_>>();
3292 check_results(&result, &lib_res, uws);
3293 }
3294
3295 #[test]
3296 fn plus_minus() {
3297 let uws = "+23 -4.5 -34 +25.7 - 2 + 5.6";
3298 let result = vec![
3299 PositionalToken {
3300 source: uws,
3301 offset: 0,
3302 length: 3,
3303 token: Token::Word(Word::Number(Number::Integer(23))),
3304 },
3305 PositionalToken {
3306 source: uws,
3307 offset: 3,
3308 length: 1,
3309 token: Token::Special(Special::Separator(Separator::Space)),
3310 },
3311 PositionalToken {
3312 source: uws,
3313 offset: 4,
3314 length: 4,
3315 token: Token::Word(Word::Number(Number::Float(-4.5))),
3316 },
3317 PositionalToken {
3318 source: uws,
3319 offset: 8,
3320 length: 1,
3321 token: Token::Special(Special::Separator(Separator::Space)),
3322 },
3323 PositionalToken {
3324 source: uws,
3325 offset: 9,
3326 length: 3,
3327 token: Token::Word(Word::Number(Number::Integer(-34))),
3328 },
3329 PositionalToken {
3330 source: uws,
3331 offset: 12,
3332 length: 1,
3333 token: Token::Special(Special::Separator(Separator::Space)),
3334 },
3335 PositionalToken {
3336 source: uws,
3337 offset: 13,
3338 length: 5,
3339 token: Token::Word(Word::Number(Number::Float(25.7))),
3340 },
3341 PositionalToken {
3342 source: uws,
3343 offset: 18,
3344 length: 1,
3345 token: Token::Special(Special::Separator(Separator::Space)),
3346 },
3347 PositionalToken {
3348 source: uws,
3349 offset: 19,
3350 length: 1,
3351 token: Token::Special(Special::Punctuation('-')),
3352 },
3353 PositionalToken {
3354 source: uws,
3355 offset: 20,
3356 length: 1,
3357 token: Token::Special(Special::Separator(Separator::Space)),
3358 },
3359 PositionalToken {
3360 source: uws,
3361 offset: 21,
3362 length: 1,
3363 token: Token::Word(Word::Number(Number::Integer(2))),
3364 },
3365 PositionalToken {
3366 source: uws,
3367 offset: 22,
3368 length: 1,
3369 token: Token::Special(Special::Separator(Separator::Space)),
3370 },
3371 PositionalToken {
3372 source: uws,
3373 offset: 23,
3374 length: 1,
3375 token: Token::Special(Special::Punctuation('+')),
3376 },
3377 PositionalToken {
3378 source: uws,
3379 offset: 24,
3380 length: 1,
3381 token: Token::Special(Special::Separator(Separator::Space)),
3382 },
3383 PositionalToken {
3384 source: uws,
3385 offset: 25,
3386 length: 3,
3387 token: Token::Word(Word::Number(Number::Float(5.6))),
3388 },
3389 ];
3390 let lib_res = uws
3391 .into_tokenizer(TokenizerParams::v1())
3392 .collect::<Vec<_>>();
3393 check(&result, &lib_res, uws);
3394 }
3396
3397 #[test]
3398 #[ignore]
3399 fn woman_bouncing_ball() {
3400 let uws = "\u{26f9}\u{200d}\u{2640}";
3401 let result = vec![PositionalToken {
3402 source: uws,
3403 offset: 0,
3404 length: 9,
3405 token: Token::Word(Word::Emoji("woman_bouncing_ball")),
3406 }];
3407 let lib_res = uws
3408 .into_tokenizer(TokenizerParams::v1())
3409 .collect::<Vec<_>>();
3410 check_results(&result, &lib_res, uws);
3411 }
3413
3414 #[test]
3415 fn emoji_and_rusabbr_default() {
3416 let uws = "🇷🇺 🇸🇹\n👱🏿👶🏽👨🏽\n👱\nС.С.С.Р.\n👨👩👦👦\n🧠\n";
3417 let result = vec![
3418 PositionalToken {
3419 source: uws,
3420 offset: 0,
3421 length: 8,
3422 token: Token::Word(Word::Emoji("russia")),
3423 },
3424 PositionalToken {
3425 source: uws,
3426 offset: 8,
3427 length: 1,
3428 token: Token::Special(Special::Separator(Separator::Space)),
3429 },
3430 PositionalToken {
3431 source: uws,
3432 offset: 9,
3433 length: 8,
3434 token: Token::Word(Word::Emoji("sao_tome_and_principe")),
3435 },
3436 PositionalToken {
3437 source: uws,
3438 offset: 17,
3439 length: 1,
3440 token: Token::Special(Special::Separator(Separator::Newline)),
3441 },
3442 PositionalToken {
3443 source: uws,
3444 offset: 18,
3445 length: 8,
3446 token: Token::Word(Word::Emoji("blond_haired_person_dark_skin_tone")),
3447 },
3448 PositionalToken {
3449 source: uws,
3450 offset: 26,
3451 length: 8,
3452 token: Token::Word(Word::Emoji("baby_medium_skin_tone")),
3453 },
3454 PositionalToken {
3455 source: uws,
3456 offset: 34,
3457 length: 8,
3458 token: Token::Word(Word::Emoji("man_medium_skin_tone")),
3459 },
3460 PositionalToken {
3461 source: uws,
3462 offset: 42,
3463 length: 1,
3464 token: Token::Special(Special::Separator(Separator::Newline)),
3465 },
3466 PositionalToken {
3467 source: uws,
3468 offset: 43,
3469 length: 4,
3470 token: Token::Word(Word::Emoji("blond_haired_person")),
3471 },
3472 PositionalToken {
3473 source: uws,
3474 offset: 47,
3475 length: 1,
3476 token: Token::Special(Special::Separator(Separator::Newline)),
3477 },
3478 PositionalToken {
3479 source: uws,
3480 offset: 48,
3481 length: 2,
3482 token: Token::Word(Word::Word("С".to_string())),
3483 },
3484 PositionalToken {
3485 source: uws,
3486 offset: 50,
3487 length: 1,
3488 token: Token::Special(Special::Punctuation('.')),
3489 },
3490 PositionalToken {
3491 source: uws,
3492 offset: 51,
3493 length: 2,
3494 token: Token::Word(Word::Word("С".to_string())),
3495 },
3496 PositionalToken {
3497 source: uws,
3498 offset: 53,
3499 length: 1,
3500 token: Token::Special(Special::Punctuation('.')),
3501 },
3502 PositionalToken {
3503 source: uws,
3504 offset: 54,
3505 length: 2,
3506 token: Token::Word(Word::Word("С".to_string())),
3507 },
3508 PositionalToken {
3509 source: uws,
3510 offset: 56,
3511 length: 1,
3512 token: Token::Special(Special::Punctuation('.')),
3513 },
3514 PositionalToken {
3515 source: uws,
3516 offset: 57,
3517 length: 2,
3518 token: Token::Word(Word::Word("Р".to_string())),
3519 },
3520 PositionalToken {
3521 source: uws,
3522 offset: 59,
3523 length: 1,
3524 token: Token::Special(Special::Punctuation('.')),
3525 },
3526 PositionalToken {
3527 source: uws,
3528 offset: 60,
3529 length: 1,
3530 token: Token::Special(Special::Separator(Separator::Newline)),
3531 },
3532 PositionalToken {
3533 source: uws,
3534 offset: 61,
3535 length: 25,
3536 token: Token::Word(Word::Emoji("family_man_woman_boy_boy")),
3537 },
3538 PositionalToken {
3539 source: uws,
3540 offset: 86,
3541 length: 1,
3542 token: Token::Special(Special::Separator(Separator::Newline)),
3543 },
3544 PositionalToken {
3545 source: uws,
3546 offset: 87,
3547 length: 4,
3548 token: Token::Word(Word::Emoji("brain")),
3549 },
3550 PositionalToken {
3551 source: uws,
3552 offset: 91,
3553 length: 1,
3554 token: Token::Special(Special::Separator(Separator::Newline)),
3555 },
3556 ];
3557
3558 let lib_res = uws
3559 .into_tokenizer(TokenizerParams::v1())
3560 .collect::<Vec<_>>();
3561 check_results(&result, &lib_res, uws);
3562 }
3564
3565 #[test]
3566 fn emoji_and_rusabbr_no_split() {
3567 let uws = "🇷🇺 🇸🇹\n👱🏿👶🏽👨🏽\n👱\nС.С.С.Р.\n👨👩👦👦\n🧠\n";
3568 let result = vec![
3569 PositionalToken {
3570 source: uws,
3571 offset: 0,
3572 length: 8,
3573 token: Token::Word(Word::Emoji("russia")),
3574 },
3575 PositionalToken {
3576 source: uws,
3577 offset: 8,
3578 length: 1,
3579 token: Token::Special(Special::Separator(Separator::Space)),
3580 },
3581 PositionalToken {
3582 source: uws,
3583 offset: 9,
3584 length: 8,
3585 token: Token::Word(Word::Emoji("sao_tome_and_principe")),
3586 },
3587 PositionalToken {
3588 source: uws,
3589 offset: 17,
3590 length: 1,
3591 token: Token::Special(Special::Separator(Separator::Newline)),
3592 },
3593 PositionalToken {
3594 source: uws,
3595 offset: 18,
3596 length: 8,
3597 token: Token::Word(Word::Emoji("blond_haired_person_dark_skin_tone")),
3598 },
3599 PositionalToken {
3600 source: uws,
3601 offset: 26,
3602 length: 8,
3603 token: Token::Word(Word::Emoji("baby_medium_skin_tone")),
3604 },
3605 PositionalToken {
3606 source: uws,
3607 offset: 34,
3608 length: 8,
3609 token: Token::Word(Word::Emoji("man_medium_skin_tone")),
3610 },
3611 PositionalToken {
3612 source: uws,
3613 offset: 42,
3614 length: 1,
3615 token: Token::Special(Special::Separator(Separator::Newline)),
3616 },
3617 PositionalToken {
3618 source: uws,
3619 offset: 43,
3620 length: 4,
3621 token: Token::Word(Word::Emoji("blond_haired_person")),
3622 },
3623 PositionalToken {
3624 source: uws,
3625 offset: 47,
3626 length: 1,
3627 token: Token::Special(Special::Separator(Separator::Newline)),
3628 },
3629 PositionalToken {
3630 source: uws,
3631 offset: 48,
3632 length: 11,
3633 token: Token::Word(Word::Word("С.С.С.Р".to_string())),
3634 },
3635 PositionalToken {
3636 source: uws,
3637 offset: 59,
3638 length: 1,
3639 token: Token::Special(Special::Punctuation('.')),
3640 },
3641 PositionalToken {
3642 source: uws,
3643 offset: 60,
3644 length: 1,
3645 token: Token::Special(Special::Separator(Separator::Newline)),
3646 },
3647 PositionalToken {
3648 source: uws,
3649 offset: 61,
3650 length: 25,
3651 token: Token::Word(Word::Emoji("family_man_woman_boy_boy")),
3652 },
3653 PositionalToken {
3654 source: uws,
3655 offset: 86,
3656 length: 1,
3657 token: Token::Special(Special::Separator(Separator::Newline)),
3658 },
3659 PositionalToken {
3660 source: uws,
3661 offset: 87,
3662 length: 4,
3663 token: Token::Word(Word::Emoji("brain")),
3664 },
3665 PositionalToken {
3666 source: uws,
3667 offset: 91,
3668 length: 1,
3669 token: Token::Special(Special::Separator(Separator::Newline)),
3670 },
3671 ];
3672
3673 let lib_res = uws.into_tokenizer(Default::default()).collect::<Vec<_>>();
3674 check_results(&result, &lib_res, uws);
3675 }
3677
3678 #[test]
3902 fn html() {
3903 let uws = "<div class=\"article article_view \" id=\"article_view_-113039156_9551\" data-article-url=\"/@chaibuket-o-chem-ne-zabyt-25-noyabrya\" data-audio-context=\"article:-113039156_9551\"><h1 class=\"article_decoration_first article_decoration_last\" >День Мамы </h1><p class=\"article_decoration_first article_decoration_last\" >День, когда поздравляют мам, бабушек, сестер и жён — это всемирный праздник, называемый «День Мамы». В настоящее время его отмечают почти в каждой стране, просто везде разные даты и способы празднования. </p><h3 class=\"article_decoration_first article_decoration_last\" ><span class='article_anchor_title'>\n <span class='article_anchor_button' id='pochemu-my-ego-prazdnuem'></span>\n <span class='article_anchor_fsymbol'>П</span>\n</span>ПОЧЕМУ МЫ ЕГО ПРАЗДНУЕМ</h3><p class=\"article_decoration_first article_decoration_last article_decoration_before\" >В 1987 году комитет госдумы по делам женщин, семьи и молодежи выступил с предложением учредить «День мамы», а сам приказ был подписан уже 30 января 1988 года Борисом Ельциным. Было решено, что ежегодно в России празднество дня мамы будет выпадать на последнее воскресенье ноября. </p><figure data-type=\"101\" data-mode=\"\" class=\"article_decoration_first article_decoration_last\" >\n <div class=\"article_figure_content\" style=\"width: 1125px\">\n <div class=\"article_figure_sizer_content\"><div class=\"article_object_sizer_wrap\" data-sizes=\"[{"s":["https://pp.userapi.com/c849128/v849128704/c0ffd/pcNJaBH3NDo.jpg",75,50],"m":["https://pp.userapi.com/c849128/v849128704/c0ffe/ozCLs2kHtRY.jpg",130,87],"x":["https://pp.userapi.com/c849128/v849128704/c0fff/E4KtTNDydzE.jpg",604,403],"y":["https://pp.userapi.com/c849128/v849128704/c1000/1nLxpYKavzU.jpg",807,538],"z":["https://pp.userapi.com/c849128/v849128704/c1001/IgEODe90yEk.jpg",1125,750],"o":["https://pp.userapi.com/c849128/v849128704/c1002/01faNwVZ2_E.jpg",130,87],"p":["https://pp.userapi.com/c849128/v849128704/c1003/baDFzbdRP2s.jpg",200,133],"q":["https://pp.userapi.com/c849128/v849128704/c1004/CY4khI6KJKA.jpg",320,213],"r":["https://pp.userapi.com/c849128/v849128704/c1005/NOvAJ6-VltY.jpg",510,340]}]\">\n <img class=\"article_object_sizer_inner article_object_photo__image_blur\" src=\"https://pp.userapi.com/c849128/v849128704/c0ffd/pcNJaBH3NDo.jpg\" data-baseurl=\"\"/>\n \n</div></div>\n <div class=\"article_figure_sizer\" style=\"padding-bottom: 66.666666666667%\"></div>";
3904 let result = vec![
3905 PositionalToken {
3906 source: uws,
3907 offset: 236,
3908 length: 8,
3909 token: Token::Word(Word::Word("День".to_string())),
3910 },
3911 PositionalToken {
3912 source: uws,
3913 offset: 244,
3914 length: 1,
3915 token: Token::Special(Special::Separator(Separator::Space)),
3916 },
3917 PositionalToken {
3918 source: uws,
3919 offset: 245,
3920 length: 8,
3921 token: Token::Word(Word::Word("Мамы".to_string())),
3922 },
3923 PositionalToken {
3924 source: uws,
3925 offset: 253,
3926 length: 1,
3927 token: Token::Special(Special::Separator(Separator::Space)),
3928 },
3929 PositionalToken {
3930 source: uws,
3931 offset: 321,
3932 length: 8,
3933 token: Token::Word(Word::Word("День".to_string())),
3934 },
3935 PositionalToken {
3936 source: uws,
3937 offset: 329,
3938 length: 1,
3939 token: Token::Special(Special::Punctuation(',')),
3940 },
3941 PositionalToken {
3942 source: uws,
3943 offset: 330,
3944 length: 1,
3945 token: Token::Special(Special::Separator(Separator::Space)),
3946 },
3947 PositionalToken {
3948 source: uws,
3949 offset: 331,
3950 length: 10,
3951 token: Token::Word(Word::Word("когда".to_string())),
3952 },
3953 PositionalToken {
3954 source: uws,
3955 offset: 341,
3956 length: 1,
3957 token: Token::Special(Special::Separator(Separator::Space)),
3958 },
3959 PositionalToken {
3960 source: uws,
3961 offset: 342,
3962 length: 22,
3963 token: Token::Word(Word::Word("поздравляют".to_string())),
3964 },
3965 PositionalToken {
3966 source: uws,
3967 offset: 364,
3968 length: 1,
3969 token: Token::Special(Special::Separator(Separator::Space)),
3970 },
3971 PositionalToken {
3972 source: uws,
3973 offset: 365,
3974 length: 6,
3975 token: Token::Word(Word::Word("мам".to_string())),
3976 },
3977 PositionalToken {
3978 source: uws,
3979 offset: 371,
3980 length: 1,
3981 token: Token::Special(Special::Punctuation(',')),
3982 },
3983 PositionalToken {
3984 source: uws,
3985 offset: 372,
3986 length: 1,
3987 token: Token::Special(Special::Separator(Separator::Space)),
3988 },
3989 PositionalToken {
3990 source: uws,
3991 offset: 373,
3992 length: 14,
3993 token: Token::Word(Word::Word("бабушек".to_string())),
3994 },
3995 PositionalToken {
3996 source: uws,
3997 offset: 387,
3998 length: 1,
3999 token: Token::Special(Special::Punctuation(',')),
4000 },
4001 PositionalToken {
4002 source: uws,
4003 offset: 388,
4004 length: 1,
4005 token: Token::Special(Special::Separator(Separator::Space)),
4006 },
4007 PositionalToken {
4008 source: uws,
4009 offset: 389,
4010 length: 12,
4011 token: Token::Word(Word::Word("сестер".to_string())),
4012 },
4013 PositionalToken {
4014 source: uws,
4015 offset: 401,
4016 length: 1,
4017 token: Token::Special(Special::Separator(Separator::Space)),
4018 },
4019 PositionalToken {
4020 source: uws,
4021 offset: 402,
4022 length: 2,
4023 token: Token::Word(Word::Word("и".to_string())),
4024 },
4025 PositionalToken {
4026 source: uws,
4027 offset: 404,
4028 length: 1,
4029 token: Token::Special(Special::Separator(Separator::Space)),
4030 },
4031 PositionalToken {
4032 source: uws,
4033 offset: 405,
4034 length: 6,
4035 token: Token::Word(Word::Word("жён".to_string())),
4036 },
4037 PositionalToken {
4038 source: uws,
4039 offset: 411,
4040 length: 1,
4041 token: Token::Special(Special::Separator(Separator::Space)),
4042 },
4043 PositionalToken {
4044 source: uws,
4045 offset: 412,
4046 length: 3,
4047 token: Token::Special(Special::Punctuation('—')),
4048 },
4049 PositionalToken {
4050 source: uws,
4051 offset: 415,
4052 length: 1,
4053 token: Token::Special(Special::Separator(Separator::Space)),
4054 },
4055 PositionalToken {
4056 source: uws,
4057 offset: 416,
4058 length: 6,
4059 token: Token::Word(Word::Word("это".to_string())),
4060 },
4061 PositionalToken {
4062 source: uws,
4063 offset: 422,
4064 length: 1,
4065 token: Token::Special(Special::Separator(Separator::Space)),
4066 },
4067 PositionalToken {
4068 source: uws,
4069 offset: 423,
4070 length: 18,
4071 token: Token::Word(Word::Word("всемирный".to_string())),
4072 },
4073 PositionalToken {
4074 source: uws,
4075 offset: 441,
4076 length: 1,
4077 token: Token::Special(Special::Separator(Separator::Space)),
4078 },
4079 PositionalToken {
4080 source: uws,
4081 offset: 442,
4082 length: 16,
4083 token: Token::Word(Word::Word("праздник".to_string())),
4084 },
4085 PositionalToken {
4086 source: uws,
4087 offset: 458,
4088 length: 1,
4089 token: Token::Special(Special::Punctuation(',')),
4090 },
4091 PositionalToken {
4092 source: uws,
4093 offset: 459,
4094 length: 1,
4095 token: Token::Special(Special::Separator(Separator::Space)),
4096 },
4097 PositionalToken {
4098 source: uws,
4099 offset: 460,
4100 length: 20,
4101 token: Token::Word(Word::Word("называемый".to_string())),
4102 },
4103 PositionalToken {
4104 source: uws,
4105 offset: 480,
4106 length: 1,
4107 token: Token::Special(Special::Separator(Separator::Space)),
4108 },
4109 PositionalToken {
4110 source: uws,
4111 offset: 481,
4112 length: 2,
4113 token: Token::Special(Special::Punctuation('«')),
4114 },
4115 PositionalToken {
4116 source: uws,
4117 offset: 483,
4118 length: 8,
4119 token: Token::Word(Word::Word("День".to_string())),
4120 },
4121 PositionalToken {
4122 source: uws,
4123 offset: 491,
4124 length: 1,
4125 token: Token::Special(Special::Separator(Separator::Space)),
4126 },
4127 PositionalToken {
4128 source: uws,
4129 offset: 492,
4130 length: 8,
4131 token: Token::Word(Word::Word("Мамы".to_string())),
4132 },
4133 PositionalToken {
4134 source: uws,
4135 offset: 500,
4136 length: 2,
4137 token: Token::Special(Special::Punctuation('»')),
4138 },
4139 PositionalToken {
4140 source: uws,
4141 offset: 502,
4142 length: 1,
4143 token: Token::Special(Special::Punctuation('.')),
4144 },
4145 PositionalToken {
4146 source: uws,
4147 offset: 503,
4148 length: 1,
4149 token: Token::Special(Special::Separator(Separator::Space)),
4150 },
4151 PositionalToken {
4152 source: uws,
4153 offset: 504,
4154 length: 2,
4155 token: Token::Word(Word::Word("В".to_string())),
4156 },
4157 PositionalToken {
4158 source: uws,
4159 offset: 506,
4160 length: 1,
4161 token: Token::Special(Special::Separator(Separator::Space)),
4162 },
4163 PositionalToken {
4164 source: uws,
4165 offset: 507,
4166 length: 18,
4167 token: Token::Word(Word::Word("настоящее".to_string())),
4168 },
4169 PositionalToken {
4170 source: uws,
4171 offset: 525,
4172 length: 1,
4173 token: Token::Special(Special::Separator(Separator::Space)),
4174 },
4175 PositionalToken {
4176 source: uws,
4177 offset: 526,
4178 length: 10,
4179 token: Token::Word(Word::Word("время".to_string())),
4180 },
4181 PositionalToken {
4182 source: uws,
4183 offset: 536,
4184 length: 1,
4185 token: Token::Special(Special::Separator(Separator::Space)),
4186 },
4187 PositionalToken {
4188 source: uws,
4189 offset: 537,
4190 length: 6,
4191 token: Token::Word(Word::Word("его".to_string())),
4192 },
4193 PositionalToken {
4194 source: uws,
4195 offset: 543,
4196 length: 1,
4197 token: Token::Special(Special::Separator(Separator::Space)),
4198 },
4199 PositionalToken {
4200 source: uws,
4201 offset: 544,
4202 length: 16,
4203 token: Token::Word(Word::Word("отмечают".to_string())),
4204 },
4205 PositionalToken {
4206 source: uws,
4207 offset: 560,
4208 length: 1,
4209 token: Token::Special(Special::Separator(Separator::Space)),
4210 },
4211 PositionalToken {
4212 source: uws,
4213 offset: 561,
4214 length: 10,
4215 token: Token::Word(Word::Word("почти".to_string())),
4216 },
4217 PositionalToken {
4218 source: uws,
4219 offset: 571,
4220 length: 1,
4221 token: Token::Special(Special::Separator(Separator::Space)),
4222 },
4223 PositionalToken {
4224 source: uws,
4225 offset: 572,
4226 length: 2,
4227 token: Token::Word(Word::Word("в".to_string())),
4228 },
4229 PositionalToken {
4230 source: uws,
4231 offset: 574,
4232 length: 1,
4233 token: Token::Special(Special::Separator(Separator::Space)),
4234 },
4235 PositionalToken {
4236 source: uws,
4237 offset: 575,
4238 length: 12,
4239 token: Token::Word(Word::Word("каждой".to_string())),
4240 },
4241 PositionalToken {
4242 source: uws,
4243 offset: 587,
4244 length: 1,
4245 token: Token::Special(Special::Separator(Separator::Space)),
4246 },
4247 PositionalToken {
4248 source: uws,
4249 offset: 588,
4250 length: 12,
4251 token: Token::Word(Word::Word("стране".to_string())),
4252 },
4253 PositionalToken {
4254 source: uws,
4255 offset: 600,
4256 length: 1,
4257 token: Token::Special(Special::Punctuation(',')),
4258 },
4259 PositionalToken {
4260 source: uws,
4261 offset: 601,
4262 length: 1,
4263 token: Token::Special(Special::Separator(Separator::Space)),
4264 },
4265 PositionalToken {
4266 source: uws,
4267 offset: 602,
4268 length: 12,
4269 token: Token::Word(Word::Word("просто".to_string())),
4270 },
4271 PositionalToken {
4272 source: uws,
4273 offset: 614,
4274 length: 1,
4275 token: Token::Special(Special::Separator(Separator::Space)),
4276 },
4277 PositionalToken {
4278 source: uws,
4279 offset: 615,
4280 length: 10,
4281 token: Token::Word(Word::Word("везде".to_string())),
4282 },
4283 PositionalToken {
4284 source: uws,
4285 offset: 625,
4286 length: 1,
4287 token: Token::Special(Special::Separator(Separator::Space)),
4288 },
4289 PositionalToken {
4290 source: uws,
4291 offset: 626,
4292 length: 12,
4293 token: Token::Word(Word::Word("разные".to_string())),
4294 },
4295 PositionalToken {
4296 source: uws,
4297 offset: 638,
4298 length: 1,
4299 token: Token::Special(Special::Separator(Separator::Space)),
4300 },
4301 PositionalToken {
4302 source: uws,
4303 offset: 639,
4304 length: 8,
4305 token: Token::Word(Word::Word("даты".to_string())),
4306 },
4307 PositionalToken {
4308 source: uws,
4309 offset: 647,
4310 length: 1,
4311 token: Token::Special(Special::Separator(Separator::Space)),
4312 },
4313 PositionalToken {
4314 source: uws,
4315 offset: 648,
4316 length: 2,
4317 token: Token::Word(Word::Word("и".to_string())),
4318 },
4319 PositionalToken {
4320 source: uws,
4321 offset: 650,
4322 length: 1,
4323 token: Token::Special(Special::Separator(Separator::Space)),
4324 },
4325 PositionalToken {
4326 source: uws,
4327 offset: 651,
4328 length: 14,
4329 token: Token::Word(Word::Word("способы".to_string())),
4330 },
4331 PositionalToken {
4332 source: uws,
4333 offset: 665,
4334 length: 1,
4335 token: Token::Special(Special::Separator(Separator::Space)),
4336 },
4337 PositionalToken {
4338 source: uws,
4339 offset: 666,
4340 length: 24,
4341 token: Token::Word(Word::Word("празднования".to_string())),
4342 },
4343 PositionalToken {
4344 source: uws,
4345 offset: 690,
4346 length: 1,
4347 token: Token::Special(Special::Punctuation('.')),
4348 },
4349 PositionalToken {
4350 source: uws,
4351 offset: 691,
4352 length: 1,
4353 token: Token::Special(Special::Separator(Separator::Space)),
4354 },
4355 PositionalToken {
4356 source: uws,
4357 offset: 794,
4358 length: 1,
4359 token: Token::Special(Special::Separator(Separator::Newline)),
4360 },
4361 PositionalToken {
4362 source: uws,
4363 offset: 795,
4364 length: 2,
4365 token: Token::Special(Special::Separator(Separator::Space)),
4366 },
4367 PositionalToken {
4368 source: uws,
4369 offset: 870,
4370 length: 1,
4371 token: Token::Special(Special::Separator(Separator::Newline)),
4372 },
4373 PositionalToken {
4374 source: uws,
4375 offset: 871,
4376 length: 2,
4377 token: Token::Special(Special::Separator(Separator::Space)),
4378 },
4379 PositionalToken {
4380 source: uws,
4381 offset: 910,
4382 length: 2,
4383 token: Token::Word(Word::Word("П".to_string())),
4384 },
4385 PositionalToken {
4386 source: uws,
4387 offset: 919,
4388 length: 1,
4389 token: Token::Special(Special::Separator(Separator::Newline)),
4390 },
4391 PositionalToken {
4392 source: uws,
4393 offset: 927,
4394 length: 12,
4395 token: Token::Word(Word::Word("ПОЧЕМУ".to_string())),
4396 },
4397 PositionalToken {
4398 source: uws,
4399 offset: 939,
4400 length: 1,
4401 token: Token::Special(Special::Separator(Separator::Space)),
4402 },
4403 PositionalToken {
4404 source: uws,
4405 offset: 940,
4406 length: 4,
4407 token: Token::Word(Word::Word("МЫ".to_string())),
4408 },
4409 PositionalToken {
4410 source: uws,
4411 offset: 944,
4412 length: 1,
4413 token: Token::Special(Special::Separator(Separator::Space)),
4414 },
4415 PositionalToken {
4416 source: uws,
4417 offset: 945,
4418 length: 6,
4419 token: Token::Word(Word::Word("ЕГО".to_string())),
4420 },
4421 PositionalToken {
4422 source: uws,
4423 offset: 951,
4424 length: 1,
4425 token: Token::Special(Special::Separator(Separator::Space)),
4426 },
4427 PositionalToken {
4428 source: uws,
4429 offset: 952,
4430 length: 18,
4431 token: Token::Word(Word::Word("ПРАЗДНУЕМ".to_string())),
4432 },
4433 PositionalToken {
4434 source: uws,
4435 offset: 1063,
4436 length: 2,
4437 token: Token::Word(Word::Word("В".to_string())),
4438 },
4439 PositionalToken {
4440 source: uws,
4441 offset: 1065,
4442 length: 1,
4443 token: Token::Special(Special::Separator(Separator::Space)),
4444 },
4445 PositionalToken {
4446 source: uws,
4447 offset: 1066,
4448 length: 4,
4449 token: Token::Word(Word::Number(Number::Integer(1987))),
4450 },
4451 PositionalToken {
4452 source: uws,
4453 offset: 1070,
4454 length: 1,
4455 token: Token::Special(Special::Separator(Separator::Space)),
4456 },
4457 PositionalToken {
4458 source: uws,
4459 offset: 1071,
4460 length: 8,
4461 token: Token::Word(Word::Word("году".to_string())),
4462 },
4463 PositionalToken {
4464 source: uws,
4465 offset: 1079,
4466 length: 1,
4467 token: Token::Special(Special::Separator(Separator::Space)),
4468 },
4469 PositionalToken {
4470 source: uws,
4471 offset: 1080,
4472 length: 14,
4473 token: Token::Word(Word::Word("комитет".to_string())),
4474 },
4475 PositionalToken {
4476 source: uws,
4477 offset: 1094,
4478 length: 1,
4479 token: Token::Special(Special::Separator(Separator::Space)),
4480 },
4481 PositionalToken {
4482 source: uws,
4483 offset: 1095,
4484 length: 14,
4485 token: Token::Word(Word::Word("госдумы".to_string())),
4486 },
4487 PositionalToken {
4488 source: uws,
4489 offset: 1109,
4490 length: 1,
4491 token: Token::Special(Special::Separator(Separator::Space)),
4492 },
4493 PositionalToken {
4494 source: uws,
4495 offset: 1110,
4496 length: 4,
4497 token: Token::Word(Word::Word("по".to_string())),
4498 },
4499 PositionalToken {
4500 source: uws,
4501 offset: 1114,
4502 length: 1,
4503 token: Token::Special(Special::Separator(Separator::Space)),
4504 },
4505 PositionalToken {
4506 source: uws,
4507 offset: 1115,
4508 length: 10,
4509 token: Token::Word(Word::Word("делам".to_string())),
4510 },
4511 PositionalToken {
4512 source: uws,
4513 offset: 1125,
4514 length: 1,
4515 token: Token::Special(Special::Separator(Separator::Space)),
4516 },
4517 PositionalToken {
4518 source: uws,
4519 offset: 1126,
4520 length: 12,
4521 token: Token::Word(Word::Word("женщин".to_string())),
4522 },
4523 PositionalToken {
4524 source: uws,
4525 offset: 1138,
4526 length: 1,
4527 token: Token::Special(Special::Punctuation(',')),
4528 },
4529 PositionalToken {
4530 source: uws,
4531 offset: 1139,
4532 length: 1,
4533 token: Token::Special(Special::Separator(Separator::Space)),
4534 },
4535 PositionalToken {
4536 source: uws,
4537 offset: 1140,
4538 length: 10,
4539 token: Token::Word(Word::Word("семьи".to_string())),
4540 },
4541 PositionalToken {
4542 source: uws,
4543 offset: 1150,
4544 length: 1,
4545 token: Token::Special(Special::Separator(Separator::Space)),
4546 },
4547 PositionalToken {
4548 source: uws,
4549 offset: 1151,
4550 length: 2,
4551 token: Token::Word(Word::Word("и".to_string())),
4552 },
4553 PositionalToken {
4554 source: uws,
4555 offset: 1153,
4556 length: 1,
4557 token: Token::Special(Special::Separator(Separator::Space)),
4558 },
4559 PositionalToken {
4560 source: uws,
4561 offset: 1154,
4562 length: 16,
4563 token: Token::Word(Word::Word("молодежи".to_string())),
4564 },
4565 PositionalToken {
4566 source: uws,
4567 offset: 1170,
4568 length: 1,
4569 token: Token::Special(Special::Separator(Separator::Space)),
4570 },
4571 PositionalToken {
4572 source: uws,
4573 offset: 1171,
4574 length: 16,
4575 token: Token::Word(Word::Word("выступил".to_string())),
4576 },
4577 PositionalToken {
4578 source: uws,
4579 offset: 1187,
4580 length: 1,
4581 token: Token::Special(Special::Separator(Separator::Space)),
4582 },
4583 PositionalToken {
4584 source: uws,
4585 offset: 1188,
4586 length: 2,
4587 token: Token::Word(Word::Word("с".to_string())),
4588 },
4589 PositionalToken {
4590 source: uws,
4591 offset: 1190,
4592 length: 1,
4593 token: Token::Special(Special::Separator(Separator::Space)),
4594 },
4595 PositionalToken {
4596 source: uws,
4597 offset: 1191,
4598 length: 24,
4599 token: Token::Word(Word::Word("предложением".to_string())),
4600 },
4601 PositionalToken {
4602 source: uws,
4603 offset: 1215,
4604 length: 1,
4605 token: Token::Special(Special::Separator(Separator::Space)),
4606 },
4607 PositionalToken {
4608 source: uws,
4609 offset: 1216,
4610 length: 16,
4611 token: Token::Word(Word::Word("учредить".to_string())),
4612 },
4613 PositionalToken {
4614 source: uws,
4615 offset: 1232,
4616 length: 1,
4617 token: Token::Special(Special::Separator(Separator::Space)),
4618 },
4619 PositionalToken {
4620 source: uws,
4621 offset: 1233,
4622 length: 2,
4623 token: Token::Special(Special::Punctuation('«')),
4624 },
4625 PositionalToken {
4626 source: uws,
4627 offset: 1235,
4628 length: 8,
4629 token: Token::Word(Word::Word("День".to_string())),
4630 },
4631 PositionalToken {
4632 source: uws,
4633 offset: 1243,
4634 length: 1,
4635 token: Token::Special(Special::Separator(Separator::Space)),
4636 },
4637 PositionalToken {
4638 source: uws,
4639 offset: 1244,
4640 length: 8,
4641 token: Token::Word(Word::Word("мамы".to_string())),
4642 },
4643 PositionalToken {
4644 source: uws,
4645 offset: 1252,
4646 length: 2,
4647 token: Token::Special(Special::Punctuation('»')),
4648 },
4649 PositionalToken {
4650 source: uws,
4651 offset: 1254,
4652 length: 1,
4653 token: Token::Special(Special::Punctuation(',')),
4654 },
4655 PositionalToken {
4656 source: uws,
4657 offset: 1255,
4658 length: 1,
4659 token: Token::Special(Special::Separator(Separator::Space)),
4660 },
4661 PositionalToken {
4662 source: uws,
4663 offset: 1256,
4664 length: 2,
4665 token: Token::Word(Word::Word("а".to_string())),
4666 },
4667 PositionalToken {
4668 source: uws,
4669 offset: 1258,
4670 length: 1,
4671 token: Token::Special(Special::Separator(Separator::Space)),
4672 },
4673 PositionalToken {
4674 source: uws,
4675 offset: 1259,
4676 length: 6,
4677 token: Token::Word(Word::Word("сам".to_string())),
4678 },
4679 PositionalToken {
4680 source: uws,
4681 offset: 1265,
4682 length: 1,
4683 token: Token::Special(Special::Separator(Separator::Space)),
4684 },
4685 PositionalToken {
4686 source: uws,
4687 offset: 1266,
4688 length: 12,
4689 token: Token::Word(Word::Word("приказ".to_string())),
4690 },
4691 PositionalToken {
4692 source: uws,
4693 offset: 1278,
4694 length: 1,
4695 token: Token::Special(Special::Separator(Separator::Space)),
4696 },
4697 PositionalToken {
4698 source: uws,
4699 offset: 1279,
4700 length: 6,
4701 token: Token::Word(Word::Word("был".to_string())),
4702 },
4703 PositionalToken {
4704 source: uws,
4705 offset: 1285,
4706 length: 1,
4707 token: Token::Special(Special::Separator(Separator::Space)),
4708 },
4709 PositionalToken {
4710 source: uws,
4711 offset: 1286,
4712 length: 16,
4713 token: Token::Word(Word::Word("подписан".to_string())),
4714 },
4715 PositionalToken {
4716 source: uws,
4717 offset: 1302,
4718 length: 1,
4719 token: Token::Special(Special::Separator(Separator::Space)),
4720 },
4721 PositionalToken {
4722 source: uws,
4723 offset: 1303,
4724 length: 6,
4725 token: Token::Word(Word::Word("уже".to_string())),
4726 },
4727 PositionalToken {
4728 source: uws,
4729 offset: 1309,
4730 length: 1,
4731 token: Token::Special(Special::Separator(Separator::Space)),
4732 },
4733 PositionalToken {
4734 source: uws,
4735 offset: 1310,
4736 length: 2,
4737 token: Token::Word(Word::Number(Number::Integer(30))),
4738 },
4739 PositionalToken {
4740 source: uws,
4741 offset: 1312,
4742 length: 1,
4743 token: Token::Special(Special::Separator(Separator::Space)),
4744 },
4745 PositionalToken {
4746 source: uws,
4747 offset: 1313,
4748 length: 12,
4749 token: Token::Word(Word::Word("января".to_string())),
4750 },
4751 PositionalToken {
4752 source: uws,
4753 offset: 1325,
4754 length: 1,
4755 token: Token::Special(Special::Separator(Separator::Space)),
4756 },
4757 PositionalToken {
4758 source: uws,
4759 offset: 1326,
4760 length: 4,
4761 token: Token::Word(Word::Number(Number::Integer(1988))),
4762 },
4763 PositionalToken {
4764 source: uws,
4765 offset: 1330,
4766 length: 1,
4767 token: Token::Special(Special::Separator(Separator::Space)),
4768 },
4769 PositionalToken {
4770 source: uws,
4771 offset: 1331,
4772 length: 8,
4773 token: Token::Word(Word::Word("года".to_string())),
4774 },
4775 PositionalToken {
4776 source: uws,
4777 offset: 1339,
4778 length: 1,
4779 token: Token::Special(Special::Separator(Separator::Space)),
4780 },
4781 PositionalToken {
4782 source: uws,
4783 offset: 1340,
4784 length: 14,
4785 token: Token::Word(Word::Word("Борисом".to_string())),
4786 },
4787 PositionalToken {
4788 source: uws,
4789 offset: 1354,
4790 length: 1,
4791 token: Token::Special(Special::Separator(Separator::Space)),
4792 },
4793 PositionalToken {
4794 source: uws,
4795 offset: 1355,
4796 length: 16,
4797 token: Token::Word(Word::Word("Ельциным".to_string())),
4798 },
4799 PositionalToken {
4800 source: uws,
4801 offset: 1371,
4802 length: 1,
4803 token: Token::Special(Special::Punctuation('.')),
4804 },
4805 PositionalToken {
4806 source: uws,
4807 offset: 1372,
4808 length: 1,
4809 token: Token::Special(Special::Separator(Separator::Space)),
4810 },
4811 PositionalToken {
4812 source: uws,
4813 offset: 1373,
4814 length: 8,
4815 token: Token::Word(Word::Word("Было".to_string())),
4816 },
4817 PositionalToken {
4818 source: uws,
4819 offset: 1381,
4820 length: 1,
4821 token: Token::Special(Special::Separator(Separator::Space)),
4822 },
4823 PositionalToken {
4824 source: uws,
4825 offset: 1382,
4826 length: 12,
4827 token: Token::Word(Word::Word("решено".to_string())),
4828 },
4829 PositionalToken {
4830 source: uws,
4831 offset: 1394,
4832 length: 1,
4833 token: Token::Special(Special::Punctuation(',')),
4834 },
4835 PositionalToken {
4836 source: uws,
4837 offset: 1395,
4838 length: 1,
4839 token: Token::Special(Special::Separator(Separator::Space)),
4840 },
4841 PositionalToken {
4842 source: uws,
4843 offset: 1396,
4844 length: 6,
4845 token: Token::Word(Word::Word("что".to_string())),
4846 },
4847 PositionalToken {
4848 source: uws,
4849 offset: 1402,
4850 length: 1,
4851 token: Token::Special(Special::Separator(Separator::Space)),
4852 },
4853 PositionalToken {
4854 source: uws,
4855 offset: 1403,
4856 length: 16,
4857 token: Token::Word(Word::Word("ежегодно".to_string())),
4858 },
4859 PositionalToken {
4860 source: uws,
4861 offset: 1419,
4862 length: 1,
4863 token: Token::Special(Special::Separator(Separator::Space)),
4864 },
4865 PositionalToken {
4866 source: uws,
4867 offset: 1420,
4868 length: 2,
4869 token: Token::Word(Word::Word("в".to_string())),
4870 },
4871 PositionalToken {
4872 source: uws,
4873 offset: 1422,
4874 length: 1,
4875 token: Token::Special(Special::Separator(Separator::Space)),
4876 },
4877 PositionalToken {
4878 source: uws,
4879 offset: 1423,
4880 length: 12,
4881 token: Token::Word(Word::Word("России".to_string())),
4882 },
4883 PositionalToken {
4884 source: uws,
4885 offset: 1435,
4886 length: 1,
4887 token: Token::Special(Special::Separator(Separator::Space)),
4888 },
4889 PositionalToken {
4890 source: uws,
4891 offset: 1436,
4892 length: 22,
4893 token: Token::Word(Word::Word("празднество".to_string())),
4894 },
4895 PositionalToken {
4896 source: uws,
4897 offset: 1458,
4898 length: 1,
4899 token: Token::Special(Special::Separator(Separator::Space)),
4900 },
4901 PositionalToken {
4902 source: uws,
4903 offset: 1459,
4904 length: 6,
4905 token: Token::Word(Word::Word("дня".to_string())),
4906 },
4907 PositionalToken {
4908 source: uws,
4909 offset: 1465,
4910 length: 1,
4911 token: Token::Special(Special::Separator(Separator::Space)),
4912 },
4913 PositionalToken {
4914 source: uws,
4915 offset: 1466,
4916 length: 8,
4917 token: Token::Word(Word::Word("мамы".to_string())),
4918 },
4919 PositionalToken {
4920 source: uws,
4921 offset: 1474,
4922 length: 1,
4923 token: Token::Special(Special::Separator(Separator::Space)),
4924 },
4925 PositionalToken {
4926 source: uws,
4927 offset: 1475,
4928 length: 10,
4929 token: Token::Word(Word::Word("будет".to_string())),
4930 },
4931 PositionalToken {
4932 source: uws,
4933 offset: 1485,
4934 length: 1,
4935 token: Token::Special(Special::Separator(Separator::Space)),
4936 },
4937 PositionalToken {
4938 source: uws,
4939 offset: 1486,
4940 length: 16,
4941 token: Token::Word(Word::Word("выпадать".to_string())),
4942 },
4943 PositionalToken {
4944 source: uws,
4945 offset: 1502,
4946 length: 1,
4947 token: Token::Special(Special::Separator(Separator::Space)),
4948 },
4949 PositionalToken {
4950 source: uws,
4951 offset: 1503,
4952 length: 4,
4953 token: Token::Word(Word::Word("на".to_string())),
4954 },
4955 PositionalToken {
4956 source: uws,
4957 offset: 1507,
4958 length: 1,
4959 token: Token::Special(Special::Separator(Separator::Space)),
4960 },
4961 PositionalToken {
4962 source: uws,
4963 offset: 1508,
4964 length: 18,
4965 token: Token::Word(Word::Word("последнее".to_string())),
4966 },
4967 PositionalToken {
4968 source: uws,
4969 offset: 1526,
4970 length: 1,
4971 token: Token::Special(Special::Separator(Separator::Space)),
4972 },
4973 PositionalToken {
4974 source: uws,
4975 offset: 1527,
4976 length: 22,
4977 token: Token::Word(Word::Word("воскресенье".to_string())),
4978 },
4979 PositionalToken {
4980 source: uws,
4981 offset: 1549,
4982 length: 1,
4983 token: Token::Special(Special::Separator(Separator::Space)),
4984 },
4985 PositionalToken {
4986 source: uws,
4987 offset: 1550,
4988 length: 12,
4989 token: Token::Word(Word::Word("ноября".to_string())),
4990 },
4991 PositionalToken {
4992 source: uws,
4993 offset: 1562,
4994 length: 1,
4995 token: Token::Special(Special::Punctuation('.')),
4996 },
4997 PositionalToken {
4998 source: uws,
4999 offset: 1563,
5000 length: 1,
5001 token: Token::Special(Special::Separator(Separator::Space)),
5002 },
5003 PositionalToken {
5004 source: uws,
5005 offset: 1664,
5006 length: 1,
5007 token: Token::Special(Special::Separator(Separator::Newline)),
5008 },
5009 PositionalToken {
5010 source: uws,
5011 offset: 1665,
5012 length: 2,
5013 token: Token::Special(Special::Separator(Separator::Space)),
5014 },
5015 PositionalToken {
5016 source: uws,
5017 offset: 1725,
5018 length: 1,
5019 token: Token::Special(Special::Separator(Separator::Newline)),
5020 },
5021 PositionalToken {
5022 source: uws,
5023 offset: 1726,
5024 length: 4,
5025 token: Token::Special(Special::Separator(Separator::Space)),
5026 },
5027 PositionalToken {
5028 source: uws,
5029 offset: 2725,
5030 length: 1,
5031 token: Token::Special(Special::Separator(Separator::Newline)),
5032 },
5033 PositionalToken {
5034 source: uws,
5035 offset: 2726,
5036 length: 2,
5037 token: Token::Special(Special::Separator(Separator::Space)),
5038 },
5039 PositionalToken {
5040 source: uws,
5041 offset: 2888,
5042 length: 1,
5043 token: Token::Special(Special::Separator(Separator::Newline)),
5044 },
5045 PositionalToken {
5046 source: uws,
5047 offset: 2889,
5048 length: 2,
5049 token: Token::Special(Special::Separator(Separator::Space)),
5050 },
5051 PositionalToken {
5052 source: uws,
5053 offset: 2891,
5054 length: 1,
5055 token: Token::Special(Special::Separator(Separator::Newline)),
5056 },
5057 PositionalToken {
5058 source: uws,
5059 offset: 2904,
5060 length: 1,
5061 token: Token::Special(Special::Separator(Separator::Newline)),
5062 },
5063 PositionalToken {
5064 source: uws,
5065 offset: 2905,
5066 length: 4,
5067 token: Token::Special(Special::Separator(Separator::Space)),
5068 },
5069 ];
5070
5071 let text = Text::new({
5072 uws.into_source()
5073 .pipe(tagger::Builder::new().create().into_breaker())
5074 .pipe(entities::Builder::new().create().into_piped())
5075 .into_separator()
5076 })
5077 .unwrap();
5078
5079 let lib_res = text
5080 .into_tokenizer(TokenizerParams::v1())
5081 .filter_map(|tt| tt.into_original_token_1())
5082 .collect::<Vec<_>>();
5083
5084 check_results(&result, &lib_res, uws);
5085 }
5086
5087 #[test]
5138 fn numerical_no_split() {
5139 let uws = "12.02.18 31.28.34 23.11.2018 123.568.365.234.578 127.0.0.1 1st 1кг 123123афываыв 12321фвафыов234выалфо 12_123_343.4234_4234";
5140 let lib_res = uws.into_tokenizer(Default::default()).collect::<Vec<_>>();
5141 let result = vec![
5143 PositionalToken {
5144 source: uws,
5145 offset: 0,
5146 length: 8,
5147 token: Token::Word(Word::Numerical(Numerical::DotSeparated(
5148 "12.02.18".to_string(),
5149 ))),
5150 },
5151 PositionalToken {
5152 source: uws,
5153 offset: 8,
5154 length: 1,
5155 token: Token::Special(Special::Separator(Separator::Space)),
5156 },
5157 PositionalToken {
5158 source: uws,
5159 offset: 9,
5160 length: 8,
5161 token: Token::Word(Word::Numerical(Numerical::DotSeparated(
5162 "31.28.34".to_string(),
5163 ))),
5164 },
5165 PositionalToken {
5166 source: uws,
5167 offset: 17,
5168 length: 1,
5169 token: Token::Special(Special::Separator(Separator::Space)),
5170 },
5171 PositionalToken {
5172 source: uws,
5173 offset: 18,
5174 length: 10,
5175 token: Token::Word(Word::Numerical(Numerical::DotSeparated(
5176 "23.11.2018".to_string(),
5177 ))),
5178 },
5179 PositionalToken {
5180 source: uws,
5181 offset: 28,
5182 length: 1,
5183 token: Token::Special(Special::Separator(Separator::Space)),
5184 },
5185 PositionalToken {
5186 source: uws,
5187 offset: 29,
5188 length: 19,
5189 token: Token::Word(Word::Number(Number::Integer(123568365234578))),
5191 },
5192 PositionalToken {
5193 source: uws,
5194 offset: 48,
5195 length: 1,
5196 token: Token::Special(Special::Separator(Separator::Space)),
5197 },
5198 PositionalToken {
5199 source: uws,
5200 offset: 49,
5201 length: 9,
5202 token: Token::Word(Word::Numerical(Numerical::DotSeparated(
5203 "127.0.0.1".to_string(),
5204 ))),
5205 },
5206 PositionalToken {
5207 source: uws,
5208 offset: 58,
5209 length: 1,
5210 token: Token::Special(Special::Separator(Separator::Space)),
5211 },
5212 PositionalToken {
5213 source: uws,
5214 offset: 59,
5215 length: 3,
5216 token: Token::Word(Word::Numerical(Numerical::Measures("1st".to_string()))),
5217 },
5218 PositionalToken {
5219 source: uws,
5220 offset: 62,
5221 length: 1,
5222 token: Token::Special(Special::Separator(Separator::Space)),
5223 },
5224 PositionalToken {
5225 source: uws,
5226 offset: 63,
5227 length: 5,
5228 token: Token::Word(Word::Numerical(Numerical::Measures("1кг".to_string()))),
5229 },
5230 PositionalToken {
5231 source: uws,
5232 offset: 68,
5233 length: 1,
5234 token: Token::Special(Special::Separator(Separator::Space)),
5235 },
5236 PositionalToken {
5237 source: uws,
5238 offset: 69,
5239 length: 20,
5240 token: Token::Word(Word::Numerical(Numerical::Measures(
5241 "123123афываыв".to_string(),
5242 ))),
5243 },
5244 PositionalToken {
5245 source: uws,
5246 offset: 89,
5247 length: 1,
5248 token: Token::Special(Special::Separator(Separator::Space)),
5249 },
5250 PositionalToken {
5251 source: uws,
5252 offset: 90,
5253 length: 34,
5254 token: Token::Word(Word::Numerical(Numerical::Alphanumeric(
5255 "12321фвафыов234выалфо".to_string(),
5256 ))),
5257 },
5258 PositionalToken {
5259 source: uws,
5260 offset: 124,
5261 length: 1,
5262 token: Token::Special(Special::Separator(Separator::Space)),
5263 },
5264 PositionalToken {
5265 source: uws,
5266 offset: 125,
5267 length: 20,
5268 token: Token::Word(Word::Numerical(Numerical::Alphanumeric(
5269 "12_123_343.4234_4234".to_string(),
5270 ))),
5271 },
5272 ];
5273 check_results(&result, &lib_res, uws);
5274 }
5275
5276 #[test]
5277 fn numerical_default() {
5278 let uws = "12.02.18 31.28.34 23.11.2018 123.568.365.234.578 127.0.0.1 1st 1кг 123123афываыв 12321фвафыов234выалфо 12_123_343.4234_4234";
5279 let lib_res = uws
5280 .into_tokenizer(TokenizerParams::v1())
5281 .collect::<Vec<_>>();
5282 let result = vec![
5285 PositionalToken {
5286 source: uws,
5287 offset: 0,
5288 length: 2,
5289 token: Token::Word(Word::Number(Number::Integer(12))),
5290 },
5291 PositionalToken {
5292 source: uws,
5293 offset: 2,
5294 length: 1,
5295 token: Token::Special(Special::Punctuation('.')),
5296 },
5297 PositionalToken {
5298 source: uws,
5299 offset: 3,
5300 length: 2,
5301 token: Token::Word(Word::Number(Number::ZeroInteger {
5302 i: 2,
5303 s: "02".to_string(),
5304 })),
5305 },
5306 PositionalToken {
5307 source: uws,
5308 offset: 5,
5309 length: 1,
5310 token: Token::Special(Special::Punctuation('.')),
5311 },
5312 PositionalToken {
5313 source: uws,
5314 offset: 6,
5315 length: 2,
5316 token: Token::Word(Word::Number(Number::Integer(18))),
5317 },
5318 PositionalToken {
5319 source: uws,
5320 offset: 8,
5321 length: 1,
5322 token: Token::Special(Special::Separator(Separator::Space)),
5323 },
5324 PositionalToken {
5325 source: uws,
5326 offset: 9,
5327 length: 2,
5328 token: Token::Word(Word::Number(Number::Integer(31))),
5329 },
5330 PositionalToken {
5331 source: uws,
5332 offset: 11,
5333 length: 1,
5334 token: Token::Special(Special::Punctuation('.')),
5335 },
5336 PositionalToken {
5337 source: uws,
5338 offset: 12,
5339 length: 2,
5340 token: Token::Word(Word::Number(Number::Integer(28))),
5341 },
5342 PositionalToken {
5343 source: uws,
5344 offset: 14,
5345 length: 1,
5346 token: Token::Special(Special::Punctuation('.')),
5347 },
5348 PositionalToken {
5349 source: uws,
5350 offset: 15,
5351 length: 2,
5352 token: Token::Word(Word::Number(Number::Integer(34))),
5353 },
5354 PositionalToken {
5355 source: uws,
5356 offset: 17,
5357 length: 1,
5358 token: Token::Special(Special::Separator(Separator::Space)),
5359 },
5360 PositionalToken {
5361 source: uws,
5362 offset: 18,
5363 length: 2,
5364 token: Token::Word(Word::Number(Number::Integer(23))),
5365 },
5366 PositionalToken {
5367 source: uws,
5368 offset: 20,
5369 length: 1,
5370 token: Token::Special(Special::Punctuation('.')),
5371 },
5372 PositionalToken {
5373 source: uws,
5374 offset: 21,
5375 length: 2,
5376 token: Token::Word(Word::Number(Number::Integer(11))),
5377 },
5378 PositionalToken {
5379 source: uws,
5380 offset: 23,
5381 length: 1,
5382 token: Token::Special(Special::Punctuation('.')),
5383 },
5384 PositionalToken {
5385 source: uws,
5386 offset: 24,
5387 length: 4,
5388 token: Token::Word(Word::Number(Number::Integer(2018))),
5389 },
5390 PositionalToken {
5391 source: uws,
5392 offset: 28,
5393 length: 1,
5394 token: Token::Special(Special::Separator(Separator::Space)),
5395 },
5396 PositionalToken {
5397 source: uws,
5398 offset: 29,
5399 length: 19,
5400 token: Token::Word(Word::Number(Number::Integer(123568365234578))),
5402 },
5403 PositionalToken {
5458 source: uws,
5459 offset: 48,
5460 length: 1,
5461 token: Token::Special(Special::Separator(Separator::Space)),
5462 },
5463 PositionalToken {
5464 source: uws,
5465 offset: 49,
5466 length: 3,
5467 token: Token::Word(Word::Number(Number::Integer(127))),
5468 },
5469 PositionalToken {
5470 source: uws,
5471 offset: 52,
5472 length: 1,
5473 token: Token::Special(Special::Punctuation('.')),
5474 },
5475 PositionalToken {
5476 source: uws,
5477 offset: 53,
5478 length: 1,
5479 token: Token::Word(Word::Number(Number::ZeroInteger {
5480 i: 0,
5481 s: "0".to_string(),
5482 })),
5483 },
5484 PositionalToken {
5485 source: uws,
5486 offset: 54,
5487 length: 1,
5488 token: Token::Special(Special::Punctuation('.')),
5489 },
5490 PositionalToken {
5491 source: uws,
5492 offset: 55,
5493 length: 1,
5494 token: Token::Word(Word::Number(Number::ZeroInteger {
5495 i: 0,
5496 s: "0".to_string(),
5497 })),
5498 },
5499 PositionalToken {
5500 source: uws,
5501 offset: 56,
5502 length: 1,
5503 token: Token::Special(Special::Punctuation('.')),
5504 },
5505 PositionalToken {
5506 source: uws,
5507 offset: 57,
5508 length: 1,
5509 token: Token::Word(Word::Number(Number::Integer(1))),
5510 },
5511 PositionalToken {
5512 source: uws,
5513 offset: 58,
5514 length: 1,
5515 token: Token::Special(Special::Separator(Separator::Space)),
5516 },
5517 PositionalToken {
5518 source: uws,
5519 offset: 59,
5520 length: 3,
5521 token: Token::Word(Word::Numerical(Numerical::Measures("1st".to_string()))),
5522 },
5523 PositionalToken {
5524 source: uws,
5525 offset: 62,
5526 length: 1,
5527 token: Token::Special(Special::Separator(Separator::Space)),
5528 },
5529 PositionalToken {
5530 source: uws,
5531 offset: 63,
5532 length: 5,
5533 token: Token::Word(Word::Numerical(Numerical::Measures("1кг".to_string()))),
5534 },
5535 PositionalToken {
5536 source: uws,
5537 offset: 68,
5538 length: 1,
5539 token: Token::Special(Special::Separator(Separator::Space)),
5540 },
5541 PositionalToken {
5542 source: uws,
5543 offset: 69,
5544 length: 20,
5545 token: Token::Word(Word::Numerical(Numerical::Measures(
5546 "123123афываыв".to_string(),
5547 ))),
5548 },
5549 PositionalToken {
5550 source: uws,
5551 offset: 89,
5552 length: 1,
5553 token: Token::Special(Special::Separator(Separator::Space)),
5554 },
5555 PositionalToken {
5556 source: uws,
5557 offset: 90,
5558 length: 34,
5559 token: Token::Word(Word::Numerical(Numerical::Alphanumeric(
5560 "12321фвафыов234выалфо".to_string(),
5561 ))),
5562 },
5563 PositionalToken {
5564 source: uws,
5565 offset: 124,
5566 length: 1,
5567 token: Token::Special(Special::Separator(Separator::Space)),
5568 },
5569 PositionalToken {
5570 source: uws,
5571 offset: 125,
5572 length: 2,
5573 token: Token::Word(Word::Number(Number::Integer(12))),
5574 },
5575 PositionalToken {
5576 source: uws,
5577 offset: 127,
5578 length: 1,
5579 token: Token::Special(Special::Punctuation('_')),
5580 },
5581 PositionalToken {
5582 source: uws,
5583 offset: 128,
5584 length: 3,
5585 token: Token::Word(Word::Number(Number::Integer(123))),
5586 },
5587 PositionalToken {
5588 source: uws,
5589 offset: 131,
5590 length: 1,
5591 token: Token::Special(Special::Punctuation('_')),
5592 },
5593 PositionalToken {
5594 source: uws,
5595 offset: 132,
5596 length: 3,
5597 token: Token::Word(Word::Number(Number::Integer(343))),
5598 },
5599 PositionalToken {
5600 source: uws,
5601 offset: 135,
5602 length: 1,
5603 token: Token::Special(Special::Punctuation('.')),
5604 },
5605 PositionalToken {
5606 source: uws,
5607 offset: 136,
5608 length: 4,
5609 token: Token::Word(Word::Number(Number::Integer(4234))),
5610 },
5611 PositionalToken {
5612 source: uws,
5613 offset: 140,
5614 length: 1,
5615 token: Token::Special(Special::Punctuation('_')),
5616 },
5617 PositionalToken {
5618 source: uws,
5619 offset: 141,
5620 length: 4,
5621 token: Token::Word(Word::Number(Number::Integer(4234))),
5622 },
5623 ];
5624 check_results(&result, &lib_res, uws);
5625 }
5626
5627 enum Lang {
5640 Zho,
5641 Jpn,
5642 Kor,
5643 Ara,
5644 Ell,
5645 }
5646
5647 #[test]
5648 fn test_lang_zho() {
5649 let (uws, result) = get_lang_test(Lang::Zho);
5650 let lib_res = uws
5651 .into_tokenizer(TokenizerParams::v1())
5652 .collect::<Vec<_>>();
5653 check_results(&result, &lib_res, &uws);
5654 }
5655
5656 #[test]
5657 fn test_lang_jpn() {
5658 let (uws, result) = get_lang_test(Lang::Jpn);
5659 let lib_res = uws
5660 .into_tokenizer(TokenizerParams::v1())
5661 .collect::<Vec<_>>();
5662 check_results(&result, &lib_res, &uws);
5663 }
5664
5665 #[test]
5666 fn test_lang_kor() {
5667 let (uws, result) = get_lang_test(Lang::Kor);
5668 let lib_res = uws
5669 .into_tokenizer(TokenizerParams::v1())
5670 .collect::<Vec<_>>();
5671 check_results(&result, &lib_res, &uws);
5672 }
5673
5674 #[test]
5675 fn test_lang_ara() {
5676 let (uws, result) = get_lang_test(Lang::Ara);
5677 let lib_res = uws
5678 .into_tokenizer(TokenizerParams::v1())
5679 .collect::<Vec<_>>();
5680 check_results(&result, &lib_res, &uws);
5681 }
5682
5683 #[test]
5684 fn test_lang_ell() {
5685 let (uws, result) = get_lang_test(Lang::Ell);
5686 let lib_res = uws
5687 .into_tokenizer(TokenizerParams::v1())
5688 .collect::<Vec<_>>();
5689 check_results(&result, &lib_res, &uws);
5690 }
5691
5692 fn get_lang_test(lng: Lang) -> (String, Vec<PositionalToken>) {
5693 let uws = match lng {
5694 Lang::Zho => {
5695 "美国电视连续剧《超人前传》的第一集《试播集》于2001年10月16日在電視網首播,剧集主创人阿尔弗雷德·高夫和迈尔斯·米勒編劇,大卫·努特尔执导。这一试播首次向观众引荐了克拉克·肯特一角,他是位拥有超能力的外星孤儿,与家人和朋友一起在堪薩斯州虚构小镇斯莫维尔生活。在这一集里,肯特首度得知自己的来历,同时还需要阻止一位学生试图杀死镇上高中多名学生的报复之举。本集节目里引入了多个之后将贯穿全季甚至整部剧集的主题元素,例如几位主要角色之间的三角恋情。电视剧在加拿大溫哥華取景,旨在选用其“美国中产阶级”景观,主创人花了5个月的时间专门用于为主角物色合适的演员。试播集在所有演员选好4天后正式开拍。由于时间上的限制,剧组无法搭建好实体外景,因此只能使用计算机绘图技术将数字化的外景插入到镜头中。节目一经上映就打破了电视网的多项收视纪录,并且获得了评论员的普遍好评和多个奖项提名,并在其中两项上胜出"
5696 }
5697 Lang::Kor => {
5698 "플레이스테이션 은 소니 컴퓨터 엔터테인먼트가 개발한 세 번째 가정용 게임기이다. 마이크로소프트의 엑스박스 360, 닌텐도의 Wii와 경쟁하고 있다. 이전 제품에서 온라인 플레이 기능을 비디오 게임 개발사에 전적으로 의존하던 것과 달리 통합 온라인 게임 서비스인 플레이스테이션 네트워크 서비스를 발매와 함께 시작해 제공하고 있으며, 탄탄한 멀티미디어 재생 기능, 플레이스테이션 포터블과의 연결, 고화질 광학 디스크 포맷인 블루레이 디스크 재생 기능 등의 기능을 갖추고 있다. 2006년 11월 11일에 일본에서 처음으로 출시했으며, 11월 17일에는 북미 지역, 2007년 3월 23일에는 유럽과 오세아니아 지역에서, 대한민국의 경우 6월 5일부터 일주일간 예약판매를 실시해, 매일 준비한 수량이 동이 나는 등 많은 관심을 받았으며 6월 16일에 정식 출시 행사를 열었다"
5699 }
5700 Lang::Jpn => {
5701 "熊野三山本願所は、15世紀末以降における熊野三山(熊野本宮、熊野新宮、熊野那智)の造営・修造のための勧進を担った組織の総称。 熊野三山を含めて、日本における古代から中世前半にかけての寺社の造営は、寺社領経営のような恒常的財源、幕府や朝廷などからの一時的な造営料所の寄進、あるいは公権力からの臨時の保護によって行われていた。しかしながら、熊野三山では、これらの財源はすべて15世紀半ばまでに実効性を失った"
5702 }
5703 Lang::Ara => {
5704 "لشکرکشیهای روسهای وارنگی به دریای خزر مجموعهای از حملات نظامی در بین سالهای ۸۶۴ تا ۱۰۴۱ میلادی به سواحل دریای خزر بودهاست. روسهای وارنگی ابتدا در قرن نهم میلادی به عنوان بازرگانان پوست، عسل و برده در سرزمینهای اسلامی(سرکلند) ظاهر شدند. این بازرگانان در مسیر تجاری ولگا به خرید و فروش میپرداختند. نخستین حملهٔ آنان در فاصله سالهای ۸۶۴ تا ۸۸۴ میلادی در مقیاسی کوچک علیه علویان طبرستان رخ داد. نخستین یورش بزرگ روسها در سال ۹۱۳ رخ داد و آنان با ۵۰۰ فروند درازکشتی شهر گرگان و اطراف آن را غارت کردند. آنها در این حمله مقداری کالا و برده را به تاراج بردند و در راه بازگشتن به سمت شمال، در دلتای ولگا، مورد حملهٔ خزرهای مسلمان قرار گرفتند و بعضی از آنان موفق به فرار شدند، ولی در میانهٔ ولگا به قتل رسیدند. دومین هجوم بزرگ روسها به دریای خزر در سال ۹۴۳ به وقوع پیوست. در این دوره ایگور یکم، حاکم روس کیف، رهبری روسها را در دست داشت. روسها پس از توافق با دولت خزرها برای عبور امن از منطقه، تا رود کورا و اعماق قفقاز پیش رفتند و در سال ۹۴۳ موفق شدند بندر بردعه، پایتخت اران (جمهوری آذربایجان کنونی)، را تصرف کنند. روسها در آنجا به مدت چند ماه ماندند و بسیاری از ساکنان شهر را کشتند و از راه غارتگری اموالی را به تاراج بردند. تنها دلیل بازگشت آنان "
5705 }
5706 Lang::Ell => {
5707 "Το Πρόγραμμα υλοποιείται εξ ολοκλήρου από απόσταση και μπορεί να συμμετέχει κάθε εμπλεκόμενος στη ή/και ενδιαφερόμενος για τη διδασκαλία της Ελληνικής ως δεύτερης/ξένης γλώσσας στην Ελλάδα και στο εξωτερικό, αρκεί να είναι απόφοιτος ελληνικής φιλολογίας, ξένων φιλολογιών, παιδαγωγικών τμημάτων, θεολογικών σχολών ή άλλων πανεπιστημιακών τμημάτων ελληνικών ή ισότιμων ξένων πανεπιστημίων. Υπό όρους γίνονται δεκτοί υποψήφιοι που δεν έχουν ολοκληρώσει σπουδές τριτοβάθμιας εκπαίδευσης."
5708 }
5709 };
5710 let tokens = match lng {
5711 Lang::Zho => vec![
5712 PositionalToken {
5713 source: uws,
5714 offset: 0,
5715 length: 3,
5716 token: Token::Word(Word::Word("美".to_string())),
5717 },
5718 PositionalToken {
5719 source: uws,
5720 offset: 3,
5721 length: 3,
5722 token: Token::Word(Word::Word("国".to_string())),
5723 },
5724 PositionalToken {
5725 source: uws,
5726 offset: 6,
5727 length: 3,
5728 token: Token::Word(Word::Word("电".to_string())),
5729 },
5730 PositionalToken {
5731 source: uws,
5732 offset: 9,
5733 length: 3,
5734 token: Token::Word(Word::Word("视".to_string())),
5735 },
5736 PositionalToken {
5737 source: uws,
5738 offset: 12,
5739 length: 3,
5740 token: Token::Word(Word::Word("连".to_string())),
5741 },
5742 PositionalToken {
5743 source: uws,
5744 offset: 15,
5745 length: 3,
5746 token: Token::Word(Word::Word("续".to_string())),
5747 },
5748 PositionalToken {
5749 source: uws,
5750 offset: 18,
5751 length: 3,
5752 token: Token::Word(Word::Word("剧".to_string())),
5753 },
5754 PositionalToken {
5755 source: uws,
5756 offset: 21,
5757 length: 3,
5758 token: Token::Special(Special::Punctuation('《')),
5759 },
5760 PositionalToken {
5761 source: uws,
5762 offset: 24,
5763 length: 3,
5764 token: Token::Word(Word::Word("超".to_string())),
5765 },
5766 PositionalToken {
5767 source: uws,
5768 offset: 27,
5769 length: 3,
5770 token: Token::Word(Word::Word("人".to_string())),
5771 },
5772 PositionalToken {
5773 source: uws,
5774 offset: 30,
5775 length: 3,
5776 token: Token::Word(Word::Word("前".to_string())),
5777 },
5778 PositionalToken {
5779 source: uws,
5780 offset: 33,
5781 length: 3,
5782 token: Token::Word(Word::Word("传".to_string())),
5783 },
5784 PositionalToken {
5785 source: uws,
5786 offset: 36,
5787 length: 3,
5788 token: Token::Special(Special::Punctuation('》')),
5789 },
5790 PositionalToken {
5791 source: uws,
5792 offset: 39,
5793 length: 3,
5794 token: Token::Word(Word::Word("的".to_string())),
5795 },
5796 PositionalToken {
5797 source: uws,
5798 offset: 42,
5799 length: 3,
5800 token: Token::Word(Word::Word("第".to_string())),
5801 },
5802 PositionalToken {
5803 source: uws,
5804 offset: 45,
5805 length: 3,
5806 token: Token::Word(Word::Word("一".to_string())),
5807 },
5808 PositionalToken {
5809 source: uws,
5810 offset: 48,
5811 length: 3,
5812 token: Token::Word(Word::Word("集".to_string())),
5813 },
5814 PositionalToken {
5815 source: uws,
5816 offset: 51,
5817 length: 3,
5818 token: Token::Special(Special::Punctuation('《')),
5819 },
5820 PositionalToken {
5821 source: uws,
5822 offset: 54,
5823 length: 3,
5824 token: Token::Word(Word::Word("试".to_string())),
5825 },
5826 PositionalToken {
5827 source: uws,
5828 offset: 57,
5829 length: 3,
5830 token: Token::Word(Word::Word("播".to_string())),
5831 },
5832 PositionalToken {
5833 source: uws,
5834 offset: 60,
5835 length: 3,
5836 token: Token::Word(Word::Word("集".to_string())),
5837 },
5838 PositionalToken {
5839 source: uws,
5840 offset: 63,
5841 length: 3,
5842 token: Token::Special(Special::Punctuation('》')),
5843 },
5844 PositionalToken {
5845 source: uws,
5846 offset: 66,
5847 length: 3,
5848 token: Token::Word(Word::Word("于".to_string())),
5849 },
5850 PositionalToken {
5851 source: uws,
5852 offset: 69,
5853 length: 4,
5854 token: Token::Word(Word::Number(Number::Integer(2001))),
5855 },
5856 PositionalToken {
5857 source: uws,
5858 offset: 73,
5859 length: 3,
5860 token: Token::Word(Word::Word("年".to_string())),
5861 },
5862 PositionalToken {
5863 source: uws,
5864 offset: 76,
5865 length: 2,
5866 token: Token::Word(Word::Number(Number::Integer(10))),
5867 },
5868 PositionalToken {
5869 source: uws,
5870 offset: 78,
5871 length: 3,
5872 token: Token::Word(Word::Word("月".to_string())),
5873 },
5874 PositionalToken {
5875 source: uws,
5876 offset: 81,
5877 length: 2,
5878 token: Token::Word(Word::Number(Number::Integer(16))),
5879 },
5880 PositionalToken {
5881 source: uws,
5882 offset: 83,
5883 length: 3,
5884 token: Token::Word(Word::Word("日".to_string())),
5885 },
5886 PositionalToken {
5887 source: uws,
5888 offset: 86,
5889 length: 3,
5890 token: Token::Word(Word::Word("在".to_string())),
5891 },
5892 PositionalToken {
5893 source: uws,
5894 offset: 89,
5895 length: 3,
5896 token: Token::Word(Word::Word("電".to_string())),
5897 },
5898 PositionalToken {
5899 source: uws,
5900 offset: 92,
5901 length: 3,
5902 token: Token::Word(Word::Word("視".to_string())),
5903 },
5904 PositionalToken {
5905 source: uws,
5906 offset: 95,
5907 length: 3,
5908 token: Token::Word(Word::Word("網".to_string())),
5909 },
5910 PositionalToken {
5911 source: uws,
5912 offset: 98,
5913 length: 3,
5914 token: Token::Word(Word::Word("首".to_string())),
5915 },
5916 PositionalToken {
5917 source: uws,
5918 offset: 101,
5919 length: 3,
5920 token: Token::Word(Word::Word("播".to_string())),
5921 },
5922 PositionalToken {
5923 source: uws,
5924 offset: 104,
5925 length: 3,
5926 token: Token::Special(Special::Punctuation(',')),
5927 },
5928 PositionalToken {
5929 source: uws,
5930 offset: 107,
5931 length: 3,
5932 token: Token::Word(Word::Word("剧".to_string())),
5933 },
5934 PositionalToken {
5935 source: uws,
5936 offset: 110,
5937 length: 3,
5938 token: Token::Word(Word::Word("集".to_string())),
5939 },
5940 PositionalToken {
5941 source: uws,
5942 offset: 113,
5943 length: 3,
5944 token: Token::Word(Word::Word("主".to_string())),
5945 },
5946 PositionalToken {
5947 source: uws,
5948 offset: 116,
5949 length: 3,
5950 token: Token::Word(Word::Word("创".to_string())),
5951 },
5952 PositionalToken {
5953 source: uws,
5954 offset: 119,
5955 length: 3,
5956 token: Token::Word(Word::Word("人".to_string())),
5957 },
5958 PositionalToken {
5959 source: uws,
5960 offset: 122,
5961 length: 3,
5962 token: Token::Word(Word::Word("阿".to_string())),
5963 },
5964 PositionalToken {
5965 source: uws,
5966 offset: 125,
5967 length: 3,
5968 token: Token::Word(Word::Word("尔".to_string())),
5969 },
5970 PositionalToken {
5971 source: uws,
5972 offset: 128,
5973 length: 3,
5974 token: Token::Word(Word::Word("弗".to_string())),
5975 },
5976 PositionalToken {
5977 source: uws,
5978 offset: 131,
5979 length: 3,
5980 token: Token::Word(Word::Word("雷".to_string())),
5981 },
5982 PositionalToken {
5983 source: uws,
5984 offset: 134,
5985 length: 3,
5986 token: Token::Word(Word::Word("德".to_string())),
5987 },
5988 PositionalToken {
5989 source: uws,
5990 offset: 137,
5991 length: 2,
5992 token: Token::Special(Special::Punctuation('·')),
5993 },
5994 PositionalToken {
5995 source: uws,
5996 offset: 139,
5997 length: 3,
5998 token: Token::Word(Word::Word("高".to_string())),
5999 },
6000 PositionalToken {
6001 source: uws,
6002 offset: 142,
6003 length: 3,
6004 token: Token::Word(Word::Word("夫".to_string())),
6005 },
6006 PositionalToken {
6007 source: uws,
6008 offset: 145,
6009 length: 3,
6010 token: Token::Word(Word::Word("和".to_string())),
6011 },
6012 PositionalToken {
6013 source: uws,
6014 offset: 148,
6015 length: 3,
6016 token: Token::Word(Word::Word("迈".to_string())),
6017 },
6018 PositionalToken {
6019 source: uws,
6020 offset: 151,
6021 length: 3,
6022 token: Token::Word(Word::Word("尔".to_string())),
6023 },
6024 PositionalToken {
6025 source: uws,
6026 offset: 154,
6027 length: 3,
6028 token: Token::Word(Word::Word("斯".to_string())),
6029 },
6030 PositionalToken {
6031 source: uws,
6032 offset: 157,
6033 length: 2,
6034 token: Token::Special(Special::Punctuation('·')),
6035 },
6036 PositionalToken {
6037 source: uws,
6038 offset: 159,
6039 length: 3,
6040 token: Token::Word(Word::Word("米".to_string())),
6041 },
6042 PositionalToken {
6043 source: uws,
6044 offset: 162,
6045 length: 3,
6046 token: Token::Word(Word::Word("勒".to_string())),
6047 },
6048 PositionalToken {
6049 source: uws,
6050 offset: 165,
6051 length: 3,
6052 token: Token::Word(Word::Word("編".to_string())),
6053 },
6054 PositionalToken {
6055 source: uws,
6056 offset: 168,
6057 length: 3,
6058 token: Token::Word(Word::Word("劇".to_string())),
6059 },
6060 PositionalToken {
6061 source: uws,
6062 offset: 171,
6063 length: 3,
6064 token: Token::Special(Special::Punctuation(',')),
6065 },
6066 PositionalToken {
6067 source: uws,
6068 offset: 174,
6069 length: 3,
6070 token: Token::Word(Word::Word("大".to_string())),
6071 },
6072 PositionalToken {
6073 source: uws,
6074 offset: 177,
6075 length: 3,
6076 token: Token::Word(Word::Word("卫".to_string())),
6077 },
6078 PositionalToken {
6079 source: uws,
6080 offset: 180,
6081 length: 2,
6082 token: Token::Special(Special::Punctuation('·')),
6083 },
6084 PositionalToken {
6085 source: uws,
6086 offset: 182,
6087 length: 3,
6088 token: Token::Word(Word::Word("努".to_string())),
6089 },
6090 PositionalToken {
6091 source: uws,
6092 offset: 185,
6093 length: 3,
6094 token: Token::Word(Word::Word("特".to_string())),
6095 },
6096 PositionalToken {
6097 source: uws,
6098 offset: 188,
6099 length: 3,
6100 token: Token::Word(Word::Word("尔".to_string())),
6101 },
6102 PositionalToken {
6103 source: uws,
6104 offset: 191,
6105 length: 3,
6106 token: Token::Word(Word::Word("执".to_string())),
6107 },
6108 PositionalToken {
6109 source: uws,
6110 offset: 194,
6111 length: 3,
6112 token: Token::Word(Word::Word("导".to_string())),
6113 },
6114 PositionalToken {
6115 source: uws,
6116 offset: 197,
6117 length: 3,
6118 token: Token::Special(Special::Punctuation('。')),
6119 },
6120 PositionalToken {
6121 source: uws,
6122 offset: 200,
6123 length: 3,
6124 token: Token::Word(Word::Word("这".to_string())),
6125 },
6126 PositionalToken {
6127 source: uws,
6128 offset: 203,
6129 length: 3,
6130 token: Token::Word(Word::Word("一".to_string())),
6131 },
6132 PositionalToken {
6133 source: uws,
6134 offset: 206,
6135 length: 3,
6136 token: Token::Word(Word::Word("试".to_string())),
6137 },
6138 PositionalToken {
6139 source: uws,
6140 offset: 209,
6141 length: 3,
6142 token: Token::Word(Word::Word("播".to_string())),
6143 },
6144 PositionalToken {
6145 source: uws,
6146 offset: 212,
6147 length: 3,
6148 token: Token::Word(Word::Word("首".to_string())),
6149 },
6150 PositionalToken {
6151 source: uws,
6152 offset: 215,
6153 length: 3,
6154 token: Token::Word(Word::Word("次".to_string())),
6155 },
6156 PositionalToken {
6157 source: uws,
6158 offset: 218,
6159 length: 3,
6160 token: Token::Word(Word::Word("向".to_string())),
6161 },
6162 PositionalToken {
6163 source: uws,
6164 offset: 221,
6165 length: 3,
6166 token: Token::Word(Word::Word("观".to_string())),
6167 },
6168 PositionalToken {
6169 source: uws,
6170 offset: 224,
6171 length: 3,
6172 token: Token::Word(Word::Word("众".to_string())),
6173 },
6174 PositionalToken {
6175 source: uws,
6176 offset: 227,
6177 length: 3,
6178 token: Token::Word(Word::Word("引".to_string())),
6179 },
6180 PositionalToken {
6181 source: uws,
6182 offset: 230,
6183 length: 3,
6184 token: Token::Word(Word::Word("荐".to_string())),
6185 },
6186 PositionalToken {
6187 source: uws,
6188 offset: 233,
6189 length: 3,
6190 token: Token::Word(Word::Word("了".to_string())),
6191 },
6192 PositionalToken {
6193 source: uws,
6194 offset: 236,
6195 length: 3,
6196 token: Token::Word(Word::Word("克".to_string())),
6197 },
6198 PositionalToken {
6199 source: uws,
6200 offset: 239,
6201 length: 3,
6202 token: Token::Word(Word::Word("拉".to_string())),
6203 },
6204 PositionalToken {
6205 source: uws,
6206 offset: 242,
6207 length: 3,
6208 token: Token::Word(Word::Word("克".to_string())),
6209 },
6210 PositionalToken {
6211 source: uws,
6212 offset: 245,
6213 length: 2,
6214 token: Token::Special(Special::Punctuation('·')),
6215 },
6216 PositionalToken {
6217 source: uws,
6218 offset: 247,
6219 length: 3,
6220 token: Token::Word(Word::Word("肯".to_string())),
6221 },
6222 PositionalToken {
6223 source: uws,
6224 offset: 250,
6225 length: 3,
6226 token: Token::Word(Word::Word("特".to_string())),
6227 },
6228 PositionalToken {
6229 source: uws,
6230 offset: 253,
6231 length: 3,
6232 token: Token::Word(Word::Word("一".to_string())),
6233 },
6234 PositionalToken {
6235 source: uws,
6236 offset: 256,
6237 length: 3,
6238 token: Token::Word(Word::Word("角".to_string())),
6239 },
6240 PositionalToken {
6241 source: uws,
6242 offset: 259,
6243 length: 3,
6244 token: Token::Special(Special::Punctuation(',')),
6245 },
6246 PositionalToken {
6247 source: uws,
6248 offset: 262,
6249 length: 3,
6250 token: Token::Word(Word::Word("他".to_string())),
6251 },
6252 PositionalToken {
6253 source: uws,
6254 offset: 265,
6255 length: 3,
6256 token: Token::Word(Word::Word("是".to_string())),
6257 },
6258 PositionalToken {
6259 source: uws,
6260 offset: 268,
6261 length: 3,
6262 token: Token::Word(Word::Word("位".to_string())),
6263 },
6264 PositionalToken {
6265 source: uws,
6266 offset: 271,
6267 length: 3,
6268 token: Token::Word(Word::Word("拥".to_string())),
6269 },
6270 PositionalToken {
6271 source: uws,
6272 offset: 274,
6273 length: 3,
6274 token: Token::Word(Word::Word("有".to_string())),
6275 },
6276 PositionalToken {
6277 source: uws,
6278 offset: 277,
6279 length: 3,
6280 token: Token::Word(Word::Word("超".to_string())),
6281 },
6282 ],
6283 Lang::Jpn => vec![
6284 PositionalToken {
6285 source: uws,
6286 offset: 0,
6287 length: 3,
6288 token: Token::Word(Word::Word("熊".to_string())),
6289 },
6290 PositionalToken {
6291 source: uws,
6292 offset: 3,
6293 length: 3,
6294 token: Token::Word(Word::Word("野".to_string())),
6295 },
6296 PositionalToken {
6297 source: uws,
6298 offset: 6,
6299 length: 3,
6300 token: Token::Word(Word::Word("三".to_string())),
6301 },
6302 PositionalToken {
6303 source: uws,
6304 offset: 9,
6305 length: 3,
6306 token: Token::Word(Word::Word("山".to_string())),
6307 },
6308 PositionalToken {
6309 source: uws,
6310 offset: 12,
6311 length: 3,
6312 token: Token::Word(Word::Word("本".to_string())),
6313 },
6314 PositionalToken {
6315 source: uws,
6316 offset: 15,
6317 length: 3,
6318 token: Token::Word(Word::Word("願".to_string())),
6319 },
6320 PositionalToken {
6321 source: uws,
6322 offset: 18,
6323 length: 3,
6324 token: Token::Word(Word::Word("所".to_string())),
6325 },
6326 PositionalToken {
6327 source: uws,
6328 offset: 21,
6329 length: 3,
6330 token: Token::Word(Word::Word("は".to_string())),
6331 },
6332 PositionalToken {
6333 source: uws,
6334 offset: 24,
6335 length: 3,
6336 token: Token::Special(Special::Punctuation('、')),
6337 },
6338 PositionalToken {
6339 source: uws,
6340 offset: 27,
6341 length: 2,
6342 token: Token::Word(Word::Number(Number::Integer(15))),
6343 },
6344 PositionalToken {
6345 source: uws,
6346 offset: 29,
6347 length: 3,
6348 token: Token::Word(Word::Word("世".to_string())),
6349 },
6350 PositionalToken {
6351 source: uws,
6352 offset: 32,
6353 length: 3,
6354 token: Token::Word(Word::Word("紀".to_string())),
6355 },
6356 PositionalToken {
6357 source: uws,
6358 offset: 35,
6359 length: 3,
6360 token: Token::Word(Word::Word("末".to_string())),
6361 },
6362 PositionalToken {
6363 source: uws,
6364 offset: 38,
6365 length: 3,
6366 token: Token::Word(Word::Word("以".to_string())),
6367 },
6368 PositionalToken {
6369 source: uws,
6370 offset: 41,
6371 length: 3,
6372 token: Token::Word(Word::Word("降".to_string())),
6373 },
6374 PositionalToken {
6375 source: uws,
6376 offset: 44,
6377 length: 3,
6378 token: Token::Word(Word::Word("に".to_string())),
6379 },
6380 PositionalToken {
6381 source: uws,
6382 offset: 47,
6383 length: 3,
6384 token: Token::Word(Word::Word("お".to_string())),
6385 },
6386 PositionalToken {
6387 source: uws,
6388 offset: 50,
6389 length: 3,
6390 token: Token::Word(Word::Word("け".to_string())),
6391 },
6392 PositionalToken {
6393 source: uws,
6394 offset: 53,
6395 length: 3,
6396 token: Token::Word(Word::Word("る".to_string())),
6397 },
6398 PositionalToken {
6399 source: uws,
6400 offset: 56,
6401 length: 3,
6402 token: Token::Word(Word::Word("熊".to_string())),
6403 },
6404 PositionalToken {
6405 source: uws,
6406 offset: 59,
6407 length: 3,
6408 token: Token::Word(Word::Word("野".to_string())),
6409 },
6410 PositionalToken {
6411 source: uws,
6412 offset: 62,
6413 length: 3,
6414 token: Token::Word(Word::Word("三".to_string())),
6415 },
6416 PositionalToken {
6417 source: uws,
6418 offset: 65,
6419 length: 3,
6420 token: Token::Word(Word::Word("山".to_string())),
6421 },
6422 PositionalToken {
6423 source: uws,
6424 offset: 68,
6425 length: 3,
6426 token: Token::Special(Special::Punctuation('(')),
6427 },
6428 PositionalToken {
6429 source: uws,
6430 offset: 71,
6431 length: 3,
6432 token: Token::Word(Word::Word("熊".to_string())),
6433 },
6434 PositionalToken {
6435 source: uws,
6436 offset: 74,
6437 length: 3,
6438 token: Token::Word(Word::Word("野".to_string())),
6439 },
6440 PositionalToken {
6441 source: uws,
6442 offset: 77,
6443 length: 3,
6444 token: Token::Word(Word::Word("本".to_string())),
6445 },
6446 PositionalToken {
6447 source: uws,
6448 offset: 80,
6449 length: 3,
6450 token: Token::Word(Word::Word("宮".to_string())),
6451 },
6452 PositionalToken {
6453 source: uws,
6454 offset: 83,
6455 length: 3,
6456 token: Token::Special(Special::Punctuation('、')),
6457 },
6458 PositionalToken {
6459 source: uws,
6460 offset: 86,
6461 length: 3,
6462 token: Token::Word(Word::Word("熊".to_string())),
6463 },
6464 PositionalToken {
6465 source: uws,
6466 offset: 89,
6467 length: 3,
6468 token: Token::Word(Word::Word("野".to_string())),
6469 },
6470 PositionalToken {
6471 source: uws,
6472 offset: 92,
6473 length: 3,
6474 token: Token::Word(Word::Word("新".to_string())),
6475 },
6476 PositionalToken {
6477 source: uws,
6478 offset: 95,
6479 length: 3,
6480 token: Token::Word(Word::Word("宮".to_string())),
6481 },
6482 PositionalToken {
6483 source: uws,
6484 offset: 98,
6485 length: 3,
6486 token: Token::Special(Special::Punctuation('、')),
6487 },
6488 PositionalToken {
6489 source: uws,
6490 offset: 101,
6491 length: 3,
6492 token: Token::Word(Word::Word("熊".to_string())),
6493 },
6494 PositionalToken {
6495 source: uws,
6496 offset: 104,
6497 length: 3,
6498 token: Token::Word(Word::Word("野".to_string())),
6499 },
6500 PositionalToken {
6501 source: uws,
6502 offset: 107,
6503 length: 3,
6504 token: Token::Word(Word::Word("那".to_string())),
6505 },
6506 PositionalToken {
6507 source: uws,
6508 offset: 110,
6509 length: 3,
6510 token: Token::Word(Word::Word("智".to_string())),
6511 },
6512 PositionalToken {
6513 source: uws,
6514 offset: 113,
6515 length: 3,
6516 token: Token::Special(Special::Punctuation(')')),
6517 },
6518 PositionalToken {
6519 source: uws,
6520 offset: 116,
6521 length: 3,
6522 token: Token::Word(Word::Word("の".to_string())),
6523 },
6524 PositionalToken {
6525 source: uws,
6526 offset: 119,
6527 length: 3,
6528 token: Token::Word(Word::Word("造".to_string())),
6529 },
6530 PositionalToken {
6531 source: uws,
6532 offset: 122,
6533 length: 3,
6534 token: Token::Word(Word::Word("営".to_string())),
6535 },
6536 PositionalToken {
6537 source: uws,
6538 offset: 125,
6539 length: 3,
6540 token: Token::Special(Special::Punctuation('・')),
6541 },
6542 PositionalToken {
6543 source: uws,
6544 offset: 128,
6545 length: 3,
6546 token: Token::Word(Word::Word("修".to_string())),
6547 },
6548 PositionalToken {
6549 source: uws,
6550 offset: 131,
6551 length: 3,
6552 token: Token::Word(Word::Word("造".to_string())),
6553 },
6554 PositionalToken {
6555 source: uws,
6556 offset: 134,
6557 length: 3,
6558 token: Token::Word(Word::Word("の".to_string())),
6559 },
6560 PositionalToken {
6561 source: uws,
6562 offset: 137,
6563 length: 3,
6564 token: Token::Word(Word::Word("た".to_string())),
6565 },
6566 PositionalToken {
6567 source: uws,
6568 offset: 140,
6569 length: 3,
6570 token: Token::Word(Word::Word("め".to_string())),
6571 },
6572 PositionalToken {
6573 source: uws,
6574 offset: 143,
6575 length: 3,
6576 token: Token::Word(Word::Word("の".to_string())),
6577 },
6578 PositionalToken {
6579 source: uws,
6580 offset: 146,
6581 length: 3,
6582 token: Token::Word(Word::Word("勧".to_string())),
6583 },
6584 PositionalToken {
6585 source: uws,
6586 offset: 149,
6587 length: 3,
6588 token: Token::Word(Word::Word("進".to_string())),
6589 },
6590 PositionalToken {
6591 source: uws,
6592 offset: 152,
6593 length: 3,
6594 token: Token::Word(Word::Word("を".to_string())),
6595 },
6596 PositionalToken {
6597 source: uws,
6598 offset: 155,
6599 length: 3,
6600 token: Token::Word(Word::Word("担".to_string())),
6601 },
6602 PositionalToken {
6603 source: uws,
6604 offset: 158,
6605 length: 3,
6606 token: Token::Word(Word::Word("っ".to_string())),
6607 },
6608 PositionalToken {
6609 source: uws,
6610 offset: 161,
6611 length: 3,
6612 token: Token::Word(Word::Word("た".to_string())),
6613 },
6614 PositionalToken {
6615 source: uws,
6616 offset: 164,
6617 length: 3,
6618 token: Token::Word(Word::Word("組".to_string())),
6619 },
6620 PositionalToken {
6621 source: uws,
6622 offset: 167,
6623 length: 3,
6624 token: Token::Word(Word::Word("織".to_string())),
6625 },
6626 PositionalToken {
6627 source: uws,
6628 offset: 170,
6629 length: 3,
6630 token: Token::Word(Word::Word("の".to_string())),
6631 },
6632 PositionalToken {
6633 source: uws,
6634 offset: 173,
6635 length: 3,
6636 token: Token::Word(Word::Word("総".to_string())),
6637 },
6638 PositionalToken {
6639 source: uws,
6640 offset: 176,
6641 length: 3,
6642 token: Token::Word(Word::Word("称".to_string())),
6643 },
6644 PositionalToken {
6645 source: uws,
6646 offset: 179,
6647 length: 3,
6648 token: Token::Special(Special::Punctuation('。')),
6649 },
6650 PositionalToken {
6651 source: uws,
6652 offset: 182,
6653 length: 1,
6654 token: Token::Special(Special::Separator(Separator::Space)),
6655 },
6656 PositionalToken {
6657 source: uws,
6658 offset: 183,
6659 length: 3,
6660 token: Token::Word(Word::Word("熊".to_string())),
6661 },
6662 PositionalToken {
6663 source: uws,
6664 offset: 186,
6665 length: 3,
6666 token: Token::Word(Word::Word("野".to_string())),
6667 },
6668 PositionalToken {
6669 source: uws,
6670 offset: 189,
6671 length: 3,
6672 token: Token::Word(Word::Word("三".to_string())),
6673 },
6674 PositionalToken {
6675 source: uws,
6676 offset: 192,
6677 length: 3,
6678 token: Token::Word(Word::Word("山".to_string())),
6679 },
6680 PositionalToken {
6681 source: uws,
6682 offset: 195,
6683 length: 3,
6684 token: Token::Word(Word::Word("を".to_string())),
6685 },
6686 PositionalToken {
6687 source: uws,
6688 offset: 198,
6689 length: 3,
6690 token: Token::Word(Word::Word("含".to_string())),
6691 },
6692 PositionalToken {
6693 source: uws,
6694 offset: 201,
6695 length: 3,
6696 token: Token::Word(Word::Word("め".to_string())),
6697 },
6698 PositionalToken {
6699 source: uws,
6700 offset: 204,
6701 length: 3,
6702 token: Token::Word(Word::Word("て".to_string())),
6703 },
6704 PositionalToken {
6705 source: uws,
6706 offset: 207,
6707 length: 3,
6708 token: Token::Special(Special::Punctuation('、')),
6709 },
6710 PositionalToken {
6711 source: uws,
6712 offset: 210,
6713 length: 3,
6714 token: Token::Word(Word::Word("日".to_string())),
6715 },
6716 PositionalToken {
6717 source: uws,
6718 offset: 213,
6719 length: 3,
6720 token: Token::Word(Word::Word("本".to_string())),
6721 },
6722 PositionalToken {
6723 source: uws,
6724 offset: 216,
6725 length: 3,
6726 token: Token::Word(Word::Word("に".to_string())),
6727 },
6728 PositionalToken {
6729 source: uws,
6730 offset: 219,
6731 length: 3,
6732 token: Token::Word(Word::Word("お".to_string())),
6733 },
6734 PositionalToken {
6735 source: uws,
6736 offset: 222,
6737 length: 3,
6738 token: Token::Word(Word::Word("け".to_string())),
6739 },
6740 PositionalToken {
6741 source: uws,
6742 offset: 225,
6743 length: 3,
6744 token: Token::Word(Word::Word("る".to_string())),
6745 },
6746 PositionalToken {
6747 source: uws,
6748 offset: 228,
6749 length: 3,
6750 token: Token::Word(Word::Word("古".to_string())),
6751 },
6752 PositionalToken {
6753 source: uws,
6754 offset: 231,
6755 length: 3,
6756 token: Token::Word(Word::Word("代".to_string())),
6757 },
6758 PositionalToken {
6759 source: uws,
6760 offset: 234,
6761 length: 3,
6762 token: Token::Word(Word::Word("か".to_string())),
6763 },
6764 PositionalToken {
6765 source: uws,
6766 offset: 237,
6767 length: 3,
6768 token: Token::Word(Word::Word("ら".to_string())),
6769 },
6770 PositionalToken {
6771 source: uws,
6772 offset: 240,
6773 length: 3,
6774 token: Token::Word(Word::Word("中".to_string())),
6775 },
6776 PositionalToken {
6777 source: uws,
6778 offset: 243,
6779 length: 3,
6780 token: Token::Word(Word::Word("世".to_string())),
6781 },
6782 PositionalToken {
6783 source: uws,
6784 offset: 246,
6785 length: 3,
6786 token: Token::Word(Word::Word("前".to_string())),
6787 },
6788 PositionalToken {
6789 source: uws,
6790 offset: 249,
6791 length: 3,
6792 token: Token::Word(Word::Word("半".to_string())),
6793 },
6794 PositionalToken {
6795 source: uws,
6796 offset: 252,
6797 length: 3,
6798 token: Token::Word(Word::Word("に".to_string())),
6799 },
6800 PositionalToken {
6801 source: uws,
6802 offset: 255,
6803 length: 3,
6804 token: Token::Word(Word::Word("か".to_string())),
6805 },
6806 PositionalToken {
6807 source: uws,
6808 offset: 258,
6809 length: 3,
6810 token: Token::Word(Word::Word("け".to_string())),
6811 },
6812 PositionalToken {
6813 source: uws,
6814 offset: 261,
6815 length: 3,
6816 token: Token::Word(Word::Word("て".to_string())),
6817 },
6818 PositionalToken {
6819 source: uws,
6820 offset: 264,
6821 length: 3,
6822 token: Token::Word(Word::Word("の".to_string())),
6823 },
6824 PositionalToken {
6825 source: uws,
6826 offset: 267,
6827 length: 3,
6828 token: Token::Word(Word::Word("寺".to_string())),
6829 },
6830 PositionalToken {
6831 source: uws,
6832 offset: 270,
6833 length: 3,
6834 token: Token::Word(Word::Word("社".to_string())),
6835 },
6836 PositionalToken {
6837 source: uws,
6838 offset: 273,
6839 length: 3,
6840 token: Token::Word(Word::Word("の".to_string())),
6841 },
6842 PositionalToken {
6843 source: uws,
6844 offset: 276,
6845 length: 3,
6846 token: Token::Word(Word::Word("造".to_string())),
6847 },
6848 PositionalToken {
6849 source: uws,
6850 offset: 279,
6851 length: 3,
6852 token: Token::Word(Word::Word("営".to_string())),
6853 },
6854 PositionalToken {
6855 source: uws,
6856 offset: 282,
6857 length: 3,
6858 token: Token::Word(Word::Word("は".to_string())),
6859 },
6860 PositionalToken {
6861 source: uws,
6862 offset: 285,
6863 length: 3,
6864 token: Token::Special(Special::Punctuation('、')),
6865 },
6866 PositionalToken {
6867 source: uws,
6868 offset: 288,
6869 length: 3,
6870 token: Token::Word(Word::Word("寺".to_string())),
6871 },
6872 PositionalToken {
6873 source: uws,
6874 offset: 291,
6875 length: 3,
6876 token: Token::Word(Word::Word("社".to_string())),
6877 },
6878 ],
6879 Lang::Kor => vec![
6880 PositionalToken {
6881 source: uws,
6882 offset: 0,
6883 length: 21,
6884 token: Token::Word(Word::Word("플레이스테이션".to_string())),
6885 },
6886 PositionalToken {
6887 source: uws,
6888 offset: 21,
6889 length: 1,
6890 token: Token::Special(Special::Separator(Separator::Space)),
6891 },
6892 PositionalToken {
6893 source: uws,
6894 offset: 22,
6895 length: 3,
6896 token: Token::Word(Word::Word("은".to_string())),
6897 },
6898 PositionalToken {
6899 source: uws,
6900 offset: 25,
6901 length: 1,
6902 token: Token::Special(Special::Separator(Separator::Space)),
6903 },
6904 PositionalToken {
6905 source: uws,
6906 offset: 26,
6907 length: 6,
6908 token: Token::Word(Word::Word("소니".to_string())),
6909 },
6910 PositionalToken {
6911 source: uws,
6912 offset: 32,
6913 length: 1,
6914 token: Token::Special(Special::Separator(Separator::Space)),
6915 },
6916 PositionalToken {
6917 source: uws,
6918 offset: 33,
6919 length: 9,
6920 token: Token::Word(Word::Word("컴퓨터".to_string())),
6921 },
6922 PositionalToken {
6923 source: uws,
6924 offset: 42,
6925 length: 1,
6926 token: Token::Special(Special::Separator(Separator::Space)),
6927 },
6928 PositionalToken {
6929 source: uws,
6930 offset: 43,
6931 length: 21,
6932 token: Token::Word(Word::Word("엔터테인먼트가".to_string())),
6933 },
6934 PositionalToken {
6935 source: uws,
6936 offset: 64,
6937 length: 1,
6938 token: Token::Special(Special::Separator(Separator::Space)),
6939 },
6940 PositionalToken {
6941 source: uws,
6942 offset: 65,
6943 length: 9,
6944 token: Token::Word(Word::Word("개발한".to_string())),
6945 },
6946 PositionalToken {
6947 source: uws,
6948 offset: 74,
6949 length: 1,
6950 token: Token::Special(Special::Separator(Separator::Space)),
6951 },
6952 PositionalToken {
6953 source: uws,
6954 offset: 75,
6955 length: 3,
6956 token: Token::Word(Word::Word("세".to_string())),
6957 },
6958 PositionalToken {
6959 source: uws,
6960 offset: 78,
6961 length: 1,
6962 token: Token::Special(Special::Separator(Separator::Space)),
6963 },
6964 PositionalToken {
6965 source: uws,
6966 offset: 79,
6967 length: 6,
6968 token: Token::Word(Word::Word("번째".to_string())),
6969 },
6970 PositionalToken {
6971 source: uws,
6972 offset: 85,
6973 length: 1,
6974 token: Token::Special(Special::Separator(Separator::Space)),
6975 },
6976 PositionalToken {
6977 source: uws,
6978 offset: 86,
6979 length: 9,
6980 token: Token::Word(Word::Word("가정용".to_string())),
6981 },
6982 PositionalToken {
6983 source: uws,
6984 offset: 95,
6985 length: 1,
6986 token: Token::Special(Special::Separator(Separator::Space)),
6987 },
6988 PositionalToken {
6989 source: uws,
6990 offset: 96,
6991 length: 15,
6992 token: Token::Word(Word::Word("게임기이다".to_string())),
6993 },
6994 PositionalToken {
6995 source: uws,
6996 offset: 111,
6997 length: 1,
6998 token: Token::Special(Special::Punctuation('.')),
6999 },
7000 PositionalToken {
7001 source: uws,
7002 offset: 112,
7003 length: 1,
7004 token: Token::Special(Special::Separator(Separator::Space)),
7005 },
7006 PositionalToken {
7007 source: uws,
7008 offset: 113,
7009 length: 24,
7010 token: Token::Word(Word::Word("마이크로소프트의".to_string())),
7011 },
7012 PositionalToken {
7013 source: uws,
7014 offset: 137,
7015 length: 1,
7016 token: Token::Special(Special::Separator(Separator::Space)),
7017 },
7018 PositionalToken {
7019 source: uws,
7020 offset: 138,
7021 length: 12,
7022 token: Token::Word(Word::Word("엑스박스".to_string())),
7023 },
7024 PositionalToken {
7025 source: uws,
7026 offset: 150,
7027 length: 1,
7028 token: Token::Special(Special::Separator(Separator::Space)),
7029 },
7030 PositionalToken {
7031 source: uws,
7032 offset: 151,
7033 length: 3,
7034 token: Token::Word(Word::Number(Number::Integer(360))),
7035 },
7036 PositionalToken {
7037 source: uws,
7038 offset: 154,
7039 length: 1,
7040 token: Token::Special(Special::Punctuation(',')),
7041 },
7042 PositionalToken {
7043 source: uws,
7044 offset: 155,
7045 length: 1,
7046 token: Token::Special(Special::Separator(Separator::Space)),
7047 },
7048 PositionalToken {
7049 source: uws,
7050 offset: 156,
7051 length: 12,
7052 token: Token::Word(Word::Word("닌텐도의".to_string())),
7053 },
7054 PositionalToken {
7055 source: uws,
7056 offset: 168,
7057 length: 1,
7058 token: Token::Special(Special::Separator(Separator::Space)),
7059 },
7060 PositionalToken {
7061 source: uws,
7062 offset: 169,
7063 length: 6,
7064 token: Token::Word(Word::Word("Wii와".to_string())),
7065 },
7066 PositionalToken {
7067 source: uws,
7068 offset: 175,
7069 length: 1,
7070 token: Token::Special(Special::Separator(Separator::Space)),
7071 },
7072 PositionalToken {
7073 source: uws,
7074 offset: 176,
7075 length: 12,
7076 token: Token::Word(Word::Word("경쟁하고".to_string())),
7077 },
7078 PositionalToken {
7079 source: uws,
7080 offset: 188,
7081 length: 1,
7082 token: Token::Special(Special::Separator(Separator::Space)),
7083 },
7084 PositionalToken {
7085 source: uws,
7086 offset: 189,
7087 length: 6,
7088 token: Token::Word(Word::Word("있다".to_string())),
7089 },
7090 PositionalToken {
7091 source: uws,
7092 offset: 195,
7093 length: 1,
7094 token: Token::Special(Special::Punctuation('.')),
7095 },
7096 PositionalToken {
7097 source: uws,
7098 offset: 196,
7099 length: 1,
7100 token: Token::Special(Special::Separator(Separator::Space)),
7101 },
7102 PositionalToken {
7103 source: uws,
7104 offset: 197,
7105 length: 6,
7106 token: Token::Word(Word::Word("이전".to_string())),
7107 },
7108 PositionalToken {
7109 source: uws,
7110 offset: 203,
7111 length: 1,
7112 token: Token::Special(Special::Separator(Separator::Space)),
7113 },
7114 PositionalToken {
7115 source: uws,
7116 offset: 204,
7117 length: 12,
7118 token: Token::Word(Word::Word("제품에서".to_string())),
7119 },
7120 PositionalToken {
7121 source: uws,
7122 offset: 216,
7123 length: 1,
7124 token: Token::Special(Special::Separator(Separator::Space)),
7125 },
7126 PositionalToken {
7127 source: uws,
7128 offset: 217,
7129 length: 9,
7130 token: Token::Word(Word::Word("온라인".to_string())),
7131 },
7132 PositionalToken {
7133 source: uws,
7134 offset: 226,
7135 length: 1,
7136 token: Token::Special(Special::Separator(Separator::Space)),
7137 },
7138 PositionalToken {
7139 source: uws,
7140 offset: 227,
7141 length: 9,
7142 token: Token::Word(Word::Word("플레이".to_string())),
7143 },
7144 PositionalToken {
7145 source: uws,
7146 offset: 236,
7147 length: 1,
7148 token: Token::Special(Special::Separator(Separator::Space)),
7149 },
7150 PositionalToken {
7151 source: uws,
7152 offset: 237,
7153 length: 3,
7154 token: Token::Word(Word::Word("기".to_string())),
7155 },
7156 ],
7157 Lang::Ara => vec![
7158 PositionalToken {
7159 source: uws,
7160 offset: 0,
7161 length: 14,
7162 token: Token::Word(Word::Word("لشکرکشی".to_string())),
7163 },
7164 PositionalToken {
7165 source: uws,
7166 offset: 14,
7167 length: 3,
7168 token: Token::Unicode(Unicode::Formatter(Formatter::Char('\u{200c}'))),
7169 },
7170 PositionalToken {
7171 source: uws,
7172 offset: 17,
7173 length: 6,
7174 token: Token::Word(Word::Word("های".to_string())),
7175 },
7176 PositionalToken {
7177 source: uws,
7178 offset: 23,
7179 length: 1,
7180 token: Token::Special(Special::Separator(Separator::Space)),
7181 },
7182 PositionalToken {
7183 source: uws,
7184 offset: 24,
7185 length: 6,
7186 token: Token::Word(Word::Word("روس".to_string())),
7187 },
7188 PositionalToken {
7189 source: uws,
7190 offset: 30,
7191 length: 3,
7192 token: Token::Unicode(Unicode::Formatter(Formatter::Char('\u{200c}'))),
7193 },
7194 PositionalToken {
7195 source: uws,
7196 offset: 33,
7197 length: 6,
7198 token: Token::Word(Word::Word("های".to_string())),
7199 },
7200 PositionalToken {
7201 source: uws,
7202 offset: 39,
7203 length: 1,
7204 token: Token::Special(Special::Separator(Separator::Space)),
7205 },
7206 PositionalToken {
7207 source: uws,
7208 offset: 40,
7209 length: 12,
7210 token: Token::Word(Word::Word("وارنگی".to_string())),
7211 },
7212 PositionalToken {
7213 source: uws,
7214 offset: 52,
7215 length: 1,
7216 token: Token::Special(Special::Separator(Separator::Space)),
7217 },
7218 PositionalToken {
7219 source: uws,
7220 offset: 53,
7221 length: 4,
7222 token: Token::Word(Word::Word("به".to_string())),
7223 },
7224 PositionalToken {
7225 source: uws,
7226 offset: 57,
7227 length: 1,
7228 token: Token::Special(Special::Separator(Separator::Space)),
7229 },
7230 PositionalToken {
7231 source: uws,
7232 offset: 58,
7233 length: 10,
7234 token: Token::Word(Word::Word("دریای".to_string())),
7235 },
7236 PositionalToken {
7237 source: uws,
7238 offset: 68,
7239 length: 1,
7240 token: Token::Special(Special::Separator(Separator::Space)),
7241 },
7242 PositionalToken {
7243 source: uws,
7244 offset: 69,
7245 length: 6,
7246 token: Token::Word(Word::Word("خزر".to_string())),
7247 },
7248 PositionalToken {
7249 source: uws,
7250 offset: 75,
7251 length: 1,
7252 token: Token::Special(Special::Separator(Separator::Space)),
7253 },
7254 PositionalToken {
7255 source: uws,
7256 offset: 76,
7257 length: 12,
7258 token: Token::Word(Word::Word("مجموعه".to_string())),
7259 },
7260 PositionalToken {
7261 source: uws,
7262 offset: 88,
7263 length: 3,
7264 token: Token::Unicode(Unicode::Formatter(Formatter::Char('\u{200c}'))),
7265 },
7266 PositionalToken {
7267 source: uws,
7268 offset: 91,
7269 length: 4,
7270 token: Token::Word(Word::Word("ای".to_string())),
7271 },
7272 PositionalToken {
7273 source: uws,
7274 offset: 95,
7275 length: 1,
7276 token: Token::Special(Special::Separator(Separator::Space)),
7277 },
7278 PositionalToken {
7279 source: uws,
7280 offset: 96,
7281 length: 4,
7282 token: Token::Word(Word::Word("از".to_string())),
7283 },
7284 PositionalToken {
7285 source: uws,
7286 offset: 100,
7287 length: 1,
7288 token: Token::Special(Special::Separator(Separator::Space)),
7289 },
7290 PositionalToken {
7291 source: uws,
7292 offset: 101,
7293 length: 10,
7294 token: Token::Word(Word::Word("حملات".to_string())),
7295 },
7296 PositionalToken {
7297 source: uws,
7298 offset: 111,
7299 length: 1,
7300 token: Token::Special(Special::Separator(Separator::Space)),
7301 },
7302 PositionalToken {
7303 source: uws,
7304 offset: 112,
7305 length: 10,
7306 token: Token::Word(Word::Word("نظامی".to_string())),
7307 },
7308 PositionalToken {
7309 source: uws,
7310 offset: 122,
7311 length: 1,
7312 token: Token::Special(Special::Separator(Separator::Space)),
7313 },
7314 PositionalToken {
7315 source: uws,
7316 offset: 123,
7317 length: 4,
7318 token: Token::Word(Word::Word("در".to_string())),
7319 },
7320 PositionalToken {
7321 source: uws,
7322 offset: 127,
7323 length: 1,
7324 token: Token::Special(Special::Separator(Separator::Space)),
7325 },
7326 PositionalToken {
7327 source: uws,
7328 offset: 128,
7329 length: 6,
7330 token: Token::Word(Word::Word("بین".to_string())),
7331 },
7332 PositionalToken {
7333 source: uws,
7334 offset: 134,
7335 length: 1,
7336 token: Token::Special(Special::Separator(Separator::Space)),
7337 },
7338 PositionalToken {
7339 source: uws,
7340 offset: 135,
7341 length: 6,
7342 token: Token::Word(Word::Word("سال".to_string())),
7343 },
7344 PositionalToken {
7345 source: uws,
7346 offset: 141,
7347 length: 3,
7348 token: Token::Unicode(Unicode::Formatter(Formatter::Char('\u{200c}'))),
7349 },
7350 PositionalToken {
7351 source: uws,
7352 offset: 144,
7353 length: 6,
7354 token: Token::Word(Word::Word("های".to_string())),
7355 },
7356 PositionalToken {
7357 source: uws,
7358 offset: 150,
7359 length: 1,
7360 token: Token::Special(Special::Separator(Separator::Space)),
7361 },
7362 PositionalToken {
7363 source: uws,
7364 offset: 151,
7365 length: 6,
7366 token: Token::Word(Word::StrangeWord("۸۶۴".to_string())),
7367 },
7368 PositionalToken {
7369 source: uws,
7370 offset: 157,
7371 length: 1,
7372 token: Token::Special(Special::Separator(Separator::Space)),
7373 },
7374 PositionalToken {
7375 source: uws,
7376 offset: 158,
7377 length: 4,
7378 token: Token::Word(Word::Word("تا".to_string())),
7379 },
7380 PositionalToken {
7381 source: uws,
7382 offset: 162,
7383 length: 1,
7384 token: Token::Special(Special::Separator(Separator::Space)),
7385 },
7386 PositionalToken {
7387 source: uws,
7388 offset: 163,
7389 length: 8,
7390 token: Token::Word(Word::StrangeWord("۱۰۴۱".to_string())),
7391 },
7392 PositionalToken {
7393 source: uws,
7394 offset: 171,
7395 length: 1,
7396 token: Token::Special(Special::Separator(Separator::Space)),
7397 },
7398 PositionalToken {
7399 source: uws,
7400 offset: 172,
7401 length: 12,
7402 token: Token::Word(Word::Word("میلادی".to_string())),
7403 },
7404 PositionalToken {
7405 source: uws,
7406 offset: 184,
7407 length: 1,
7408 token: Token::Special(Special::Separator(Separator::Space)),
7409 },
7410 PositionalToken {
7411 source: uws,
7412 offset: 185,
7413 length: 2,
7414 token: Token::Word(Word::Word("ب".to_string())),
7415 },
7416 ],
7417 Lang::Ell => vec![
7418 PositionalToken {
7419 source: uws,
7420 offset: 0,
7421 length: 4,
7422 token: Token::Word(Word::Word("Το".to_string())),
7423 },
7424 PositionalToken {
7425 source: uws,
7426 offset: 4,
7427 length: 1,
7428 token: Token::Special(Special::Separator(Separator::Space)),
7429 },
7430 PositionalToken {
7431 source: uws,
7432 offset: 5,
7433 length: 18,
7434 token: Token::Word(Word::Word("Πρόγραμμα".to_string())),
7435 },
7436 PositionalToken {
7437 source: uws,
7438 offset: 23,
7439 length: 1,
7440 token: Token::Special(Special::Separator(Separator::Space)),
7441 },
7442 PositionalToken {
7443 source: uws,
7444 offset: 24,
7445 length: 22,
7446 token: Token::Word(Word::Word("υλοποιείται".to_string())),
7447 },
7448 PositionalToken {
7449 source: uws,
7450 offset: 46,
7451 length: 1,
7452 token: Token::Special(Special::Separator(Separator::Space)),
7453 },
7454 PositionalToken {
7455 source: uws,
7456 offset: 47,
7457 length: 4,
7458 token: Token::Word(Word::Word("εξ".to_string())),
7459 },
7460 PositionalToken {
7461 source: uws,
7462 offset: 51,
7463 length: 1,
7464 token: Token::Special(Special::Separator(Separator::Space)),
7465 },
7466 PositionalToken {
7467 source: uws,
7468 offset: 52,
7469 length: 18,
7470 token: Token::Word(Word::Word("ολοκλήρου".to_string())),
7471 },
7472 PositionalToken {
7473 source: uws,
7474 offset: 70,
7475 length: 1,
7476 token: Token::Special(Special::Separator(Separator::Space)),
7477 },
7478 PositionalToken {
7479 source: uws,
7480 offset: 71,
7481 length: 6,
7482 token: Token::Word(Word::Word("από".to_string())),
7483 },
7484 PositionalToken {
7485 source: uws,
7486 offset: 77,
7487 length: 1,
7488 token: Token::Special(Special::Separator(Separator::Space)),
7489 },
7490 PositionalToken {
7491 source: uws,
7492 offset: 78,
7493 length: 16,
7494 token: Token::Word(Word::Word("απόσταση".to_string())),
7495 },
7496 PositionalToken {
7497 source: uws,
7498 offset: 94,
7499 length: 1,
7500 token: Token::Special(Special::Separator(Separator::Space)),
7501 },
7502 PositionalToken {
7503 source: uws,
7504 offset: 95,
7505 length: 6,
7506 token: Token::Word(Word::Word("και".to_string())),
7507 },
7508 PositionalToken {
7509 source: uws,
7510 offset: 101,
7511 length: 1,
7512 token: Token::Special(Special::Separator(Separator::Space)),
7513 },
7514 PositionalToken {
7515 source: uws,
7516 offset: 102,
7517 length: 12,
7518 token: Token::Word(Word::Word("μπορεί".to_string())),
7519 },
7520 PositionalToken {
7521 source: uws,
7522 offset: 114,
7523 length: 1,
7524 token: Token::Special(Special::Separator(Separator::Space)),
7525 },
7526 PositionalToken {
7527 source: uws,
7528 offset: 115,
7529 length: 4,
7530 token: Token::Word(Word::Word("να".to_string())),
7531 },
7532 PositionalToken {
7533 source: uws,
7534 offset: 119,
7535 length: 1,
7536 token: Token::Special(Special::Separator(Separator::Space)),
7537 },
7538 PositionalToken {
7539 source: uws,
7540 offset: 120,
7541 length: 20,
7542 token: Token::Word(Word::Word("συμμετέχει".to_string())),
7543 },
7544 PositionalToken {
7545 source: uws,
7546 offset: 140,
7547 length: 1,
7548 token: Token::Special(Special::Separator(Separator::Space)),
7549 },
7550 PositionalToken {
7551 source: uws,
7552 offset: 141,
7553 length: 8,
7554 token: Token::Word(Word::Word("κάθε".to_string())),
7555 },
7556 PositionalToken {
7557 source: uws,
7558 offset: 149,
7559 length: 1,
7560 token: Token::Special(Special::Separator(Separator::Space)),
7561 },
7562 PositionalToken {
7563 source: uws,
7564 offset: 150,
7565 length: 24,
7566 token: Token::Word(Word::Word("εμπλεκόμενος".to_string())),
7567 },
7568 PositionalToken {
7569 source: uws,
7570 offset: 174,
7571 length: 1,
7572 token: Token::Special(Special::Separator(Separator::Space)),
7573 },
7574 PositionalToken {
7575 source: uws,
7576 offset: 175,
7577 length: 6,
7578 token: Token::Word(Word::Word("στη".to_string())),
7579 },
7580 PositionalToken {
7581 source: uws,
7582 offset: 181,
7583 length: 1,
7584 token: Token::Special(Special::Separator(Separator::Space)),
7585 },
7586 PositionalToken {
7587 source: uws,
7588 offset: 182,
7589 length: 2,
7590 token: Token::Word(Word::Word("ή".to_string())),
7591 },
7592 PositionalToken {
7593 source: uws,
7594 offset: 184,
7595 length: 1,
7596 token: Token::Special(Special::Punctuation('/')),
7597 },
7598 ],
7599 };
7600 (
7601 uws.chars()
7602 .take(100)
7603 .fold(String::new(), |acc, c| acc + &format!("{}", c)),
7604 tokens,
7605 )
7606 }
7607}