1use std::sync::Arc;
2use text_parsing::{Breaker, IntoSource, Local, Localize, Snip, Source, SourceEvent};
3
4mod emoji;
5pub use emoji::EMOJIMAP;
6
7mod breakers;
8pub use breakers::{SentenceBreaker, UnicodeSentenceBreaker};
9
10mod wordbreaker;
11
12mod options;
13pub use options::{IntoTokenizer, TokenizerOptions, TokenizerParams};
14
15mod tokens;
16pub use tokens::Tokens;
17
18mod text_tokens;
19use text_tokens::InnerBound;
20pub use text_tokens::TextTokens;
21
22#[derive(Debug)]
23pub enum Error {
24 TextParser(text_parsing::Error),
25}
26
27const EPS: f64 = 1e-8;
28
29#[cfg(feature = "strings")]
30#[derive(Debug, Clone, PartialEq, PartialOrd)]
31pub enum Number {
32 Integer(i64),
33 Float(f64),
34 ZeroInteger { i: i64, s: String },
36}
37
38#[cfg(not(feature = "strings"))]
39#[derive(Debug, Clone, Copy, PartialEq, PartialOrd)]
40pub enum Number {
41 Integer(i64),
42 Float(f64),
43 ZeroInteger { i: i64 },
44}
45
46impl Number {
47 pub fn as_f64(&self) -> f64 {
48 match self {
49 Number::Integer(i) => *i as f64,
50 Number::Float(f) => *f,
51 Number::ZeroInteger { i, .. } => *i as f64,
52 }
53 }
54}
55impl Ord for Number {
56 fn cmp(&self, other: &Number) -> std::cmp::Ordering {
57 let s = self.as_f64();
58 let o = other.as_f64();
59 let d = s - o;
60 match d.abs() < EPS {
61 true => std::cmp::Ordering::Equal,
62 false => {
63 if d > 0.0 {
64 return std::cmp::Ordering::Greater;
65 }
66 if d < 0.0 {
67 return std::cmp::Ordering::Less;
68 }
69 std::cmp::Ordering::Equal
70 }
71 }
72 }
73}
74impl Eq for Number {}
75
76#[derive(Debug, Clone, Copy, Eq, PartialEq, Ord, PartialOrd)]
77pub enum Separator {
78 Space,
79 Tab,
80 Newline,
81 Char(char),
82}
83
84#[derive(Debug, Clone, Copy, Eq, PartialEq, Ord, PartialOrd)]
85pub enum Formatter {
86 Char(char),
87 Joiner, }
89
90#[derive(Debug, Clone, Copy, PartialEq, Eq, Ord, PartialOrd)]
91pub enum Special {
92 Currency(char),
93 Punctuation(char),
94 Symbol(char),
95 Separator(Separator),
96}
97
98#[cfg(feature = "strings")]
99#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord)]
100pub enum Word {
101 Word(String),
102 StrangeWord(String),
103 Numerical(Numerical),
104 Number(Number),
105 Emoji(&'static str),
106}
107
108#[cfg(feature = "strings")]
109#[derive(Debug, Clone, Eq, PartialEq, Ord, PartialOrd)]
110pub enum Numerical {
111 DotSeparated(String),
115 Measures(String),
116 Alphanumeric(String),
117}
118
119#[cfg(feature = "strings")]
120#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord)]
121pub enum Struct {
122 Hashtag(String),
123 Mention(String),
124 }
126
127#[cfg(feature = "strings")]
128#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord)]
129pub enum Unicode {
130 String(String),
131 Formatter(Formatter),
132}
133
134#[cfg(not(feature = "strings"))]
135#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord)]
136pub enum Word {
137 Word,
138 StrangeWord,
139 Numerical(Numerical),
140 Number(Number),
141 Emoji(&'static str),
142}
143
144#[cfg(not(feature = "strings"))]
145#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord)]
146pub enum Numerical {
147 DotSeparated,
151 Measures,
152 Alphanumeric,
153}
154
155#[cfg(not(feature = "strings"))]
156#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord)]
157pub enum Struct {
158 Hashtag,
159 Mention,
160 }
162
163#[cfg(not(feature = "strings"))]
164#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord)]
165pub enum Unicode {
166 String,
167 Formatter(Formatter),
168}
169
170#[cfg(feature = "strings")]
171#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord)]
172pub enum Token {
173 Word(Word),
174 Struct(Struct),
175 Special(Special),
176 Unicode(Unicode),
177}
178
179#[cfg(not(feature = "strings"))]
180#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord)]
181pub enum Token {
182 Word(Word),
183 Struct(Struct),
184 Special(Special),
185 Unicode(Unicode),
186}
187
188#[derive(Debug)]
202pub struct TextStr<'s> {
203 buffer: &'s str,
204 localities: Arc<Vec<TextLocality>>,
205 breakers: Arc<Vec<InnerBound>>,
206}
207impl<'s> TextStr<'s> {
208 pub fn new<'a>(s: &'a str) -> Result<TextStr<'a>, Error> {
209 let text = inner_new(s.into_source(), false)?;
210 Ok(TextStr {
211 buffer: s,
212 localities: text.localities,
213 breakers: text.breakers,
214 })
215 }
216}
217
218fn inner_new<S: Source>(mut source: S, with_buffer: bool) -> Result<Text, Error> {
219 let mut buffer = String::new();
220 let mut localities = Vec::new();
221 let mut breakers = Vec::new();
222 let mut buffer_len = 0;
223
224 while let Some(local_se) = source.next_char().map_err(Error::TextParser)? {
225 let (local, se) = local_se.into_inner();
226 let c = match se {
227 SourceEvent::Char(c) => match c {
228 '\u{0060}' => '\u{0027}',
229 _ => c,
230 },
231 SourceEvent::Breaker(b) => {
232 let (c, opt_b) = match b {
233 Breaker::None => continue,
234 Breaker::Space => (' ', None),
235 Breaker::Line => ('\n', None),
236 Breaker::Word => ('\u{200B}', Some(b)), Breaker::Sentence | Breaker::Paragraph | Breaker::Section => ('\n', Some(b)),
238 };
239 if let Some(b) = opt_b {
240 let br = InnerBound {
241 bytes: Snip {
242 offset: buffer_len,
243 length: c.len_utf8(),
244 },
245 chars: Snip {
246 offset: localities.len(),
247 length: 1,
248 },
249 breaker: b,
250 original: Some(local),
251 };
252 breakers.push(br);
254 }
255 c
256 }
257 };
258
259 let buf_local = ().localize(
260 Snip {
261 offset: localities.len(),
263 length: 1,
264 },
265 Snip {
266 offset: buffer_len,
268 length: c.len_utf8(),
269 },
270 );
271 if with_buffer {
272 buffer.push(c);
273 }
274 buffer_len += c.len_utf8();
275 localities.push(TextLocality {
276 buffer: buf_local,
277 original: local,
278 });
279 }
280 Ok(Text {
281 buffer: Arc::new(buffer),
282 localities: Arc::new(localities),
283 breakers: Arc::new(breakers),
284 })
285}
286
287#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord)]
288pub struct TextLocality {
289 pub buffer: Local<()>,
290 pub original: Local<()>,
291}
292
293#[derive(Debug)]
294pub struct Text {
295 buffer: Arc<String>,
296 localities: Arc<Vec<TextLocality>>,
297 breakers: Arc<Vec<InnerBound>>,
298}
299impl Text {
300 pub fn new<S: Source>(source: S) -> Result<Text, Error> {
301 inner_new(source, true)
302 }
303 pub fn token_text<'s>(&'s self, token: &TextToken) -> &'s str {
304 let Snip {
305 offset: begin,
306 length: len,
307 } = token.locality.bytes();
308 let end = begin + len;
309 &self.buffer[begin..end]
310 }
311 pub fn text(&self) -> &str {
312 self.buffer.as_ref()
313 }
314 pub fn original_locality(&self, idx: usize) -> Option<Local<()>> {
315 self.localities.get(idx).map(|tl| tl.original)
316 }
317 pub fn localities(&self) -> &Vec<TextLocality> {
318 self.localities.as_ref()
319 }
320 pub fn shared_text(&self) -> Text {
321 Text {
322 buffer: self.buffer.clone(),
323 localities: self.localities.clone(),
324 breakers: self.breakers.clone(),
325 }
326 }
327}
328
329impl TryFrom<String> for Text {
330 type Error = Error;
331
332 fn try_from(s: String) -> Result<Text, Error> {
333 let mut text = inner_new((&s).into_source(), false)?;
334 text.buffer = Arc::new(s);
335 Ok(text)
336 }
337}
338
339impl TryFrom<&str> for Text {
340 type Error = Error;
341
342 fn try_from(s: &str) -> Result<Text, Error> {
343 Text::new(s.into_source())
344 }
345}
346
347#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord)]
348pub enum Bound {
349 Sentence,
350 Paragraph,
351 Section,
352}
353
354#[cfg(feature = "strings")]
355#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord)]
356pub struct TextToken {
357 locality: Local<()>,
358 original: Option<Local<()>>,
359 pub token: Token2,
360}
361
362#[cfg(not(feature = "strings"))]
363#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord)]
364pub struct TextToken {
365 locality: Local<()>,
366 original: Option<Local<()>>,
367 pub token: Token2,
368}
369
370#[cfg(test)]
371impl TextToken {
372 fn into_original_token_1(self) -> Option<Local<Token>> {
373 match self.original {
374 Some(original) => self.token.into_token().map(|t| original.local(t)),
375 None => None,
376 }
377 }
378}
379
380impl TextToken {
381 pub fn local(&self) -> Local<()> {
382 self.locality
383 }
384 pub fn original(&self) -> Option<Local<()>> {
385 self.original
386 }
387 pub fn into_position(mut self) -> TextToken {
388 self.locality = self.locality.into_position();
389 self.original = self.original.map(|or| or.into_position());
390 self
391 }
392 pub fn try_as_token(&self) -> Result<Token, Bound> {
393 self.token.try_as_token()
394 }
395 pub fn as_original_token(&self) -> Option<Local<&Token2>> {
396 self.original.map(|original| original.local(&self.token))
397 }
398 pub fn into_original_token(self) -> Option<Local<Token2>> {
399 self.original.map(|original| original.local(self.token))
400 }
401 pub fn original_str<'s>(&self, original: &'s str) -> Result<&'s str, OriginalError> {
402 match self.original {
403 Some(local) => {
404 let Snip {
405 offset: begin,
406 length: len,
407 } = local.bytes();
408 let end = begin + len;
409 match original.get(begin..end) {
410 Some(s) => Ok(s),
411 None => Err(OriginalError::InvalidSnip),
412 }
413 }
414 None => Err(OriginalError::NoOriginal),
415 }
416 }
417
418 #[cfg(feature = "strings")]
419 fn token_clone(&self) -> Token2 {
420 self.token.clone()
421 }
422
423 #[cfg(not(feature = "strings"))]
424 fn token_clone(&self) -> Token2 {
425 self.token
426 }
427
428 pub fn merge_tokens(
429 &self,
430 other: &TextToken,
431 new_token: Option<Token2>,
432 ) -> Result<TextToken, TextToken> {
433 let (local, left_lb, left_lc) = add_local(&self.locality, &other.locality);
434 let must_be_left = left_lb;
435 let mut ok = must_be_left == left_lc;
436 let orig = match (&self.original, &other.original) {
437 (None, None) => None,
438 (Some(o), None) | (None, Some(o)) => Some(*o),
439 (Some(s), Some(o)) => {
440 let (orig, lb, lc) = add_local(s, o);
441 ok &= must_be_left == lb;
442 ok &= must_be_left == lc;
443 Some(orig)
444 }
445 };
446 let token = TextToken {
447 locality: local,
448 original: orig,
449 token: match new_token {
450 Some(t) => t,
451 None => self.token_clone(),
452 },
453 };
454 match ok {
455 true => Ok(token),
456 false => Err(token),
457 }
458 }
459}
460
461fn add_local(slf: &Local<()>, other: &Local<()>) -> (Local<()>, bool, bool) {
462 let b1 = slf.bytes();
464 let b2 = other.bytes();
465 let c1 = slf.chars();
466 let c2 = other.chars();
467 let (bytes, slf_is_left_by_bytes) = match b1.offset < b2.offset {
468 true => (
469 Snip {
470 offset: b1.offset,
471 length: (b2.offset + b2.length) - b1.offset,
472 },
473 true,
474 ),
475 false => (
476 Snip {
477 offset: b2.offset,
478 length: (b1.offset + b1.length) - b2.offset,
479 },
480 false,
481 ),
482 };
483 let (chars, slf_is_left_by_chars) = match c1.offset < c2.offset {
484 true => (
485 Snip {
486 offset: c1.offset,
487 length: (c2.offset + c2.length) - c1.offset,
488 },
489 true,
490 ),
491 false => (
492 Snip {
493 offset: c2.offset,
494 length: (c1.offset + c1.length) - c2.offset,
495 },
496 false,
497 ),
498 };
499 (
500 ().localize(chars, bytes),
501 slf_is_left_by_bytes,
502 slf_is_left_by_chars,
503 )
504}
505
506impl TextToken {
507 pub fn test_token(lt: Local<Token2>) -> TextToken {
508 let (local, token) = lt.into_inner();
509 TextToken {
510 locality: local,
511 original: Some(local.local(())),
512 token,
513 }
514 }
515 pub fn test_new(token: Token2, local: Local<()>, original: Option<Local<()>>) -> TextToken {
516 TextToken {
517 locality: local,
518 original,
519 token,
520 }
521 }
522}
523
524#[derive(Debug)]
551pub enum OriginalError {
552 NoOriginal,
553 InvalidSnip,
554}
555
556#[cfg(feature = "strings")]
564#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord)]
565pub enum Token2 {
566 Word(Word),
567 Struct(Struct),
568 Special(Special),
569 Unicode(Unicode),
570
571 Bound(Bound),
572}
573#[cfg(not(feature = "strings"))]
574#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord)]
575pub enum Token2 {
576 Word(Word),
577 Struct(Struct),
578 Special(Special),
579 Unicode(Unicode),
580
581 Bound(Bound),
582}
583impl From<Token> for Token2 {
584 fn from(t: Token) -> Token2 {
585 match t {
586 Token::Word(w) => Token2::Word(w),
587 Token::Struct(s) => Token2::Struct(s),
588 Token::Special(s) => Token2::Special(s),
589 Token::Unicode(u) => Token2::Unicode(u),
590 }
591 }
592}
593impl Token2 {
594 #[cfg(not(feature = "strings"))]
595 fn try_as_token(&self) -> Result<Token, Bound> {
596 (*self).try_into_token()
597 }
598
599 #[cfg(feature = "strings")]
600 fn try_as_token(&self) -> Result<Token, Bound> {
601 self.clone().try_into_token()
602 }
603
604 fn try_into_token(self) -> Result<Token, Bound> {
605 match self {
606 Token2::Word(w) => Ok(Token::Word(w)),
607 Token2::Struct(s) => Ok(Token::Struct(s)),
608 Token2::Special(s) => Ok(Token::Special(s)),
609 Token2::Unicode(u) => Ok(Token::Unicode(u)),
610 Token2::Bound(b) => Err(b),
611 }
612 }
613}
614#[cfg(test)]
615impl Token2 {
616 fn into_token(self) -> Option<Token> {
617 match self {
618 Token2::Word(w) => Some(Token::Word(w)),
619 Token2::Struct(s) => Some(Token::Struct(s)),
620 Token2::Special(s) => Some(Token::Special(s)),
621 Token2::Unicode(u) => Some(Token::Unicode(u)),
622 Token2::Bound(_) => None,
623 }
624 }
625}
626
627#[cfg(test)]
628#[cfg(not(feature = "strings"))]
629mod test {
630 use super::*;
631 use text_parsing::{
632 IntoPipeParser, IntoSource, Localize, ParserExt, SourceExt, entities, tagger,
633 };
634
635 fn check_results(result: &Vec<Local<Token>>, lib_res: &Vec<Local<Token>>, _uws: &str) {
636 assert_eq!(result.len(), lib_res.len());
637 for i in 0..result.len() {
638 let res: Local<Token> = result[i].clone().into();
639 assert_eq!(res, lib_res[i]);
640 }
641 }
642
643 fn symbols() {
645 let uws = "Сибирь Арене 17 30 от 2560₽ 😀";
646 let lib_res = uws
649 .into_tokenizer(TokenizerParams::v1())
650 .collect::<Vec<_>>();
651 for t in lib_res {
653 println!("{:?}", t);
654 }
655 panic!()
656 }
657}
658
659#[cfg(test)]
660mod test_v0_5 {
661 use super::*;
662 use text_parsing::{IntoPipeParser, IntoSource, ParserExt, SourceExt, entities, tagger};
663
664 fn basic() {
666 let uws = "<p>Oxana Putan shared the quick (\"brown\") fox can't jump 32.3 feet, right? 4pda etc.</p><p> qeq U.S.A asd\n\n\nBrr, it's 29.3°F!\n Русское предложение #36.6 для тестирования деления по юникод-словам...\n🇷🇺 🇸🇹\n👱🏿👶🏽👨🏽\n+Done! Готово</p>";
681 let text = Text::new({
682 uws.into_source()
683 .pipe(tagger::Builder::new().create().into_breaker())
684 .pipe(entities::Builder::new().create().into_piped())
685 .into_separator()
686 })
687 .unwrap();
688 let lib_res = text
689 .into_tokenizer({
690 TokenizerParams::default()
691 .add_option(TokenizerOptions::SplitDot)
692 .add_option(TokenizerOptions::SplitUnderscore)
693 .add_option(TokenizerOptions::SplitColon)
694 .with_default_sentences()
695 })
696 .collect::<Vec<_>>();
697
698 for tok in lib_res {
699 println!(
700 "C{:?}, B{:?}, {:?} -> {:?}",
701 tok.original.map(|loc| loc.chars()),
702 tok.original.map(|loc| loc.bytes()),
703 tok.token,
704 tok.original_str(uws)
705 );
706 }
707
708 panic!()
709 }
710}
711
712#[cfg(test)]
713#[cfg(feature = "strings")]
714mod test {
715 use super::*;
716 use text_parsing::{
717 IntoPipeParser, IntoSource, Localize, ParserExt, SourceExt, entities, tagger,
718 };
719
720 #[allow(dead_code)]
739 fn print_result(lib_res: &Vec<Local<Token>>) {
740 for lt in lib_res {
745 println!("{:?}", lt);
746 }
747 }
748 #[derive(Debug, Clone)]
777 struct CharToken {
778 byte_offset: usize,
779 byte_length: usize,
780 char_offset: usize,
781 char_length: usize,
782 token: Token,
783 }
784 impl Into<Local<Token>> for CharToken {
785 fn into(self) -> Local<Token> {
786 self.token.localize(
787 Snip {
788 offset: self.char_offset,
789 length: self.char_length,
790 },
791 Snip {
792 offset: self.byte_offset,
793 length: self.byte_length,
794 },
795 )
796 }
797 }
798
799 #[derive(Debug, Clone)]
800 struct PositionalToken {
801 source: &'static str,
802 offset: usize,
803 length: usize,
804 token: Token,
805 }
806 impl Into<Local<Token>> for PositionalToken {
807 fn into(self) -> Local<Token> {
808 self.token.localize(
809 Snip {
810 offset: self.source[..self.offset].chars().count(),
811 length: self.source[self.offset..self.offset + self.length]
812 .chars()
813 .count(),
814 },
815 Snip {
816 offset: self.offset,
817 length: self.length,
818 },
819 )
820 }
821 }
822
823 fn check_results(result: &Vec<PositionalToken>, lib_res: &Vec<Local<Token>>, _uws: &str) {
824 assert_eq!(result.len(), lib_res.len());
825 for i in 0..result.len() {
826 let res: Local<Token> = result[i].clone().into();
827 assert_eq!(res, lib_res[i]);
828 }
829 }
830
831 fn check_cresults(result: &Vec<CharToken>, lib_res: &Vec<Local<Token>>, _uws: &str) {
832 assert_eq!(result.len(), lib_res.len());
833 for i in 0..result.len() {
834 let res: Local<Token> = result[i].clone().into();
835 assert_eq!(res, lib_res[i]);
836 }
837 }
838
839 fn check<T: Clone + std::fmt::Debug + Into<Local<Token>>>(
840 res: &Vec<T>,
841 lib: &Vec<Local<Token>>,
842 _uws: &str,
843 ) {
844 let mut lib = lib.iter();
845 let mut res = res.iter().map(|r| {
846 let res: Local<Token> = r.clone().into();
847 res
848 });
849 let mut diff = Vec::new();
850 loop {
851 match (lib.next(), res.next()) {
852 (Some(lw), Some(rw)) => {
853 if *lw != rw {
854 diff.push(format!("LIB: {:?}", lw));
855 diff.push(format!("TEST: {:?}", rw));
856 diff.push("".to_string())
857 }
858 }
859 (Some(lw), None) => {
860 diff.push(format!("LIB: {:?}", lw));
861 diff.push("TEST: ----".to_string());
862 diff.push("".to_string())
863 }
864 (None, Some(rw)) => {
865 diff.push("LIB: ----".to_string());
866 diff.push(format!("TEST: {:?}", rw));
867 diff.push("".to_string())
868 }
869 (None, None) => break,
870 }
871 }
872 if diff.len() > 0 {
873 for ln in &diff {
874 println!("{}", ln);
875 }
876 panic!("Diff count: {}", diff.len() / 3);
877 }
878 }
879
880 #[test]
881 #[rustfmt::skip]
882 fn currency() {
883 let uws = "$ ₽ € ¥";
884 let result = vec![
885 PositionalToken { source: uws, offset: 0, length: 1, token: Token::Special(Special::Currency('$')) },
886 PositionalToken { source: uws, offset: 1, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
887 PositionalToken { source: uws, offset: 2, length: 3, token: Token::Special(Special::Currency('₽')) },
888 PositionalToken { source: uws, offset: 5, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
889 PositionalToken { source: uws, offset: 6, length: 3, token: Token::Special(Special::Currency('€')) },
890 PositionalToken { source: uws, offset: 9, length: 1, token: Token::Special(Special::Separator(Separator::Space)) },
891 PositionalToken { source: uws, offset: 10, length: 2, token: Token::Special(Special::Currency('¥')) },
892 ];
893 let lib_res = uws
894 .into_tokenizer(TokenizerParams::v1())
895 .collect::<Vec<_>>();
896 check_results(&result, &lib_res, uws);
898 }
900
901 #[test]
902 fn spaces() {
903 let uws = " spaces too many apces ";
904 let result = vec![
905 PositionalToken {
906 source: uws,
907 offset: 0,
908 length: 4,
909 token: Token::Special(Special::Separator(Separator::Space)),
910 },
911 PositionalToken {
912 source: uws,
913 offset: 4,
914 length: 6,
915 token: Token::Word(Word::Word("spaces".to_string())),
916 },
917 PositionalToken {
918 source: uws,
919 offset: 10,
920 length: 4,
921 token: Token::Special(Special::Separator(Separator::Space)),
922 },
923 PositionalToken {
924 source: uws,
925 offset: 14,
926 length: 3,
927 token: Token::Word(Word::Word("too".to_string())),
928 },
929 PositionalToken {
930 source: uws,
931 offset: 17,
932 length: 3,
933 token: Token::Special(Special::Separator(Separator::Space)),
934 },
935 PositionalToken {
936 source: uws,
937 offset: 20,
938 length: 4,
939 token: Token::Word(Word::Word("many".to_string())),
940 },
941 PositionalToken {
942 source: uws,
943 offset: 24,
944 length: 3,
945 token: Token::Special(Special::Separator(Separator::Space)),
946 },
947 PositionalToken {
948 source: uws,
949 offset: 27,
950 length: 5,
951 token: Token::Word(Word::Word("apces".to_string())),
952 },
953 PositionalToken {
954 source: uws,
955 offset: 32,
956 length: 3,
957 token: Token::Special(Special::Separator(Separator::Space)),
958 },
959 ];
960 let lib_res = uws
961 .into_tokenizer(TokenizerParams::v1())
962 .collect::<Vec<_>>();
963 check_results(&result, &lib_res, uws);
964 }
966
967 #[test]
968 fn numbers() {
969 let uws = "(() -2\n() -2";
970 let result = vec![
971 PositionalToken {
972 source: uws,
973 offset: 0,
974 length: 1,
975 token: Token::Special(Special::Punctuation('(')),
976 },
977 PositionalToken {
978 source: uws,
979 offset: 1,
980 length: 1,
981 token: Token::Special(Special::Punctuation('(')),
982 },
983 PositionalToken {
984 source: uws,
985 offset: 2,
986 length: 1,
987 token: Token::Special(Special::Punctuation(')')),
988 },
989 PositionalToken {
990 source: uws,
991 offset: 3,
992 length: 1,
993 token: Token::Special(Special::Separator(Separator::Space)),
994 },
995 PositionalToken {
996 source: uws,
997 offset: 4,
998 length: 2,
999 token: Token::Word(Word::Number(Number::Integer(-2))),
1000 },
1001 PositionalToken {
1002 source: uws,
1003 offset: 6,
1004 length: 1,
1005 token: Token::Special(Special::Separator(Separator::Newline)),
1006 },
1007 PositionalToken {
1008 source: uws,
1009 offset: 7,
1010 length: 1,
1011 token: Token::Special(Special::Punctuation('(')),
1012 },
1013 PositionalToken {
1014 source: uws,
1015 offset: 8,
1016 length: 1,
1017 token: Token::Special(Special::Punctuation(')')),
1018 },
1019 PositionalToken {
1020 source: uws,
1021 offset: 9,
1022 length: 2,
1023 token: Token::Special(Special::Separator(Separator::Space)),
1024 },
1025 PositionalToken {
1026 source: uws,
1027 offset: 11,
1028 length: 2,
1029 token: Token::Word(Word::Number(Number::Integer(-2))),
1030 },
1031 ];
1032 let lib_res = uws
1033 .into_tokenizer({
1034 TokenizerParams::default()
1035 .add_option(TokenizerOptions::SplitDot)
1036 .add_option(TokenizerOptions::SplitUnderscore)
1037 .add_option(TokenizerOptions::SplitColon)
1038 .add_option(TokenizerOptions::MergeWhites)
1039 })
1040 .collect::<Vec<_>>();
1041 check_results(&result, &lib_res, uws);
1042 }
1043
1044 #[test]
1045 fn word_with_inner_hyphens() {
1046 let uws = "Опросы показывают";
1047 let result = vec![
1048 PositionalToken {
1049 source: uws,
1050 offset: 0,
1051 length: 14,
1052 token: Token::Word(Word::StrangeWord("Опросы".to_string())),
1053 },
1054 PositionalToken {
1055 source: uws,
1056 offset: 14,
1057 length: 1,
1058 token: Token::Special(Special::Separator(Separator::Space)),
1059 },
1060 PositionalToken {
1061 source: uws,
1062 offset: 15,
1063 length: 28,
1064 token: Token::Word(Word::StrangeWord("показывают".to_string())),
1065 },
1066 ];
1067 let lib_res = uws
1068 .into_tokenizer(TokenizerParams::v1())
1069 .collect::<Vec<_>>();
1070 check_results(&result, &lib_res, uws);
1071 }
1072
1073 #[test]
1074 fn mixed_but_word() {
1075 let uws = "L’Oreal";
1076 let result = vec![PositionalToken {
1077 source: uws,
1078 offset: 0,
1079 length: 9,
1080 token: Token::Word(Word::StrangeWord("L’Oreal".to_string())),
1081 }];
1082 let lib_res = uws
1083 .into_tokenizer(TokenizerParams::v1())
1084 .collect::<Vec<_>>();
1085 check_results(&result, &lib_res, uws);
1086 }
1087
1088 #[test]
1089 fn hashtags() {
1090 let uws = "#hashtag#hashtag2";
1091 let result = vec![
1092 PositionalToken {
1093 source: uws,
1094 offset: 0,
1095 length: 8,
1096 token: Token::Struct(Struct::Hashtag("hashtag".to_string())),
1097 },
1098 PositionalToken {
1099 source: uws,
1100 offset: 8,
1101 length: 9,
1102 token: Token::Struct(Struct::Hashtag("hashtag2".to_string())),
1103 },
1104 ];
1131 let lib_res = uws
1132 .into_tokenizer(TokenizerParams::v1())
1133 .collect::<Vec<_>>();
1134 check_results(&result, &lib_res, uws);
1135 }
1136
1137 #[test]
1138 fn hashtags2() {
1139 let uws = "#hashtag#hashtag2 #hash_tag";
1140 let result = vec![
1141 PositionalToken {
1142 source: uws,
1143 offset: 0,
1144 length: 8,
1145 token: Token::Struct(Struct::Hashtag("hashtag".to_string())),
1146 },
1147 PositionalToken {
1148 source: uws,
1149 offset: 8,
1150 length: 9,
1151 token: Token::Struct(Struct::Hashtag("hashtag2".to_string())),
1152 },
1153 PositionalToken {
1154 source: uws,
1155 offset: 17,
1156 length: 1,
1157 token: Token::Special(Special::Separator(Separator::Space)),
1158 },
1159 PositionalToken {
1160 source: uws,
1161 offset: 18,
1162 length: 9,
1163 token: Token::Struct(Struct::Hashtag("hash_tag".to_string())),
1164 },
1165 ];
1166 let lib_res = uws
1167 .into_tokenizer(TokenizerParams::v1().add_option(TokenizerOptions::StructTokens))
1168 .collect::<Vec<_>>();
1169 check_results(&result, &lib_res, uws);
1170 }
1171
1172 #[test]
1173 fn mention2() {
1174 let uws = "@hashtag@hashtag2 @hash_tag";
1175 let result = vec![
1176 PositionalToken {
1177 source: uws,
1178 offset: 0,
1179 length: 8,
1180 token: Token::Struct(Struct::Mention("hashtag".to_string())),
1181 },
1182 PositionalToken {
1183 source: uws,
1184 offset: 8,
1185 length: 9,
1186 token: Token::Struct(Struct::Mention("hashtag2".to_string())),
1187 },
1188 PositionalToken {
1189 source: uws,
1190 offset: 17,
1191 length: 1,
1192 token: Token::Special(Special::Separator(Separator::Space)),
1193 },
1194 PositionalToken {
1195 source: uws,
1196 offset: 18,
1197 length: 9,
1198 token: Token::Struct(Struct::Mention("hash_tag".to_string())),
1199 },
1200 ];
1201 let lib_res = uws
1202 .into_tokenizer(TokenizerParams::v1().add_option(TokenizerOptions::StructTokens))
1203 .collect::<Vec<_>>();
1204 check_results(&result, &lib_res, uws);
1205 }
1206
1207 #[test]
1208 fn apostrophe() {
1209 let uws = "l'oreal; l\u{0060}oreal";
1210 let result = vec![
1211 PositionalToken {
1212 source: uws,
1213 offset: 0,
1214 length: 7,
1215 token: Token::Word(Word::Word("l'oreal".to_string())),
1216 },
1217 PositionalToken {
1218 source: uws,
1219 offset: 7,
1220 length: 1,
1221 token: Token::Special(Special::Punctuation(';')),
1222 },
1223 PositionalToken {
1224 source: uws,
1225 offset: 8,
1226 length: 1,
1227 token: Token::Special(Special::Separator(Separator::Space)),
1228 },
1229 PositionalToken {
1230 source: uws,
1231 offset: 9,
1232 length: 7,
1233 token: Token::Word(Word::Word("l'oreal".to_string())),
1234 },
1235 ];
1236 let text = Text::new(uws.into_source()).unwrap();
1237 let lib_res = text
1238 .into_tokenizer(TokenizerParams::v1())
1239 .filter_map(|tt| tt.into_original_token_1())
1240 .collect::<Vec<_>>();
1241 check_results(&result, &lib_res, uws);
1242 }
1243
1244 #[test]
1245 fn char_tokens() {
1246 let uws = "[Oxana Putan|1712640565] shared the quick (\"brown\") fox can't jump 32.3 feet, right? 4pda etc. qeq U.S.A asd\n\n\nBrr, it's 29.3°F!\n Русское предложение #36.6 для тестирования деления по юникод-словам...\n🇷🇺 🇸🇹\n👱🏿👶🏽👨🏽\n+Done! Готово";
1247 let result = vec![
1248 CharToken {
1249 byte_offset: 0,
1250 byte_length: 1,
1251 char_offset: 0,
1252 char_length: 1,
1253 token: Token::Special(Special::Punctuation('[')),
1254 },
1255 CharToken {
1256 byte_offset: 1,
1257 byte_length: 5,
1258 char_offset: 1,
1259 char_length: 5,
1260 token: Token::Word(Word::Word("Oxana".to_string())),
1261 },
1262 CharToken {
1263 byte_offset: 6,
1264 byte_length: 1,
1265 char_offset: 6,
1266 char_length: 1,
1267 token: Token::Special(Special::Separator(Separator::Space)),
1268 },
1269 CharToken {
1270 byte_offset: 7,
1271 byte_length: 5,
1272 char_offset: 7,
1273 char_length: 5,
1274 token: Token::Word(Word::Word("Putan".to_string())),
1275 },
1276 CharToken {
1277 byte_offset: 12,
1278 byte_length: 1,
1279 char_offset: 12,
1280 char_length: 1,
1281 token: Token::Special(Special::Punctuation('|')),
1282 },
1283 CharToken {
1284 byte_offset: 13,
1285 byte_length: 10,
1286 char_offset: 13,
1287 char_length: 10,
1288 token: Token::Word(Word::Number(Number::Integer(1712640565))),
1289 },
1290 CharToken {
1291 byte_offset: 23,
1292 byte_length: 1,
1293 char_offset: 23,
1294 char_length: 1,
1295 token: Token::Special(Special::Punctuation(']')),
1296 },
1297 CharToken {
1305 byte_offset: 24,
1306 byte_length: 1,
1307 char_offset: 24,
1308 char_length: 1,
1309 token: Token::Special(Special::Separator(Separator::Space)),
1310 },
1311 CharToken {
1312 byte_offset: 25,
1313 byte_length: 6,
1314 char_offset: 25,
1315 char_length: 6,
1316 token: Token::Word(Word::Word("shared".to_string())),
1317 },
1318 CharToken {
1319 byte_offset: 31,
1320 byte_length: 1,
1321 char_offset: 31,
1322 char_length: 1,
1323 token: Token::Special(Special::Separator(Separator::Space)),
1324 },
1325 CharToken {
1326 byte_offset: 32,
1327 byte_length: 3,
1328 char_offset: 32,
1329 char_length: 3,
1330 token: Token::Word(Word::Word("the".to_string())),
1331 },
1332 CharToken {
1333 byte_offset: 35,
1334 byte_length: 1,
1335 char_offset: 35,
1336 char_length: 1,
1337 token: Token::Special(Special::Separator(Separator::Space)),
1338 },
1339 CharToken {
1340 byte_offset: 36,
1341 byte_length: 5,
1342 char_offset: 36,
1343 char_length: 5,
1344 token: Token::Word(Word::Word("quick".to_string())),
1345 },
1346 CharToken {
1347 byte_offset: 41,
1348 byte_length: 1,
1349 char_offset: 41,
1350 char_length: 1,
1351 token: Token::Special(Special::Separator(Separator::Space)),
1352 },
1353 CharToken {
1354 byte_offset: 42,
1355 byte_length: 1,
1356 char_offset: 42,
1357 char_length: 1,
1358 token: Token::Special(Special::Punctuation('(')),
1359 },
1360 CharToken {
1361 byte_offset: 43,
1362 byte_length: 1,
1363 char_offset: 43,
1364 char_length: 1,
1365 token: Token::Special(Special::Punctuation('"')),
1366 },
1367 CharToken {
1368 byte_offset: 44,
1369 byte_length: 5,
1370 char_offset: 44,
1371 char_length: 5,
1372 token: Token::Word(Word::Word("brown".to_string())),
1373 },
1374 CharToken {
1375 byte_offset: 49,
1376 byte_length: 1,
1377 char_offset: 49,
1378 char_length: 1,
1379 token: Token::Special(Special::Punctuation('"')),
1380 },
1381 CharToken {
1382 byte_offset: 50,
1383 byte_length: 1,
1384 char_offset: 50,
1385 char_length: 1,
1386 token: Token::Special(Special::Punctuation(')')),
1387 },
1388 CharToken {
1389 byte_offset: 51,
1390 byte_length: 1,
1391 char_offset: 51,
1392 char_length: 1,
1393 token: Token::Special(Special::Separator(Separator::Space)),
1394 },
1395 CharToken {
1396 byte_offset: 52,
1397 byte_length: 3,
1398 char_offset: 52,
1399 char_length: 3,
1400 token: Token::Word(Word::Word("fox".to_string())),
1401 },
1402 CharToken {
1403 byte_offset: 55,
1404 byte_length: 1,
1405 char_offset: 55,
1406 char_length: 1,
1407 token: Token::Special(Special::Separator(Separator::Space)),
1408 },
1409 CharToken {
1410 byte_offset: 56,
1411 byte_length: 5,
1412 char_offset: 56,
1413 char_length: 5,
1414 token: Token::Word(Word::Word("can\'t".to_string())),
1415 },
1416 CharToken {
1417 byte_offset: 61,
1418 byte_length: 1,
1419 char_offset: 61,
1420 char_length: 1,
1421 token: Token::Special(Special::Separator(Separator::Space)),
1422 },
1423 CharToken {
1424 byte_offset: 62,
1425 byte_length: 4,
1426 char_offset: 62,
1427 char_length: 4,
1428 token: Token::Word(Word::Word("jump".to_string())),
1429 },
1430 CharToken {
1431 byte_offset: 66,
1432 byte_length: 1,
1433 char_offset: 66,
1434 char_length: 1,
1435 token: Token::Special(Special::Separator(Separator::Space)),
1436 },
1437 CharToken {
1438 byte_offset: 67,
1439 byte_length: 4,
1440 char_offset: 67,
1441 char_length: 4,
1442 token: Token::Word(Word::Number(Number::Float(32.3))),
1443 },
1444 CharToken {
1445 byte_offset: 71,
1446 byte_length: 1,
1447 char_offset: 71,
1448 char_length: 1,
1449 token: Token::Special(Special::Separator(Separator::Space)),
1450 },
1451 CharToken {
1452 byte_offset: 72,
1453 byte_length: 4,
1454 char_offset: 72,
1455 char_length: 4,
1456 token: Token::Word(Word::Word("feet".to_string())),
1457 },
1458 CharToken {
1459 byte_offset: 76,
1460 byte_length: 1,
1461 char_offset: 76,
1462 char_length: 1,
1463 token: Token::Special(Special::Punctuation(',')),
1464 },
1465 CharToken {
1466 byte_offset: 77,
1467 byte_length: 1,
1468 char_offset: 77,
1469 char_length: 1,
1470 token: Token::Special(Special::Separator(Separator::Space)),
1471 },
1472 CharToken {
1473 byte_offset: 78,
1474 byte_length: 5,
1475 char_offset: 78,
1476 char_length: 5,
1477 token: Token::Word(Word::Word("right".to_string())),
1478 },
1479 CharToken {
1480 byte_offset: 83,
1481 byte_length: 1,
1482 char_offset: 83,
1483 char_length: 1,
1484 token: Token::Special(Special::Punctuation('?')),
1485 },
1486 CharToken {
1487 byte_offset: 84,
1488 byte_length: 1,
1489 char_offset: 84,
1490 char_length: 1,
1491 token: Token::Special(Special::Separator(Separator::Space)),
1492 },
1493 CharToken {
1494 byte_offset: 85,
1495 byte_length: 4,
1496 char_offset: 85,
1497 char_length: 4,
1498 token: Token::Word(Word::Numerical(Numerical::Measures("4pda".to_string()))),
1499 },
1500 CharToken {
1501 byte_offset: 89,
1502 byte_length: 1,
1503 char_offset: 89,
1504 char_length: 1,
1505 token: Token::Special(Special::Separator(Separator::Space)),
1506 },
1507 CharToken {
1508 byte_offset: 90,
1509 byte_length: 3,
1510 char_offset: 90,
1511 char_length: 3,
1512 token: Token::Word(Word::Word("etc".to_string())),
1513 },
1514 CharToken {
1515 byte_offset: 93,
1516 byte_length: 1,
1517 char_offset: 93,
1518 char_length: 1,
1519 token: Token::Special(Special::Punctuation('.')),
1520 },
1521 CharToken {
1522 byte_offset: 94,
1523 byte_length: 1,
1524 char_offset: 94,
1525 char_length: 1,
1526 token: Token::Special(Special::Separator(Separator::Space)),
1527 },
1528 CharToken {
1529 byte_offset: 95,
1530 byte_length: 3,
1531 char_offset: 95,
1532 char_length: 3,
1533 token: Token::Word(Word::Word("qeq".to_string())),
1534 },
1535 CharToken {
1536 byte_offset: 98,
1537 byte_length: 1,
1538 char_offset: 98,
1539 char_length: 1,
1540 token: Token::Special(Special::Separator(Separator::Space)),
1541 },
1542 CharToken {
1543 byte_offset: 99,
1544 byte_length: 5,
1545 char_offset: 99,
1546 char_length: 5,
1547 token: Token::Word(Word::Word("U.S.A".to_string())),
1548 },
1549 CharToken {
1550 byte_offset: 104,
1551 byte_length: 2,
1552 char_offset: 104,
1553 char_length: 2,
1554 token: Token::Special(Special::Separator(Separator::Space)),
1555 },
1556 CharToken {
1557 byte_offset: 106,
1558 byte_length: 3,
1559 char_offset: 106,
1560 char_length: 3,
1561 token: Token::Word(Word::Word("asd".to_string())),
1562 },
1563 CharToken {
1564 byte_offset: 109,
1565 byte_length: 3,
1566 char_offset: 109,
1567 char_length: 3,
1568 token: Token::Special(Special::Separator(Separator::Newline)),
1569 },
1570 CharToken {
1571 byte_offset: 112,
1572 byte_length: 3,
1573 char_offset: 112,
1574 char_length: 3,
1575 token: Token::Word(Word::Word("Brr".to_string())),
1576 },
1577 CharToken {
1578 byte_offset: 115,
1579 byte_length: 1,
1580 char_offset: 115,
1581 char_length: 1,
1582 token: Token::Special(Special::Punctuation(',')),
1583 },
1584 CharToken {
1585 byte_offset: 116,
1586 byte_length: 1,
1587 char_offset: 116,
1588 char_length: 1,
1589 token: Token::Special(Special::Separator(Separator::Space)),
1590 },
1591 CharToken {
1592 byte_offset: 117,
1593 byte_length: 4,
1594 char_offset: 117,
1595 char_length: 4,
1596 token: Token::Word(Word::Word("it\'s".to_string())),
1597 },
1598 CharToken {
1599 byte_offset: 121,
1600 byte_length: 1,
1601 char_offset: 121,
1602 char_length: 1,
1603 token: Token::Special(Special::Separator(Separator::Space)),
1604 },
1605 CharToken {
1606 byte_offset: 122,
1607 byte_length: 4,
1608 char_offset: 122,
1609 char_length: 4,
1610 token: Token::Word(Word::Number(Number::Float(29.3))),
1611 },
1612 CharToken {
1613 byte_offset: 126,
1614 byte_length: 2,
1615 char_offset: 126,
1616 char_length: 1,
1617 token: Token::Special(Special::Symbol('°')),
1618 },
1619 CharToken {
1620 byte_offset: 128,
1621 byte_length: 1,
1622 char_offset: 127,
1623 char_length: 1,
1624 token: Token::Word(Word::Word("F".to_string())),
1625 },
1626 CharToken {
1627 byte_offset: 129,
1628 byte_length: 1,
1629 char_offset: 128,
1630 char_length: 1,
1631 token: Token::Special(Special::Punctuation('!')),
1632 },
1633 CharToken {
1634 byte_offset: 130,
1635 byte_length: 1,
1636 char_offset: 129,
1637 char_length: 1,
1638 token: Token::Special(Special::Separator(Separator::Newline)),
1639 },
1640 CharToken {
1641 byte_offset: 131,
1642 byte_length: 1,
1643 char_offset: 130,
1644 char_length: 1,
1645 token: Token::Special(Special::Separator(Separator::Space)),
1646 },
1647 CharToken {
1648 byte_offset: 132,
1649 byte_length: 14,
1650 char_offset: 131,
1651 char_length: 7,
1652 token: Token::Word(Word::Word("Русское".to_string())),
1653 },
1654 CharToken {
1655 byte_offset: 146,
1656 byte_length: 1,
1657 char_offset: 138,
1658 char_length: 1,
1659 token: Token::Special(Special::Separator(Separator::Space)),
1660 },
1661 CharToken {
1662 byte_offset: 147,
1663 byte_length: 22,
1664 char_offset: 139,
1665 char_length: 11,
1666 token: Token::Word(Word::Word("предложение".to_string())),
1667 },
1668 CharToken {
1669 byte_offset: 169,
1670 byte_length: 1,
1671 char_offset: 150,
1672 char_length: 1,
1673 token: Token::Special(Special::Separator(Separator::Space)),
1674 },
1675 CharToken {
1676 byte_offset: 170,
1677 byte_length: 5,
1678 char_offset: 151,
1679 char_length: 5,
1680 token: Token::Struct(Struct::Hashtag("36.6".to_string())),
1681 },
1682 CharToken {
1683 byte_offset: 175,
1684 byte_length: 1,
1685 char_offset: 156,
1686 char_length: 1,
1687 token: Token::Special(Special::Separator(Separator::Space)),
1688 },
1689 CharToken {
1690 byte_offset: 176,
1691 byte_length: 6,
1692 char_offset: 157,
1693 char_length: 3,
1694 token: Token::Word(Word::Word("для".to_string())),
1695 },
1696 CharToken {
1697 byte_offset: 182,
1698 byte_length: 1,
1699 char_offset: 160,
1700 char_length: 1,
1701 token: Token::Special(Special::Separator(Separator::Space)),
1702 },
1703 CharToken {
1704 byte_offset: 183,
1705 byte_length: 24,
1706 char_offset: 161,
1707 char_length: 12,
1708 token: Token::Word(Word::Word("тестирования".to_string())),
1709 },
1710 CharToken {
1711 byte_offset: 207,
1712 byte_length: 1,
1713 char_offset: 173,
1714 char_length: 1,
1715 token: Token::Special(Special::Separator(Separator::Space)),
1716 },
1717 CharToken {
1718 byte_offset: 208,
1719 byte_length: 14,
1720 char_offset: 174,
1721 char_length: 7,
1722 token: Token::Word(Word::Word("деления".to_string())),
1723 },
1724 CharToken {
1725 byte_offset: 222,
1726 byte_length: 1,
1727 char_offset: 181,
1728 char_length: 1,
1729 token: Token::Special(Special::Separator(Separator::Space)),
1730 },
1731 CharToken {
1732 byte_offset: 223,
1733 byte_length: 4,
1734 char_offset: 182,
1735 char_length: 2,
1736 token: Token::Word(Word::Word("по".to_string())),
1737 },
1738 CharToken {
1739 byte_offset: 227,
1740 byte_length: 1,
1741 char_offset: 184,
1742 char_length: 1,
1743 token: Token::Special(Special::Separator(Separator::Space)),
1744 },
1745 CharToken {
1746 byte_offset: 228,
1747 byte_length: 12,
1748 char_offset: 185,
1749 char_length: 6,
1750 token: Token::Word(Word::Word("юникод".to_string())),
1751 },
1752 CharToken {
1753 byte_offset: 240,
1754 byte_length: 1,
1755 char_offset: 191,
1756 char_length: 1,
1757 token: Token::Special(Special::Punctuation('-')),
1758 },
1759 CharToken {
1760 byte_offset: 241,
1761 byte_length: 12,
1762 char_offset: 192,
1763 char_length: 6,
1764 token: Token::Word(Word::Word("словам".to_string())),
1765 },
1766 CharToken {
1767 byte_offset: 253,
1768 byte_length: 3,
1769 char_offset: 198,
1770 char_length: 3,
1771 token: Token::Special(Special::Punctuation('.')),
1772 },
1773 CharToken {
1774 byte_offset: 256,
1775 byte_length: 1,
1776 char_offset: 201,
1777 char_length: 1,
1778 token: Token::Special(Special::Separator(Separator::Newline)),
1779 },
1780 CharToken {
1781 byte_offset: 257,
1782 byte_length: 8,
1783 char_offset: 202,
1784 char_length: 2,
1785 token: Token::Word(Word::Emoji("russia")),
1786 },
1787 CharToken {
1788 byte_offset: 265,
1789 byte_length: 1,
1790 char_offset: 204,
1791 char_length: 1,
1792 token: Token::Special(Special::Separator(Separator::Space)),
1793 },
1794 CharToken {
1795 byte_offset: 266,
1796 byte_length: 8,
1797 char_offset: 205,
1798 char_length: 2,
1799 token: Token::Word(Word::Emoji("sao_tome_and_principe")),
1800 },
1801 CharToken {
1802 byte_offset: 274,
1803 byte_length: 1,
1804 char_offset: 207,
1805 char_length: 1,
1806 token: Token::Special(Special::Separator(Separator::Newline)),
1807 },
1808 CharToken {
1809 byte_offset: 275,
1810 byte_length: 8,
1811 char_offset: 208,
1812 char_length: 2,
1813 token: Token::Word(Word::Emoji("blond_haired_person_dark_skin_tone")),
1814 },
1815 CharToken {
1816 byte_offset: 283,
1817 byte_length: 8,
1818 char_offset: 210,
1819 char_length: 2,
1820 token: Token::Word(Word::Emoji("baby_medium_skin_tone")),
1821 },
1822 CharToken {
1823 byte_offset: 291,
1824 byte_length: 8,
1825 char_offset: 212,
1826 char_length: 2,
1827 token: Token::Word(Word::Emoji("man_medium_skin_tone")),
1828 },
1829 CharToken {
1830 byte_offset: 299,
1831 byte_length: 1,
1832 char_offset: 214,
1833 char_length: 1,
1834 token: Token::Special(Special::Separator(Separator::Newline)),
1835 },
1836 CharToken {
1837 byte_offset: 300,
1838 byte_length: 1,
1839 char_offset: 215,
1840 char_length: 1,
1841 token: Token::Special(Special::Punctuation('+')),
1842 },
1843 CharToken {
1844 byte_offset: 301,
1845 byte_length: 4,
1846 char_offset: 216,
1847 char_length: 4,
1848 token: Token::Word(Word::Word("Done".to_string())),
1849 },
1850 CharToken {
1851 byte_offset: 305,
1852 byte_length: 1,
1853 char_offset: 220,
1854 char_length: 1,
1855 token: Token::Special(Special::Punctuation('!')),
1856 },
1857 CharToken {
1858 byte_offset: 306,
1859 byte_length: 1,
1860 char_offset: 221,
1861 char_length: 1,
1862 token: Token::Special(Special::Separator(Separator::Space)),
1863 },
1864 CharToken {
1865 byte_offset: 307,
1866 byte_length: 12,
1867 char_offset: 222,
1868 char_length: 6,
1869 token: Token::Word(Word::Word("Готово".to_string())),
1870 },
1871 ];
1872
1873 let lib_res = uws
1874 .into_tokenizer(TokenizerParams::complex())
1875 .collect::<Vec<_>>();
1876
1877 check_cresults(&result, &lib_res, uws);
1879 }
1880
1881 #[test]
1882 fn general_default() {
1883 let uws = "The quick (\"brown\") fox can't jump 32.3 feet, right? 4pda etc. qeq U.S.A asd\n\n\nBrr, it's 29.3°F!\n Русское предложение #36.6 для тестирования деления по юникод-словам...\n";
1884 let result = vec![
1885 PositionalToken {
1886 source: uws,
1887 offset: 0,
1888 length: 3,
1889 token: Token::Word(Word::Word("The".to_string())),
1890 },
1891 PositionalToken {
1892 source: uws,
1893 offset: 3,
1894 length: 1,
1895 token: Token::Special(Special::Separator(Separator::Space)),
1896 },
1897 PositionalToken {
1898 source: uws,
1899 offset: 4,
1900 length: 5,
1901 token: Token::Word(Word::Word("quick".to_string())),
1902 },
1903 PositionalToken {
1904 source: uws,
1905 offset: 9,
1906 length: 1,
1907 token: Token::Special(Special::Separator(Separator::Space)),
1908 },
1909 PositionalToken {
1910 source: uws,
1911 offset: 10,
1912 length: 1,
1913 token: Token::Special(Special::Punctuation('(')),
1914 },
1915 PositionalToken {
1916 source: uws,
1917 offset: 11,
1918 length: 1,
1919 token: Token::Special(Special::Punctuation('"')),
1920 },
1921 PositionalToken {
1922 source: uws,
1923 offset: 12,
1924 length: 5,
1925 token: Token::Word(Word::Word("brown".to_string())),
1926 },
1927 PositionalToken {
1928 source: uws,
1929 offset: 17,
1930 length: 1,
1931 token: Token::Special(Special::Punctuation('"')),
1932 },
1933 PositionalToken {
1934 source: uws,
1935 offset: 18,
1936 length: 1,
1937 token: Token::Special(Special::Punctuation(')')),
1938 },
1939 PositionalToken {
1940 source: uws,
1941 offset: 19,
1942 length: 1,
1943 token: Token::Special(Special::Separator(Separator::Space)),
1944 },
1945 PositionalToken {
1946 source: uws,
1947 offset: 20,
1948 length: 3,
1949 token: Token::Word(Word::Word("fox".to_string())),
1950 },
1951 PositionalToken {
1952 source: uws,
1953 offset: 23,
1954 length: 1,
1955 token: Token::Special(Special::Separator(Separator::Space)),
1956 },
1957 PositionalToken {
1958 source: uws,
1959 offset: 24,
1960 length: 5,
1961 token: Token::Word(Word::Word("can\'t".to_string())),
1962 },
1963 PositionalToken {
1964 source: uws,
1965 offset: 29,
1966 length: 1,
1967 token: Token::Special(Special::Separator(Separator::Space)),
1968 },
1969 PositionalToken {
1970 source: uws,
1971 offset: 30,
1972 length: 4,
1973 token: Token::Word(Word::Word("jump".to_string())),
1974 },
1975 PositionalToken {
1976 source: uws,
1977 offset: 34,
1978 length: 1,
1979 token: Token::Special(Special::Separator(Separator::Space)),
1980 },
1981 PositionalToken {
1982 source: uws,
1983 offset: 35,
1984 length: 4,
1985 token: Token::Word(Word::Number(Number::Float(32.3))),
1986 },
1987 PositionalToken {
1988 source: uws,
1989 offset: 39,
1990 length: 1,
1991 token: Token::Special(Special::Separator(Separator::Space)),
1992 },
1993 PositionalToken {
1994 source: uws,
1995 offset: 40,
1996 length: 4,
1997 token: Token::Word(Word::Word("feet".to_string())),
1998 },
1999 PositionalToken {
2000 source: uws,
2001 offset: 44,
2002 length: 1,
2003 token: Token::Special(Special::Punctuation(',')),
2004 },
2005 PositionalToken {
2006 source: uws,
2007 offset: 45,
2008 length: 1,
2009 token: Token::Special(Special::Separator(Separator::Space)),
2010 },
2011 PositionalToken {
2012 source: uws,
2013 offset: 46,
2014 length: 5,
2015 token: Token::Word(Word::Word("right".to_string())),
2016 },
2017 PositionalToken {
2018 source: uws,
2019 offset: 51,
2020 length: 1,
2021 token: Token::Special(Special::Punctuation('?')),
2022 },
2023 PositionalToken {
2024 source: uws,
2025 offset: 52,
2026 length: 1,
2027 token: Token::Special(Special::Separator(Separator::Space)),
2028 },
2029 PositionalToken {
2030 source: uws,
2031 offset: 53,
2032 length: 4,
2033 token: Token::Word(Word::Numerical(Numerical::Measures("4pda".to_string()))),
2034 }, PositionalToken {
2036 source: uws,
2037 offset: 57,
2038 length: 1,
2039 token: Token::Special(Special::Separator(Separator::Space)),
2040 },
2041 PositionalToken {
2042 source: uws,
2043 offset: 58,
2044 length: 3,
2045 token: Token::Word(Word::Word("etc".to_string())),
2046 },
2047 PositionalToken {
2048 source: uws,
2049 offset: 61,
2050 length: 1,
2051 token: Token::Special(Special::Punctuation('.')),
2052 },
2053 PositionalToken {
2054 source: uws,
2055 offset: 62,
2056 length: 1,
2057 token: Token::Special(Special::Separator(Separator::Space)),
2058 },
2059 PositionalToken {
2060 source: uws,
2061 offset: 63,
2062 length: 3,
2063 token: Token::Word(Word::Word("qeq".to_string())),
2064 },
2065 PositionalToken {
2066 source: uws,
2067 offset: 66,
2068 length: 1,
2069 token: Token::Special(Special::Separator(Separator::Space)),
2070 },
2071 PositionalToken {
2072 source: uws,
2073 offset: 67,
2074 length: 1,
2075 token: Token::Word(Word::Word("U".to_string())),
2076 },
2077 PositionalToken {
2078 source: uws,
2079 offset: 68,
2080 length: 1,
2081 token: Token::Special(Special::Punctuation('.')),
2082 },
2083 PositionalToken {
2084 source: uws,
2085 offset: 69,
2086 length: 1,
2087 token: Token::Word(Word::Word("S".to_string())),
2088 },
2089 PositionalToken {
2090 source: uws,
2091 offset: 70,
2092 length: 1,
2093 token: Token::Special(Special::Punctuation('.')),
2094 },
2095 PositionalToken {
2096 source: uws,
2097 offset: 71,
2098 length: 1,
2099 token: Token::Word(Word::Word("A".to_string())),
2100 },
2101 PositionalToken {
2102 source: uws,
2103 offset: 72,
2104 length: 2,
2105 token: Token::Special(Special::Separator(Separator::Space)),
2106 },
2107 PositionalToken {
2108 source: uws,
2109 offset: 74,
2110 length: 3,
2111 token: Token::Word(Word::Word("asd".to_string())),
2112 },
2113 PositionalToken {
2114 source: uws,
2115 offset: 77,
2116 length: 3,
2117 token: Token::Special(Special::Separator(Separator::Newline)),
2118 },
2119 PositionalToken {
2120 source: uws,
2121 offset: 80,
2122 length: 3,
2123 token: Token::Word(Word::Word("Brr".to_string())),
2124 },
2125 PositionalToken {
2126 source: uws,
2127 offset: 83,
2128 length: 1,
2129 token: Token::Special(Special::Punctuation(',')),
2130 },
2131 PositionalToken {
2132 source: uws,
2133 offset: 84,
2134 length: 1,
2135 token: Token::Special(Special::Separator(Separator::Space)),
2136 },
2137 PositionalToken {
2138 source: uws,
2139 offset: 85,
2140 length: 4,
2141 token: Token::Word(Word::Word("it\'s".to_string())),
2142 },
2143 PositionalToken {
2144 source: uws,
2145 offset: 89,
2146 length: 1,
2147 token: Token::Special(Special::Separator(Separator::Space)),
2148 },
2149 PositionalToken {
2150 source: uws,
2151 offset: 90,
2152 length: 4,
2153 token: Token::Word(Word::Number(Number::Float(29.3))),
2154 },
2155 PositionalToken {
2156 source: uws,
2157 offset: 94,
2158 length: 2,
2159 token: Token::Special(Special::Symbol('°')),
2160 },
2161 PositionalToken {
2162 source: uws,
2163 offset: 96,
2164 length: 1,
2165 token: Token::Word(Word::Word("F".to_string())),
2166 },
2167 PositionalToken {
2168 source: uws,
2169 offset: 97,
2170 length: 1,
2171 token: Token::Special(Special::Punctuation('!')),
2172 },
2173 PositionalToken {
2174 source: uws,
2175 offset: 98,
2176 length: 1,
2177 token: Token::Special(Special::Separator(Separator::Newline)),
2178 },
2179 PositionalToken {
2180 source: uws,
2181 offset: 99,
2182 length: 1,
2183 token: Token::Special(Special::Separator(Separator::Space)),
2184 },
2185 PositionalToken {
2186 source: uws,
2187 offset: 100,
2188 length: 14,
2189 token: Token::Word(Word::Word("Русское".to_string())),
2190 },
2191 PositionalToken {
2192 source: uws,
2193 offset: 114,
2194 length: 1,
2195 token: Token::Special(Special::Separator(Separator::Space)),
2196 },
2197 PositionalToken {
2198 source: uws,
2199 offset: 115,
2200 length: 22,
2201 token: Token::Word(Word::Word("предложение".to_string())),
2202 },
2203 PositionalToken {
2204 source: uws,
2205 offset: 137,
2206 length: 1,
2207 token: Token::Special(Special::Separator(Separator::Space)),
2208 },
2209 PositionalToken {
2210 source: uws,
2211 offset: 138,
2212 length: 5,
2213 token: Token::Struct(Struct::Hashtag("36.6".to_string())),
2214 },
2215 PositionalToken {
2228 source: uws,
2229 offset: 143,
2230 length: 1,
2231 token: Token::Special(Special::Separator(Separator::Space)),
2232 },
2233 PositionalToken {
2234 source: uws,
2235 offset: 144,
2236 length: 6,
2237 token: Token::Word(Word::Word("для".to_string())),
2238 },
2239 PositionalToken {
2240 source: uws,
2241 offset: 150,
2242 length: 1,
2243 token: Token::Special(Special::Separator(Separator::Space)),
2244 },
2245 PositionalToken {
2246 source: uws,
2247 offset: 151,
2248 length: 24,
2249 token: Token::Word(Word::Word("тестирования".to_string())),
2250 },
2251 PositionalToken {
2252 source: uws,
2253 offset: 175,
2254 length: 1,
2255 token: Token::Special(Special::Separator(Separator::Space)),
2256 },
2257 PositionalToken {
2258 source: uws,
2259 offset: 176,
2260 length: 14,
2261 token: Token::Word(Word::Word("деления".to_string())),
2262 },
2263 PositionalToken {
2264 source: uws,
2265 offset: 190,
2266 length: 1,
2267 token: Token::Special(Special::Separator(Separator::Space)),
2268 },
2269 PositionalToken {
2270 source: uws,
2271 offset: 191,
2272 length: 4,
2273 token: Token::Word(Word::Word("по".to_string())),
2274 },
2275 PositionalToken {
2276 source: uws,
2277 offset: 195,
2278 length: 1,
2279 token: Token::Special(Special::Separator(Separator::Space)),
2280 },
2281 PositionalToken {
2282 source: uws,
2283 offset: 196,
2284 length: 12,
2285 token: Token::Word(Word::Word("юникод".to_string())),
2286 },
2287 PositionalToken {
2288 source: uws,
2289 offset: 208,
2290 length: 1,
2291 token: Token::Special(Special::Punctuation('-')),
2292 },
2293 PositionalToken {
2294 source: uws,
2295 offset: 209,
2296 length: 12,
2297 token: Token::Word(Word::Word("словам".to_string())),
2298 },
2299 PositionalToken {
2300 source: uws,
2301 offset: 221,
2302 length: 3,
2303 token: Token::Special(Special::Punctuation('.')),
2304 },
2305 PositionalToken {
2306 source: uws,
2307 offset: 224,
2308 length: 1,
2309 token: Token::Special(Special::Separator(Separator::Newline)),
2310 },
2311 ];
2312 let lib_res = uws
2313 .into_tokenizer(TokenizerParams::v1())
2314 .collect::<Vec<_>>();
2315 check_results(&result, &lib_res, uws);
2316 }
2317
2318 #[test]
2319 fn general_no_split() {
2320 let uws = "The quick (\"brown\") fox can't jump 32.3 feet, right? 4pda etc. qeq U.S.A asd\n\n\nBrr, it's 29.3°F!\n Русское предложение #36.6 для тестирования деления по юникод-словам...\n";
2321 let result = vec![
2322 PositionalToken {
2323 source: uws,
2324 offset: 0,
2325 length: 3,
2326 token: Token::Word(Word::Word("The".to_string())),
2327 },
2328 PositionalToken {
2329 source: uws,
2330 offset: 3,
2331 length: 1,
2332 token: Token::Special(Special::Separator(Separator::Space)),
2333 },
2334 PositionalToken {
2335 source: uws,
2336 offset: 4,
2337 length: 5,
2338 token: Token::Word(Word::Word("quick".to_string())),
2339 },
2340 PositionalToken {
2341 source: uws,
2342 offset: 9,
2343 length: 1,
2344 token: Token::Special(Special::Separator(Separator::Space)),
2345 },
2346 PositionalToken {
2347 source: uws,
2348 offset: 10,
2349 length: 1,
2350 token: Token::Special(Special::Punctuation('(')),
2351 },
2352 PositionalToken {
2353 source: uws,
2354 offset: 11,
2355 length: 1,
2356 token: Token::Special(Special::Punctuation('"')),
2357 },
2358 PositionalToken {
2359 source: uws,
2360 offset: 12,
2361 length: 5,
2362 token: Token::Word(Word::Word("brown".to_string())),
2363 },
2364 PositionalToken {
2365 source: uws,
2366 offset: 17,
2367 length: 1,
2368 token: Token::Special(Special::Punctuation('"')),
2369 },
2370 PositionalToken {
2371 source: uws,
2372 offset: 18,
2373 length: 1,
2374 token: Token::Special(Special::Punctuation(')')),
2375 },
2376 PositionalToken {
2377 source: uws,
2378 offset: 19,
2379 length: 1,
2380 token: Token::Special(Special::Separator(Separator::Space)),
2381 },
2382 PositionalToken {
2383 source: uws,
2384 offset: 20,
2385 length: 3,
2386 token: Token::Word(Word::Word("fox".to_string())),
2387 },
2388 PositionalToken {
2389 source: uws,
2390 offset: 23,
2391 length: 1,
2392 token: Token::Special(Special::Separator(Separator::Space)),
2393 },
2394 PositionalToken {
2395 source: uws,
2396 offset: 24,
2397 length: 5,
2398 token: Token::Word(Word::Word("can\'t".to_string())),
2399 },
2400 PositionalToken {
2401 source: uws,
2402 offset: 29,
2403 length: 1,
2404 token: Token::Special(Special::Separator(Separator::Space)),
2405 },
2406 PositionalToken {
2407 source: uws,
2408 offset: 30,
2409 length: 4,
2410 token: Token::Word(Word::Word("jump".to_string())),
2411 },
2412 PositionalToken {
2413 source: uws,
2414 offset: 34,
2415 length: 1,
2416 token: Token::Special(Special::Separator(Separator::Space)),
2417 },
2418 PositionalToken {
2419 source: uws,
2420 offset: 35,
2421 length: 4,
2422 token: Token::Word(Word::Number(Number::Float(32.3))),
2423 },
2424 PositionalToken {
2425 source: uws,
2426 offset: 39,
2427 length: 1,
2428 token: Token::Special(Special::Separator(Separator::Space)),
2429 },
2430 PositionalToken {
2431 source: uws,
2432 offset: 40,
2433 length: 4,
2434 token: Token::Word(Word::Word("feet".to_string())),
2435 },
2436 PositionalToken {
2437 source: uws,
2438 offset: 44,
2439 length: 1,
2440 token: Token::Special(Special::Punctuation(',')),
2441 },
2442 PositionalToken {
2443 source: uws,
2444 offset: 45,
2445 length: 1,
2446 token: Token::Special(Special::Separator(Separator::Space)),
2447 },
2448 PositionalToken {
2449 source: uws,
2450 offset: 46,
2451 length: 5,
2452 token: Token::Word(Word::Word("right".to_string())),
2453 },
2454 PositionalToken {
2455 source: uws,
2456 offset: 51,
2457 length: 1,
2458 token: Token::Special(Special::Punctuation('?')),
2459 },
2460 PositionalToken {
2461 source: uws,
2462 offset: 52,
2463 length: 1,
2464 token: Token::Special(Special::Separator(Separator::Space)),
2465 },
2466 PositionalToken {
2467 source: uws,
2468 offset: 53,
2469 length: 4,
2470 token: Token::Word(Word::Numerical(Numerical::Measures("4pda".to_string()))),
2471 }, PositionalToken {
2473 source: uws,
2474 offset: 57,
2475 length: 1,
2476 token: Token::Special(Special::Separator(Separator::Space)),
2477 },
2478 PositionalToken {
2479 source: uws,
2480 offset: 58,
2481 length: 3,
2482 token: Token::Word(Word::Word("etc".to_string())),
2483 },
2484 PositionalToken {
2485 source: uws,
2486 offset: 61,
2487 length: 1,
2488 token: Token::Special(Special::Punctuation('.')),
2489 },
2490 PositionalToken {
2491 source: uws,
2492 offset: 62,
2493 length: 1,
2494 token: Token::Special(Special::Separator(Separator::Space)),
2495 },
2496 PositionalToken {
2497 source: uws,
2498 offset: 63,
2499 length: 3,
2500 token: Token::Word(Word::Word("qeq".to_string())),
2501 },
2502 PositionalToken {
2503 source: uws,
2504 offset: 66,
2505 length: 1,
2506 token: Token::Special(Special::Separator(Separator::Space)),
2507 },
2508 PositionalToken {
2509 source: uws,
2510 offset: 67,
2511 length: 5,
2512 token: Token::Word(Word::Word("U.S.A".to_string())),
2513 },
2514 PositionalToken {
2515 source: uws,
2516 offset: 72,
2517 length: 1,
2518 token: Token::Special(Special::Separator(Separator::Space)),
2519 },
2520 PositionalToken {
2521 source: uws,
2522 offset: 73,
2523 length: 1,
2524 token: Token::Special(Special::Separator(Separator::Space)),
2525 },
2526 PositionalToken {
2527 source: uws,
2528 offset: 74,
2529 length: 3,
2530 token: Token::Word(Word::Word("asd".to_string())),
2531 },
2532 PositionalToken {
2533 source: uws,
2534 offset: 77,
2535 length: 1,
2536 token: Token::Special(Special::Separator(Separator::Newline)),
2537 },
2538 PositionalToken {
2539 source: uws,
2540 offset: 78,
2541 length: 1,
2542 token: Token::Special(Special::Separator(Separator::Newline)),
2543 },
2544 PositionalToken {
2545 source: uws,
2546 offset: 79,
2547 length: 1,
2548 token: Token::Special(Special::Separator(Separator::Newline)),
2549 },
2550 PositionalToken {
2551 source: uws,
2552 offset: 80,
2553 length: 3,
2554 token: Token::Word(Word::Word("Brr".to_string())),
2555 },
2556 PositionalToken {
2557 source: uws,
2558 offset: 83,
2559 length: 1,
2560 token: Token::Special(Special::Punctuation(',')),
2561 },
2562 PositionalToken {
2563 source: uws,
2564 offset: 84,
2565 length: 1,
2566 token: Token::Special(Special::Separator(Separator::Space)),
2567 },
2568 PositionalToken {
2569 source: uws,
2570 offset: 85,
2571 length: 4,
2572 token: Token::Word(Word::Word("it\'s".to_string())),
2573 },
2574 PositionalToken {
2575 source: uws,
2576 offset: 89,
2577 length: 1,
2578 token: Token::Special(Special::Separator(Separator::Space)),
2579 },
2580 PositionalToken {
2581 source: uws,
2582 offset: 90,
2583 length: 4,
2584 token: Token::Word(Word::Number(Number::Float(29.3))),
2585 },
2586 PositionalToken {
2587 source: uws,
2588 offset: 94,
2589 length: 2,
2590 token: Token::Special(Special::Symbol('°')),
2591 },
2592 PositionalToken {
2593 source: uws,
2594 offset: 96,
2595 length: 1,
2596 token: Token::Word(Word::Word("F".to_string())),
2597 },
2598 PositionalToken {
2599 source: uws,
2600 offset: 97,
2601 length: 1,
2602 token: Token::Special(Special::Punctuation('!')),
2603 },
2604 PositionalToken {
2605 source: uws,
2606 offset: 98,
2607 length: 1,
2608 token: Token::Special(Special::Separator(Separator::Newline)),
2609 },
2610 PositionalToken {
2611 source: uws,
2612 offset: 99,
2613 length: 1,
2614 token: Token::Special(Special::Separator(Separator::Space)),
2615 },
2616 PositionalToken {
2617 source: uws,
2618 offset: 100,
2619 length: 14,
2620 token: Token::Word(Word::Word("Русское".to_string())),
2621 },
2622 PositionalToken {
2623 source: uws,
2624 offset: 114,
2625 length: 1,
2626 token: Token::Special(Special::Separator(Separator::Space)),
2627 },
2628 PositionalToken {
2629 source: uws,
2630 offset: 115,
2631 length: 22,
2632 token: Token::Word(Word::Word("предложение".to_string())),
2633 },
2634 PositionalToken {
2635 source: uws,
2636 offset: 137,
2637 length: 1,
2638 token: Token::Special(Special::Separator(Separator::Space)),
2639 },
2640 PositionalToken {
2641 source: uws,
2642 offset: 138,
2643 length: 1,
2644 token: Token::Special(Special::Punctuation('#')),
2645 },
2646 PositionalToken {
2647 source: uws,
2648 offset: 139,
2649 length: 4,
2650 token: Token::Word(Word::Number(Number::Float(36.6))),
2651 },
2652 PositionalToken {
2653 source: uws,
2654 offset: 143,
2655 length: 1,
2656 token: Token::Special(Special::Separator(Separator::Space)),
2657 },
2658 PositionalToken {
2659 source: uws,
2660 offset: 144,
2661 length: 6,
2662 token: Token::Word(Word::Word("для".to_string())),
2663 },
2664 PositionalToken {
2665 source: uws,
2666 offset: 150,
2667 length: 1,
2668 token: Token::Special(Special::Separator(Separator::Space)),
2669 },
2670 PositionalToken {
2671 source: uws,
2672 offset: 151,
2673 length: 24,
2674 token: Token::Word(Word::Word("тестирования".to_string())),
2675 },
2676 PositionalToken {
2677 source: uws,
2678 offset: 175,
2679 length: 1,
2680 token: Token::Special(Special::Separator(Separator::Space)),
2681 },
2682 PositionalToken {
2683 source: uws,
2684 offset: 176,
2685 length: 14,
2686 token: Token::Word(Word::Word("деления".to_string())),
2687 },
2688 PositionalToken {
2689 source: uws,
2690 offset: 190,
2691 length: 1,
2692 token: Token::Special(Special::Separator(Separator::Space)),
2693 },
2694 PositionalToken {
2695 source: uws,
2696 offset: 191,
2697 length: 4,
2698 token: Token::Word(Word::Word("по".to_string())),
2699 },
2700 PositionalToken {
2701 source: uws,
2702 offset: 195,
2703 length: 1,
2704 token: Token::Special(Special::Separator(Separator::Space)),
2705 },
2706 PositionalToken {
2707 source: uws,
2708 offset: 196,
2709 length: 12,
2710 token: Token::Word(Word::Word("юникод".to_string())),
2711 },
2712 PositionalToken {
2713 source: uws,
2714 offset: 208,
2715 length: 1,
2716 token: Token::Special(Special::Punctuation('-')),
2717 },
2718 PositionalToken {
2719 source: uws,
2720 offset: 209,
2721 length: 12,
2722 token: Token::Word(Word::Word("словам".to_string())),
2723 },
2724 PositionalToken {
2725 source: uws,
2726 offset: 221,
2727 length: 1,
2728 token: Token::Special(Special::Punctuation('.')),
2729 },
2730 PositionalToken {
2731 source: uws,
2732 offset: 222,
2733 length: 1,
2734 token: Token::Special(Special::Punctuation('.')),
2735 },
2736 PositionalToken {
2737 source: uws,
2738 offset: 223,
2739 length: 1,
2740 token: Token::Special(Special::Punctuation('.')),
2741 },
2742 PositionalToken {
2743 source: uws,
2744 offset: 224,
2745 length: 1,
2746 token: Token::Special(Special::Separator(Separator::Newline)),
2747 },
2748 ];
2749 let lib_res = uws.into_tokenizer(Default::default()).collect::<Vec<_>>();
2750 check_results(&result, &lib_res, uws);
2751 }
2752
2753 #[test]
2754 fn general_complex() {
2755 let uws = "The quick (\"brown\") fox can't jump 32.3 feet, right? 4pda etc. qeq U.S.A asd\n\n\nBrr, it's 29.3°F!\n Русское предложение #36.6 для тестирования деления по юникод-словам...\n";
2756 let result = vec![
2757 PositionalToken {
2758 source: uws,
2759 offset: 0,
2760 length: 3,
2761 token: Token::Word(Word::Word("The".to_string())),
2762 },
2763 PositionalToken {
2764 source: uws,
2765 offset: 3,
2766 length: 1,
2767 token: Token::Special(Special::Separator(Separator::Space)),
2768 },
2769 PositionalToken {
2770 source: uws,
2771 offset: 4,
2772 length: 5,
2773 token: Token::Word(Word::Word("quick".to_string())),
2774 },
2775 PositionalToken {
2776 source: uws,
2777 offset: 9,
2778 length: 1,
2779 token: Token::Special(Special::Separator(Separator::Space)),
2780 },
2781 PositionalToken {
2782 source: uws,
2783 offset: 10,
2784 length: 1,
2785 token: Token::Special(Special::Punctuation('(')),
2786 },
2787 PositionalToken {
2788 source: uws,
2789 offset: 11,
2790 length: 1,
2791 token: Token::Special(Special::Punctuation('"')),
2792 },
2793 PositionalToken {
2794 source: uws,
2795 offset: 12,
2796 length: 5,
2797 token: Token::Word(Word::Word("brown".to_string())),
2798 },
2799 PositionalToken {
2800 source: uws,
2801 offset: 17,
2802 length: 1,
2803 token: Token::Special(Special::Punctuation('"')),
2804 },
2805 PositionalToken {
2806 source: uws,
2807 offset: 18,
2808 length: 1,
2809 token: Token::Special(Special::Punctuation(')')),
2810 },
2811 PositionalToken {
2812 source: uws,
2813 offset: 19,
2814 length: 1,
2815 token: Token::Special(Special::Separator(Separator::Space)),
2816 },
2817 PositionalToken {
2818 source: uws,
2819 offset: 20,
2820 length: 3,
2821 token: Token::Word(Word::Word("fox".to_string())),
2822 },
2823 PositionalToken {
2824 source: uws,
2825 offset: 23,
2826 length: 1,
2827 token: Token::Special(Special::Separator(Separator::Space)),
2828 },
2829 PositionalToken {
2830 source: uws,
2831 offset: 24,
2832 length: 5,
2833 token: Token::Word(Word::Word("can\'t".to_string())),
2834 },
2835 PositionalToken {
2836 source: uws,
2837 offset: 29,
2838 length: 1,
2839 token: Token::Special(Special::Separator(Separator::Space)),
2840 },
2841 PositionalToken {
2842 source: uws,
2843 offset: 30,
2844 length: 4,
2845 token: Token::Word(Word::Word("jump".to_string())),
2846 },
2847 PositionalToken {
2848 source: uws,
2849 offset: 34,
2850 length: 1,
2851 token: Token::Special(Special::Separator(Separator::Space)),
2852 },
2853 PositionalToken {
2854 source: uws,
2855 offset: 35,
2856 length: 4,
2857 token: Token::Word(Word::Number(Number::Float(32.3))),
2858 },
2859 PositionalToken {
2860 source: uws,
2861 offset: 39,
2862 length: 1,
2863 token: Token::Special(Special::Separator(Separator::Space)),
2864 },
2865 PositionalToken {
2866 source: uws,
2867 offset: 40,
2868 length: 4,
2869 token: Token::Word(Word::Word("feet".to_string())),
2870 },
2871 PositionalToken {
2872 source: uws,
2873 offset: 44,
2874 length: 1,
2875 token: Token::Special(Special::Punctuation(',')),
2876 },
2877 PositionalToken {
2878 source: uws,
2879 offset: 45,
2880 length: 1,
2881 token: Token::Special(Special::Separator(Separator::Space)),
2882 },
2883 PositionalToken {
2884 source: uws,
2885 offset: 46,
2886 length: 5,
2887 token: Token::Word(Word::Word("right".to_string())),
2888 },
2889 PositionalToken {
2890 source: uws,
2891 offset: 51,
2892 length: 1,
2893 token: Token::Special(Special::Punctuation('?')),
2894 },
2895 PositionalToken {
2896 source: uws,
2897 offset: 52,
2898 length: 1,
2899 token: Token::Special(Special::Separator(Separator::Space)),
2900 },
2901 PositionalToken {
2902 source: uws,
2903 offset: 53,
2904 length: 4,
2905 token: Token::Word(Word::Numerical(Numerical::Measures("4pda".to_string()))),
2906 }, PositionalToken {
2908 source: uws,
2909 offset: 57,
2910 length: 1,
2911 token: Token::Special(Special::Separator(Separator::Space)),
2912 },
2913 PositionalToken {
2914 source: uws,
2915 offset: 58,
2916 length: 3,
2917 token: Token::Word(Word::Word("etc".to_string())),
2918 },
2919 PositionalToken {
2920 source: uws,
2921 offset: 61,
2922 length: 1,
2923 token: Token::Special(Special::Punctuation('.')),
2924 },
2925 PositionalToken {
2926 source: uws,
2927 offset: 62,
2928 length: 1,
2929 token: Token::Special(Special::Separator(Separator::Space)),
2930 },
2931 PositionalToken {
2932 source: uws,
2933 offset: 63,
2934 length: 3,
2935 token: Token::Word(Word::Word("qeq".to_string())),
2936 },
2937 PositionalToken {
2938 source: uws,
2939 offset: 66,
2940 length: 1,
2941 token: Token::Special(Special::Separator(Separator::Space)),
2942 },
2943 PositionalToken {
2944 source: uws,
2945 offset: 67,
2946 length: 5,
2947 token: Token::Word(Word::Word("U.S.A".to_string())),
2948 },
2949 PositionalToken {
2950 source: uws,
2951 offset: 72,
2952 length: 2,
2953 token: Token::Special(Special::Separator(Separator::Space)),
2954 },
2955 PositionalToken {
2956 source: uws,
2957 offset: 74,
2958 length: 3,
2959 token: Token::Word(Word::Word("asd".to_string())),
2960 },
2961 PositionalToken {
2962 source: uws,
2963 offset: 77,
2964 length: 3,
2965 token: Token::Special(Special::Separator(Separator::Newline)),
2966 },
2967 PositionalToken {
2968 source: uws,
2969 offset: 80,
2970 length: 3,
2971 token: Token::Word(Word::Word("Brr".to_string())),
2972 },
2973 PositionalToken {
2974 source: uws,
2975 offset: 83,
2976 length: 1,
2977 token: Token::Special(Special::Punctuation(',')),
2978 },
2979 PositionalToken {
2980 source: uws,
2981 offset: 84,
2982 length: 1,
2983 token: Token::Special(Special::Separator(Separator::Space)),
2984 },
2985 PositionalToken {
2986 source: uws,
2987 offset: 85,
2988 length: 4,
2989 token: Token::Word(Word::Word("it\'s".to_string())),
2990 },
2991 PositionalToken {
2992 source: uws,
2993 offset: 89,
2994 length: 1,
2995 token: Token::Special(Special::Separator(Separator::Space)),
2996 },
2997 PositionalToken {
2998 source: uws,
2999 offset: 90,
3000 length: 4,
3001 token: Token::Word(Word::Number(Number::Float(29.3))),
3002 },
3003 PositionalToken {
3004 source: uws,
3005 offset: 94,
3006 length: 2,
3007 token: Token::Special(Special::Symbol('°')),
3008 },
3009 PositionalToken {
3010 source: uws,
3011 offset: 96,
3012 length: 1,
3013 token: Token::Word(Word::Word("F".to_string())),
3014 },
3015 PositionalToken {
3016 source: uws,
3017 offset: 97,
3018 length: 1,
3019 token: Token::Special(Special::Punctuation('!')),
3020 },
3021 PositionalToken {
3022 source: uws,
3023 offset: 98,
3024 length: 1,
3025 token: Token::Special(Special::Separator(Separator::Newline)),
3026 },
3027 PositionalToken {
3028 source: uws,
3029 offset: 99,
3030 length: 1,
3031 token: Token::Special(Special::Separator(Separator::Space)),
3032 },
3033 PositionalToken {
3034 source: uws,
3035 offset: 100,
3036 length: 14,
3037 token: Token::Word(Word::Word("Русское".to_string())),
3038 },
3039 PositionalToken {
3040 source: uws,
3041 offset: 114,
3042 length: 1,
3043 token: Token::Special(Special::Separator(Separator::Space)),
3044 },
3045 PositionalToken {
3046 source: uws,
3047 offset: 115,
3048 length: 22,
3049 token: Token::Word(Word::Word("предложение".to_string())),
3050 },
3051 PositionalToken {
3052 source: uws,
3053 offset: 137,
3054 length: 1,
3055 token: Token::Special(Special::Separator(Separator::Space)),
3056 },
3057 PositionalToken {
3058 source: uws,
3059 offset: 138,
3060 length: 5,
3061 token: Token::Struct(Struct::Hashtag("36.6".to_string())),
3062 },
3063 PositionalToken {
3064 source: uws,
3065 offset: 143,
3066 length: 1,
3067 token: Token::Special(Special::Separator(Separator::Space)),
3068 },
3069 PositionalToken {
3070 source: uws,
3071 offset: 144,
3072 length: 6,
3073 token: Token::Word(Word::Word("для".to_string())),
3074 },
3075 PositionalToken {
3076 source: uws,
3077 offset: 150,
3078 length: 1,
3079 token: Token::Special(Special::Separator(Separator::Space)),
3080 },
3081 PositionalToken {
3082 source: uws,
3083 offset: 151,
3084 length: 24,
3085 token: Token::Word(Word::Word("тестирования".to_string())),
3086 },
3087 PositionalToken {
3088 source: uws,
3089 offset: 175,
3090 length: 1,
3091 token: Token::Special(Special::Separator(Separator::Space)),
3092 },
3093 PositionalToken {
3094 source: uws,
3095 offset: 176,
3096 length: 14,
3097 token: Token::Word(Word::Word("деления".to_string())),
3098 },
3099 PositionalToken {
3100 source: uws,
3101 offset: 190,
3102 length: 1,
3103 token: Token::Special(Special::Separator(Separator::Space)),
3104 },
3105 PositionalToken {
3106 source: uws,
3107 offset: 191,
3108 length: 4,
3109 token: Token::Word(Word::Word("по".to_string())),
3110 },
3111 PositionalToken {
3112 source: uws,
3113 offset: 195,
3114 length: 1,
3115 token: Token::Special(Special::Separator(Separator::Space)),
3116 },
3117 PositionalToken {
3118 source: uws,
3119 offset: 196,
3120 length: 12,
3121 token: Token::Word(Word::Word("юникод".to_string())),
3122 },
3123 PositionalToken {
3124 source: uws,
3125 offset: 208,
3126 length: 1,
3127 token: Token::Special(Special::Punctuation('-')),
3128 },
3129 PositionalToken {
3130 source: uws,
3131 offset: 209,
3132 length: 12,
3133 token: Token::Word(Word::Word("словам".to_string())),
3134 },
3135 PositionalToken {
3136 source: uws,
3137 offset: 221,
3138 length: 3,
3139 token: Token::Special(Special::Punctuation('.')),
3140 },
3141 PositionalToken {
3142 source: uws,
3143 offset: 224,
3144 length: 1,
3145 token: Token::Special(Special::Separator(Separator::Newline)),
3146 },
3147 ];
3148 let lib_res = uws
3149 .into_tokenizer(TokenizerParams::complex())
3150 .collect::<Vec<_>>();
3151 check_results(&result, &lib_res, uws);
3152 }
3153
3154 #[test]
3155 fn plus_minus() {
3156 let uws = "+23 -4.5 -34 +25.7 - 2 + 5.6";
3157 let result = vec![
3158 PositionalToken {
3159 source: uws,
3160 offset: 0,
3161 length: 3,
3162 token: Token::Word(Word::Number(Number::Integer(23))),
3163 },
3164 PositionalToken {
3165 source: uws,
3166 offset: 3,
3167 length: 1,
3168 token: Token::Special(Special::Separator(Separator::Space)),
3169 },
3170 PositionalToken {
3171 source: uws,
3172 offset: 4,
3173 length: 4,
3174 token: Token::Word(Word::Number(Number::Float(-4.5))),
3175 },
3176 PositionalToken {
3177 source: uws,
3178 offset: 8,
3179 length: 1,
3180 token: Token::Special(Special::Separator(Separator::Space)),
3181 },
3182 PositionalToken {
3183 source: uws,
3184 offset: 9,
3185 length: 3,
3186 token: Token::Word(Word::Number(Number::Integer(-34))),
3187 },
3188 PositionalToken {
3189 source: uws,
3190 offset: 12,
3191 length: 1,
3192 token: Token::Special(Special::Separator(Separator::Space)),
3193 },
3194 PositionalToken {
3195 source: uws,
3196 offset: 13,
3197 length: 5,
3198 token: Token::Word(Word::Number(Number::Float(25.7))),
3199 },
3200 PositionalToken {
3201 source: uws,
3202 offset: 18,
3203 length: 1,
3204 token: Token::Special(Special::Separator(Separator::Space)),
3205 },
3206 PositionalToken {
3207 source: uws,
3208 offset: 19,
3209 length: 1,
3210 token: Token::Special(Special::Punctuation('-')),
3211 },
3212 PositionalToken {
3213 source: uws,
3214 offset: 20,
3215 length: 1,
3216 token: Token::Special(Special::Separator(Separator::Space)),
3217 },
3218 PositionalToken {
3219 source: uws,
3220 offset: 21,
3221 length: 1,
3222 token: Token::Word(Word::Number(Number::Integer(2))),
3223 },
3224 PositionalToken {
3225 source: uws,
3226 offset: 22,
3227 length: 1,
3228 token: Token::Special(Special::Separator(Separator::Space)),
3229 },
3230 PositionalToken {
3231 source: uws,
3232 offset: 23,
3233 length: 1,
3234 token: Token::Special(Special::Punctuation('+')),
3235 },
3236 PositionalToken {
3237 source: uws,
3238 offset: 24,
3239 length: 1,
3240 token: Token::Special(Special::Separator(Separator::Space)),
3241 },
3242 PositionalToken {
3243 source: uws,
3244 offset: 25,
3245 length: 3,
3246 token: Token::Word(Word::Number(Number::Float(5.6))),
3247 },
3248 ];
3249 let lib_res = uws
3250 .into_tokenizer(TokenizerParams::v1())
3251 .collect::<Vec<_>>();
3252 check(&result, &lib_res, uws);
3253 }
3255
3256 #[test]
3257 #[ignore]
3258 fn woman_bouncing_ball() {
3259 let uws = "\u{26f9}\u{200d}\u{2640}";
3260 let result = vec![PositionalToken {
3261 source: uws,
3262 offset: 0,
3263 length: 9,
3264 token: Token::Word(Word::Emoji("woman_bouncing_ball")),
3265 }];
3266 let lib_res = uws
3267 .into_tokenizer(TokenizerParams::v1())
3268 .collect::<Vec<_>>();
3269 check_results(&result, &lib_res, uws);
3270 }
3272
3273 #[test]
3274 fn emoji_and_rusabbr_default() {
3275 let uws = "🇷🇺 🇸🇹\n👱🏿👶🏽👨🏽\n👱\nС.С.С.Р.\n👨👩👦👦\n🧠\n";
3276 let result = vec![
3277 PositionalToken {
3278 source: uws,
3279 offset: 0,
3280 length: 8,
3281 token: Token::Word(Word::Emoji("russia")),
3282 },
3283 PositionalToken {
3284 source: uws,
3285 offset: 8,
3286 length: 1,
3287 token: Token::Special(Special::Separator(Separator::Space)),
3288 },
3289 PositionalToken {
3290 source: uws,
3291 offset: 9,
3292 length: 8,
3293 token: Token::Word(Word::Emoji("sao_tome_and_principe")),
3294 },
3295 PositionalToken {
3296 source: uws,
3297 offset: 17,
3298 length: 1,
3299 token: Token::Special(Special::Separator(Separator::Newline)),
3300 },
3301 PositionalToken {
3302 source: uws,
3303 offset: 18,
3304 length: 8,
3305 token: Token::Word(Word::Emoji("blond_haired_person_dark_skin_tone")),
3306 },
3307 PositionalToken {
3308 source: uws,
3309 offset: 26,
3310 length: 8,
3311 token: Token::Word(Word::Emoji("baby_medium_skin_tone")),
3312 },
3313 PositionalToken {
3314 source: uws,
3315 offset: 34,
3316 length: 8,
3317 token: Token::Word(Word::Emoji("man_medium_skin_tone")),
3318 },
3319 PositionalToken {
3320 source: uws,
3321 offset: 42,
3322 length: 1,
3323 token: Token::Special(Special::Separator(Separator::Newline)),
3324 },
3325 PositionalToken {
3326 source: uws,
3327 offset: 43,
3328 length: 4,
3329 token: Token::Word(Word::Emoji("blond_haired_person")),
3330 },
3331 PositionalToken {
3332 source: uws,
3333 offset: 47,
3334 length: 1,
3335 token: Token::Special(Special::Separator(Separator::Newline)),
3336 },
3337 PositionalToken {
3338 source: uws,
3339 offset: 48,
3340 length: 2,
3341 token: Token::Word(Word::Word("С".to_string())),
3342 },
3343 PositionalToken {
3344 source: uws,
3345 offset: 50,
3346 length: 1,
3347 token: Token::Special(Special::Punctuation('.')),
3348 },
3349 PositionalToken {
3350 source: uws,
3351 offset: 51,
3352 length: 2,
3353 token: Token::Word(Word::Word("С".to_string())),
3354 },
3355 PositionalToken {
3356 source: uws,
3357 offset: 53,
3358 length: 1,
3359 token: Token::Special(Special::Punctuation('.')),
3360 },
3361 PositionalToken {
3362 source: uws,
3363 offset: 54,
3364 length: 2,
3365 token: Token::Word(Word::Word("С".to_string())),
3366 },
3367 PositionalToken {
3368 source: uws,
3369 offset: 56,
3370 length: 1,
3371 token: Token::Special(Special::Punctuation('.')),
3372 },
3373 PositionalToken {
3374 source: uws,
3375 offset: 57,
3376 length: 2,
3377 token: Token::Word(Word::Word("Р".to_string())),
3378 },
3379 PositionalToken {
3380 source: uws,
3381 offset: 59,
3382 length: 1,
3383 token: Token::Special(Special::Punctuation('.')),
3384 },
3385 PositionalToken {
3386 source: uws,
3387 offset: 60,
3388 length: 1,
3389 token: Token::Special(Special::Separator(Separator::Newline)),
3390 },
3391 PositionalToken {
3392 source: uws,
3393 offset: 61,
3394 length: 25,
3395 token: Token::Word(Word::Emoji("family_man_woman_boy_boy")),
3396 },
3397 PositionalToken {
3398 source: uws,
3399 offset: 86,
3400 length: 1,
3401 token: Token::Special(Special::Separator(Separator::Newline)),
3402 },
3403 PositionalToken {
3404 source: uws,
3405 offset: 87,
3406 length: 4,
3407 token: Token::Word(Word::Emoji("brain")),
3408 },
3409 PositionalToken {
3410 source: uws,
3411 offset: 91,
3412 length: 1,
3413 token: Token::Special(Special::Separator(Separator::Newline)),
3414 },
3415 ];
3416
3417 let lib_res = uws
3418 .into_tokenizer(TokenizerParams::v1())
3419 .collect::<Vec<_>>();
3420 check_results(&result, &lib_res, uws);
3421 }
3423
3424 #[test]
3425 fn emoji_and_rusabbr_no_split() {
3426 let uws = "🇷🇺 🇸🇹\n👱🏿👶🏽👨🏽\n👱\nС.С.С.Р.\n👨👩👦👦\n🧠\n";
3427 let result = vec![
3428 PositionalToken {
3429 source: uws,
3430 offset: 0,
3431 length: 8,
3432 token: Token::Word(Word::Emoji("russia")),
3433 },
3434 PositionalToken {
3435 source: uws,
3436 offset: 8,
3437 length: 1,
3438 token: Token::Special(Special::Separator(Separator::Space)),
3439 },
3440 PositionalToken {
3441 source: uws,
3442 offset: 9,
3443 length: 8,
3444 token: Token::Word(Word::Emoji("sao_tome_and_principe")),
3445 },
3446 PositionalToken {
3447 source: uws,
3448 offset: 17,
3449 length: 1,
3450 token: Token::Special(Special::Separator(Separator::Newline)),
3451 },
3452 PositionalToken {
3453 source: uws,
3454 offset: 18,
3455 length: 8,
3456 token: Token::Word(Word::Emoji("blond_haired_person_dark_skin_tone")),
3457 },
3458 PositionalToken {
3459 source: uws,
3460 offset: 26,
3461 length: 8,
3462 token: Token::Word(Word::Emoji("baby_medium_skin_tone")),
3463 },
3464 PositionalToken {
3465 source: uws,
3466 offset: 34,
3467 length: 8,
3468 token: Token::Word(Word::Emoji("man_medium_skin_tone")),
3469 },
3470 PositionalToken {
3471 source: uws,
3472 offset: 42,
3473 length: 1,
3474 token: Token::Special(Special::Separator(Separator::Newline)),
3475 },
3476 PositionalToken {
3477 source: uws,
3478 offset: 43,
3479 length: 4,
3480 token: Token::Word(Word::Emoji("blond_haired_person")),
3481 },
3482 PositionalToken {
3483 source: uws,
3484 offset: 47,
3485 length: 1,
3486 token: Token::Special(Special::Separator(Separator::Newline)),
3487 },
3488 PositionalToken {
3489 source: uws,
3490 offset: 48,
3491 length: 11,
3492 token: Token::Word(Word::Word("С.С.С.Р".to_string())),
3493 },
3494 PositionalToken {
3495 source: uws,
3496 offset: 59,
3497 length: 1,
3498 token: Token::Special(Special::Punctuation('.')),
3499 },
3500 PositionalToken {
3501 source: uws,
3502 offset: 60,
3503 length: 1,
3504 token: Token::Special(Special::Separator(Separator::Newline)),
3505 },
3506 PositionalToken {
3507 source: uws,
3508 offset: 61,
3509 length: 25,
3510 token: Token::Word(Word::Emoji("family_man_woman_boy_boy")),
3511 },
3512 PositionalToken {
3513 source: uws,
3514 offset: 86,
3515 length: 1,
3516 token: Token::Special(Special::Separator(Separator::Newline)),
3517 },
3518 PositionalToken {
3519 source: uws,
3520 offset: 87,
3521 length: 4,
3522 token: Token::Word(Word::Emoji("brain")),
3523 },
3524 PositionalToken {
3525 source: uws,
3526 offset: 91,
3527 length: 1,
3528 token: Token::Special(Special::Separator(Separator::Newline)),
3529 },
3530 ];
3531
3532 let lib_res = uws.into_tokenizer(Default::default()).collect::<Vec<_>>();
3533 check_results(&result, &lib_res, uws);
3534 }
3536
3537 #[test]
3761 fn html() {
3762 let uws = "<div class=\"article article_view \" id=\"article_view_-113039156_9551\" data-article-url=\"/@chaibuket-o-chem-ne-zabyt-25-noyabrya\" data-audio-context=\"article:-113039156_9551\"><h1 class=\"article_decoration_first article_decoration_last\" >День Мамы </h1><p class=\"article_decoration_first article_decoration_last\" >День, когда поздравляют мам, бабушек, сестер и жён — это всемирный праздник, называемый «День Мамы». В настоящее время его отмечают почти в каждой стране, просто везде разные даты и способы празднования. </p><h3 class=\"article_decoration_first article_decoration_last\" ><span class='article_anchor_title'>\n <span class='article_anchor_button' id='pochemu-my-ego-prazdnuem'></span>\n <span class='article_anchor_fsymbol'>П</span>\n</span>ПОЧЕМУ МЫ ЕГО ПРАЗДНУЕМ</h3><p class=\"article_decoration_first article_decoration_last article_decoration_before\" >В 1987 году комитет госдумы по делам женщин, семьи и молодежи выступил с предложением учредить «День мамы», а сам приказ был подписан уже 30 января 1988 года Борисом Ельциным. Было решено, что ежегодно в России празднество дня мамы будет выпадать на последнее воскресенье ноября. </p><figure data-type=\"101\" data-mode=\"\" class=\"article_decoration_first article_decoration_last\" >\n <div class=\"article_figure_content\" style=\"width: 1125px\">\n <div class=\"article_figure_sizer_content\"><div class=\"article_object_sizer_wrap\" data-sizes=\"[{"s":["https://pp.userapi.com/c849128/v849128704/c0ffd/pcNJaBH3NDo.jpg",75,50],"m":["https://pp.userapi.com/c849128/v849128704/c0ffe/ozCLs2kHtRY.jpg",130,87],"x":["https://pp.userapi.com/c849128/v849128704/c0fff/E4KtTNDydzE.jpg",604,403],"y":["https://pp.userapi.com/c849128/v849128704/c1000/1nLxpYKavzU.jpg",807,538],"z":["https://pp.userapi.com/c849128/v849128704/c1001/IgEODe90yEk.jpg",1125,750],"o":["https://pp.userapi.com/c849128/v849128704/c1002/01faNwVZ2_E.jpg",130,87],"p":["https://pp.userapi.com/c849128/v849128704/c1003/baDFzbdRP2s.jpg",200,133],"q":["https://pp.userapi.com/c849128/v849128704/c1004/CY4khI6KJKA.jpg",320,213],"r":["https://pp.userapi.com/c849128/v849128704/c1005/NOvAJ6-VltY.jpg",510,340]}]\">\n <img class=\"article_object_sizer_inner article_object_photo__image_blur\" src=\"https://pp.userapi.com/c849128/v849128704/c0ffd/pcNJaBH3NDo.jpg\" data-baseurl=\"\"/>\n \n</div></div>\n <div class=\"article_figure_sizer\" style=\"padding-bottom: 66.666666666667%\"></div>";
3763 let result = vec![
3764 PositionalToken {
3765 source: uws,
3766 offset: 236,
3767 length: 8,
3768 token: Token::Word(Word::Word("День".to_string())),
3769 },
3770 PositionalToken {
3771 source: uws,
3772 offset: 244,
3773 length: 1,
3774 token: Token::Special(Special::Separator(Separator::Space)),
3775 },
3776 PositionalToken {
3777 source: uws,
3778 offset: 245,
3779 length: 8,
3780 token: Token::Word(Word::Word("Мамы".to_string())),
3781 },
3782 PositionalToken {
3783 source: uws,
3784 offset: 253,
3785 length: 1,
3786 token: Token::Special(Special::Separator(Separator::Space)),
3787 },
3788 PositionalToken {
3789 source: uws,
3790 offset: 321,
3791 length: 8,
3792 token: Token::Word(Word::Word("День".to_string())),
3793 },
3794 PositionalToken {
3795 source: uws,
3796 offset: 329,
3797 length: 1,
3798 token: Token::Special(Special::Punctuation(',')),
3799 },
3800 PositionalToken {
3801 source: uws,
3802 offset: 330,
3803 length: 1,
3804 token: Token::Special(Special::Separator(Separator::Space)),
3805 },
3806 PositionalToken {
3807 source: uws,
3808 offset: 331,
3809 length: 10,
3810 token: Token::Word(Word::Word("когда".to_string())),
3811 },
3812 PositionalToken {
3813 source: uws,
3814 offset: 341,
3815 length: 1,
3816 token: Token::Special(Special::Separator(Separator::Space)),
3817 },
3818 PositionalToken {
3819 source: uws,
3820 offset: 342,
3821 length: 22,
3822 token: Token::Word(Word::Word("поздравляют".to_string())),
3823 },
3824 PositionalToken {
3825 source: uws,
3826 offset: 364,
3827 length: 1,
3828 token: Token::Special(Special::Separator(Separator::Space)),
3829 },
3830 PositionalToken {
3831 source: uws,
3832 offset: 365,
3833 length: 6,
3834 token: Token::Word(Word::Word("мам".to_string())),
3835 },
3836 PositionalToken {
3837 source: uws,
3838 offset: 371,
3839 length: 1,
3840 token: Token::Special(Special::Punctuation(',')),
3841 },
3842 PositionalToken {
3843 source: uws,
3844 offset: 372,
3845 length: 1,
3846 token: Token::Special(Special::Separator(Separator::Space)),
3847 },
3848 PositionalToken {
3849 source: uws,
3850 offset: 373,
3851 length: 14,
3852 token: Token::Word(Word::Word("бабушек".to_string())),
3853 },
3854 PositionalToken {
3855 source: uws,
3856 offset: 387,
3857 length: 1,
3858 token: Token::Special(Special::Punctuation(',')),
3859 },
3860 PositionalToken {
3861 source: uws,
3862 offset: 388,
3863 length: 1,
3864 token: Token::Special(Special::Separator(Separator::Space)),
3865 },
3866 PositionalToken {
3867 source: uws,
3868 offset: 389,
3869 length: 12,
3870 token: Token::Word(Word::Word("сестер".to_string())),
3871 },
3872 PositionalToken {
3873 source: uws,
3874 offset: 401,
3875 length: 1,
3876 token: Token::Special(Special::Separator(Separator::Space)),
3877 },
3878 PositionalToken {
3879 source: uws,
3880 offset: 402,
3881 length: 2,
3882 token: Token::Word(Word::Word("и".to_string())),
3883 },
3884 PositionalToken {
3885 source: uws,
3886 offset: 404,
3887 length: 1,
3888 token: Token::Special(Special::Separator(Separator::Space)),
3889 },
3890 PositionalToken {
3891 source: uws,
3892 offset: 405,
3893 length: 6,
3894 token: Token::Word(Word::Word("жён".to_string())),
3895 },
3896 PositionalToken {
3897 source: uws,
3898 offset: 411,
3899 length: 1,
3900 token: Token::Special(Special::Separator(Separator::Space)),
3901 },
3902 PositionalToken {
3903 source: uws,
3904 offset: 412,
3905 length: 3,
3906 token: Token::Special(Special::Punctuation('—')),
3907 },
3908 PositionalToken {
3909 source: uws,
3910 offset: 415,
3911 length: 1,
3912 token: Token::Special(Special::Separator(Separator::Space)),
3913 },
3914 PositionalToken {
3915 source: uws,
3916 offset: 416,
3917 length: 6,
3918 token: Token::Word(Word::Word("это".to_string())),
3919 },
3920 PositionalToken {
3921 source: uws,
3922 offset: 422,
3923 length: 1,
3924 token: Token::Special(Special::Separator(Separator::Space)),
3925 },
3926 PositionalToken {
3927 source: uws,
3928 offset: 423,
3929 length: 18,
3930 token: Token::Word(Word::Word("всемирный".to_string())),
3931 },
3932 PositionalToken {
3933 source: uws,
3934 offset: 441,
3935 length: 1,
3936 token: Token::Special(Special::Separator(Separator::Space)),
3937 },
3938 PositionalToken {
3939 source: uws,
3940 offset: 442,
3941 length: 16,
3942 token: Token::Word(Word::Word("праздник".to_string())),
3943 },
3944 PositionalToken {
3945 source: uws,
3946 offset: 458,
3947 length: 1,
3948 token: Token::Special(Special::Punctuation(',')),
3949 },
3950 PositionalToken {
3951 source: uws,
3952 offset: 459,
3953 length: 1,
3954 token: Token::Special(Special::Separator(Separator::Space)),
3955 },
3956 PositionalToken {
3957 source: uws,
3958 offset: 460,
3959 length: 20,
3960 token: Token::Word(Word::Word("называемый".to_string())),
3961 },
3962 PositionalToken {
3963 source: uws,
3964 offset: 480,
3965 length: 1,
3966 token: Token::Special(Special::Separator(Separator::Space)),
3967 },
3968 PositionalToken {
3969 source: uws,
3970 offset: 481,
3971 length: 2,
3972 token: Token::Special(Special::Punctuation('«')),
3973 },
3974 PositionalToken {
3975 source: uws,
3976 offset: 483,
3977 length: 8,
3978 token: Token::Word(Word::Word("День".to_string())),
3979 },
3980 PositionalToken {
3981 source: uws,
3982 offset: 491,
3983 length: 1,
3984 token: Token::Special(Special::Separator(Separator::Space)),
3985 },
3986 PositionalToken {
3987 source: uws,
3988 offset: 492,
3989 length: 8,
3990 token: Token::Word(Word::Word("Мамы".to_string())),
3991 },
3992 PositionalToken {
3993 source: uws,
3994 offset: 500,
3995 length: 2,
3996 token: Token::Special(Special::Punctuation('»')),
3997 },
3998 PositionalToken {
3999 source: uws,
4000 offset: 502,
4001 length: 1,
4002 token: Token::Special(Special::Punctuation('.')),
4003 },
4004 PositionalToken {
4005 source: uws,
4006 offset: 503,
4007 length: 1,
4008 token: Token::Special(Special::Separator(Separator::Space)),
4009 },
4010 PositionalToken {
4011 source: uws,
4012 offset: 504,
4013 length: 2,
4014 token: Token::Word(Word::Word("В".to_string())),
4015 },
4016 PositionalToken {
4017 source: uws,
4018 offset: 506,
4019 length: 1,
4020 token: Token::Special(Special::Separator(Separator::Space)),
4021 },
4022 PositionalToken {
4023 source: uws,
4024 offset: 507,
4025 length: 18,
4026 token: Token::Word(Word::Word("настоящее".to_string())),
4027 },
4028 PositionalToken {
4029 source: uws,
4030 offset: 525,
4031 length: 1,
4032 token: Token::Special(Special::Separator(Separator::Space)),
4033 },
4034 PositionalToken {
4035 source: uws,
4036 offset: 526,
4037 length: 10,
4038 token: Token::Word(Word::Word("время".to_string())),
4039 },
4040 PositionalToken {
4041 source: uws,
4042 offset: 536,
4043 length: 1,
4044 token: Token::Special(Special::Separator(Separator::Space)),
4045 },
4046 PositionalToken {
4047 source: uws,
4048 offset: 537,
4049 length: 6,
4050 token: Token::Word(Word::Word("его".to_string())),
4051 },
4052 PositionalToken {
4053 source: uws,
4054 offset: 543,
4055 length: 1,
4056 token: Token::Special(Special::Separator(Separator::Space)),
4057 },
4058 PositionalToken {
4059 source: uws,
4060 offset: 544,
4061 length: 16,
4062 token: Token::Word(Word::Word("отмечают".to_string())),
4063 },
4064 PositionalToken {
4065 source: uws,
4066 offset: 560,
4067 length: 1,
4068 token: Token::Special(Special::Separator(Separator::Space)),
4069 },
4070 PositionalToken {
4071 source: uws,
4072 offset: 561,
4073 length: 10,
4074 token: Token::Word(Word::Word("почти".to_string())),
4075 },
4076 PositionalToken {
4077 source: uws,
4078 offset: 571,
4079 length: 1,
4080 token: Token::Special(Special::Separator(Separator::Space)),
4081 },
4082 PositionalToken {
4083 source: uws,
4084 offset: 572,
4085 length: 2,
4086 token: Token::Word(Word::Word("в".to_string())),
4087 },
4088 PositionalToken {
4089 source: uws,
4090 offset: 574,
4091 length: 1,
4092 token: Token::Special(Special::Separator(Separator::Space)),
4093 },
4094 PositionalToken {
4095 source: uws,
4096 offset: 575,
4097 length: 12,
4098 token: Token::Word(Word::Word("каждой".to_string())),
4099 },
4100 PositionalToken {
4101 source: uws,
4102 offset: 587,
4103 length: 1,
4104 token: Token::Special(Special::Separator(Separator::Space)),
4105 },
4106 PositionalToken {
4107 source: uws,
4108 offset: 588,
4109 length: 12,
4110 token: Token::Word(Word::Word("стране".to_string())),
4111 },
4112 PositionalToken {
4113 source: uws,
4114 offset: 600,
4115 length: 1,
4116 token: Token::Special(Special::Punctuation(',')),
4117 },
4118 PositionalToken {
4119 source: uws,
4120 offset: 601,
4121 length: 1,
4122 token: Token::Special(Special::Separator(Separator::Space)),
4123 },
4124 PositionalToken {
4125 source: uws,
4126 offset: 602,
4127 length: 12,
4128 token: Token::Word(Word::Word("просто".to_string())),
4129 },
4130 PositionalToken {
4131 source: uws,
4132 offset: 614,
4133 length: 1,
4134 token: Token::Special(Special::Separator(Separator::Space)),
4135 },
4136 PositionalToken {
4137 source: uws,
4138 offset: 615,
4139 length: 10,
4140 token: Token::Word(Word::Word("везде".to_string())),
4141 },
4142 PositionalToken {
4143 source: uws,
4144 offset: 625,
4145 length: 1,
4146 token: Token::Special(Special::Separator(Separator::Space)),
4147 },
4148 PositionalToken {
4149 source: uws,
4150 offset: 626,
4151 length: 12,
4152 token: Token::Word(Word::Word("разные".to_string())),
4153 },
4154 PositionalToken {
4155 source: uws,
4156 offset: 638,
4157 length: 1,
4158 token: Token::Special(Special::Separator(Separator::Space)),
4159 },
4160 PositionalToken {
4161 source: uws,
4162 offset: 639,
4163 length: 8,
4164 token: Token::Word(Word::Word("даты".to_string())),
4165 },
4166 PositionalToken {
4167 source: uws,
4168 offset: 647,
4169 length: 1,
4170 token: Token::Special(Special::Separator(Separator::Space)),
4171 },
4172 PositionalToken {
4173 source: uws,
4174 offset: 648,
4175 length: 2,
4176 token: Token::Word(Word::Word("и".to_string())),
4177 },
4178 PositionalToken {
4179 source: uws,
4180 offset: 650,
4181 length: 1,
4182 token: Token::Special(Special::Separator(Separator::Space)),
4183 },
4184 PositionalToken {
4185 source: uws,
4186 offset: 651,
4187 length: 14,
4188 token: Token::Word(Word::Word("способы".to_string())),
4189 },
4190 PositionalToken {
4191 source: uws,
4192 offset: 665,
4193 length: 1,
4194 token: Token::Special(Special::Separator(Separator::Space)),
4195 },
4196 PositionalToken {
4197 source: uws,
4198 offset: 666,
4199 length: 24,
4200 token: Token::Word(Word::Word("празднования".to_string())),
4201 },
4202 PositionalToken {
4203 source: uws,
4204 offset: 690,
4205 length: 1,
4206 token: Token::Special(Special::Punctuation('.')),
4207 },
4208 PositionalToken {
4209 source: uws,
4210 offset: 691,
4211 length: 1,
4212 token: Token::Special(Special::Separator(Separator::Space)),
4213 },
4214 PositionalToken {
4215 source: uws,
4216 offset: 794,
4217 length: 1,
4218 token: Token::Special(Special::Separator(Separator::Newline)),
4219 },
4220 PositionalToken {
4221 source: uws,
4222 offset: 795,
4223 length: 2,
4224 token: Token::Special(Special::Separator(Separator::Space)),
4225 },
4226 PositionalToken {
4227 source: uws,
4228 offset: 870,
4229 length: 1,
4230 token: Token::Special(Special::Separator(Separator::Newline)),
4231 },
4232 PositionalToken {
4233 source: uws,
4234 offset: 871,
4235 length: 2,
4236 token: Token::Special(Special::Separator(Separator::Space)),
4237 },
4238 PositionalToken {
4239 source: uws,
4240 offset: 910,
4241 length: 2,
4242 token: Token::Word(Word::Word("П".to_string())),
4243 },
4244 PositionalToken {
4245 source: uws,
4246 offset: 919,
4247 length: 1,
4248 token: Token::Special(Special::Separator(Separator::Newline)),
4249 },
4250 PositionalToken {
4251 source: uws,
4252 offset: 927,
4253 length: 12,
4254 token: Token::Word(Word::Word("ПОЧЕМУ".to_string())),
4255 },
4256 PositionalToken {
4257 source: uws,
4258 offset: 939,
4259 length: 1,
4260 token: Token::Special(Special::Separator(Separator::Space)),
4261 },
4262 PositionalToken {
4263 source: uws,
4264 offset: 940,
4265 length: 4,
4266 token: Token::Word(Word::Word("МЫ".to_string())),
4267 },
4268 PositionalToken {
4269 source: uws,
4270 offset: 944,
4271 length: 1,
4272 token: Token::Special(Special::Separator(Separator::Space)),
4273 },
4274 PositionalToken {
4275 source: uws,
4276 offset: 945,
4277 length: 6,
4278 token: Token::Word(Word::Word("ЕГО".to_string())),
4279 },
4280 PositionalToken {
4281 source: uws,
4282 offset: 951,
4283 length: 1,
4284 token: Token::Special(Special::Separator(Separator::Space)),
4285 },
4286 PositionalToken {
4287 source: uws,
4288 offset: 952,
4289 length: 18,
4290 token: Token::Word(Word::Word("ПРАЗДНУЕМ".to_string())),
4291 },
4292 PositionalToken {
4293 source: uws,
4294 offset: 1063,
4295 length: 2,
4296 token: Token::Word(Word::Word("В".to_string())),
4297 },
4298 PositionalToken {
4299 source: uws,
4300 offset: 1065,
4301 length: 1,
4302 token: Token::Special(Special::Separator(Separator::Space)),
4303 },
4304 PositionalToken {
4305 source: uws,
4306 offset: 1066,
4307 length: 4,
4308 token: Token::Word(Word::Number(Number::Integer(1987))),
4309 },
4310 PositionalToken {
4311 source: uws,
4312 offset: 1070,
4313 length: 1,
4314 token: Token::Special(Special::Separator(Separator::Space)),
4315 },
4316 PositionalToken {
4317 source: uws,
4318 offset: 1071,
4319 length: 8,
4320 token: Token::Word(Word::Word("году".to_string())),
4321 },
4322 PositionalToken {
4323 source: uws,
4324 offset: 1079,
4325 length: 1,
4326 token: Token::Special(Special::Separator(Separator::Space)),
4327 },
4328 PositionalToken {
4329 source: uws,
4330 offset: 1080,
4331 length: 14,
4332 token: Token::Word(Word::Word("комитет".to_string())),
4333 },
4334 PositionalToken {
4335 source: uws,
4336 offset: 1094,
4337 length: 1,
4338 token: Token::Special(Special::Separator(Separator::Space)),
4339 },
4340 PositionalToken {
4341 source: uws,
4342 offset: 1095,
4343 length: 14,
4344 token: Token::Word(Word::Word("госдумы".to_string())),
4345 },
4346 PositionalToken {
4347 source: uws,
4348 offset: 1109,
4349 length: 1,
4350 token: Token::Special(Special::Separator(Separator::Space)),
4351 },
4352 PositionalToken {
4353 source: uws,
4354 offset: 1110,
4355 length: 4,
4356 token: Token::Word(Word::Word("по".to_string())),
4357 },
4358 PositionalToken {
4359 source: uws,
4360 offset: 1114,
4361 length: 1,
4362 token: Token::Special(Special::Separator(Separator::Space)),
4363 },
4364 PositionalToken {
4365 source: uws,
4366 offset: 1115,
4367 length: 10,
4368 token: Token::Word(Word::Word("делам".to_string())),
4369 },
4370 PositionalToken {
4371 source: uws,
4372 offset: 1125,
4373 length: 1,
4374 token: Token::Special(Special::Separator(Separator::Space)),
4375 },
4376 PositionalToken {
4377 source: uws,
4378 offset: 1126,
4379 length: 12,
4380 token: Token::Word(Word::Word("женщин".to_string())),
4381 },
4382 PositionalToken {
4383 source: uws,
4384 offset: 1138,
4385 length: 1,
4386 token: Token::Special(Special::Punctuation(',')),
4387 },
4388 PositionalToken {
4389 source: uws,
4390 offset: 1139,
4391 length: 1,
4392 token: Token::Special(Special::Separator(Separator::Space)),
4393 },
4394 PositionalToken {
4395 source: uws,
4396 offset: 1140,
4397 length: 10,
4398 token: Token::Word(Word::Word("семьи".to_string())),
4399 },
4400 PositionalToken {
4401 source: uws,
4402 offset: 1150,
4403 length: 1,
4404 token: Token::Special(Special::Separator(Separator::Space)),
4405 },
4406 PositionalToken {
4407 source: uws,
4408 offset: 1151,
4409 length: 2,
4410 token: Token::Word(Word::Word("и".to_string())),
4411 },
4412 PositionalToken {
4413 source: uws,
4414 offset: 1153,
4415 length: 1,
4416 token: Token::Special(Special::Separator(Separator::Space)),
4417 },
4418 PositionalToken {
4419 source: uws,
4420 offset: 1154,
4421 length: 16,
4422 token: Token::Word(Word::Word("молодежи".to_string())),
4423 },
4424 PositionalToken {
4425 source: uws,
4426 offset: 1170,
4427 length: 1,
4428 token: Token::Special(Special::Separator(Separator::Space)),
4429 },
4430 PositionalToken {
4431 source: uws,
4432 offset: 1171,
4433 length: 16,
4434 token: Token::Word(Word::Word("выступил".to_string())),
4435 },
4436 PositionalToken {
4437 source: uws,
4438 offset: 1187,
4439 length: 1,
4440 token: Token::Special(Special::Separator(Separator::Space)),
4441 },
4442 PositionalToken {
4443 source: uws,
4444 offset: 1188,
4445 length: 2,
4446 token: Token::Word(Word::Word("с".to_string())),
4447 },
4448 PositionalToken {
4449 source: uws,
4450 offset: 1190,
4451 length: 1,
4452 token: Token::Special(Special::Separator(Separator::Space)),
4453 },
4454 PositionalToken {
4455 source: uws,
4456 offset: 1191,
4457 length: 24,
4458 token: Token::Word(Word::Word("предложением".to_string())),
4459 },
4460 PositionalToken {
4461 source: uws,
4462 offset: 1215,
4463 length: 1,
4464 token: Token::Special(Special::Separator(Separator::Space)),
4465 },
4466 PositionalToken {
4467 source: uws,
4468 offset: 1216,
4469 length: 16,
4470 token: Token::Word(Word::Word("учредить".to_string())),
4471 },
4472 PositionalToken {
4473 source: uws,
4474 offset: 1232,
4475 length: 1,
4476 token: Token::Special(Special::Separator(Separator::Space)),
4477 },
4478 PositionalToken {
4479 source: uws,
4480 offset: 1233,
4481 length: 2,
4482 token: Token::Special(Special::Punctuation('«')),
4483 },
4484 PositionalToken {
4485 source: uws,
4486 offset: 1235,
4487 length: 8,
4488 token: Token::Word(Word::Word("День".to_string())),
4489 },
4490 PositionalToken {
4491 source: uws,
4492 offset: 1243,
4493 length: 1,
4494 token: Token::Special(Special::Separator(Separator::Space)),
4495 },
4496 PositionalToken {
4497 source: uws,
4498 offset: 1244,
4499 length: 8,
4500 token: Token::Word(Word::Word("мамы".to_string())),
4501 },
4502 PositionalToken {
4503 source: uws,
4504 offset: 1252,
4505 length: 2,
4506 token: Token::Special(Special::Punctuation('»')),
4507 },
4508 PositionalToken {
4509 source: uws,
4510 offset: 1254,
4511 length: 1,
4512 token: Token::Special(Special::Punctuation(',')),
4513 },
4514 PositionalToken {
4515 source: uws,
4516 offset: 1255,
4517 length: 1,
4518 token: Token::Special(Special::Separator(Separator::Space)),
4519 },
4520 PositionalToken {
4521 source: uws,
4522 offset: 1256,
4523 length: 2,
4524 token: Token::Word(Word::Word("а".to_string())),
4525 },
4526 PositionalToken {
4527 source: uws,
4528 offset: 1258,
4529 length: 1,
4530 token: Token::Special(Special::Separator(Separator::Space)),
4531 },
4532 PositionalToken {
4533 source: uws,
4534 offset: 1259,
4535 length: 6,
4536 token: Token::Word(Word::Word("сам".to_string())),
4537 },
4538 PositionalToken {
4539 source: uws,
4540 offset: 1265,
4541 length: 1,
4542 token: Token::Special(Special::Separator(Separator::Space)),
4543 },
4544 PositionalToken {
4545 source: uws,
4546 offset: 1266,
4547 length: 12,
4548 token: Token::Word(Word::Word("приказ".to_string())),
4549 },
4550 PositionalToken {
4551 source: uws,
4552 offset: 1278,
4553 length: 1,
4554 token: Token::Special(Special::Separator(Separator::Space)),
4555 },
4556 PositionalToken {
4557 source: uws,
4558 offset: 1279,
4559 length: 6,
4560 token: Token::Word(Word::Word("был".to_string())),
4561 },
4562 PositionalToken {
4563 source: uws,
4564 offset: 1285,
4565 length: 1,
4566 token: Token::Special(Special::Separator(Separator::Space)),
4567 },
4568 PositionalToken {
4569 source: uws,
4570 offset: 1286,
4571 length: 16,
4572 token: Token::Word(Word::Word("подписан".to_string())),
4573 },
4574 PositionalToken {
4575 source: uws,
4576 offset: 1302,
4577 length: 1,
4578 token: Token::Special(Special::Separator(Separator::Space)),
4579 },
4580 PositionalToken {
4581 source: uws,
4582 offset: 1303,
4583 length: 6,
4584 token: Token::Word(Word::Word("уже".to_string())),
4585 },
4586 PositionalToken {
4587 source: uws,
4588 offset: 1309,
4589 length: 1,
4590 token: Token::Special(Special::Separator(Separator::Space)),
4591 },
4592 PositionalToken {
4593 source: uws,
4594 offset: 1310,
4595 length: 2,
4596 token: Token::Word(Word::Number(Number::Integer(30))),
4597 },
4598 PositionalToken {
4599 source: uws,
4600 offset: 1312,
4601 length: 1,
4602 token: Token::Special(Special::Separator(Separator::Space)),
4603 },
4604 PositionalToken {
4605 source: uws,
4606 offset: 1313,
4607 length: 12,
4608 token: Token::Word(Word::Word("января".to_string())),
4609 },
4610 PositionalToken {
4611 source: uws,
4612 offset: 1325,
4613 length: 1,
4614 token: Token::Special(Special::Separator(Separator::Space)),
4615 },
4616 PositionalToken {
4617 source: uws,
4618 offset: 1326,
4619 length: 4,
4620 token: Token::Word(Word::Number(Number::Integer(1988))),
4621 },
4622 PositionalToken {
4623 source: uws,
4624 offset: 1330,
4625 length: 1,
4626 token: Token::Special(Special::Separator(Separator::Space)),
4627 },
4628 PositionalToken {
4629 source: uws,
4630 offset: 1331,
4631 length: 8,
4632 token: Token::Word(Word::Word("года".to_string())),
4633 },
4634 PositionalToken {
4635 source: uws,
4636 offset: 1339,
4637 length: 1,
4638 token: Token::Special(Special::Separator(Separator::Space)),
4639 },
4640 PositionalToken {
4641 source: uws,
4642 offset: 1340,
4643 length: 14,
4644 token: Token::Word(Word::Word("Борисом".to_string())),
4645 },
4646 PositionalToken {
4647 source: uws,
4648 offset: 1354,
4649 length: 1,
4650 token: Token::Special(Special::Separator(Separator::Space)),
4651 },
4652 PositionalToken {
4653 source: uws,
4654 offset: 1355,
4655 length: 16,
4656 token: Token::Word(Word::Word("Ельциным".to_string())),
4657 },
4658 PositionalToken {
4659 source: uws,
4660 offset: 1371,
4661 length: 1,
4662 token: Token::Special(Special::Punctuation('.')),
4663 },
4664 PositionalToken {
4665 source: uws,
4666 offset: 1372,
4667 length: 1,
4668 token: Token::Special(Special::Separator(Separator::Space)),
4669 },
4670 PositionalToken {
4671 source: uws,
4672 offset: 1373,
4673 length: 8,
4674 token: Token::Word(Word::Word("Было".to_string())),
4675 },
4676 PositionalToken {
4677 source: uws,
4678 offset: 1381,
4679 length: 1,
4680 token: Token::Special(Special::Separator(Separator::Space)),
4681 },
4682 PositionalToken {
4683 source: uws,
4684 offset: 1382,
4685 length: 12,
4686 token: Token::Word(Word::Word("решено".to_string())),
4687 },
4688 PositionalToken {
4689 source: uws,
4690 offset: 1394,
4691 length: 1,
4692 token: Token::Special(Special::Punctuation(',')),
4693 },
4694 PositionalToken {
4695 source: uws,
4696 offset: 1395,
4697 length: 1,
4698 token: Token::Special(Special::Separator(Separator::Space)),
4699 },
4700 PositionalToken {
4701 source: uws,
4702 offset: 1396,
4703 length: 6,
4704 token: Token::Word(Word::Word("что".to_string())),
4705 },
4706 PositionalToken {
4707 source: uws,
4708 offset: 1402,
4709 length: 1,
4710 token: Token::Special(Special::Separator(Separator::Space)),
4711 },
4712 PositionalToken {
4713 source: uws,
4714 offset: 1403,
4715 length: 16,
4716 token: Token::Word(Word::Word("ежегодно".to_string())),
4717 },
4718 PositionalToken {
4719 source: uws,
4720 offset: 1419,
4721 length: 1,
4722 token: Token::Special(Special::Separator(Separator::Space)),
4723 },
4724 PositionalToken {
4725 source: uws,
4726 offset: 1420,
4727 length: 2,
4728 token: Token::Word(Word::Word("в".to_string())),
4729 },
4730 PositionalToken {
4731 source: uws,
4732 offset: 1422,
4733 length: 1,
4734 token: Token::Special(Special::Separator(Separator::Space)),
4735 },
4736 PositionalToken {
4737 source: uws,
4738 offset: 1423,
4739 length: 12,
4740 token: Token::Word(Word::Word("России".to_string())),
4741 },
4742 PositionalToken {
4743 source: uws,
4744 offset: 1435,
4745 length: 1,
4746 token: Token::Special(Special::Separator(Separator::Space)),
4747 },
4748 PositionalToken {
4749 source: uws,
4750 offset: 1436,
4751 length: 22,
4752 token: Token::Word(Word::Word("празднество".to_string())),
4753 },
4754 PositionalToken {
4755 source: uws,
4756 offset: 1458,
4757 length: 1,
4758 token: Token::Special(Special::Separator(Separator::Space)),
4759 },
4760 PositionalToken {
4761 source: uws,
4762 offset: 1459,
4763 length: 6,
4764 token: Token::Word(Word::Word("дня".to_string())),
4765 },
4766 PositionalToken {
4767 source: uws,
4768 offset: 1465,
4769 length: 1,
4770 token: Token::Special(Special::Separator(Separator::Space)),
4771 },
4772 PositionalToken {
4773 source: uws,
4774 offset: 1466,
4775 length: 8,
4776 token: Token::Word(Word::Word("мамы".to_string())),
4777 },
4778 PositionalToken {
4779 source: uws,
4780 offset: 1474,
4781 length: 1,
4782 token: Token::Special(Special::Separator(Separator::Space)),
4783 },
4784 PositionalToken {
4785 source: uws,
4786 offset: 1475,
4787 length: 10,
4788 token: Token::Word(Word::Word("будет".to_string())),
4789 },
4790 PositionalToken {
4791 source: uws,
4792 offset: 1485,
4793 length: 1,
4794 token: Token::Special(Special::Separator(Separator::Space)),
4795 },
4796 PositionalToken {
4797 source: uws,
4798 offset: 1486,
4799 length: 16,
4800 token: Token::Word(Word::Word("выпадать".to_string())),
4801 },
4802 PositionalToken {
4803 source: uws,
4804 offset: 1502,
4805 length: 1,
4806 token: Token::Special(Special::Separator(Separator::Space)),
4807 },
4808 PositionalToken {
4809 source: uws,
4810 offset: 1503,
4811 length: 4,
4812 token: Token::Word(Word::Word("на".to_string())),
4813 },
4814 PositionalToken {
4815 source: uws,
4816 offset: 1507,
4817 length: 1,
4818 token: Token::Special(Special::Separator(Separator::Space)),
4819 },
4820 PositionalToken {
4821 source: uws,
4822 offset: 1508,
4823 length: 18,
4824 token: Token::Word(Word::Word("последнее".to_string())),
4825 },
4826 PositionalToken {
4827 source: uws,
4828 offset: 1526,
4829 length: 1,
4830 token: Token::Special(Special::Separator(Separator::Space)),
4831 },
4832 PositionalToken {
4833 source: uws,
4834 offset: 1527,
4835 length: 22,
4836 token: Token::Word(Word::Word("воскресенье".to_string())),
4837 },
4838 PositionalToken {
4839 source: uws,
4840 offset: 1549,
4841 length: 1,
4842 token: Token::Special(Special::Separator(Separator::Space)),
4843 },
4844 PositionalToken {
4845 source: uws,
4846 offset: 1550,
4847 length: 12,
4848 token: Token::Word(Word::Word("ноября".to_string())),
4849 },
4850 PositionalToken {
4851 source: uws,
4852 offset: 1562,
4853 length: 1,
4854 token: Token::Special(Special::Punctuation('.')),
4855 },
4856 PositionalToken {
4857 source: uws,
4858 offset: 1563,
4859 length: 1,
4860 token: Token::Special(Special::Separator(Separator::Space)),
4861 },
4862 PositionalToken {
4863 source: uws,
4864 offset: 1664,
4865 length: 1,
4866 token: Token::Special(Special::Separator(Separator::Newline)),
4867 },
4868 PositionalToken {
4869 source: uws,
4870 offset: 1665,
4871 length: 2,
4872 token: Token::Special(Special::Separator(Separator::Space)),
4873 },
4874 PositionalToken {
4875 source: uws,
4876 offset: 1725,
4877 length: 1,
4878 token: Token::Special(Special::Separator(Separator::Newline)),
4879 },
4880 PositionalToken {
4881 source: uws,
4882 offset: 1726,
4883 length: 4,
4884 token: Token::Special(Special::Separator(Separator::Space)),
4885 },
4886 PositionalToken {
4887 source: uws,
4888 offset: 2725,
4889 length: 1,
4890 token: Token::Special(Special::Separator(Separator::Newline)),
4891 },
4892 PositionalToken {
4893 source: uws,
4894 offset: 2726,
4895 length: 2,
4896 token: Token::Special(Special::Separator(Separator::Space)),
4897 },
4898 PositionalToken {
4899 source: uws,
4900 offset: 2888,
4901 length: 1,
4902 token: Token::Special(Special::Separator(Separator::Newline)),
4903 },
4904 PositionalToken {
4905 source: uws,
4906 offset: 2889,
4907 length: 2,
4908 token: Token::Special(Special::Separator(Separator::Space)),
4909 },
4910 PositionalToken {
4911 source: uws,
4912 offset: 2891,
4913 length: 1,
4914 token: Token::Special(Special::Separator(Separator::Newline)),
4915 },
4916 PositionalToken {
4917 source: uws,
4918 offset: 2904,
4919 length: 1,
4920 token: Token::Special(Special::Separator(Separator::Newline)),
4921 },
4922 PositionalToken {
4923 source: uws,
4924 offset: 2905,
4925 length: 4,
4926 token: Token::Special(Special::Separator(Separator::Space)),
4927 },
4928 ];
4929
4930 let text = Text::new({
4931 uws.into_source()
4932 .pipe(tagger::Builder::new().create().into_breaker())
4933 .pipe(entities::Builder::new().create().into_piped())
4934 .into_separator()
4935 })
4936 .unwrap();
4937
4938 let lib_res = text
4939 .into_tokenizer(TokenizerParams::v1())
4940 .filter_map(|tt| tt.into_original_token_1())
4941 .collect::<Vec<_>>();
4942
4943 check_results(&result, &lib_res, uws);
4944 }
4945
4946 #[test]
4997 fn numerical_no_split() {
4998 let uws = "12.02.18 31.28.34 23.11.2018 123.568.365.234.578 127.0.0.1 1st 1кг 123123афываыв 12321фвафыов234выалфо 12_123_343.4234_4234";
4999 let lib_res = uws.into_tokenizer(Default::default()).collect::<Vec<_>>();
5000 let result = vec![
5002 PositionalToken {
5003 source: uws,
5004 offset: 0,
5005 length: 8,
5006 token: Token::Word(Word::Numerical(Numerical::DotSeparated(
5007 "12.02.18".to_string(),
5008 ))),
5009 },
5010 PositionalToken {
5011 source: uws,
5012 offset: 8,
5013 length: 1,
5014 token: Token::Special(Special::Separator(Separator::Space)),
5015 },
5016 PositionalToken {
5017 source: uws,
5018 offset: 9,
5019 length: 8,
5020 token: Token::Word(Word::Numerical(Numerical::DotSeparated(
5021 "31.28.34".to_string(),
5022 ))),
5023 },
5024 PositionalToken {
5025 source: uws,
5026 offset: 17,
5027 length: 1,
5028 token: Token::Special(Special::Separator(Separator::Space)),
5029 },
5030 PositionalToken {
5031 source: uws,
5032 offset: 18,
5033 length: 10,
5034 token: Token::Word(Word::Numerical(Numerical::DotSeparated(
5035 "23.11.2018".to_string(),
5036 ))),
5037 },
5038 PositionalToken {
5039 source: uws,
5040 offset: 28,
5041 length: 1,
5042 token: Token::Special(Special::Separator(Separator::Space)),
5043 },
5044 PositionalToken {
5045 source: uws,
5046 offset: 29,
5047 length: 19,
5048 token: Token::Word(Word::Numerical(Numerical::DotSeparated(
5049 "123.568.365.234.578".to_string(),
5050 ))),
5051 },
5052 PositionalToken {
5053 source: uws,
5054 offset: 48,
5055 length: 1,
5056 token: Token::Special(Special::Separator(Separator::Space)),
5057 },
5058 PositionalToken {
5059 source: uws,
5060 offset: 49,
5061 length: 9,
5062 token: Token::Word(Word::Numerical(Numerical::DotSeparated(
5063 "127.0.0.1".to_string(),
5064 ))),
5065 },
5066 PositionalToken {
5067 source: uws,
5068 offset: 58,
5069 length: 1,
5070 token: Token::Special(Special::Separator(Separator::Space)),
5071 },
5072 PositionalToken {
5073 source: uws,
5074 offset: 59,
5075 length: 3,
5076 token: Token::Word(Word::Numerical(Numerical::Measures("1st".to_string()))),
5077 },
5078 PositionalToken {
5079 source: uws,
5080 offset: 62,
5081 length: 1,
5082 token: Token::Special(Special::Separator(Separator::Space)),
5083 },
5084 PositionalToken {
5085 source: uws,
5086 offset: 63,
5087 length: 5,
5088 token: Token::Word(Word::Numerical(Numerical::Measures("1кг".to_string()))),
5089 },
5090 PositionalToken {
5091 source: uws,
5092 offset: 68,
5093 length: 1,
5094 token: Token::Special(Special::Separator(Separator::Space)),
5095 },
5096 PositionalToken {
5097 source: uws,
5098 offset: 69,
5099 length: 20,
5100 token: Token::Word(Word::Numerical(Numerical::Measures(
5101 "123123афываыв".to_string(),
5102 ))),
5103 },
5104 PositionalToken {
5105 source: uws,
5106 offset: 89,
5107 length: 1,
5108 token: Token::Special(Special::Separator(Separator::Space)),
5109 },
5110 PositionalToken {
5111 source: uws,
5112 offset: 90,
5113 length: 34,
5114 token: Token::Word(Word::Numerical(Numerical::Alphanumeric(
5115 "12321фвафыов234выалфо".to_string(),
5116 ))),
5117 },
5118 PositionalToken {
5119 source: uws,
5120 offset: 124,
5121 length: 1,
5122 token: Token::Special(Special::Separator(Separator::Space)),
5123 },
5124 PositionalToken {
5125 source: uws,
5126 offset: 125,
5127 length: 20,
5128 token: Token::Word(Word::Numerical(Numerical::Alphanumeric(
5129 "12_123_343.4234_4234".to_string(),
5130 ))),
5131 },
5132 ];
5133 check_results(&result, &lib_res, uws);
5134 }
5135
5136 #[test]
5137 fn numerical_default() {
5138 let uws = "12.02.18 31.28.34 23.11.2018 123.568.365.234.578 127.0.0.1 1st 1кг 123123афываыв 12321фвафыов234выалфо 12_123_343.4234_4234";
5139 let lib_res = uws
5140 .into_tokenizer(TokenizerParams::v1())
5141 .collect::<Vec<_>>();
5142 let result = vec![
5144 PositionalToken {
5145 source: uws,
5146 offset: 0,
5147 length: 2,
5148 token: Token::Word(Word::Number(Number::Integer(12))),
5149 },
5150 PositionalToken {
5151 source: uws,
5152 offset: 2,
5153 length: 1,
5154 token: Token::Special(Special::Punctuation('.')),
5155 },
5156 PositionalToken {
5157 source: uws,
5158 offset: 3,
5159 length: 2,
5160 token: Token::Word(Word::Number(Number::ZeroInteger {
5161 i: 2,
5162 s: "02".to_string(),
5163 })),
5164 },
5165 PositionalToken {
5166 source: uws,
5167 offset: 5,
5168 length: 1,
5169 token: Token::Special(Special::Punctuation('.')),
5170 },
5171 PositionalToken {
5172 source: uws,
5173 offset: 6,
5174 length: 2,
5175 token: Token::Word(Word::Number(Number::Integer(18))),
5176 },
5177 PositionalToken {
5178 source: uws,
5179 offset: 8,
5180 length: 1,
5181 token: Token::Special(Special::Separator(Separator::Space)),
5182 },
5183 PositionalToken {
5184 source: uws,
5185 offset: 9,
5186 length: 2,
5187 token: Token::Word(Word::Number(Number::Integer(31))),
5188 },
5189 PositionalToken {
5190 source: uws,
5191 offset: 11,
5192 length: 1,
5193 token: Token::Special(Special::Punctuation('.')),
5194 },
5195 PositionalToken {
5196 source: uws,
5197 offset: 12,
5198 length: 2,
5199 token: Token::Word(Word::Number(Number::Integer(28))),
5200 },
5201 PositionalToken {
5202 source: uws,
5203 offset: 14,
5204 length: 1,
5205 token: Token::Special(Special::Punctuation('.')),
5206 },
5207 PositionalToken {
5208 source: uws,
5209 offset: 15,
5210 length: 2,
5211 token: Token::Word(Word::Number(Number::Integer(34))),
5212 },
5213 PositionalToken {
5214 source: uws,
5215 offset: 17,
5216 length: 1,
5217 token: Token::Special(Special::Separator(Separator::Space)),
5218 },
5219 PositionalToken {
5220 source: uws,
5221 offset: 18,
5222 length: 2,
5223 token: Token::Word(Word::Number(Number::Integer(23))),
5224 },
5225 PositionalToken {
5226 source: uws,
5227 offset: 20,
5228 length: 1,
5229 token: Token::Special(Special::Punctuation('.')),
5230 },
5231 PositionalToken {
5232 source: uws,
5233 offset: 21,
5234 length: 2,
5235 token: Token::Word(Word::Number(Number::Integer(11))),
5236 },
5237 PositionalToken {
5238 source: uws,
5239 offset: 23,
5240 length: 1,
5241 token: Token::Special(Special::Punctuation('.')),
5242 },
5243 PositionalToken {
5244 source: uws,
5245 offset: 24,
5246 length: 4,
5247 token: Token::Word(Word::Number(Number::Integer(2018))),
5248 },
5249 PositionalToken {
5250 source: uws,
5251 offset: 28,
5252 length: 1,
5253 token: Token::Special(Special::Separator(Separator::Space)),
5254 },
5255 PositionalToken {
5256 source: uws,
5257 offset: 29,
5258 length: 3,
5259 token: Token::Word(Word::Number(Number::Integer(123))),
5260 },
5261 PositionalToken {
5262 source: uws,
5263 offset: 32,
5264 length: 1,
5265 token: Token::Special(Special::Punctuation('.')),
5266 },
5267 PositionalToken {
5268 source: uws,
5269 offset: 33,
5270 length: 3,
5271 token: Token::Word(Word::Number(Number::Integer(568))),
5272 },
5273 PositionalToken {
5274 source: uws,
5275 offset: 36,
5276 length: 1,
5277 token: Token::Special(Special::Punctuation('.')),
5278 },
5279 PositionalToken {
5280 source: uws,
5281 offset: 37,
5282 length: 3,
5283 token: Token::Word(Word::Number(Number::Integer(365))),
5284 },
5285 PositionalToken {
5286 source: uws,
5287 offset: 40,
5288 length: 1,
5289 token: Token::Special(Special::Punctuation('.')),
5290 },
5291 PositionalToken {
5292 source: uws,
5293 offset: 41,
5294 length: 3,
5295 token: Token::Word(Word::Number(Number::Integer(234))),
5296 },
5297 PositionalToken {
5298 source: uws,
5299 offset: 44,
5300 length: 1,
5301 token: Token::Special(Special::Punctuation('.')),
5302 },
5303 PositionalToken {
5304 source: uws,
5305 offset: 45,
5306 length: 3,
5307 token: Token::Word(Word::Number(Number::Integer(578))),
5308 },
5309 PositionalToken {
5310 source: uws,
5311 offset: 48,
5312 length: 1,
5313 token: Token::Special(Special::Separator(Separator::Space)),
5314 },
5315 PositionalToken {
5316 source: uws,
5317 offset: 49,
5318 length: 3,
5319 token: Token::Word(Word::Number(Number::Integer(127))),
5320 },
5321 PositionalToken {
5322 source: uws,
5323 offset: 52,
5324 length: 1,
5325 token: Token::Special(Special::Punctuation('.')),
5326 },
5327 PositionalToken {
5328 source: uws,
5329 offset: 53,
5330 length: 1,
5331 token: Token::Word(Word::Number(Number::ZeroInteger {
5332 i: 0,
5333 s: "0".to_string(),
5334 })),
5335 },
5336 PositionalToken {
5337 source: uws,
5338 offset: 54,
5339 length: 1,
5340 token: Token::Special(Special::Punctuation('.')),
5341 },
5342 PositionalToken {
5343 source: uws,
5344 offset: 55,
5345 length: 1,
5346 token: Token::Word(Word::Number(Number::ZeroInteger {
5347 i: 0,
5348 s: "0".to_string(),
5349 })),
5350 },
5351 PositionalToken {
5352 source: uws,
5353 offset: 56,
5354 length: 1,
5355 token: Token::Special(Special::Punctuation('.')),
5356 },
5357 PositionalToken {
5358 source: uws,
5359 offset: 57,
5360 length: 1,
5361 token: Token::Word(Word::Number(Number::Integer(1))),
5362 },
5363 PositionalToken {
5364 source: uws,
5365 offset: 58,
5366 length: 1,
5367 token: Token::Special(Special::Separator(Separator::Space)),
5368 },
5369 PositionalToken {
5370 source: uws,
5371 offset: 59,
5372 length: 3,
5373 token: Token::Word(Word::Numerical(Numerical::Measures("1st".to_string()))),
5374 },
5375 PositionalToken {
5376 source: uws,
5377 offset: 62,
5378 length: 1,
5379 token: Token::Special(Special::Separator(Separator::Space)),
5380 },
5381 PositionalToken {
5382 source: uws,
5383 offset: 63,
5384 length: 5,
5385 token: Token::Word(Word::Numerical(Numerical::Measures("1кг".to_string()))),
5386 },
5387 PositionalToken {
5388 source: uws,
5389 offset: 68,
5390 length: 1,
5391 token: Token::Special(Special::Separator(Separator::Space)),
5392 },
5393 PositionalToken {
5394 source: uws,
5395 offset: 69,
5396 length: 20,
5397 token: Token::Word(Word::Numerical(Numerical::Measures(
5398 "123123афываыв".to_string(),
5399 ))),
5400 },
5401 PositionalToken {
5402 source: uws,
5403 offset: 89,
5404 length: 1,
5405 token: Token::Special(Special::Separator(Separator::Space)),
5406 },
5407 PositionalToken {
5408 source: uws,
5409 offset: 90,
5410 length: 34,
5411 token: Token::Word(Word::Numerical(Numerical::Alphanumeric(
5412 "12321фвафыов234выалфо".to_string(),
5413 ))),
5414 },
5415 PositionalToken {
5416 source: uws,
5417 offset: 124,
5418 length: 1,
5419 token: Token::Special(Special::Separator(Separator::Space)),
5420 },
5421 PositionalToken {
5422 source: uws,
5423 offset: 125,
5424 length: 2,
5425 token: Token::Word(Word::Number(Number::Integer(12))),
5426 },
5427 PositionalToken {
5428 source: uws,
5429 offset: 127,
5430 length: 1,
5431 token: Token::Special(Special::Punctuation('_')),
5432 },
5433 PositionalToken {
5434 source: uws,
5435 offset: 128,
5436 length: 3,
5437 token: Token::Word(Word::Number(Number::Integer(123))),
5438 },
5439 PositionalToken {
5440 source: uws,
5441 offset: 131,
5442 length: 1,
5443 token: Token::Special(Special::Punctuation('_')),
5444 },
5445 PositionalToken {
5446 source: uws,
5447 offset: 132,
5448 length: 3,
5449 token: Token::Word(Word::Number(Number::Integer(343))),
5450 },
5451 PositionalToken {
5452 source: uws,
5453 offset: 135,
5454 length: 1,
5455 token: Token::Special(Special::Punctuation('.')),
5456 },
5457 PositionalToken {
5458 source: uws,
5459 offset: 136,
5460 length: 4,
5461 token: Token::Word(Word::Number(Number::Integer(4234))),
5462 },
5463 PositionalToken {
5464 source: uws,
5465 offset: 140,
5466 length: 1,
5467 token: Token::Special(Special::Punctuation('_')),
5468 },
5469 PositionalToken {
5470 source: uws,
5471 offset: 141,
5472 length: 4,
5473 token: Token::Word(Word::Number(Number::Integer(4234))),
5474 },
5475 ];
5476 check_results(&result, &lib_res, uws);
5477 }
5478
5479 enum Lang {
5492 Zho,
5493 Jpn,
5494 Kor,
5495 Ara,
5496 Ell,
5497 }
5498
5499 #[test]
5500 fn test_lang_zho() {
5501 let (uws, result) = get_lang_test(Lang::Zho);
5502 let lib_res = uws
5503 .into_tokenizer(TokenizerParams::v1())
5504 .collect::<Vec<_>>();
5505 check_results(&result, &lib_res, &uws);
5506 }
5507
5508 #[test]
5509 fn test_lang_jpn() {
5510 let (uws, result) = get_lang_test(Lang::Jpn);
5511 let lib_res = uws
5512 .into_tokenizer(TokenizerParams::v1())
5513 .collect::<Vec<_>>();
5514 check_results(&result, &lib_res, &uws);
5515 }
5516
5517 #[test]
5518 fn test_lang_kor() {
5519 let (uws, result) = get_lang_test(Lang::Kor);
5520 let lib_res = uws
5521 .into_tokenizer(TokenizerParams::v1())
5522 .collect::<Vec<_>>();
5523 check_results(&result, &lib_res, &uws);
5524 }
5525
5526 #[test]
5527 fn test_lang_ara() {
5528 let (uws, result) = get_lang_test(Lang::Ara);
5529 let lib_res = uws
5530 .into_tokenizer(TokenizerParams::v1())
5531 .collect::<Vec<_>>();
5532 check_results(&result, &lib_res, &uws);
5533 }
5534
5535 #[test]
5536 fn test_lang_ell() {
5537 let (uws, result) = get_lang_test(Lang::Ell);
5538 let lib_res = uws
5539 .into_tokenizer(TokenizerParams::v1())
5540 .collect::<Vec<_>>();
5541 check_results(&result, &lib_res, &uws);
5542 }
5543
5544 fn get_lang_test(lng: Lang) -> (String, Vec<PositionalToken>) {
5545 let uws = match lng {
5546 Lang::Zho => {
5547 "美国电视连续剧《超人前传》的第一集《试播集》于2001年10月16日在電視網首播,剧集主创人阿尔弗雷德·高夫和迈尔斯·米勒編劇,大卫·努特尔执导。这一试播首次向观众引荐了克拉克·肯特一角,他是位拥有超能力的外星孤儿,与家人和朋友一起在堪薩斯州虚构小镇斯莫维尔生活。在这一集里,肯特首度得知自己的来历,同时还需要阻止一位学生试图杀死镇上高中多名学生的报复之举。本集节目里引入了多个之后将贯穿全季甚至整部剧集的主题元素,例如几位主要角色之间的三角恋情。电视剧在加拿大溫哥華取景,旨在选用其“美国中产阶级”景观,主创人花了5个月的时间专门用于为主角物色合适的演员。试播集在所有演员选好4天后正式开拍。由于时间上的限制,剧组无法搭建好实体外景,因此只能使用计算机绘图技术将数字化的外景插入到镜头中。节目一经上映就打破了电视网的多项收视纪录,并且获得了评论员的普遍好评和多个奖项提名,并在其中两项上胜出"
5548 }
5549 Lang::Kor => {
5550 "플레이스테이션 은 소니 컴퓨터 엔터테인먼트가 개발한 세 번째 가정용 게임기이다. 마이크로소프트의 엑스박스 360, 닌텐도의 Wii와 경쟁하고 있다. 이전 제품에서 온라인 플레이 기능을 비디오 게임 개발사에 전적으로 의존하던 것과 달리 통합 온라인 게임 서비스인 플레이스테이션 네트워크 서비스를 발매와 함께 시작해 제공하고 있으며, 탄탄한 멀티미디어 재생 기능, 플레이스테이션 포터블과의 연결, 고화질 광학 디스크 포맷인 블루레이 디스크 재생 기능 등의 기능을 갖추고 있다. 2006년 11월 11일에 일본에서 처음으로 출시했으며, 11월 17일에는 북미 지역, 2007년 3월 23일에는 유럽과 오세아니아 지역에서, 대한민국의 경우 6월 5일부터 일주일간 예약판매를 실시해, 매일 준비한 수량이 동이 나는 등 많은 관심을 받았으며 6월 16일에 정식 출시 행사를 열었다"
5551 }
5552 Lang::Jpn => {
5553 "熊野三山本願所は、15世紀末以降における熊野三山(熊野本宮、熊野新宮、熊野那智)の造営・修造のための勧進を担った組織の総称。 熊野三山を含めて、日本における古代から中世前半にかけての寺社の造営は、寺社領経営のような恒常的財源、幕府や朝廷などからの一時的な造営料所の寄進、あるいは公権力からの臨時の保護によって行われていた。しかしながら、熊野三山では、これらの財源はすべて15世紀半ばまでに実効性を失った"
5554 }
5555 Lang::Ara => {
5556 "لشکرکشیهای روسهای وارنگی به دریای خزر مجموعهای از حملات نظامی در بین سالهای ۸۶۴ تا ۱۰۴۱ میلادی به سواحل دریای خزر بودهاست. روسهای وارنگی ابتدا در قرن نهم میلادی به عنوان بازرگانان پوست، عسل و برده در سرزمینهای اسلامی(سرکلند) ظاهر شدند. این بازرگانان در مسیر تجاری ولگا به خرید و فروش میپرداختند. نخستین حملهٔ آنان در فاصله سالهای ۸۶۴ تا ۸۸۴ میلادی در مقیاسی کوچک علیه علویان طبرستان رخ داد. نخستین یورش بزرگ روسها در سال ۹۱۳ رخ داد و آنان با ۵۰۰ فروند درازکشتی شهر گرگان و اطراف آن را غارت کردند. آنها در این حمله مقداری کالا و برده را به تاراج بردند و در راه بازگشتن به سمت شمال، در دلتای ولگا، مورد حملهٔ خزرهای مسلمان قرار گرفتند و بعضی از آنان موفق به فرار شدند، ولی در میانهٔ ولگا به قتل رسیدند. دومین هجوم بزرگ روسها به دریای خزر در سال ۹۴۳ به وقوع پیوست. در این دوره ایگور یکم، حاکم روس کیف، رهبری روسها را در دست داشت. روسها پس از توافق با دولت خزرها برای عبور امن از منطقه، تا رود کورا و اعماق قفقاز پیش رفتند و در سال ۹۴۳ موفق شدند بندر بردعه، پایتخت اران (جمهوری آذربایجان کنونی)، را تصرف کنند. روسها در آنجا به مدت چند ماه ماندند و بسیاری از ساکنان شهر را کشتند و از راه غارتگری اموالی را به تاراج بردند. تنها دلیل بازگشت آنان "
5557 }
5558 Lang::Ell => {
5559 "Το Πρόγραμμα υλοποιείται εξ ολοκλήρου από απόσταση και μπορεί να συμμετέχει κάθε εμπλεκόμενος στη ή/και ενδιαφερόμενος για τη διδασκαλία της Ελληνικής ως δεύτερης/ξένης γλώσσας στην Ελλάδα και στο εξωτερικό, αρκεί να είναι απόφοιτος ελληνικής φιλολογίας, ξένων φιλολογιών, παιδαγωγικών τμημάτων, θεολογικών σχολών ή άλλων πανεπιστημιακών τμημάτων ελληνικών ή ισότιμων ξένων πανεπιστημίων. Υπό όρους γίνονται δεκτοί υποψήφιοι που δεν έχουν ολοκληρώσει σπουδές τριτοβάθμιας εκπαίδευσης."
5560 }
5561 };
5562 let tokens = match lng {
5563 Lang::Zho => vec![
5564 PositionalToken {
5565 source: uws,
5566 offset: 0,
5567 length: 3,
5568 token: Token::Word(Word::Word("美".to_string())),
5569 },
5570 PositionalToken {
5571 source: uws,
5572 offset: 3,
5573 length: 3,
5574 token: Token::Word(Word::Word("国".to_string())),
5575 },
5576 PositionalToken {
5577 source: uws,
5578 offset: 6,
5579 length: 3,
5580 token: Token::Word(Word::Word("电".to_string())),
5581 },
5582 PositionalToken {
5583 source: uws,
5584 offset: 9,
5585 length: 3,
5586 token: Token::Word(Word::Word("视".to_string())),
5587 },
5588 PositionalToken {
5589 source: uws,
5590 offset: 12,
5591 length: 3,
5592 token: Token::Word(Word::Word("连".to_string())),
5593 },
5594 PositionalToken {
5595 source: uws,
5596 offset: 15,
5597 length: 3,
5598 token: Token::Word(Word::Word("续".to_string())),
5599 },
5600 PositionalToken {
5601 source: uws,
5602 offset: 18,
5603 length: 3,
5604 token: Token::Word(Word::Word("剧".to_string())),
5605 },
5606 PositionalToken {
5607 source: uws,
5608 offset: 21,
5609 length: 3,
5610 token: Token::Special(Special::Punctuation('《')),
5611 },
5612 PositionalToken {
5613 source: uws,
5614 offset: 24,
5615 length: 3,
5616 token: Token::Word(Word::Word("超".to_string())),
5617 },
5618 PositionalToken {
5619 source: uws,
5620 offset: 27,
5621 length: 3,
5622 token: Token::Word(Word::Word("人".to_string())),
5623 },
5624 PositionalToken {
5625 source: uws,
5626 offset: 30,
5627 length: 3,
5628 token: Token::Word(Word::Word("前".to_string())),
5629 },
5630 PositionalToken {
5631 source: uws,
5632 offset: 33,
5633 length: 3,
5634 token: Token::Word(Word::Word("传".to_string())),
5635 },
5636 PositionalToken {
5637 source: uws,
5638 offset: 36,
5639 length: 3,
5640 token: Token::Special(Special::Punctuation('》')),
5641 },
5642 PositionalToken {
5643 source: uws,
5644 offset: 39,
5645 length: 3,
5646 token: Token::Word(Word::Word("的".to_string())),
5647 },
5648 PositionalToken {
5649 source: uws,
5650 offset: 42,
5651 length: 3,
5652 token: Token::Word(Word::Word("第".to_string())),
5653 },
5654 PositionalToken {
5655 source: uws,
5656 offset: 45,
5657 length: 3,
5658 token: Token::Word(Word::Word("一".to_string())),
5659 },
5660 PositionalToken {
5661 source: uws,
5662 offset: 48,
5663 length: 3,
5664 token: Token::Word(Word::Word("集".to_string())),
5665 },
5666 PositionalToken {
5667 source: uws,
5668 offset: 51,
5669 length: 3,
5670 token: Token::Special(Special::Punctuation('《')),
5671 },
5672 PositionalToken {
5673 source: uws,
5674 offset: 54,
5675 length: 3,
5676 token: Token::Word(Word::Word("试".to_string())),
5677 },
5678 PositionalToken {
5679 source: uws,
5680 offset: 57,
5681 length: 3,
5682 token: Token::Word(Word::Word("播".to_string())),
5683 },
5684 PositionalToken {
5685 source: uws,
5686 offset: 60,
5687 length: 3,
5688 token: Token::Word(Word::Word("集".to_string())),
5689 },
5690 PositionalToken {
5691 source: uws,
5692 offset: 63,
5693 length: 3,
5694 token: Token::Special(Special::Punctuation('》')),
5695 },
5696 PositionalToken {
5697 source: uws,
5698 offset: 66,
5699 length: 3,
5700 token: Token::Word(Word::Word("于".to_string())),
5701 },
5702 PositionalToken {
5703 source: uws,
5704 offset: 69,
5705 length: 4,
5706 token: Token::Word(Word::Number(Number::Integer(2001))),
5707 },
5708 PositionalToken {
5709 source: uws,
5710 offset: 73,
5711 length: 3,
5712 token: Token::Word(Word::Word("年".to_string())),
5713 },
5714 PositionalToken {
5715 source: uws,
5716 offset: 76,
5717 length: 2,
5718 token: Token::Word(Word::Number(Number::Integer(10))),
5719 },
5720 PositionalToken {
5721 source: uws,
5722 offset: 78,
5723 length: 3,
5724 token: Token::Word(Word::Word("月".to_string())),
5725 },
5726 PositionalToken {
5727 source: uws,
5728 offset: 81,
5729 length: 2,
5730 token: Token::Word(Word::Number(Number::Integer(16))),
5731 },
5732 PositionalToken {
5733 source: uws,
5734 offset: 83,
5735 length: 3,
5736 token: Token::Word(Word::Word("日".to_string())),
5737 },
5738 PositionalToken {
5739 source: uws,
5740 offset: 86,
5741 length: 3,
5742 token: Token::Word(Word::Word("在".to_string())),
5743 },
5744 PositionalToken {
5745 source: uws,
5746 offset: 89,
5747 length: 3,
5748 token: Token::Word(Word::Word("電".to_string())),
5749 },
5750 PositionalToken {
5751 source: uws,
5752 offset: 92,
5753 length: 3,
5754 token: Token::Word(Word::Word("視".to_string())),
5755 },
5756 PositionalToken {
5757 source: uws,
5758 offset: 95,
5759 length: 3,
5760 token: Token::Word(Word::Word("網".to_string())),
5761 },
5762 PositionalToken {
5763 source: uws,
5764 offset: 98,
5765 length: 3,
5766 token: Token::Word(Word::Word("首".to_string())),
5767 },
5768 PositionalToken {
5769 source: uws,
5770 offset: 101,
5771 length: 3,
5772 token: Token::Word(Word::Word("播".to_string())),
5773 },
5774 PositionalToken {
5775 source: uws,
5776 offset: 104,
5777 length: 3,
5778 token: Token::Special(Special::Punctuation(',')),
5779 },
5780 PositionalToken {
5781 source: uws,
5782 offset: 107,
5783 length: 3,
5784 token: Token::Word(Word::Word("剧".to_string())),
5785 },
5786 PositionalToken {
5787 source: uws,
5788 offset: 110,
5789 length: 3,
5790 token: Token::Word(Word::Word("集".to_string())),
5791 },
5792 PositionalToken {
5793 source: uws,
5794 offset: 113,
5795 length: 3,
5796 token: Token::Word(Word::Word("主".to_string())),
5797 },
5798 PositionalToken {
5799 source: uws,
5800 offset: 116,
5801 length: 3,
5802 token: Token::Word(Word::Word("创".to_string())),
5803 },
5804 PositionalToken {
5805 source: uws,
5806 offset: 119,
5807 length: 3,
5808 token: Token::Word(Word::Word("人".to_string())),
5809 },
5810 PositionalToken {
5811 source: uws,
5812 offset: 122,
5813 length: 3,
5814 token: Token::Word(Word::Word("阿".to_string())),
5815 },
5816 PositionalToken {
5817 source: uws,
5818 offset: 125,
5819 length: 3,
5820 token: Token::Word(Word::Word("尔".to_string())),
5821 },
5822 PositionalToken {
5823 source: uws,
5824 offset: 128,
5825 length: 3,
5826 token: Token::Word(Word::Word("弗".to_string())),
5827 },
5828 PositionalToken {
5829 source: uws,
5830 offset: 131,
5831 length: 3,
5832 token: Token::Word(Word::Word("雷".to_string())),
5833 },
5834 PositionalToken {
5835 source: uws,
5836 offset: 134,
5837 length: 3,
5838 token: Token::Word(Word::Word("德".to_string())),
5839 },
5840 PositionalToken {
5841 source: uws,
5842 offset: 137,
5843 length: 2,
5844 token: Token::Special(Special::Punctuation('·')),
5845 },
5846 PositionalToken {
5847 source: uws,
5848 offset: 139,
5849 length: 3,
5850 token: Token::Word(Word::Word("高".to_string())),
5851 },
5852 PositionalToken {
5853 source: uws,
5854 offset: 142,
5855 length: 3,
5856 token: Token::Word(Word::Word("夫".to_string())),
5857 },
5858 PositionalToken {
5859 source: uws,
5860 offset: 145,
5861 length: 3,
5862 token: Token::Word(Word::Word("和".to_string())),
5863 },
5864 PositionalToken {
5865 source: uws,
5866 offset: 148,
5867 length: 3,
5868 token: Token::Word(Word::Word("迈".to_string())),
5869 },
5870 PositionalToken {
5871 source: uws,
5872 offset: 151,
5873 length: 3,
5874 token: Token::Word(Word::Word("尔".to_string())),
5875 },
5876 PositionalToken {
5877 source: uws,
5878 offset: 154,
5879 length: 3,
5880 token: Token::Word(Word::Word("斯".to_string())),
5881 },
5882 PositionalToken {
5883 source: uws,
5884 offset: 157,
5885 length: 2,
5886 token: Token::Special(Special::Punctuation('·')),
5887 },
5888 PositionalToken {
5889 source: uws,
5890 offset: 159,
5891 length: 3,
5892 token: Token::Word(Word::Word("米".to_string())),
5893 },
5894 PositionalToken {
5895 source: uws,
5896 offset: 162,
5897 length: 3,
5898 token: Token::Word(Word::Word("勒".to_string())),
5899 },
5900 PositionalToken {
5901 source: uws,
5902 offset: 165,
5903 length: 3,
5904 token: Token::Word(Word::Word("編".to_string())),
5905 },
5906 PositionalToken {
5907 source: uws,
5908 offset: 168,
5909 length: 3,
5910 token: Token::Word(Word::Word("劇".to_string())),
5911 },
5912 PositionalToken {
5913 source: uws,
5914 offset: 171,
5915 length: 3,
5916 token: Token::Special(Special::Punctuation(',')),
5917 },
5918 PositionalToken {
5919 source: uws,
5920 offset: 174,
5921 length: 3,
5922 token: Token::Word(Word::Word("大".to_string())),
5923 },
5924 PositionalToken {
5925 source: uws,
5926 offset: 177,
5927 length: 3,
5928 token: Token::Word(Word::Word("卫".to_string())),
5929 },
5930 PositionalToken {
5931 source: uws,
5932 offset: 180,
5933 length: 2,
5934 token: Token::Special(Special::Punctuation('·')),
5935 },
5936 PositionalToken {
5937 source: uws,
5938 offset: 182,
5939 length: 3,
5940 token: Token::Word(Word::Word("努".to_string())),
5941 },
5942 PositionalToken {
5943 source: uws,
5944 offset: 185,
5945 length: 3,
5946 token: Token::Word(Word::Word("特".to_string())),
5947 },
5948 PositionalToken {
5949 source: uws,
5950 offset: 188,
5951 length: 3,
5952 token: Token::Word(Word::Word("尔".to_string())),
5953 },
5954 PositionalToken {
5955 source: uws,
5956 offset: 191,
5957 length: 3,
5958 token: Token::Word(Word::Word("执".to_string())),
5959 },
5960 PositionalToken {
5961 source: uws,
5962 offset: 194,
5963 length: 3,
5964 token: Token::Word(Word::Word("导".to_string())),
5965 },
5966 PositionalToken {
5967 source: uws,
5968 offset: 197,
5969 length: 3,
5970 token: Token::Special(Special::Punctuation('。')),
5971 },
5972 PositionalToken {
5973 source: uws,
5974 offset: 200,
5975 length: 3,
5976 token: Token::Word(Word::Word("这".to_string())),
5977 },
5978 PositionalToken {
5979 source: uws,
5980 offset: 203,
5981 length: 3,
5982 token: Token::Word(Word::Word("一".to_string())),
5983 },
5984 PositionalToken {
5985 source: uws,
5986 offset: 206,
5987 length: 3,
5988 token: Token::Word(Word::Word("试".to_string())),
5989 },
5990 PositionalToken {
5991 source: uws,
5992 offset: 209,
5993 length: 3,
5994 token: Token::Word(Word::Word("播".to_string())),
5995 },
5996 PositionalToken {
5997 source: uws,
5998 offset: 212,
5999 length: 3,
6000 token: Token::Word(Word::Word("首".to_string())),
6001 },
6002 PositionalToken {
6003 source: uws,
6004 offset: 215,
6005 length: 3,
6006 token: Token::Word(Word::Word("次".to_string())),
6007 },
6008 PositionalToken {
6009 source: uws,
6010 offset: 218,
6011 length: 3,
6012 token: Token::Word(Word::Word("向".to_string())),
6013 },
6014 PositionalToken {
6015 source: uws,
6016 offset: 221,
6017 length: 3,
6018 token: Token::Word(Word::Word("观".to_string())),
6019 },
6020 PositionalToken {
6021 source: uws,
6022 offset: 224,
6023 length: 3,
6024 token: Token::Word(Word::Word("众".to_string())),
6025 },
6026 PositionalToken {
6027 source: uws,
6028 offset: 227,
6029 length: 3,
6030 token: Token::Word(Word::Word("引".to_string())),
6031 },
6032 PositionalToken {
6033 source: uws,
6034 offset: 230,
6035 length: 3,
6036 token: Token::Word(Word::Word("荐".to_string())),
6037 },
6038 PositionalToken {
6039 source: uws,
6040 offset: 233,
6041 length: 3,
6042 token: Token::Word(Word::Word("了".to_string())),
6043 },
6044 PositionalToken {
6045 source: uws,
6046 offset: 236,
6047 length: 3,
6048 token: Token::Word(Word::Word("克".to_string())),
6049 },
6050 PositionalToken {
6051 source: uws,
6052 offset: 239,
6053 length: 3,
6054 token: Token::Word(Word::Word("拉".to_string())),
6055 },
6056 PositionalToken {
6057 source: uws,
6058 offset: 242,
6059 length: 3,
6060 token: Token::Word(Word::Word("克".to_string())),
6061 },
6062 PositionalToken {
6063 source: uws,
6064 offset: 245,
6065 length: 2,
6066 token: Token::Special(Special::Punctuation('·')),
6067 },
6068 PositionalToken {
6069 source: uws,
6070 offset: 247,
6071 length: 3,
6072 token: Token::Word(Word::Word("肯".to_string())),
6073 },
6074 PositionalToken {
6075 source: uws,
6076 offset: 250,
6077 length: 3,
6078 token: Token::Word(Word::Word("特".to_string())),
6079 },
6080 PositionalToken {
6081 source: uws,
6082 offset: 253,
6083 length: 3,
6084 token: Token::Word(Word::Word("一".to_string())),
6085 },
6086 PositionalToken {
6087 source: uws,
6088 offset: 256,
6089 length: 3,
6090 token: Token::Word(Word::Word("角".to_string())),
6091 },
6092 PositionalToken {
6093 source: uws,
6094 offset: 259,
6095 length: 3,
6096 token: Token::Special(Special::Punctuation(',')),
6097 },
6098 PositionalToken {
6099 source: uws,
6100 offset: 262,
6101 length: 3,
6102 token: Token::Word(Word::Word("他".to_string())),
6103 },
6104 PositionalToken {
6105 source: uws,
6106 offset: 265,
6107 length: 3,
6108 token: Token::Word(Word::Word("是".to_string())),
6109 },
6110 PositionalToken {
6111 source: uws,
6112 offset: 268,
6113 length: 3,
6114 token: Token::Word(Word::Word("位".to_string())),
6115 },
6116 PositionalToken {
6117 source: uws,
6118 offset: 271,
6119 length: 3,
6120 token: Token::Word(Word::Word("拥".to_string())),
6121 },
6122 PositionalToken {
6123 source: uws,
6124 offset: 274,
6125 length: 3,
6126 token: Token::Word(Word::Word("有".to_string())),
6127 },
6128 PositionalToken {
6129 source: uws,
6130 offset: 277,
6131 length: 3,
6132 token: Token::Word(Word::Word("超".to_string())),
6133 },
6134 ],
6135 Lang::Jpn => vec![
6136 PositionalToken {
6137 source: uws,
6138 offset: 0,
6139 length: 3,
6140 token: Token::Word(Word::Word("熊".to_string())),
6141 },
6142 PositionalToken {
6143 source: uws,
6144 offset: 3,
6145 length: 3,
6146 token: Token::Word(Word::Word("野".to_string())),
6147 },
6148 PositionalToken {
6149 source: uws,
6150 offset: 6,
6151 length: 3,
6152 token: Token::Word(Word::Word("三".to_string())),
6153 },
6154 PositionalToken {
6155 source: uws,
6156 offset: 9,
6157 length: 3,
6158 token: Token::Word(Word::Word("山".to_string())),
6159 },
6160 PositionalToken {
6161 source: uws,
6162 offset: 12,
6163 length: 3,
6164 token: Token::Word(Word::Word("本".to_string())),
6165 },
6166 PositionalToken {
6167 source: uws,
6168 offset: 15,
6169 length: 3,
6170 token: Token::Word(Word::Word("願".to_string())),
6171 },
6172 PositionalToken {
6173 source: uws,
6174 offset: 18,
6175 length: 3,
6176 token: Token::Word(Word::Word("所".to_string())),
6177 },
6178 PositionalToken {
6179 source: uws,
6180 offset: 21,
6181 length: 3,
6182 token: Token::Word(Word::Word("は".to_string())),
6183 },
6184 PositionalToken {
6185 source: uws,
6186 offset: 24,
6187 length: 3,
6188 token: Token::Special(Special::Punctuation('、')),
6189 },
6190 PositionalToken {
6191 source: uws,
6192 offset: 27,
6193 length: 2,
6194 token: Token::Word(Word::Number(Number::Integer(15))),
6195 },
6196 PositionalToken {
6197 source: uws,
6198 offset: 29,
6199 length: 3,
6200 token: Token::Word(Word::Word("世".to_string())),
6201 },
6202 PositionalToken {
6203 source: uws,
6204 offset: 32,
6205 length: 3,
6206 token: Token::Word(Word::Word("紀".to_string())),
6207 },
6208 PositionalToken {
6209 source: uws,
6210 offset: 35,
6211 length: 3,
6212 token: Token::Word(Word::Word("末".to_string())),
6213 },
6214 PositionalToken {
6215 source: uws,
6216 offset: 38,
6217 length: 3,
6218 token: Token::Word(Word::Word("以".to_string())),
6219 },
6220 PositionalToken {
6221 source: uws,
6222 offset: 41,
6223 length: 3,
6224 token: Token::Word(Word::Word("降".to_string())),
6225 },
6226 PositionalToken {
6227 source: uws,
6228 offset: 44,
6229 length: 3,
6230 token: Token::Word(Word::Word("に".to_string())),
6231 },
6232 PositionalToken {
6233 source: uws,
6234 offset: 47,
6235 length: 3,
6236 token: Token::Word(Word::Word("お".to_string())),
6237 },
6238 PositionalToken {
6239 source: uws,
6240 offset: 50,
6241 length: 3,
6242 token: Token::Word(Word::Word("け".to_string())),
6243 },
6244 PositionalToken {
6245 source: uws,
6246 offset: 53,
6247 length: 3,
6248 token: Token::Word(Word::Word("る".to_string())),
6249 },
6250 PositionalToken {
6251 source: uws,
6252 offset: 56,
6253 length: 3,
6254 token: Token::Word(Word::Word("熊".to_string())),
6255 },
6256 PositionalToken {
6257 source: uws,
6258 offset: 59,
6259 length: 3,
6260 token: Token::Word(Word::Word("野".to_string())),
6261 },
6262 PositionalToken {
6263 source: uws,
6264 offset: 62,
6265 length: 3,
6266 token: Token::Word(Word::Word("三".to_string())),
6267 },
6268 PositionalToken {
6269 source: uws,
6270 offset: 65,
6271 length: 3,
6272 token: Token::Word(Word::Word("山".to_string())),
6273 },
6274 PositionalToken {
6275 source: uws,
6276 offset: 68,
6277 length: 3,
6278 token: Token::Special(Special::Punctuation('(')),
6279 },
6280 PositionalToken {
6281 source: uws,
6282 offset: 71,
6283 length: 3,
6284 token: Token::Word(Word::Word("熊".to_string())),
6285 },
6286 PositionalToken {
6287 source: uws,
6288 offset: 74,
6289 length: 3,
6290 token: Token::Word(Word::Word("野".to_string())),
6291 },
6292 PositionalToken {
6293 source: uws,
6294 offset: 77,
6295 length: 3,
6296 token: Token::Word(Word::Word("本".to_string())),
6297 },
6298 PositionalToken {
6299 source: uws,
6300 offset: 80,
6301 length: 3,
6302 token: Token::Word(Word::Word("宮".to_string())),
6303 },
6304 PositionalToken {
6305 source: uws,
6306 offset: 83,
6307 length: 3,
6308 token: Token::Special(Special::Punctuation('、')),
6309 },
6310 PositionalToken {
6311 source: uws,
6312 offset: 86,
6313 length: 3,
6314 token: Token::Word(Word::Word("熊".to_string())),
6315 },
6316 PositionalToken {
6317 source: uws,
6318 offset: 89,
6319 length: 3,
6320 token: Token::Word(Word::Word("野".to_string())),
6321 },
6322 PositionalToken {
6323 source: uws,
6324 offset: 92,
6325 length: 3,
6326 token: Token::Word(Word::Word("新".to_string())),
6327 },
6328 PositionalToken {
6329 source: uws,
6330 offset: 95,
6331 length: 3,
6332 token: Token::Word(Word::Word("宮".to_string())),
6333 },
6334 PositionalToken {
6335 source: uws,
6336 offset: 98,
6337 length: 3,
6338 token: Token::Special(Special::Punctuation('、')),
6339 },
6340 PositionalToken {
6341 source: uws,
6342 offset: 101,
6343 length: 3,
6344 token: Token::Word(Word::Word("熊".to_string())),
6345 },
6346 PositionalToken {
6347 source: uws,
6348 offset: 104,
6349 length: 3,
6350 token: Token::Word(Word::Word("野".to_string())),
6351 },
6352 PositionalToken {
6353 source: uws,
6354 offset: 107,
6355 length: 3,
6356 token: Token::Word(Word::Word("那".to_string())),
6357 },
6358 PositionalToken {
6359 source: uws,
6360 offset: 110,
6361 length: 3,
6362 token: Token::Word(Word::Word("智".to_string())),
6363 },
6364 PositionalToken {
6365 source: uws,
6366 offset: 113,
6367 length: 3,
6368 token: Token::Special(Special::Punctuation(')')),
6369 },
6370 PositionalToken {
6371 source: uws,
6372 offset: 116,
6373 length: 3,
6374 token: Token::Word(Word::Word("の".to_string())),
6375 },
6376 PositionalToken {
6377 source: uws,
6378 offset: 119,
6379 length: 3,
6380 token: Token::Word(Word::Word("造".to_string())),
6381 },
6382 PositionalToken {
6383 source: uws,
6384 offset: 122,
6385 length: 3,
6386 token: Token::Word(Word::Word("営".to_string())),
6387 },
6388 PositionalToken {
6389 source: uws,
6390 offset: 125,
6391 length: 3,
6392 token: Token::Special(Special::Punctuation('・')),
6393 },
6394 PositionalToken {
6395 source: uws,
6396 offset: 128,
6397 length: 3,
6398 token: Token::Word(Word::Word("修".to_string())),
6399 },
6400 PositionalToken {
6401 source: uws,
6402 offset: 131,
6403 length: 3,
6404 token: Token::Word(Word::Word("造".to_string())),
6405 },
6406 PositionalToken {
6407 source: uws,
6408 offset: 134,
6409 length: 3,
6410 token: Token::Word(Word::Word("の".to_string())),
6411 },
6412 PositionalToken {
6413 source: uws,
6414 offset: 137,
6415 length: 3,
6416 token: Token::Word(Word::Word("た".to_string())),
6417 },
6418 PositionalToken {
6419 source: uws,
6420 offset: 140,
6421 length: 3,
6422 token: Token::Word(Word::Word("め".to_string())),
6423 },
6424 PositionalToken {
6425 source: uws,
6426 offset: 143,
6427 length: 3,
6428 token: Token::Word(Word::Word("の".to_string())),
6429 },
6430 PositionalToken {
6431 source: uws,
6432 offset: 146,
6433 length: 3,
6434 token: Token::Word(Word::Word("勧".to_string())),
6435 },
6436 PositionalToken {
6437 source: uws,
6438 offset: 149,
6439 length: 3,
6440 token: Token::Word(Word::Word("進".to_string())),
6441 },
6442 PositionalToken {
6443 source: uws,
6444 offset: 152,
6445 length: 3,
6446 token: Token::Word(Word::Word("を".to_string())),
6447 },
6448 PositionalToken {
6449 source: uws,
6450 offset: 155,
6451 length: 3,
6452 token: Token::Word(Word::Word("担".to_string())),
6453 },
6454 PositionalToken {
6455 source: uws,
6456 offset: 158,
6457 length: 3,
6458 token: Token::Word(Word::Word("っ".to_string())),
6459 },
6460 PositionalToken {
6461 source: uws,
6462 offset: 161,
6463 length: 3,
6464 token: Token::Word(Word::Word("た".to_string())),
6465 },
6466 PositionalToken {
6467 source: uws,
6468 offset: 164,
6469 length: 3,
6470 token: Token::Word(Word::Word("組".to_string())),
6471 },
6472 PositionalToken {
6473 source: uws,
6474 offset: 167,
6475 length: 3,
6476 token: Token::Word(Word::Word("織".to_string())),
6477 },
6478 PositionalToken {
6479 source: uws,
6480 offset: 170,
6481 length: 3,
6482 token: Token::Word(Word::Word("の".to_string())),
6483 },
6484 PositionalToken {
6485 source: uws,
6486 offset: 173,
6487 length: 3,
6488 token: Token::Word(Word::Word("総".to_string())),
6489 },
6490 PositionalToken {
6491 source: uws,
6492 offset: 176,
6493 length: 3,
6494 token: Token::Word(Word::Word("称".to_string())),
6495 },
6496 PositionalToken {
6497 source: uws,
6498 offset: 179,
6499 length: 3,
6500 token: Token::Special(Special::Punctuation('。')),
6501 },
6502 PositionalToken {
6503 source: uws,
6504 offset: 182,
6505 length: 1,
6506 token: Token::Special(Special::Separator(Separator::Space)),
6507 },
6508 PositionalToken {
6509 source: uws,
6510 offset: 183,
6511 length: 3,
6512 token: Token::Word(Word::Word("熊".to_string())),
6513 },
6514 PositionalToken {
6515 source: uws,
6516 offset: 186,
6517 length: 3,
6518 token: Token::Word(Word::Word("野".to_string())),
6519 },
6520 PositionalToken {
6521 source: uws,
6522 offset: 189,
6523 length: 3,
6524 token: Token::Word(Word::Word("三".to_string())),
6525 },
6526 PositionalToken {
6527 source: uws,
6528 offset: 192,
6529 length: 3,
6530 token: Token::Word(Word::Word("山".to_string())),
6531 },
6532 PositionalToken {
6533 source: uws,
6534 offset: 195,
6535 length: 3,
6536 token: Token::Word(Word::Word("を".to_string())),
6537 },
6538 PositionalToken {
6539 source: uws,
6540 offset: 198,
6541 length: 3,
6542 token: Token::Word(Word::Word("含".to_string())),
6543 },
6544 PositionalToken {
6545 source: uws,
6546 offset: 201,
6547 length: 3,
6548 token: Token::Word(Word::Word("め".to_string())),
6549 },
6550 PositionalToken {
6551 source: uws,
6552 offset: 204,
6553 length: 3,
6554 token: Token::Word(Word::Word("て".to_string())),
6555 },
6556 PositionalToken {
6557 source: uws,
6558 offset: 207,
6559 length: 3,
6560 token: Token::Special(Special::Punctuation('、')),
6561 },
6562 PositionalToken {
6563 source: uws,
6564 offset: 210,
6565 length: 3,
6566 token: Token::Word(Word::Word("日".to_string())),
6567 },
6568 PositionalToken {
6569 source: uws,
6570 offset: 213,
6571 length: 3,
6572 token: Token::Word(Word::Word("本".to_string())),
6573 },
6574 PositionalToken {
6575 source: uws,
6576 offset: 216,
6577 length: 3,
6578 token: Token::Word(Word::Word("に".to_string())),
6579 },
6580 PositionalToken {
6581 source: uws,
6582 offset: 219,
6583 length: 3,
6584 token: Token::Word(Word::Word("お".to_string())),
6585 },
6586 PositionalToken {
6587 source: uws,
6588 offset: 222,
6589 length: 3,
6590 token: Token::Word(Word::Word("け".to_string())),
6591 },
6592 PositionalToken {
6593 source: uws,
6594 offset: 225,
6595 length: 3,
6596 token: Token::Word(Word::Word("る".to_string())),
6597 },
6598 PositionalToken {
6599 source: uws,
6600 offset: 228,
6601 length: 3,
6602 token: Token::Word(Word::Word("古".to_string())),
6603 },
6604 PositionalToken {
6605 source: uws,
6606 offset: 231,
6607 length: 3,
6608 token: Token::Word(Word::Word("代".to_string())),
6609 },
6610 PositionalToken {
6611 source: uws,
6612 offset: 234,
6613 length: 3,
6614 token: Token::Word(Word::Word("か".to_string())),
6615 },
6616 PositionalToken {
6617 source: uws,
6618 offset: 237,
6619 length: 3,
6620 token: Token::Word(Word::Word("ら".to_string())),
6621 },
6622 PositionalToken {
6623 source: uws,
6624 offset: 240,
6625 length: 3,
6626 token: Token::Word(Word::Word("中".to_string())),
6627 },
6628 PositionalToken {
6629 source: uws,
6630 offset: 243,
6631 length: 3,
6632 token: Token::Word(Word::Word("世".to_string())),
6633 },
6634 PositionalToken {
6635 source: uws,
6636 offset: 246,
6637 length: 3,
6638 token: Token::Word(Word::Word("前".to_string())),
6639 },
6640 PositionalToken {
6641 source: uws,
6642 offset: 249,
6643 length: 3,
6644 token: Token::Word(Word::Word("半".to_string())),
6645 },
6646 PositionalToken {
6647 source: uws,
6648 offset: 252,
6649 length: 3,
6650 token: Token::Word(Word::Word("に".to_string())),
6651 },
6652 PositionalToken {
6653 source: uws,
6654 offset: 255,
6655 length: 3,
6656 token: Token::Word(Word::Word("か".to_string())),
6657 },
6658 PositionalToken {
6659 source: uws,
6660 offset: 258,
6661 length: 3,
6662 token: Token::Word(Word::Word("け".to_string())),
6663 },
6664 PositionalToken {
6665 source: uws,
6666 offset: 261,
6667 length: 3,
6668 token: Token::Word(Word::Word("て".to_string())),
6669 },
6670 PositionalToken {
6671 source: uws,
6672 offset: 264,
6673 length: 3,
6674 token: Token::Word(Word::Word("の".to_string())),
6675 },
6676 PositionalToken {
6677 source: uws,
6678 offset: 267,
6679 length: 3,
6680 token: Token::Word(Word::Word("寺".to_string())),
6681 },
6682 PositionalToken {
6683 source: uws,
6684 offset: 270,
6685 length: 3,
6686 token: Token::Word(Word::Word("社".to_string())),
6687 },
6688 PositionalToken {
6689 source: uws,
6690 offset: 273,
6691 length: 3,
6692 token: Token::Word(Word::Word("の".to_string())),
6693 },
6694 PositionalToken {
6695 source: uws,
6696 offset: 276,
6697 length: 3,
6698 token: Token::Word(Word::Word("造".to_string())),
6699 },
6700 PositionalToken {
6701 source: uws,
6702 offset: 279,
6703 length: 3,
6704 token: Token::Word(Word::Word("営".to_string())),
6705 },
6706 PositionalToken {
6707 source: uws,
6708 offset: 282,
6709 length: 3,
6710 token: Token::Word(Word::Word("は".to_string())),
6711 },
6712 PositionalToken {
6713 source: uws,
6714 offset: 285,
6715 length: 3,
6716 token: Token::Special(Special::Punctuation('、')),
6717 },
6718 PositionalToken {
6719 source: uws,
6720 offset: 288,
6721 length: 3,
6722 token: Token::Word(Word::Word("寺".to_string())),
6723 },
6724 PositionalToken {
6725 source: uws,
6726 offset: 291,
6727 length: 3,
6728 token: Token::Word(Word::Word("社".to_string())),
6729 },
6730 ],
6731 Lang::Kor => vec![
6732 PositionalToken {
6733 source: uws,
6734 offset: 0,
6735 length: 21,
6736 token: Token::Word(Word::Word("플레이스테이션".to_string())),
6737 },
6738 PositionalToken {
6739 source: uws,
6740 offset: 21,
6741 length: 1,
6742 token: Token::Special(Special::Separator(Separator::Space)),
6743 },
6744 PositionalToken {
6745 source: uws,
6746 offset: 22,
6747 length: 3,
6748 token: Token::Word(Word::Word("은".to_string())),
6749 },
6750 PositionalToken {
6751 source: uws,
6752 offset: 25,
6753 length: 1,
6754 token: Token::Special(Special::Separator(Separator::Space)),
6755 },
6756 PositionalToken {
6757 source: uws,
6758 offset: 26,
6759 length: 6,
6760 token: Token::Word(Word::Word("소니".to_string())),
6761 },
6762 PositionalToken {
6763 source: uws,
6764 offset: 32,
6765 length: 1,
6766 token: Token::Special(Special::Separator(Separator::Space)),
6767 },
6768 PositionalToken {
6769 source: uws,
6770 offset: 33,
6771 length: 9,
6772 token: Token::Word(Word::Word("컴퓨터".to_string())),
6773 },
6774 PositionalToken {
6775 source: uws,
6776 offset: 42,
6777 length: 1,
6778 token: Token::Special(Special::Separator(Separator::Space)),
6779 },
6780 PositionalToken {
6781 source: uws,
6782 offset: 43,
6783 length: 21,
6784 token: Token::Word(Word::Word("엔터테인먼트가".to_string())),
6785 },
6786 PositionalToken {
6787 source: uws,
6788 offset: 64,
6789 length: 1,
6790 token: Token::Special(Special::Separator(Separator::Space)),
6791 },
6792 PositionalToken {
6793 source: uws,
6794 offset: 65,
6795 length: 9,
6796 token: Token::Word(Word::Word("개발한".to_string())),
6797 },
6798 PositionalToken {
6799 source: uws,
6800 offset: 74,
6801 length: 1,
6802 token: Token::Special(Special::Separator(Separator::Space)),
6803 },
6804 PositionalToken {
6805 source: uws,
6806 offset: 75,
6807 length: 3,
6808 token: Token::Word(Word::Word("세".to_string())),
6809 },
6810 PositionalToken {
6811 source: uws,
6812 offset: 78,
6813 length: 1,
6814 token: Token::Special(Special::Separator(Separator::Space)),
6815 },
6816 PositionalToken {
6817 source: uws,
6818 offset: 79,
6819 length: 6,
6820 token: Token::Word(Word::Word("번째".to_string())),
6821 },
6822 PositionalToken {
6823 source: uws,
6824 offset: 85,
6825 length: 1,
6826 token: Token::Special(Special::Separator(Separator::Space)),
6827 },
6828 PositionalToken {
6829 source: uws,
6830 offset: 86,
6831 length: 9,
6832 token: Token::Word(Word::Word("가정용".to_string())),
6833 },
6834 PositionalToken {
6835 source: uws,
6836 offset: 95,
6837 length: 1,
6838 token: Token::Special(Special::Separator(Separator::Space)),
6839 },
6840 PositionalToken {
6841 source: uws,
6842 offset: 96,
6843 length: 15,
6844 token: Token::Word(Word::Word("게임기이다".to_string())),
6845 },
6846 PositionalToken {
6847 source: uws,
6848 offset: 111,
6849 length: 1,
6850 token: Token::Special(Special::Punctuation('.')),
6851 },
6852 PositionalToken {
6853 source: uws,
6854 offset: 112,
6855 length: 1,
6856 token: Token::Special(Special::Separator(Separator::Space)),
6857 },
6858 PositionalToken {
6859 source: uws,
6860 offset: 113,
6861 length: 24,
6862 token: Token::Word(Word::Word("마이크로소프트의".to_string())),
6863 },
6864 PositionalToken {
6865 source: uws,
6866 offset: 137,
6867 length: 1,
6868 token: Token::Special(Special::Separator(Separator::Space)),
6869 },
6870 PositionalToken {
6871 source: uws,
6872 offset: 138,
6873 length: 12,
6874 token: Token::Word(Word::Word("엑스박스".to_string())),
6875 },
6876 PositionalToken {
6877 source: uws,
6878 offset: 150,
6879 length: 1,
6880 token: Token::Special(Special::Separator(Separator::Space)),
6881 },
6882 PositionalToken {
6883 source: uws,
6884 offset: 151,
6885 length: 3,
6886 token: Token::Word(Word::Number(Number::Integer(360))),
6887 },
6888 PositionalToken {
6889 source: uws,
6890 offset: 154,
6891 length: 1,
6892 token: Token::Special(Special::Punctuation(',')),
6893 },
6894 PositionalToken {
6895 source: uws,
6896 offset: 155,
6897 length: 1,
6898 token: Token::Special(Special::Separator(Separator::Space)),
6899 },
6900 PositionalToken {
6901 source: uws,
6902 offset: 156,
6903 length: 12,
6904 token: Token::Word(Word::Word("닌텐도의".to_string())),
6905 },
6906 PositionalToken {
6907 source: uws,
6908 offset: 168,
6909 length: 1,
6910 token: Token::Special(Special::Separator(Separator::Space)),
6911 },
6912 PositionalToken {
6913 source: uws,
6914 offset: 169,
6915 length: 6,
6916 token: Token::Word(Word::Word("Wii와".to_string())),
6917 },
6918 PositionalToken {
6919 source: uws,
6920 offset: 175,
6921 length: 1,
6922 token: Token::Special(Special::Separator(Separator::Space)),
6923 },
6924 PositionalToken {
6925 source: uws,
6926 offset: 176,
6927 length: 12,
6928 token: Token::Word(Word::Word("경쟁하고".to_string())),
6929 },
6930 PositionalToken {
6931 source: uws,
6932 offset: 188,
6933 length: 1,
6934 token: Token::Special(Special::Separator(Separator::Space)),
6935 },
6936 PositionalToken {
6937 source: uws,
6938 offset: 189,
6939 length: 6,
6940 token: Token::Word(Word::Word("있다".to_string())),
6941 },
6942 PositionalToken {
6943 source: uws,
6944 offset: 195,
6945 length: 1,
6946 token: Token::Special(Special::Punctuation('.')),
6947 },
6948 PositionalToken {
6949 source: uws,
6950 offset: 196,
6951 length: 1,
6952 token: Token::Special(Special::Separator(Separator::Space)),
6953 },
6954 PositionalToken {
6955 source: uws,
6956 offset: 197,
6957 length: 6,
6958 token: Token::Word(Word::Word("이전".to_string())),
6959 },
6960 PositionalToken {
6961 source: uws,
6962 offset: 203,
6963 length: 1,
6964 token: Token::Special(Special::Separator(Separator::Space)),
6965 },
6966 PositionalToken {
6967 source: uws,
6968 offset: 204,
6969 length: 12,
6970 token: Token::Word(Word::Word("제품에서".to_string())),
6971 },
6972 PositionalToken {
6973 source: uws,
6974 offset: 216,
6975 length: 1,
6976 token: Token::Special(Special::Separator(Separator::Space)),
6977 },
6978 PositionalToken {
6979 source: uws,
6980 offset: 217,
6981 length: 9,
6982 token: Token::Word(Word::Word("온라인".to_string())),
6983 },
6984 PositionalToken {
6985 source: uws,
6986 offset: 226,
6987 length: 1,
6988 token: Token::Special(Special::Separator(Separator::Space)),
6989 },
6990 PositionalToken {
6991 source: uws,
6992 offset: 227,
6993 length: 9,
6994 token: Token::Word(Word::Word("플레이".to_string())),
6995 },
6996 PositionalToken {
6997 source: uws,
6998 offset: 236,
6999 length: 1,
7000 token: Token::Special(Special::Separator(Separator::Space)),
7001 },
7002 PositionalToken {
7003 source: uws,
7004 offset: 237,
7005 length: 3,
7006 token: Token::Word(Word::Word("기".to_string())),
7007 },
7008 ],
7009 Lang::Ara => vec![
7010 PositionalToken {
7011 source: uws,
7012 offset: 0,
7013 length: 14,
7014 token: Token::Word(Word::Word("لشکرکشی".to_string())),
7015 },
7016 PositionalToken {
7017 source: uws,
7018 offset: 14,
7019 length: 3,
7020 token: Token::Unicode(Unicode::Formatter(Formatter::Char('\u{200c}'))),
7021 },
7022 PositionalToken {
7023 source: uws,
7024 offset: 17,
7025 length: 6,
7026 token: Token::Word(Word::Word("های".to_string())),
7027 },
7028 PositionalToken {
7029 source: uws,
7030 offset: 23,
7031 length: 1,
7032 token: Token::Special(Special::Separator(Separator::Space)),
7033 },
7034 PositionalToken {
7035 source: uws,
7036 offset: 24,
7037 length: 6,
7038 token: Token::Word(Word::Word("روس".to_string())),
7039 },
7040 PositionalToken {
7041 source: uws,
7042 offset: 30,
7043 length: 3,
7044 token: Token::Unicode(Unicode::Formatter(Formatter::Char('\u{200c}'))),
7045 },
7046 PositionalToken {
7047 source: uws,
7048 offset: 33,
7049 length: 6,
7050 token: Token::Word(Word::Word("های".to_string())),
7051 },
7052 PositionalToken {
7053 source: uws,
7054 offset: 39,
7055 length: 1,
7056 token: Token::Special(Special::Separator(Separator::Space)),
7057 },
7058 PositionalToken {
7059 source: uws,
7060 offset: 40,
7061 length: 12,
7062 token: Token::Word(Word::Word("وارنگی".to_string())),
7063 },
7064 PositionalToken {
7065 source: uws,
7066 offset: 52,
7067 length: 1,
7068 token: Token::Special(Special::Separator(Separator::Space)),
7069 },
7070 PositionalToken {
7071 source: uws,
7072 offset: 53,
7073 length: 4,
7074 token: Token::Word(Word::Word("به".to_string())),
7075 },
7076 PositionalToken {
7077 source: uws,
7078 offset: 57,
7079 length: 1,
7080 token: Token::Special(Special::Separator(Separator::Space)),
7081 },
7082 PositionalToken {
7083 source: uws,
7084 offset: 58,
7085 length: 10,
7086 token: Token::Word(Word::Word("دریای".to_string())),
7087 },
7088 PositionalToken {
7089 source: uws,
7090 offset: 68,
7091 length: 1,
7092 token: Token::Special(Special::Separator(Separator::Space)),
7093 },
7094 PositionalToken {
7095 source: uws,
7096 offset: 69,
7097 length: 6,
7098 token: Token::Word(Word::Word("خزر".to_string())),
7099 },
7100 PositionalToken {
7101 source: uws,
7102 offset: 75,
7103 length: 1,
7104 token: Token::Special(Special::Separator(Separator::Space)),
7105 },
7106 PositionalToken {
7107 source: uws,
7108 offset: 76,
7109 length: 12,
7110 token: Token::Word(Word::Word("مجموعه".to_string())),
7111 },
7112 PositionalToken {
7113 source: uws,
7114 offset: 88,
7115 length: 3,
7116 token: Token::Unicode(Unicode::Formatter(Formatter::Char('\u{200c}'))),
7117 },
7118 PositionalToken {
7119 source: uws,
7120 offset: 91,
7121 length: 4,
7122 token: Token::Word(Word::Word("ای".to_string())),
7123 },
7124 PositionalToken {
7125 source: uws,
7126 offset: 95,
7127 length: 1,
7128 token: Token::Special(Special::Separator(Separator::Space)),
7129 },
7130 PositionalToken {
7131 source: uws,
7132 offset: 96,
7133 length: 4,
7134 token: Token::Word(Word::Word("از".to_string())),
7135 },
7136 PositionalToken {
7137 source: uws,
7138 offset: 100,
7139 length: 1,
7140 token: Token::Special(Special::Separator(Separator::Space)),
7141 },
7142 PositionalToken {
7143 source: uws,
7144 offset: 101,
7145 length: 10,
7146 token: Token::Word(Word::Word("حملات".to_string())),
7147 },
7148 PositionalToken {
7149 source: uws,
7150 offset: 111,
7151 length: 1,
7152 token: Token::Special(Special::Separator(Separator::Space)),
7153 },
7154 PositionalToken {
7155 source: uws,
7156 offset: 112,
7157 length: 10,
7158 token: Token::Word(Word::Word("نظامی".to_string())),
7159 },
7160 PositionalToken {
7161 source: uws,
7162 offset: 122,
7163 length: 1,
7164 token: Token::Special(Special::Separator(Separator::Space)),
7165 },
7166 PositionalToken {
7167 source: uws,
7168 offset: 123,
7169 length: 4,
7170 token: Token::Word(Word::Word("در".to_string())),
7171 },
7172 PositionalToken {
7173 source: uws,
7174 offset: 127,
7175 length: 1,
7176 token: Token::Special(Special::Separator(Separator::Space)),
7177 },
7178 PositionalToken {
7179 source: uws,
7180 offset: 128,
7181 length: 6,
7182 token: Token::Word(Word::Word("بین".to_string())),
7183 },
7184 PositionalToken {
7185 source: uws,
7186 offset: 134,
7187 length: 1,
7188 token: Token::Special(Special::Separator(Separator::Space)),
7189 },
7190 PositionalToken {
7191 source: uws,
7192 offset: 135,
7193 length: 6,
7194 token: Token::Word(Word::Word("سال".to_string())),
7195 },
7196 PositionalToken {
7197 source: uws,
7198 offset: 141,
7199 length: 3,
7200 token: Token::Unicode(Unicode::Formatter(Formatter::Char('\u{200c}'))),
7201 },
7202 PositionalToken {
7203 source: uws,
7204 offset: 144,
7205 length: 6,
7206 token: Token::Word(Word::Word("های".to_string())),
7207 },
7208 PositionalToken {
7209 source: uws,
7210 offset: 150,
7211 length: 1,
7212 token: Token::Special(Special::Separator(Separator::Space)),
7213 },
7214 PositionalToken {
7215 source: uws,
7216 offset: 151,
7217 length: 6,
7218 token: Token::Word(Word::StrangeWord("۸۶۴".to_string())),
7219 },
7220 PositionalToken {
7221 source: uws,
7222 offset: 157,
7223 length: 1,
7224 token: Token::Special(Special::Separator(Separator::Space)),
7225 },
7226 PositionalToken {
7227 source: uws,
7228 offset: 158,
7229 length: 4,
7230 token: Token::Word(Word::Word("تا".to_string())),
7231 },
7232 PositionalToken {
7233 source: uws,
7234 offset: 162,
7235 length: 1,
7236 token: Token::Special(Special::Separator(Separator::Space)),
7237 },
7238 PositionalToken {
7239 source: uws,
7240 offset: 163,
7241 length: 8,
7242 token: Token::Word(Word::StrangeWord("۱۰۴۱".to_string())),
7243 },
7244 PositionalToken {
7245 source: uws,
7246 offset: 171,
7247 length: 1,
7248 token: Token::Special(Special::Separator(Separator::Space)),
7249 },
7250 PositionalToken {
7251 source: uws,
7252 offset: 172,
7253 length: 12,
7254 token: Token::Word(Word::Word("میلادی".to_string())),
7255 },
7256 PositionalToken {
7257 source: uws,
7258 offset: 184,
7259 length: 1,
7260 token: Token::Special(Special::Separator(Separator::Space)),
7261 },
7262 PositionalToken {
7263 source: uws,
7264 offset: 185,
7265 length: 2,
7266 token: Token::Word(Word::Word("ب".to_string())),
7267 },
7268 ],
7269 Lang::Ell => vec![
7270 PositionalToken {
7271 source: uws,
7272 offset: 0,
7273 length: 4,
7274 token: Token::Word(Word::Word("Το".to_string())),
7275 },
7276 PositionalToken {
7277 source: uws,
7278 offset: 4,
7279 length: 1,
7280 token: Token::Special(Special::Separator(Separator::Space)),
7281 },
7282 PositionalToken {
7283 source: uws,
7284 offset: 5,
7285 length: 18,
7286 token: Token::Word(Word::Word("Πρόγραμμα".to_string())),
7287 },
7288 PositionalToken {
7289 source: uws,
7290 offset: 23,
7291 length: 1,
7292 token: Token::Special(Special::Separator(Separator::Space)),
7293 },
7294 PositionalToken {
7295 source: uws,
7296 offset: 24,
7297 length: 22,
7298 token: Token::Word(Word::Word("υλοποιείται".to_string())),
7299 },
7300 PositionalToken {
7301 source: uws,
7302 offset: 46,
7303 length: 1,
7304 token: Token::Special(Special::Separator(Separator::Space)),
7305 },
7306 PositionalToken {
7307 source: uws,
7308 offset: 47,
7309 length: 4,
7310 token: Token::Word(Word::Word("εξ".to_string())),
7311 },
7312 PositionalToken {
7313 source: uws,
7314 offset: 51,
7315 length: 1,
7316 token: Token::Special(Special::Separator(Separator::Space)),
7317 },
7318 PositionalToken {
7319 source: uws,
7320 offset: 52,
7321 length: 18,
7322 token: Token::Word(Word::Word("ολοκλήρου".to_string())),
7323 },
7324 PositionalToken {
7325 source: uws,
7326 offset: 70,
7327 length: 1,
7328 token: Token::Special(Special::Separator(Separator::Space)),
7329 },
7330 PositionalToken {
7331 source: uws,
7332 offset: 71,
7333 length: 6,
7334 token: Token::Word(Word::Word("από".to_string())),
7335 },
7336 PositionalToken {
7337 source: uws,
7338 offset: 77,
7339 length: 1,
7340 token: Token::Special(Special::Separator(Separator::Space)),
7341 },
7342 PositionalToken {
7343 source: uws,
7344 offset: 78,
7345 length: 16,
7346 token: Token::Word(Word::Word("απόσταση".to_string())),
7347 },
7348 PositionalToken {
7349 source: uws,
7350 offset: 94,
7351 length: 1,
7352 token: Token::Special(Special::Separator(Separator::Space)),
7353 },
7354 PositionalToken {
7355 source: uws,
7356 offset: 95,
7357 length: 6,
7358 token: Token::Word(Word::Word("και".to_string())),
7359 },
7360 PositionalToken {
7361 source: uws,
7362 offset: 101,
7363 length: 1,
7364 token: Token::Special(Special::Separator(Separator::Space)),
7365 },
7366 PositionalToken {
7367 source: uws,
7368 offset: 102,
7369 length: 12,
7370 token: Token::Word(Word::Word("μπορεί".to_string())),
7371 },
7372 PositionalToken {
7373 source: uws,
7374 offset: 114,
7375 length: 1,
7376 token: Token::Special(Special::Separator(Separator::Space)),
7377 },
7378 PositionalToken {
7379 source: uws,
7380 offset: 115,
7381 length: 4,
7382 token: Token::Word(Word::Word("να".to_string())),
7383 },
7384 PositionalToken {
7385 source: uws,
7386 offset: 119,
7387 length: 1,
7388 token: Token::Special(Special::Separator(Separator::Space)),
7389 },
7390 PositionalToken {
7391 source: uws,
7392 offset: 120,
7393 length: 20,
7394 token: Token::Word(Word::Word("συμμετέχει".to_string())),
7395 },
7396 PositionalToken {
7397 source: uws,
7398 offset: 140,
7399 length: 1,
7400 token: Token::Special(Special::Separator(Separator::Space)),
7401 },
7402 PositionalToken {
7403 source: uws,
7404 offset: 141,
7405 length: 8,
7406 token: Token::Word(Word::Word("κάθε".to_string())),
7407 },
7408 PositionalToken {
7409 source: uws,
7410 offset: 149,
7411 length: 1,
7412 token: Token::Special(Special::Separator(Separator::Space)),
7413 },
7414 PositionalToken {
7415 source: uws,
7416 offset: 150,
7417 length: 24,
7418 token: Token::Word(Word::Word("εμπλεκόμενος".to_string())),
7419 },
7420 PositionalToken {
7421 source: uws,
7422 offset: 174,
7423 length: 1,
7424 token: Token::Special(Special::Separator(Separator::Space)),
7425 },
7426 PositionalToken {
7427 source: uws,
7428 offset: 175,
7429 length: 6,
7430 token: Token::Word(Word::Word("στη".to_string())),
7431 },
7432 PositionalToken {
7433 source: uws,
7434 offset: 181,
7435 length: 1,
7436 token: Token::Special(Special::Separator(Separator::Space)),
7437 },
7438 PositionalToken {
7439 source: uws,
7440 offset: 182,
7441 length: 2,
7442 token: Token::Word(Word::Word("ή".to_string())),
7443 },
7444 PositionalToken {
7445 source: uws,
7446 offset: 184,
7447 length: 1,
7448 token: Token::Special(Special::Punctuation('/')),
7449 },
7450 ],
7451 };
7452 (
7453 uws.chars()
7454 .take(100)
7455 .fold(String::new(), |acc, c| acc + &format!("{}", c)),
7456 tokens,
7457 )
7458 }
7459}