1use std::cmp::Ordering;
2use std::collections::VecDeque;
3use std::fmt::Display;
4
5use paste::paste;
6
7use crate::parsers::{Markdown, MarkdownOptions, Parser, PlainEnglish};
8use crate::patterns::{
9 DocPattern, LongestMatchOf, Pattern, RepeatingPattern, SequencePattern, WordSet,
10};
11use crate::punctuation::Punctuation;
12use crate::vec_ext::VecExt;
13use crate::word_metadata::AdjectiveData;
14use crate::{
15 Dictionary, FatStringToken, FatToken, FstDictionary, Lrc, NounData, Token, TokenKind,
16 TokenStringExt,
17};
18use crate::{OrdinalSuffix, Span};
19
20#[derive(Debug, Clone)]
22pub struct Document {
23 source: Lrc<Vec<char>>,
24 tokens: Vec<Token>,
25}
26
27impl Default for Document {
28 fn default() -> Self {
29 Self::new("", &PlainEnglish, &FstDictionary::curated())
30 }
31}
32
33impl Document {
34 pub fn token_indices_intersecting(&self, span: Span) -> Vec<usize> {
38 self.tokens()
39 .enumerate()
40 .filter_map(|(idx, tok)| tok.span.overlaps_with(span).then_some(idx))
41 .collect()
42 }
43
44 pub fn fat_tokens_intersecting(&self, span: Span) -> Vec<FatToken> {
48 let indices = self.token_indices_intersecting(span);
49
50 indices
51 .into_iter()
52 .map(|i| self.tokens[i].to_fat(&self.source))
53 .collect()
54 }
55
56 pub fn new(text: &str, parser: &impl Parser, dictionary: &impl Dictionary) -> Self {
59 let source: Vec<_> = text.chars().collect();
60
61 Self::new_from_vec(Lrc::new(source), parser, dictionary)
62 }
63
64 pub fn new_curated(text: &str, parser: &impl Parser) -> Self {
67 let source: Vec<_> = text.chars().collect();
68
69 Self::new_from_vec(Lrc::new(source), parser, &FstDictionary::curated())
70 }
71
72 pub fn new_from_vec(
75 source: Lrc<Vec<char>>,
76 parser: &impl Parser,
77 dictionary: &impl Dictionary,
78 ) -> Self {
79 let tokens = parser.parse(&source);
80
81 let mut document = Self { source, tokens };
82 document.parse(dictionary);
83
84 document
85 }
86
87 pub fn new_plain_english_curated(text: &str) -> Self {
90 Self::new(text, &PlainEnglish, &FstDictionary::curated())
91 }
92
93 pub fn new_plain_english(text: &str, dictionary: &impl Dictionary) -> Self {
96 Self::new(text, &PlainEnglish, dictionary)
97 }
98
99 pub fn new_markdown_curated(text: &str, markdown_options: MarkdownOptions) -> Self {
102 Self::new(
103 text,
104 &Markdown::new(markdown_options),
105 &FstDictionary::curated(),
106 )
107 }
108
109 pub fn new_markdown_default_curated(text: &str) -> Self {
112 Self::new_markdown_curated(text, MarkdownOptions::default())
113 }
114
115 pub fn new_markdown(
118 text: &str,
119 markdown_options: MarkdownOptions,
120 dictionary: &impl Dictionary,
121 ) -> Self {
122 Self::new(text, &Markdown::new(markdown_options), dictionary)
123 }
124
125 pub fn new_markdown_default(text: &str, dictionary: &impl Dictionary) -> Self {
128 Self::new_markdown(text, MarkdownOptions::default(), dictionary)
129 }
130
131 fn parse(&mut self, dictionary: &impl Dictionary) {
135 self.condense_spaces();
136 self.condense_newlines();
137 self.newlines_to_breaks();
138 self.condense_contractions();
139 self.condense_dotted_initialisms();
140 self.condense_number_suffixes();
141 self.condense_ellipsis();
142 self.condense_latin();
143 self.match_quotes();
144 self.articles_imply_nouns();
145
146 for token in self.tokens.iter_mut() {
148 if let TokenKind::Word(meta) = &mut token.kind {
149 let word_source = token.span.get_content(&self.source);
150 let found_meta = dictionary.get_word_metadata(word_source);
151 *meta = found_meta.cloned()
152 }
153 }
154
155 self.known_preposition();
157 self.articles_imply_not_verb();
158 }
159
160 fn uncached_article_pattern() -> Lrc<SequencePattern> {
161 Lrc::new(
162 SequencePattern::default()
163 .then_determiner()
164 .then_whitespace()
165 .then(|t: &Token, _source: &[char]| t.kind.is_adjective() && t.kind.is_noun())
166 .then_whitespace()
167 .then_noun(),
168 )
169 }
170
171 thread_local! {static ARTICLE_PATTERN: Lrc<SequencePattern> = Document::uncached_article_pattern()}
172
173 fn articles_imply_nouns(&mut self) {
176 let pattern = Self::ARTICLE_PATTERN.with(|v| v.clone());
177
178 for m in pattern.find_all_matches_in_doc(self) {
179 if let TokenKind::Word(Some(metadata)) = &mut self.tokens[m.start + 2].kind {
180 metadata.noun = None;
181 metadata.verb = None;
182 }
183 }
184 }
185
186 fn known_preposition(&mut self) {
189 fn create_pattern() -> Lrc<SequencePattern> {
190 Lrc::new(
191 SequencePattern::default()
192 .then(WordSet::new(&["in", "at", "on", "to", "for", "by", "with"]))
193 .then_whitespace()
194 .then(|t: &Token, _source: &[char]| {
195 t.kind.is_determiner() || t.kind.is_number()
196 }),
197 )
198 }
199 thread_local! {static PATTERN: Lrc<SequencePattern> = create_pattern()}
200
201 let pattern = PATTERN.with(|v| v.clone());
202
203 for m in pattern.find_all_matches_in_doc(self) {
204 if let TokenKind::Word(Some(metadata)) = &mut self.tokens[m.start].kind {
205 metadata.noun = None;
206 metadata.pronoun = None;
207 metadata.verb = None;
208 metadata.adjective = None;
209 }
210 }
211 }
212
213 fn articles_imply_not_verb(&mut self) {
215 fn create_pattern() -> Lrc<SequencePattern> {
216 Lrc::new(
217 SequencePattern::default()
218 .then(WordSet::new(&[
219 "a", "an", "the",
221 "my", "your", "thy", "thine", "his", "its", "our", "their",
225 "whose", "no",
227 ]))
228 .then_whitespace()
229 .then_verb(),
230 )
231 }
232 thread_local! {static PATTERN: Lrc<SequencePattern> = create_pattern()}
233 let pattern = PATTERN.with(|v| v.clone());
234
235 for m in pattern.find_all_matches_in_doc(self) {
236 if let TokenKind::Word(Some(metadata)) = &mut self.tokens[m.end - 1].kind {
237 if metadata.noun.is_none()
238 && metadata.adjective.is_none()
239 && metadata.adverb.is_none()
240 {
241 metadata.noun = Some(NounData::default());
242 metadata.adjective = Some(AdjectiveData::default());
243 }
244 metadata.verb = None;
245 }
246 }
247 }
248
249 fn newlines_to_breaks(&mut self) {
251 for token in &mut self.tokens {
252 if let TokenKind::Newline(n) = token.kind {
253 if n >= 2 {
254 token.kind = TokenKind::ParagraphBreak;
255 }
256 }
257 }
258 }
259
260 fn condense_indices(&mut self, indices: &[usize], stretch_len: usize) {
266 for idx in indices {
268 let end_tok = self.tokens[idx + stretch_len - 1].clone();
269 let start_tok = &mut self.tokens[*idx];
270
271 start_tok.span.end = end_tok.span.end;
272 }
273
274 let old = self.tokens.clone();
276 self.tokens.clear();
277
278 self.tokens
280 .extend_from_slice(&old[0..indices.first().copied().unwrap_or(indices.len())]);
281
282 let mut iter = indices.iter().peekable();
283
284 while let (Some(a_idx), b) = (iter.next(), iter.peek()) {
285 self.tokens.push(old[*a_idx].clone());
286
287 if let Some(b_idx) = b {
288 self.tokens
289 .extend_from_slice(&old[a_idx + stretch_len..**b_idx]);
290 }
291 }
292
293 self.tokens.extend_from_slice(
295 &old[indices
296 .last()
297 .map(|v| v + stretch_len)
298 .unwrap_or(indices.len())..],
299 );
300 }
301
302 pub fn get_token_at_char_index(&self, char_index: usize) -> Option<&Token> {
303 let index = self
304 .tokens
305 .binary_search_by(|t| {
306 if t.span.overlaps_with(Span::new_with_len(char_index, 1)) {
307 Ordering::Equal
308 } else {
309 t.span.start.cmp(&char_index)
310 }
311 })
312 .ok()?;
313
314 Some(&self.tokens[index])
315 }
316
317 pub fn get_token(&self, index: usize) -> Option<&Token> {
319 self.tokens.get(index)
320 }
321
322 pub fn get_token_offset(&self, base: usize, offset: isize) -> Option<&Token> {
324 match base.checked_add_signed(offset) {
325 None => None,
326 Some(idx) => self.get_token(idx),
327 }
328 }
329
330 pub fn tokens(&self) -> impl Iterator<Item = &Token> + '_ {
332 self.tokens.iter()
333 }
334
335 pub fn fat_tokens(&self) -> impl Iterator<Item = FatToken> + '_ {
337 self.tokens().map(|token| token.to_fat(&self.source))
338 }
339
340 pub fn get_next_word_from_offset(&self, base: usize, offset: isize) -> Option<&Token> {
343 if !self.get_token_offset(base, offset)?.kind.is_whitespace() {
345 return None;
346 }
347 let word_token = self.get_token_offset(base, offset + offset.signum());
349 let word_token = word_token?;
350 word_token.kind.is_word().then_some(word_token)
351 }
352
353 pub fn fat_string_tokens(&self) -> impl Iterator<Item = FatStringToken> + '_ {
355 self.fat_tokens().map(|t| t.into())
356 }
357
358 pub fn get_span_content(&self, span: &Span) -> &[char] {
359 span.get_content(&self.source)
360 }
361
362 pub fn get_span_content_str(&self, span: &Span) -> String {
363 String::from_iter(self.get_span_content(span))
364 }
365
366 pub fn get_full_string(&self) -> String {
367 self.get_span_content_str(&Span {
368 start: 0,
369 end: self.source.len(),
370 })
371 }
372
373 pub fn get_full_content(&self) -> &[char] {
374 &self.source
375 }
376
377 pub fn get_source(&self) -> &[char] {
378 &self.source
379 }
380
381 pub fn get_tokens(&self) -> &[Token] {
382 &self.tokens
383 }
384
385 fn match_quotes(&mut self) {
391 let quote_indices: Vec<usize> = self.tokens.iter_quote_indices().collect();
392
393 for i in 0..quote_indices.len() / 2 {
394 let a_i = quote_indices[i * 2];
395 let b_i = quote_indices[i * 2 + 1];
396
397 {
398 let a = self.tokens[a_i].kind.as_mut_quote().unwrap();
399 a.twin_loc = Some(b_i);
400 }
401
402 {
403 let b = self.tokens[b_i].kind.as_mut_quote().unwrap();
404 b.twin_loc = Some(a_i);
405 }
406 }
407 }
408
409 fn condense_number_suffixes(&mut self) {
411 if self.tokens.len() < 2 {
412 return;
413 }
414
415 let mut replace_starts = Vec::new();
416
417 for idx in 0..self.tokens.len() - 1 {
418 let b = &self.tokens[idx + 1];
419 let a = &self.tokens[idx];
420
421 if let (TokenKind::Number(..), TokenKind::Word(..)) = (&a.kind, &b.kind) {
424 if let Some(found_suffix) =
425 OrdinalSuffix::from_chars(self.get_span_content(&b.span))
426 {
427 self.tokens[idx].kind.as_mut_number().unwrap().suffix = Some(found_suffix);
428 replace_starts.push(idx);
429 }
430 }
431 }
432
433 self.condense_indices(&replace_starts, 2);
434 }
435
436 fn condense_spaces(&mut self) {
439 let mut cursor = 0;
440 let copy = self.tokens.clone();
441
442 let mut remove_these = VecDeque::new();
443
444 while cursor < self.tokens.len() {
445 let start_tok = &mut self.tokens[cursor];
447
448 if let TokenKind::Space(start_count) = &mut start_tok.kind {
449 loop {
450 cursor += 1;
451
452 if cursor >= copy.len() {
453 break;
454 }
455
456 let child_tok = ©[cursor];
457
458 if start_tok.span.end != child_tok.span.start {
460 break;
461 }
462
463 if let TokenKind::Space(n) = child_tok.kind {
464 *start_count += n;
465 start_tok.span.end = child_tok.span.end;
466 remove_these.push_back(cursor);
467 cursor += 1;
468 } else {
469 break;
470 };
471 }
472 }
473
474 cursor += 1;
475 }
476
477 self.tokens.remove_indices(remove_these);
478 }
479
480 thread_local! {
481 static LATIN_PATTERN: Lrc<LongestMatchOf> = Document::uncached_latin_pattern();
482 }
483
484 fn uncached_latin_pattern() -> Lrc<LongestMatchOf> {
485 Lrc::new(LongestMatchOf::new(vec![
486 Box::new(
487 SequencePattern::default()
488 .then(WordSet::new(&["etc", "vs"]))
489 .then_period(),
490 ),
491 Box::new(
492 SequencePattern::aco("et")
493 .then_whitespace()
494 .t_aco("al")
495 .then_period(),
496 ),
497 ]))
498 }
499
500 fn condense_pattern<F>(&mut self, pattern: &impl Pattern, edit: F)
503 where
504 F: Fn(&mut Token),
505 {
506 let matches = pattern.find_all_matches_in_doc(self);
507
508 let mut remove_indices = VecDeque::with_capacity(matches.len());
509
510 for m in matches {
511 remove_indices.extend(m.start + 1..m.end);
512 self.tokens[m.start].span = self.tokens[m.into_iter()].span().unwrap();
513 edit(&mut self.tokens[m.start]);
514 }
515
516 self.tokens.remove_indices(remove_indices);
517 }
518
519 fn condense_latin(&mut self) {
520 self.condense_pattern(&Self::LATIN_PATTERN.with(|v| v.clone()), |_| {})
521 }
522
523 fn condense_newlines(&mut self) {
526 let mut cursor = 0;
527 let copy = self.tokens.clone();
528
529 let mut remove_these = VecDeque::new();
530
531 while cursor < self.tokens.len() {
532 let start_tok = &mut self.tokens[cursor];
534
535 if let TokenKind::Newline(start_count) = &mut start_tok.kind {
536 loop {
537 cursor += 1;
538
539 if cursor >= copy.len() {
540 break;
541 }
542
543 let child_tok = ©[cursor];
544 if let TokenKind::Newline(n) = child_tok.kind {
545 *start_count += n;
546 start_tok.span.end = child_tok.span.end;
547 remove_these.push_back(cursor);
548 cursor += 1;
549 } else {
550 break;
551 };
552 }
553 }
554
555 cursor += 1;
556 }
557
558 self.tokens.remove_indices(remove_these);
559 }
560
561 fn condense_dotted_initialisms(&mut self) {
564 if self.tokens.len() < 2 {
565 return;
566 }
567
568 let mut to_remove = VecDeque::new();
569
570 let mut cursor = 1;
571
572 let mut initialism_start = None;
573
574 loop {
575 let a = &self.tokens[cursor - 1];
576 let b = &self.tokens[cursor];
577
578 let is_initialism_chunk = a.kind.is_word() && a.span.len() == 1 && b.kind.is_period();
579
580 if is_initialism_chunk {
581 if initialism_start.is_none() {
582 initialism_start = Some(cursor - 1);
583 } else {
584 to_remove.push_back(cursor - 1);
585 }
586
587 to_remove.push_back(cursor);
588 cursor += 1;
589 } else {
590 if let Some(start) = initialism_start {
591 let end = self.tokens[cursor - 2].span.end;
592 let start_tok: &mut Token = &mut self.tokens[start];
593 start_tok.span.end = end;
594 }
595
596 initialism_start = None;
597 }
598
599 cursor += 1;
600
601 if cursor >= self.tokens.len() - 1 {
602 break;
603 }
604 }
605
606 self.tokens.remove_indices(to_remove);
607 }
608
609 fn uncached_ellipsis_pattern() -> Lrc<RepeatingPattern> {
610 let period = SequencePattern::default().then_period();
611 Lrc::new(RepeatingPattern::new(Box::new(period), 2))
612 }
613
614 thread_local! {
615 static ELLIPSIS_PATTERN: Lrc<RepeatingPattern> = Document::uncached_ellipsis_pattern();
616 }
617
618 fn condense_ellipsis(&mut self) {
619 let pattern = Self::ELLIPSIS_PATTERN.with(|v| v.clone());
620 self.condense_pattern(&pattern, |tok| {
621 tok.kind = TokenKind::Punctuation(Punctuation::Ellipsis)
622 });
623 }
624
625 fn uncached_contraction_pattern() -> Lrc<SequencePattern> {
626 Lrc::new(
627 SequencePattern::default()
628 .then_any_word()
629 .then_apostrophe()
630 .then_any_word(),
631 )
632 }
633
634 thread_local! {
635 static CONTRACTION_PATTERN: Lrc<SequencePattern> = Document::uncached_contraction_pattern();
636 }
637
638 fn condense_contractions(&mut self) {
641 let pattern = Self::CONTRACTION_PATTERN.with(|v| v.clone());
642
643 self.condense_pattern(&pattern, |_| {});
644 }
645}
646
647macro_rules! create_fns_on_doc {
649 ($thing:ident) => {
650 paste! {
651 fn [< first_ $thing >](&self) -> Option<&Token> {
652 self.tokens.[< first_ $thing >]()
653 }
654
655 fn [< last_ $thing >](&self) -> Option<&Token> {
656 self.tokens.[< last_ $thing >]()
657 }
658
659 fn [< last_ $thing _index>](&self) -> Option<usize> {
660 self.tokens.[< last_ $thing _index >]()
661 }
662
663 fn [<iter_ $thing _indices>](&self) -> impl Iterator<Item = usize> + '_ {
664 self.tokens.[< iter_ $thing _indices >]()
665 }
666
667 fn [<iter_ $thing s>](&self) -> impl Iterator<Item = &Token> + '_ {
668 self.tokens.[< iter_ $thing s >]()
669 }
670 }
671 };
672}
673
674impl TokenStringExt for Document {
675 create_fns_on_doc!(adjective);
676 create_fns_on_doc!(apostrophe);
677 create_fns_on_doc!(at);
678 create_fns_on_doc!(chunk_terminator);
679 create_fns_on_doc!(comma);
680 create_fns_on_doc!(conjunction);
681 create_fns_on_doc!(currency);
682 create_fns_on_doc!(ellipsis);
683 create_fns_on_doc!(hostname);
684 create_fns_on_doc!(likely_homograph);
685 create_fns_on_doc!(noun);
686 create_fns_on_doc!(number);
687 create_fns_on_doc!(paragraph_break);
688 create_fns_on_doc!(pipe);
689 create_fns_on_doc!(preposition);
690 create_fns_on_doc!(punctuation);
691 create_fns_on_doc!(quote);
692 create_fns_on_doc!(sentence_terminator);
693 create_fns_on_doc!(space);
694 create_fns_on_doc!(unlintable);
695 create_fns_on_doc!(verb);
696 create_fns_on_doc!(word);
697 create_fns_on_doc!(word_like);
698
699 fn first_sentence_word(&self) -> Option<&Token> {
700 self.tokens.first_sentence_word()
701 }
702
703 fn first_non_whitespace(&self) -> Option<&Token> {
704 self.tokens.first_non_whitespace()
705 }
706
707 fn span(&self) -> Option<Span> {
708 self.tokens.span()
709 }
710
711 fn iter_linking_verb_indices(&self) -> impl Iterator<Item = usize> + '_ {
712 self.tokens.iter_linking_verb_indices()
713 }
714
715 fn iter_linking_verbs(&self) -> impl Iterator<Item = &Token> + '_ {
716 self.tokens.iter_linking_verbs()
717 }
718
719 fn iter_chunks(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
720 self.tokens.iter_chunks()
721 }
722
723 fn iter_paragraphs(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
724 self.tokens.iter_paragraphs()
725 }
726
727 fn iter_sentences(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
728 self.tokens.iter_sentences()
729 }
730}
731
732impl Display for Document {
733 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
734 for token in &self.tokens {
735 write!(f, "{}", self.get_span_content_str(&token.span))?;
736 }
737
738 Ok(())
739 }
740}
741
742#[cfg(test)]
743mod tests {
744 use itertools::Itertools;
745
746 use super::Document;
747 use crate::{Span, parsers::MarkdownOptions};
748
749 fn assert_condensed_contractions(text: &str, final_tok_count: usize) {
750 let document = Document::new_plain_english_curated(text);
751
752 assert_eq!(document.tokens.len(), final_tok_count);
753
754 let document = Document::new_markdown_curated(text, MarkdownOptions::default());
755
756 assert_eq!(document.tokens.len(), final_tok_count);
757 }
758
759 #[test]
760 fn simple_contraction() {
761 assert_condensed_contractions("isn't", 1);
762 }
763
764 #[test]
765 fn simple_contraction2() {
766 assert_condensed_contractions("wasn't", 1);
767 }
768
769 #[test]
770 fn simple_contraction3() {
771 assert_condensed_contractions("There's", 1);
772 }
773
774 #[test]
775 fn medium_contraction() {
776 assert_condensed_contractions("isn't wasn't", 3);
777 }
778
779 #[test]
780 fn medium_contraction2() {
781 assert_condensed_contractions("There's no way", 5);
782 }
783
784 #[test]
785 fn selects_token_at_char_index() {
786 let text = "There were three little pigs. They built three little homes.";
787 let document = Document::new_plain_english_curated(text);
788
789 let got = document.get_token_at_char_index(19).unwrap();
790
791 assert!(got.kind.is_word());
792 assert_eq!(got.span, Span::new(17, 23));
793 }
794
795 fn assert_token_count(source: &str, count: usize) {
796 let document = Document::new_plain_english_curated(source);
797
798 dbg!(document.tokens().map(|t| t.kind.clone()).collect_vec());
799 assert_eq!(document.tokens.len(), count);
800 }
801
802 #[test]
803 fn condenses_number_suffixes() {
804 assert_token_count("1st", 1);
805 assert_token_count("This is the 2nd test", 9);
806 assert_token_count("This is the 3rd test", 9);
807 assert_token_count(
808 "It works even with weird capitalization like this: 600nD",
809 18,
810 );
811 }
812
813 #[test]
814 fn condenses_ie() {
815 assert_token_count("There is a thing (i.e. that one)", 15);
816 assert_token_count("We are trying to condense \"i.e.\"", 13);
817 assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
818 }
819
820 #[test]
821 fn condenses_eg() {
822 assert_token_count("We are trying to condense \"e.g.\"", 13);
823 assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
824 }
825
826 #[test]
827 fn condenses_nsa() {
828 assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
829 }
830
831 #[test]
832 fn parses_ellipsis() {
833 assert_token_count("...", 1);
834 }
835
836 #[test]
837 fn parses_long_ellipsis() {
838 assert_token_count(".....", 1);
839 }
840
841 #[test]
842 fn parses_short_ellipsis() {
843 assert_token_count("..", 1);
844 }
845
846 #[test]
847 fn selects_token_at_offset() {
848 let doc = Document::new_plain_english_curated("Foo bar baz");
849
850 let tok = doc.get_token_offset(1, -1).unwrap();
851
852 assert_eq!(tok.span, Span::new(0, 3));
853 }
854
855 #[test]
856 fn cant_select_token_before_start() {
857 let doc = Document::new_plain_english_curated("Foo bar baz");
858
859 let tok = doc.get_token_offset(0, -1);
860
861 assert!(tok.is_none());
862 }
863
864 #[test]
865 fn select_next_word_pos_offset() {
866 let doc = Document::new_plain_english_curated("Foo bar baz");
867
868 let bar = doc.get_next_word_from_offset(0, 1).unwrap();
869 let bar = doc.get_span_content(&bar.span);
870 assert_eq!(bar, ['b', 'a', 'r']);
871 }
872
873 #[test]
874 fn select_next_word_neg_offset() {
875 let doc = Document::new_plain_english_curated("Foo bar baz");
876
877 let bar = doc.get_next_word_from_offset(2, -1).unwrap();
878 let bar = doc.get_span_content(&bar.span);
879 assert_eq!(bar, ['F', 'o', 'o']);
880 }
881
882 #[test]
883 fn cant_select_next_word_not_from_whitespace() {
884 let doc = Document::new_plain_english_curated("Foo bar baz");
885
886 let tok = doc.get_next_word_from_offset(0, 2);
887
888 assert!(tok.is_none());
889 }
890
891 #[test]
892 fn cant_select_next_word_before_start() {
893 let doc = Document::new_plain_english_curated("Foo bar baz");
894
895 let tok = doc.get_next_word_from_offset(0, -1);
896
897 assert!(tok.is_none());
898 }
899
900 #[test]
901 fn cant_select_next_word_with_punctuation_instead_of_whitespace() {
902 let doc = Document::new_plain_english_curated("Foo, bar, baz");
903
904 let tok = doc.get_next_word_from_offset(0, 1);
905
906 assert!(tok.is_none());
907 }
908
909 #[test]
910 fn cant_select_next_word_with_punctuation_after_whitespace() {
911 let doc = Document::new_plain_english_curated("Foo \"bar\", baz");
912
913 let tok = doc.get_next_word_from_offset(0, 1);
914
915 assert!(tok.is_none());
916 }
917}