harper_core/
document.rs

1use std::cmp::Ordering;
2use std::collections::VecDeque;
3use std::fmt::Display;
4
5use harper_brill::{Chunker, Tagger, brill_chunker, brill_tagger};
6use paste::paste;
7
8use crate::expr::{Expr, ExprExt, LongestMatchOf, Repeating, SequenceExpr};
9use crate::parsers::{Markdown, MarkdownOptions, Parser, PlainEnglish};
10use crate::patterns::WordSet;
11use crate::punctuation::Punctuation;
12use crate::spell::{Dictionary, FstDictionary};
13use crate::vec_ext::VecExt;
14use crate::{FatStringToken, FatToken, Lrc, Token, TokenKind, TokenStringExt};
15use crate::{OrdinalSuffix, Span};
16
17/// A document containing some amount of lexed and parsed English text.
18#[derive(Debug, Clone)]
19pub struct Document {
20    source: Lrc<Vec<char>>,
21    tokens: Vec<Token>,
22}
23
24impl Default for Document {
25    fn default() -> Self {
26        Self::new("", &PlainEnglish, &FstDictionary::curated())
27    }
28}
29
30impl Document {
31    /// Locate all the tokens that intersect a provided span.
32    ///
33    /// Desperately needs optimization.
34    pub fn token_indices_intersecting(&self, span: Span) -> Vec<usize> {
35        self.tokens()
36            .enumerate()
37            .filter_map(|(idx, tok)| tok.span.overlaps_with(span).then_some(idx))
38            .collect()
39    }
40
41    /// Locate all the tokens that intersect a provided span and convert them to [`FatToken`]s.
42    ///
43    /// Desperately needs optimization.
44    pub fn fat_tokens_intersecting(&self, span: Span) -> Vec<FatToken> {
45        let indices = self.token_indices_intersecting(span);
46
47        indices
48            .into_iter()
49            .map(|i| self.tokens[i].to_fat(&self.source))
50            .collect()
51    }
52
53    /// Lexes and parses text to produce a document using a provided language
54    /// parser and dictionary.
55    pub fn new(text: &str, parser: &impl Parser, dictionary: &impl Dictionary) -> Self {
56        let source: Vec<_> = text.chars().collect();
57
58        Self::new_from_vec(Lrc::new(source), parser, dictionary)
59    }
60
61    /// Lexes and parses text to produce a document using a provided language
62    /// parser and the included curated dictionary.
63    pub fn new_curated(text: &str, parser: &impl Parser) -> Self {
64        let source: Vec<_> = text.chars().collect();
65
66        Self::new_from_vec(Lrc::new(source), parser, &FstDictionary::curated())
67    }
68
69    /// Lexes and parses text to produce a document using a provided language
70    /// parser and dictionary.
71    pub fn new_from_vec(
72        source: Lrc<Vec<char>>,
73        parser: &impl Parser,
74        dictionary: &impl Dictionary,
75    ) -> Self {
76        let tokens = parser.parse(&source);
77
78        let mut document = Self { source, tokens };
79        document.parse(dictionary);
80
81        document
82    }
83
84    /// Parse text to produce a document using the built-in [`PlainEnglish`]
85    /// parser and curated dictionary.
86    pub fn new_plain_english_curated(text: &str) -> Self {
87        Self::new(text, &PlainEnglish, &FstDictionary::curated())
88    }
89
90    /// Parse text to produce a document using the built-in [`PlainEnglish`]
91    /// parser and a provided dictionary.
92    pub fn new_plain_english(text: &str, dictionary: &impl Dictionary) -> Self {
93        Self::new(text, &PlainEnglish, dictionary)
94    }
95
96    /// Parse text to produce a document using the built-in [`Markdown`] parser
97    /// and curated dictionary.
98    pub fn new_markdown_curated(text: &str, markdown_options: MarkdownOptions) -> Self {
99        Self::new(
100            text,
101            &Markdown::new(markdown_options),
102            &FstDictionary::curated(),
103        )
104    }
105
106    /// Parse text to produce a document using the built-in [`Markdown`] parser
107    /// and curated dictionary with the default Markdown configuration.
108    pub fn new_markdown_default_curated(text: &str) -> Self {
109        Self::new_markdown_curated(text, MarkdownOptions::default())
110    }
111
112    /// Parse text to produce a document using the built-in [`PlainEnglish`]
113    /// parser and the curated dictionary.
114    pub fn new_markdown(
115        text: &str,
116        markdown_options: MarkdownOptions,
117        dictionary: &impl Dictionary,
118    ) -> Self {
119        Self::new(text, &Markdown::new(markdown_options), dictionary)
120    }
121
122    /// Parse text to produce a document using the built-in [`PlainEnglish`]
123    /// parser and the curated dictionary with the default Markdown configuration.
124    pub fn new_markdown_default(text: &str, dictionary: &impl Dictionary) -> Self {
125        Self::new_markdown(text, MarkdownOptions::default(), dictionary)
126    }
127
128    /// Re-parse important language constructs.
129    ///
130    /// Should be run after every change to the underlying [`Self::source`].
131    fn parse(&mut self, dictionary: &impl Dictionary) {
132        self.condense_spaces();
133        self.condense_newlines();
134        self.newlines_to_breaks();
135        self.condense_contractions();
136        self.condense_dotted_initialisms();
137        self.condense_number_suffixes();
138        self.condense_ellipsis();
139        self.condense_latin();
140        self.match_quotes();
141
142        let token_strings: Vec<_> = self
143            .tokens
144            .iter()
145            .filter(|t| !t.kind.is_whitespace())
146            .map(|t| self.get_span_content_str(&t.span))
147            .collect();
148
149        let token_tags = brill_tagger().tag_sentence(&token_strings);
150        let np_flags = brill_chunker().chunk_sentence(&token_strings, &token_tags);
151
152        let mut i = 0;
153
154        // Annotate word metadata
155        for token in self.tokens.iter_mut() {
156            if let TokenKind::Word(meta) = &mut token.kind {
157                let word_source = token.span.get_content(&self.source);
158                let mut found_meta = dictionary.get_word_metadata(word_source).cloned();
159
160                if let Some(inner) = &mut found_meta {
161                    inner.pos_tag = token_tags[i];
162                    inner.np_member = Some(np_flags[i]);
163                }
164
165                *meta = found_meta;
166                i += 1;
167            } else if !token.kind.is_whitespace() {
168                i += 1;
169            }
170        }
171    }
172
173    /// Convert all sets of newlines greater than 2 to paragraph breaks.
174    fn newlines_to_breaks(&mut self) {
175        for token in &mut self.tokens {
176            if let TokenKind::Newline(n) = token.kind {
177                if n >= 2 {
178                    token.kind = TokenKind::ParagraphBreak;
179                }
180            }
181        }
182    }
183
184    /// Given a list of indices, this function removes the subsequent
185    /// `stretch_len - 1` elements after each index.
186    ///
187    /// Will extend token spans to include removed elements.
188    /// Assumes condensed tokens are contiguous in source text.
189    fn condense_indices(&mut self, indices: &[usize], stretch_len: usize) {
190        // Update spans
191        for idx in indices {
192            let end_tok = self.tokens[idx + stretch_len - 1].clone();
193            let start_tok = &mut self.tokens[*idx];
194
195            start_tok.span.end = end_tok.span.end;
196        }
197
198        // Trim
199        let old = self.tokens.clone();
200        self.tokens.clear();
201
202        // Keep first chunk.
203        self.tokens
204            .extend_from_slice(&old[0..indices.first().copied().unwrap_or(indices.len())]);
205
206        let mut iter = indices.iter().peekable();
207
208        while let (Some(a_idx), b) = (iter.next(), iter.peek()) {
209            self.tokens.push(old[*a_idx].clone());
210
211            if let Some(b_idx) = b {
212                self.tokens
213                    .extend_from_slice(&old[a_idx + stretch_len..**b_idx]);
214            }
215        }
216
217        // Keep last chunk.
218        self.tokens.extend_from_slice(
219            &old[indices
220                .last()
221                .map(|v| v + stretch_len)
222                .unwrap_or(indices.len())..],
223        );
224    }
225
226    pub fn get_token_at_char_index(&self, char_index: usize) -> Option<&Token> {
227        let index = self
228            .tokens
229            .binary_search_by(|t| {
230                if t.span.overlaps_with(Span::new_with_len(char_index, 1)) {
231                    Ordering::Equal
232                } else {
233                    t.span.start.cmp(&char_index)
234                }
235            })
236            .ok()?;
237
238        Some(&self.tokens[index])
239    }
240
241    /// Defensively attempt to grab a specific token.
242    pub fn get_token(&self, index: usize) -> Option<&Token> {
243        self.tokens.get(index)
244    }
245
246    /// Get a token at a signed offset from a base index, or None if out of bounds.
247    pub fn get_token_offset(&self, base: usize, offset: isize) -> Option<&Token> {
248        match base.checked_add_signed(offset) {
249            None => None,
250            Some(idx) => self.get_token(idx),
251        }
252    }
253
254    /// Get an iterator over all the tokens contained in the document.
255    pub fn tokens(&self) -> impl Iterator<Item = &Token> + '_ {
256        self.tokens.iter()
257    }
258
259    pub fn iter_nominal_phrases(&self) -> impl Iterator<Item = &[Token]> {
260        fn is_np_member(t: &Token) -> bool {
261            t.kind
262                .as_word()
263                .and_then(|x| x.as_ref())
264                .and_then(|w| w.np_member)
265                .unwrap_or(false)
266        }
267
268        fn trim(slice: &[Token]) -> &[Token] {
269            let mut start = 0;
270            let mut end = slice.len();
271            while start < end && slice[start].kind.is_whitespace() {
272                start += 1;
273            }
274            while end > start && slice[end - 1].kind.is_whitespace() {
275                end -= 1;
276            }
277            &slice[start..end]
278        }
279
280        self.tokens
281            .as_slice()
282            .split(|t| !(is_np_member(t) || t.kind.is_whitespace()))
283            .filter_map(|s| {
284                let s = trim(s);
285                if s.iter().any(is_np_member) {
286                    Some(s)
287                } else {
288                    None
289                }
290            })
291    }
292
293    /// Get an iterator over all the tokens contained in the document.
294    pub fn fat_tokens(&self) -> impl Iterator<Item = FatToken> + '_ {
295        self.tokens().map(|token| token.to_fat(&self.source))
296    }
297
298    /// Get the next or previous word token relative to a base index, if separated by whitespace.
299    /// Returns None if the next/previous token is not a word or does not exist.
300    pub fn get_next_word_from_offset(&self, base: usize, offset: isize) -> Option<&Token> {
301        // Look for whitespace at the expected offset
302        if !self.get_token_offset(base, offset)?.kind.is_whitespace() {
303            return None;
304        }
305        // Now look beyond the whitespace for a word token
306        let word_token = self.get_token_offset(base, offset + offset.signum());
307        let word_token = word_token?;
308        word_token.kind.is_word().then_some(word_token)
309    }
310
311    /// Get an iterator over all the tokens contained in the document.
312    pub fn fat_string_tokens(&self) -> impl Iterator<Item = FatStringToken> + '_ {
313        self.fat_tokens().map(|t| t.into())
314    }
315
316    pub fn get_span_content(&self, span: &Span) -> &[char] {
317        span.get_content(&self.source)
318    }
319
320    pub fn get_span_content_str(&self, span: &Span) -> String {
321        String::from_iter(self.get_span_content(span))
322    }
323
324    pub fn get_full_string(&self) -> String {
325        self.get_span_content_str(&Span {
326            start: 0,
327            end: self.source.len(),
328        })
329    }
330
331    pub fn get_full_content(&self) -> &[char] {
332        &self.source
333    }
334
335    pub fn get_source(&self) -> &[char] {
336        &self.source
337    }
338
339    pub fn get_tokens(&self) -> &[Token] {
340        &self.tokens
341    }
342
343    /// Searches for quotation marks and fills the
344    /// [`Punctuation::Quote::twin_loc`] field. This is on a best-effort
345    /// basis.
346    ///
347    /// Current algorithm is basic and could use some work.
348    fn match_quotes(&mut self) {
349        let quote_indices: Vec<usize> = self.tokens.iter_quote_indices().collect();
350
351        for i in 0..quote_indices.len() / 2 {
352            let a_i = quote_indices[i * 2];
353            let b_i = quote_indices[i * 2 + 1];
354
355            {
356                let a = self.tokens[a_i].kind.as_mut_quote().unwrap();
357                a.twin_loc = Some(b_i);
358            }
359
360            {
361                let b = self.tokens[b_i].kind.as_mut_quote().unwrap();
362                b.twin_loc = Some(a_i);
363            }
364        }
365    }
366
367    /// Searches for number suffixes and condenses them down into single tokens
368    fn condense_number_suffixes(&mut self) {
369        if self.tokens.len() < 2 {
370            return;
371        }
372
373        let mut replace_starts = Vec::new();
374
375        for idx in 0..self.tokens.len() - 1 {
376            let b = &self.tokens[idx + 1];
377            let a = &self.tokens[idx];
378
379            // TODO: Allow spaces between `a` and `b`
380
381            if let (TokenKind::Number(..), TokenKind::Word(..)) = (&a.kind, &b.kind) {
382                if let Some(found_suffix) =
383                    OrdinalSuffix::from_chars(self.get_span_content(&b.span))
384                {
385                    self.tokens[idx].kind.as_mut_number().unwrap().suffix = Some(found_suffix);
386                    replace_starts.push(idx);
387                }
388            }
389        }
390
391        self.condense_indices(&replace_starts, 2);
392    }
393
394    /// Searches for multiple sequential space tokens and condenses them down
395    /// into one.
396    fn condense_spaces(&mut self) {
397        let mut cursor = 0;
398        let copy = self.tokens.clone();
399
400        let mut remove_these = VecDeque::new();
401
402        while cursor < self.tokens.len() {
403            // Locate a stretch of one or more newline tokens.
404            let start_tok = &mut self.tokens[cursor];
405
406            if let TokenKind::Space(start_count) = &mut start_tok.kind {
407                loop {
408                    cursor += 1;
409
410                    if cursor >= copy.len() {
411                        break;
412                    }
413
414                    let child_tok = &copy[cursor];
415
416                    // Only condense adjacent spans
417                    if start_tok.span.end != child_tok.span.start {
418                        break;
419                    }
420
421                    if let TokenKind::Space(n) = child_tok.kind {
422                        *start_count += n;
423                        start_tok.span.end = child_tok.span.end;
424                        remove_these.push_back(cursor);
425                        cursor += 1;
426                    } else {
427                        break;
428                    };
429                }
430            }
431
432            cursor += 1;
433        }
434
435        self.tokens.remove_indices(remove_these);
436    }
437
438    thread_local! {
439        static LATIN_EXPR: Lrc<LongestMatchOf> = Document::uncached_latin_expr();
440    }
441
442    fn uncached_latin_expr() -> Lrc<LongestMatchOf> {
443        Lrc::new(LongestMatchOf::new(vec![
444            Box::new(
445                SequenceExpr::default()
446                    .then(WordSet::new(&["etc", "vs"]))
447                    .then_period(),
448            ),
449            Box::new(
450                SequenceExpr::aco("et")
451                    .then_whitespace()
452                    .t_aco("al")
453                    .then_period(),
454            ),
455        ]))
456    }
457
458    /// Assumes that the first matched token is the canonical one to be condensed into.
459    /// Takes a callback that can be used to retroactively edit the canonical token afterwards.
460    fn condense_expr<F>(&mut self, expr: &impl Expr, edit: F)
461    where
462        F: Fn(&mut Token),
463    {
464        let matches = expr.iter_matches_in_doc(self).collect::<Vec<_>>();
465
466        let mut remove_indices = VecDeque::with_capacity(matches.len());
467
468        for m in matches {
469            remove_indices.extend(m.start + 1..m.end);
470            self.tokens[m.start].span = self.tokens[m.into_iter()].span().unwrap();
471            edit(&mut self.tokens[m.start]);
472        }
473
474        self.tokens.remove_indices(remove_indices);
475    }
476
477    fn condense_latin(&mut self) {
478        self.condense_expr(&Self::LATIN_EXPR.with(|v| v.clone()), |_| {})
479    }
480
481    /// Searches for multiple sequential newline tokens and condenses them down
482    /// into one.
483    fn condense_newlines(&mut self) {
484        let mut cursor = 0;
485        let copy = self.tokens.clone();
486
487        let mut remove_these = VecDeque::new();
488
489        while cursor < self.tokens.len() {
490            // Locate a stretch of one or more newline tokens.
491            let start_tok = &mut self.tokens[cursor];
492
493            if let TokenKind::Newline(start_count) = &mut start_tok.kind {
494                loop {
495                    cursor += 1;
496
497                    if cursor >= copy.len() {
498                        break;
499                    }
500
501                    let child_tok = &copy[cursor];
502                    if let TokenKind::Newline(n) = child_tok.kind {
503                        *start_count += n;
504                        start_tok.span.end = child_tok.span.end;
505                        remove_these.push_back(cursor);
506                        cursor += 1;
507                    } else {
508                        break;
509                    };
510                }
511            }
512
513            cursor += 1;
514        }
515
516        self.tokens.remove_indices(remove_these);
517    }
518
519    /// Condenses words like "i.e.", "e.g." and "N.S.A." down to single words
520    /// using a state machine.
521    fn condense_dotted_initialisms(&mut self) {
522        if self.tokens.len() < 2 {
523            return;
524        }
525
526        let mut to_remove = VecDeque::new();
527
528        let mut cursor = 1;
529
530        let mut initialism_start = None;
531
532        loop {
533            let a = &self.tokens[cursor - 1];
534            let b = &self.tokens[cursor];
535
536            let is_initialism_chunk = a.kind.is_word() && a.span.len() == 1 && b.kind.is_period();
537
538            if is_initialism_chunk {
539                if initialism_start.is_none() {
540                    initialism_start = Some(cursor - 1);
541                } else {
542                    to_remove.push_back(cursor - 1);
543                }
544
545                to_remove.push_back(cursor);
546                cursor += 1;
547            } else {
548                if let Some(start) = initialism_start {
549                    let end = self.tokens[cursor - 2].span.end;
550                    let start_tok: &mut Token = &mut self.tokens[start];
551                    start_tok.span.end = end;
552                }
553
554                initialism_start = None;
555            }
556
557            cursor += 1;
558
559            if cursor >= self.tokens.len() - 1 {
560                break;
561            }
562        }
563
564        self.tokens.remove_indices(to_remove);
565    }
566
567    fn uncached_ellipsis_pattern() -> Lrc<Repeating> {
568        let period = SequenceExpr::default().then_period();
569        Lrc::new(Repeating::new(Box::new(period), 2))
570    }
571
572    thread_local! {
573        static ELLIPSIS_EXPR: Lrc<Repeating> = Document::uncached_ellipsis_pattern();
574    }
575
576    fn condense_ellipsis(&mut self) {
577        let expr = Self::ELLIPSIS_EXPR.with(|v| v.clone());
578        self.condense_expr(&expr, |tok| {
579            tok.kind = TokenKind::Punctuation(Punctuation::Ellipsis)
580        });
581    }
582
583    fn uncached_contraction_expr() -> Lrc<SequenceExpr> {
584        Lrc::new(
585            SequenceExpr::default()
586                .then_any_word()
587                .then_apostrophe()
588                .then_any_word(),
589        )
590    }
591
592    thread_local! {
593        static CONTRACTION_EXPR: Lrc<SequenceExpr> = Document::uncached_contraction_expr();
594    }
595
596    /// Searches for contractions and condenses them down into single
597    /// tokens.
598    fn condense_contractions(&mut self) {
599        let expr = Self::CONTRACTION_EXPR.with(|v| v.clone());
600
601        self.condense_expr(&expr, |_| {});
602    }
603}
604
605/// Creates functions necessary to implement [`TokenStringExt]` on a document.
606macro_rules! create_fns_on_doc {
607    ($thing:ident) => {
608        paste! {
609            fn [< first_ $thing >](&self) -> Option<&Token> {
610                self.tokens.[< first_ $thing >]()
611            }
612
613            fn [< last_ $thing >](&self) -> Option<&Token> {
614                self.tokens.[< last_ $thing >]()
615            }
616
617            fn [< last_ $thing _index>](&self) -> Option<usize> {
618                self.tokens.[< last_ $thing _index >]()
619            }
620
621            fn [<iter_ $thing _indices>](&self) -> impl DoubleEndedIterator<Item = usize> + '_ {
622                self.tokens.[< iter_ $thing _indices >]()
623            }
624
625            fn [<iter_ $thing s>](&self) -> impl Iterator<Item = &Token> + '_ {
626                self.tokens.[< iter_ $thing s >]()
627            }
628        }
629    };
630}
631
632impl TokenStringExt for Document {
633    create_fns_on_doc!(adjective);
634    create_fns_on_doc!(apostrophe);
635    create_fns_on_doc!(at);
636    create_fns_on_doc!(chunk_terminator);
637    create_fns_on_doc!(comma);
638    create_fns_on_doc!(conjunction);
639    create_fns_on_doc!(currency);
640    create_fns_on_doc!(ellipsis);
641    create_fns_on_doc!(hostname);
642    create_fns_on_doc!(likely_homograph);
643    create_fns_on_doc!(noun);
644    create_fns_on_doc!(number);
645    create_fns_on_doc!(paragraph_break);
646    create_fns_on_doc!(pipe);
647    create_fns_on_doc!(preposition);
648    create_fns_on_doc!(punctuation);
649    create_fns_on_doc!(quote);
650    create_fns_on_doc!(sentence_terminator);
651    create_fns_on_doc!(space);
652    create_fns_on_doc!(unlintable);
653    create_fns_on_doc!(verb);
654    create_fns_on_doc!(word);
655    create_fns_on_doc!(word_like);
656
657    fn first_sentence_word(&self) -> Option<&Token> {
658        self.tokens.first_sentence_word()
659    }
660
661    fn first_non_whitespace(&self) -> Option<&Token> {
662        self.tokens.first_non_whitespace()
663    }
664
665    fn span(&self) -> Option<Span> {
666        self.tokens.span()
667    }
668
669    fn iter_linking_verb_indices(&self) -> impl Iterator<Item = usize> + '_ {
670        self.tokens.iter_linking_verb_indices()
671    }
672
673    fn iter_linking_verbs(&self) -> impl Iterator<Item = &Token> + '_ {
674        self.tokens.iter_linking_verbs()
675    }
676
677    fn iter_chunks(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
678        self.tokens.iter_chunks()
679    }
680
681    fn iter_paragraphs(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
682        self.tokens.iter_paragraphs()
683    }
684
685    fn iter_sentences(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
686        self.tokens.iter_sentences()
687    }
688}
689
690impl Display for Document {
691    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
692        for token in &self.tokens {
693            write!(f, "{}", self.get_span_content_str(&token.span))?;
694        }
695
696        Ok(())
697    }
698}
699
700#[cfg(test)]
701mod tests {
702    use itertools::Itertools;
703
704    use super::Document;
705    use crate::{Span, parsers::MarkdownOptions};
706
707    fn assert_condensed_contractions(text: &str, final_tok_count: usize) {
708        let document = Document::new_plain_english_curated(text);
709
710        assert_eq!(document.tokens.len(), final_tok_count);
711
712        let document = Document::new_markdown_curated(text, MarkdownOptions::default());
713
714        assert_eq!(document.tokens.len(), final_tok_count);
715    }
716
717    #[test]
718    fn simple_contraction() {
719        assert_condensed_contractions("isn't", 1);
720    }
721
722    #[test]
723    fn simple_contraction2() {
724        assert_condensed_contractions("wasn't", 1);
725    }
726
727    #[test]
728    fn simple_contraction3() {
729        assert_condensed_contractions("There's", 1);
730    }
731
732    #[test]
733    fn medium_contraction() {
734        assert_condensed_contractions("isn't wasn't", 3);
735    }
736
737    #[test]
738    fn medium_contraction2() {
739        assert_condensed_contractions("There's no way", 5);
740    }
741
742    #[test]
743    fn selects_token_at_char_index() {
744        let text = "There were three little pigs. They built three little homes.";
745        let document = Document::new_plain_english_curated(text);
746
747        let got = document.get_token_at_char_index(19).unwrap();
748
749        assert!(got.kind.is_word());
750        assert_eq!(got.span, Span::new(17, 23));
751    }
752
753    fn assert_token_count(source: &str, count: usize) {
754        let document = Document::new_plain_english_curated(source);
755
756        dbg!(document.tokens().map(|t| t.kind.clone()).collect_vec());
757        assert_eq!(document.tokens.len(), count);
758    }
759
760    #[test]
761    fn condenses_number_suffixes() {
762        assert_token_count("1st", 1);
763        assert_token_count("This is the 2nd test", 9);
764        assert_token_count("This is the 3rd test", 9);
765        assert_token_count(
766            "It works even with weird capitalization like this: 600nD",
767            18,
768        );
769    }
770
771    #[test]
772    fn condenses_ie() {
773        assert_token_count("There is a thing (i.e. that one)", 15);
774        assert_token_count("We are trying to condense \"i.e.\"", 13);
775        assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
776    }
777
778    #[test]
779    fn condenses_eg() {
780        assert_token_count("We are trying to condense \"e.g.\"", 13);
781        assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
782    }
783
784    #[test]
785    fn condenses_nsa() {
786        assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
787    }
788
789    #[test]
790    fn parses_ellipsis() {
791        assert_token_count("...", 1);
792    }
793
794    #[test]
795    fn parses_long_ellipsis() {
796        assert_token_count(".....", 1);
797    }
798
799    #[test]
800    fn parses_short_ellipsis() {
801        assert_token_count("..", 1);
802    }
803
804    #[test]
805    fn selects_token_at_offset() {
806        let doc = Document::new_plain_english_curated("Foo bar baz");
807
808        let tok = doc.get_token_offset(1, -1).unwrap();
809
810        assert_eq!(tok.span, Span::new(0, 3));
811    }
812
813    #[test]
814    fn cant_select_token_before_start() {
815        let doc = Document::new_plain_english_curated("Foo bar baz");
816
817        let tok = doc.get_token_offset(0, -1);
818
819        assert!(tok.is_none());
820    }
821
822    #[test]
823    fn select_next_word_pos_offset() {
824        let doc = Document::new_plain_english_curated("Foo bar baz");
825
826        let bar = doc.get_next_word_from_offset(0, 1).unwrap();
827        let bar = doc.get_span_content(&bar.span);
828        assert_eq!(bar, ['b', 'a', 'r']);
829    }
830
831    #[test]
832    fn select_next_word_neg_offset() {
833        let doc = Document::new_plain_english_curated("Foo bar baz");
834
835        let bar = doc.get_next_word_from_offset(2, -1).unwrap();
836        let bar = doc.get_span_content(&bar.span);
837        assert_eq!(bar, ['F', 'o', 'o']);
838    }
839
840    #[test]
841    fn cant_select_next_word_not_from_whitespace() {
842        let doc = Document::new_plain_english_curated("Foo bar baz");
843
844        let tok = doc.get_next_word_from_offset(0, 2);
845
846        assert!(tok.is_none());
847    }
848
849    #[test]
850    fn cant_select_next_word_before_start() {
851        let doc = Document::new_plain_english_curated("Foo bar baz");
852
853        let tok = doc.get_next_word_from_offset(0, -1);
854
855        assert!(tok.is_none());
856    }
857
858    #[test]
859    fn cant_select_next_word_with_punctuation_instead_of_whitespace() {
860        let doc = Document::new_plain_english_curated("Foo, bar, baz");
861
862        let tok = doc.get_next_word_from_offset(0, 1);
863
864        assert!(tok.is_none());
865    }
866
867    #[test]
868    fn cant_select_next_word_with_punctuation_after_whitespace() {
869        let doc = Document::new_plain_english_curated("Foo \"bar\", baz");
870
871        let tok = doc.get_next_word_from_offset(0, 1);
872
873        assert!(tok.is_none());
874    }
875}