harper_core/
document.rs

1use std::cmp::Ordering;
2use std::collections::VecDeque;
3use std::fmt::Display;
4
5use harper_brill::{Chunker, Tagger, brill_chunker, brill_tagger};
6use paste::paste;
7
8use crate::expr::{Expr, ExprExt, LongestMatchOf, Repeating, SequenceExpr};
9use crate::parsers::{Markdown, MarkdownOptions, Parser, PlainEnglish};
10use crate::patterns::WordSet;
11use crate::punctuation::Punctuation;
12use crate::vec_ext::VecExt;
13use crate::{
14    Dictionary, FatStringToken, FatToken, FstDictionary, Lrc, Token, TokenKind, TokenStringExt,
15};
16use crate::{OrdinalSuffix, Span};
17
18/// A document containing some amount of lexed and parsed English text.
19#[derive(Debug, Clone)]
20pub struct Document {
21    source: Lrc<Vec<char>>,
22    tokens: Vec<Token>,
23}
24
25impl Default for Document {
26    fn default() -> Self {
27        Self::new("", &PlainEnglish, &FstDictionary::curated())
28    }
29}
30
31impl Document {
32    /// Locate all the tokens that intersect a provided span.
33    ///
34    /// Desperately needs optimization.
35    pub fn token_indices_intersecting(&self, span: Span) -> Vec<usize> {
36        self.tokens()
37            .enumerate()
38            .filter_map(|(idx, tok)| tok.span.overlaps_with(span).then_some(idx))
39            .collect()
40    }
41
42    /// Locate all the tokens that intersect a provided span and convert them to [`FatToken`]s.
43    ///
44    /// Desperately needs optimization.
45    pub fn fat_tokens_intersecting(&self, span: Span) -> Vec<FatToken> {
46        let indices = self.token_indices_intersecting(span);
47
48        indices
49            .into_iter()
50            .map(|i| self.tokens[i].to_fat(&self.source))
51            .collect()
52    }
53
54    /// Lexes and parses text to produce a document using a provided language
55    /// parser and dictionary.
56    pub fn new(text: &str, parser: &impl Parser, dictionary: &impl Dictionary) -> Self {
57        let source: Vec<_> = text.chars().collect();
58
59        Self::new_from_vec(Lrc::new(source), parser, dictionary)
60    }
61
62    /// Lexes and parses text to produce a document using a provided language
63    /// parser and the included curated dictionary.
64    pub fn new_curated(text: &str, parser: &impl Parser) -> Self {
65        let source: Vec<_> = text.chars().collect();
66
67        Self::new_from_vec(Lrc::new(source), parser, &FstDictionary::curated())
68    }
69
70    /// Lexes and parses text to produce a document using a provided language
71    /// parser and dictionary.
72    pub fn new_from_vec(
73        source: Lrc<Vec<char>>,
74        parser: &impl Parser,
75        dictionary: &impl Dictionary,
76    ) -> Self {
77        let tokens = parser.parse(&source);
78
79        let mut document = Self { source, tokens };
80        document.parse(dictionary);
81
82        document
83    }
84
85    /// Parse text to produce a document using the built-in [`PlainEnglish`]
86    /// parser and curated dictionary.
87    pub fn new_plain_english_curated(text: &str) -> Self {
88        Self::new(text, &PlainEnglish, &FstDictionary::curated())
89    }
90
91    /// Parse text to produce a document using the built-in [`PlainEnglish`]
92    /// parser and a provided dictionary.
93    pub fn new_plain_english(text: &str, dictionary: &impl Dictionary) -> Self {
94        Self::new(text, &PlainEnglish, dictionary)
95    }
96
97    /// Parse text to produce a document using the built-in [`Markdown`] parser
98    /// and curated dictionary.
99    pub fn new_markdown_curated(text: &str, markdown_options: MarkdownOptions) -> Self {
100        Self::new(
101            text,
102            &Markdown::new(markdown_options),
103            &FstDictionary::curated(),
104        )
105    }
106
107    /// Parse text to produce a document using the built-in [`Markdown`] parser
108    /// and curated dictionary with the default Markdown configuration.
109    pub fn new_markdown_default_curated(text: &str) -> Self {
110        Self::new_markdown_curated(text, MarkdownOptions::default())
111    }
112
113    /// Parse text to produce a document using the built-in [`PlainEnglish`]
114    /// parser and the curated dictionary.
115    pub fn new_markdown(
116        text: &str,
117        markdown_options: MarkdownOptions,
118        dictionary: &impl Dictionary,
119    ) -> Self {
120        Self::new(text, &Markdown::new(markdown_options), dictionary)
121    }
122
123    /// Parse text to produce a document using the built-in [`PlainEnglish`]
124    /// parser and the curated dictionary with the default Markdown configuration.
125    pub fn new_markdown_default(text: &str, dictionary: &impl Dictionary) -> Self {
126        Self::new_markdown(text, MarkdownOptions::default(), dictionary)
127    }
128
129    /// Re-parse important language constructs.
130    ///
131    /// Should be run after every change to the underlying [`Self::source`].
132    fn parse(&mut self, dictionary: &impl Dictionary) {
133        self.condense_spaces();
134        self.condense_newlines();
135        self.newlines_to_breaks();
136        self.condense_contractions();
137        self.condense_dotted_initialisms();
138        self.condense_number_suffixes();
139        self.condense_ellipsis();
140        self.condense_latin();
141        self.match_quotes();
142
143        let token_strings: Vec<_> = self
144            .tokens
145            .iter()
146            .filter(|t| !t.kind.is_whitespace())
147            .map(|t| self.get_span_content_str(&t.span))
148            .collect();
149
150        let token_tags = brill_tagger().tag_sentence(&token_strings);
151        let np_flags = brill_chunker().chunk_sentence(&token_strings, &token_tags);
152
153        let mut i = 0;
154
155        // Annotate word metadata
156        for token in self.tokens.iter_mut() {
157            if let TokenKind::Word(meta) = &mut token.kind {
158                let word_source = token.span.get_content(&self.source);
159                let mut found_meta = dictionary.get_word_metadata(word_source).cloned();
160
161                if let Some(inner) = &mut found_meta {
162                    inner.pos_tag = token_tags[i];
163                    inner.np_member = Some(np_flags[i]);
164                }
165
166                *meta = found_meta;
167                i += 1;
168            } else if !token.kind.is_whitespace() {
169                i += 1;
170            }
171        }
172    }
173
174    /// Convert all sets of newlines greater than 2 to paragraph breaks.
175    fn newlines_to_breaks(&mut self) {
176        for token in &mut self.tokens {
177            if let TokenKind::Newline(n) = token.kind {
178                if n >= 2 {
179                    token.kind = TokenKind::ParagraphBreak;
180                }
181            }
182        }
183    }
184
185    /// Given a list of indices, this function removes the subsequent
186    /// `stretch_len - 1` elements after each index.
187    ///
188    /// Will extend token spans to include removed elements.
189    /// Assumes condensed tokens are contiguous in source text.
190    fn condense_indices(&mut self, indices: &[usize], stretch_len: usize) {
191        // Update spans
192        for idx in indices {
193            let end_tok = self.tokens[idx + stretch_len - 1].clone();
194            let start_tok = &mut self.tokens[*idx];
195
196            start_tok.span.end = end_tok.span.end;
197        }
198
199        // Trim
200        let old = self.tokens.clone();
201        self.tokens.clear();
202
203        // Keep first chunk.
204        self.tokens
205            .extend_from_slice(&old[0..indices.first().copied().unwrap_or(indices.len())]);
206
207        let mut iter = indices.iter().peekable();
208
209        while let (Some(a_idx), b) = (iter.next(), iter.peek()) {
210            self.tokens.push(old[*a_idx].clone());
211
212            if let Some(b_idx) = b {
213                self.tokens
214                    .extend_from_slice(&old[a_idx + stretch_len..**b_idx]);
215            }
216        }
217
218        // Keep last chunk.
219        self.tokens.extend_from_slice(
220            &old[indices
221                .last()
222                .map(|v| v + stretch_len)
223                .unwrap_or(indices.len())..],
224        );
225    }
226
227    pub fn get_token_at_char_index(&self, char_index: usize) -> Option<&Token> {
228        let index = self
229            .tokens
230            .binary_search_by(|t| {
231                if t.span.overlaps_with(Span::new_with_len(char_index, 1)) {
232                    Ordering::Equal
233                } else {
234                    t.span.start.cmp(&char_index)
235                }
236            })
237            .ok()?;
238
239        Some(&self.tokens[index])
240    }
241
242    /// Defensively attempt to grab a specific token.
243    pub fn get_token(&self, index: usize) -> Option<&Token> {
244        self.tokens.get(index)
245    }
246
247    /// Get a token at a signed offset from a base index, or None if out of bounds.
248    pub fn get_token_offset(&self, base: usize, offset: isize) -> Option<&Token> {
249        match base.checked_add_signed(offset) {
250            None => None,
251            Some(idx) => self.get_token(idx),
252        }
253    }
254
255    /// Get an iterator over all the tokens contained in the document.
256    pub fn tokens(&self) -> impl Iterator<Item = &Token> + '_ {
257        self.tokens.iter()
258    }
259
260    pub fn iter_nominal_phrases(&self) -> impl Iterator<Item = &[Token]> {
261        fn is_np_member(t: &Token) -> bool {
262            t.kind
263                .as_word()
264                .and_then(|x| x.as_ref())
265                .and_then(|w| w.np_member)
266                .unwrap_or(false)
267        }
268
269        fn trim(slice: &[Token]) -> &[Token] {
270            let mut start = 0;
271            let mut end = slice.len();
272            while start < end && slice[start].kind.is_whitespace() {
273                start += 1;
274            }
275            while end > start && slice[end - 1].kind.is_whitespace() {
276                end -= 1;
277            }
278            &slice[start..end]
279        }
280
281        self.tokens
282            .as_slice()
283            .split(|t| !(is_np_member(t) || t.kind.is_whitespace()))
284            .filter_map(|s| {
285                let s = trim(s);
286                if s.iter().any(is_np_member) {
287                    Some(s)
288                } else {
289                    None
290                }
291            })
292    }
293
294    /// Get an iterator over all the tokens contained in the document.
295    pub fn fat_tokens(&self) -> impl Iterator<Item = FatToken> + '_ {
296        self.tokens().map(|token| token.to_fat(&self.source))
297    }
298
299    /// Get the next or previous word token relative to a base index, if separated by whitespace.
300    /// Returns None if the next/previous token is not a word or does not exist.
301    pub fn get_next_word_from_offset(&self, base: usize, offset: isize) -> Option<&Token> {
302        // Look for whitespace at the expected offset
303        if !self.get_token_offset(base, offset)?.kind.is_whitespace() {
304            return None;
305        }
306        // Now look beyond the whitespace for a word token
307        let word_token = self.get_token_offset(base, offset + offset.signum());
308        let word_token = word_token?;
309        word_token.kind.is_word().then_some(word_token)
310    }
311
312    /// Get an iterator over all the tokens contained in the document.
313    pub fn fat_string_tokens(&self) -> impl Iterator<Item = FatStringToken> + '_ {
314        self.fat_tokens().map(|t| t.into())
315    }
316
317    pub fn get_span_content(&self, span: &Span) -> &[char] {
318        span.get_content(&self.source)
319    }
320
321    pub fn get_span_content_str(&self, span: &Span) -> String {
322        String::from_iter(self.get_span_content(span))
323    }
324
325    pub fn get_full_string(&self) -> String {
326        self.get_span_content_str(&Span {
327            start: 0,
328            end: self.source.len(),
329        })
330    }
331
332    pub fn get_full_content(&self) -> &[char] {
333        &self.source
334    }
335
336    pub fn get_source(&self) -> &[char] {
337        &self.source
338    }
339
340    pub fn get_tokens(&self) -> &[Token] {
341        &self.tokens
342    }
343
344    /// Searches for quotation marks and fills the
345    /// [`Punctuation::Quote::twin_loc`] field. This is on a best-effort
346    /// basis.
347    ///
348    /// Current algorithm is basic and could use some work.
349    fn match_quotes(&mut self) {
350        let quote_indices: Vec<usize> = self.tokens.iter_quote_indices().collect();
351
352        for i in 0..quote_indices.len() / 2 {
353            let a_i = quote_indices[i * 2];
354            let b_i = quote_indices[i * 2 + 1];
355
356            {
357                let a = self.tokens[a_i].kind.as_mut_quote().unwrap();
358                a.twin_loc = Some(b_i);
359            }
360
361            {
362                let b = self.tokens[b_i].kind.as_mut_quote().unwrap();
363                b.twin_loc = Some(a_i);
364            }
365        }
366    }
367
368    /// Searches for number suffixes and condenses them down into single tokens
369    fn condense_number_suffixes(&mut self) {
370        if self.tokens.len() < 2 {
371            return;
372        }
373
374        let mut replace_starts = Vec::new();
375
376        for idx in 0..self.tokens.len() - 1 {
377            let b = &self.tokens[idx + 1];
378            let a = &self.tokens[idx];
379
380            // TODO: Allow spaces between `a` and `b`
381
382            if let (TokenKind::Number(..), TokenKind::Word(..)) = (&a.kind, &b.kind) {
383                if let Some(found_suffix) =
384                    OrdinalSuffix::from_chars(self.get_span_content(&b.span))
385                {
386                    self.tokens[idx].kind.as_mut_number().unwrap().suffix = Some(found_suffix);
387                    replace_starts.push(idx);
388                }
389            }
390        }
391
392        self.condense_indices(&replace_starts, 2);
393    }
394
395    /// Searches for multiple sequential space tokens and condenses them down
396    /// into one.
397    fn condense_spaces(&mut self) {
398        let mut cursor = 0;
399        let copy = self.tokens.clone();
400
401        let mut remove_these = VecDeque::new();
402
403        while cursor < self.tokens.len() {
404            // Locate a stretch of one or more newline tokens.
405            let start_tok = &mut self.tokens[cursor];
406
407            if let TokenKind::Space(start_count) = &mut start_tok.kind {
408                loop {
409                    cursor += 1;
410
411                    if cursor >= copy.len() {
412                        break;
413                    }
414
415                    let child_tok = &copy[cursor];
416
417                    // Only condense adjacent spans
418                    if start_tok.span.end != child_tok.span.start {
419                        break;
420                    }
421
422                    if let TokenKind::Space(n) = child_tok.kind {
423                        *start_count += n;
424                        start_tok.span.end = child_tok.span.end;
425                        remove_these.push_back(cursor);
426                        cursor += 1;
427                    } else {
428                        break;
429                    };
430                }
431            }
432
433            cursor += 1;
434        }
435
436        self.tokens.remove_indices(remove_these);
437    }
438
439    thread_local! {
440        static LATIN_EXPR: Lrc<LongestMatchOf> = Document::uncached_latin_expr();
441    }
442
443    fn uncached_latin_expr() -> Lrc<LongestMatchOf> {
444        Lrc::new(LongestMatchOf::new(vec![
445            Box::new(
446                SequenceExpr::default()
447                    .then(WordSet::new(&["etc", "vs"]))
448                    .then_period(),
449            ),
450            Box::new(
451                SequenceExpr::aco("et")
452                    .then_whitespace()
453                    .t_aco("al")
454                    .then_period(),
455            ),
456        ]))
457    }
458
459    /// Assumes that the first matched token is the canonical one to be condensed into.
460    /// Takes a callback that can be used to retroactively edit the canonical token afterwards.
461    fn condense_expr<F>(&mut self, expr: &impl Expr, edit: F)
462    where
463        F: Fn(&mut Token),
464    {
465        let matches = expr.iter_matches_in_doc(self).collect::<Vec<_>>();
466
467        let mut remove_indices = VecDeque::with_capacity(matches.len());
468
469        for m in matches {
470            remove_indices.extend(m.start + 1..m.end);
471            self.tokens[m.start].span = self.tokens[m.into_iter()].span().unwrap();
472            edit(&mut self.tokens[m.start]);
473        }
474
475        self.tokens.remove_indices(remove_indices);
476    }
477
478    fn condense_latin(&mut self) {
479        self.condense_expr(&Self::LATIN_EXPR.with(|v| v.clone()), |_| {})
480    }
481
482    /// Searches for multiple sequential newline tokens and condenses them down
483    /// into one.
484    fn condense_newlines(&mut self) {
485        let mut cursor = 0;
486        let copy = self.tokens.clone();
487
488        let mut remove_these = VecDeque::new();
489
490        while cursor < self.tokens.len() {
491            // Locate a stretch of one or more newline tokens.
492            let start_tok = &mut self.tokens[cursor];
493
494            if let TokenKind::Newline(start_count) = &mut start_tok.kind {
495                loop {
496                    cursor += 1;
497
498                    if cursor >= copy.len() {
499                        break;
500                    }
501
502                    let child_tok = &copy[cursor];
503                    if let TokenKind::Newline(n) = child_tok.kind {
504                        *start_count += n;
505                        start_tok.span.end = child_tok.span.end;
506                        remove_these.push_back(cursor);
507                        cursor += 1;
508                    } else {
509                        break;
510                    };
511                }
512            }
513
514            cursor += 1;
515        }
516
517        self.tokens.remove_indices(remove_these);
518    }
519
520    /// Condenses words like "i.e.", "e.g." and "N.S.A." down to single words
521    /// using a state machine.
522    fn condense_dotted_initialisms(&mut self) {
523        if self.tokens.len() < 2 {
524            return;
525        }
526
527        let mut to_remove = VecDeque::new();
528
529        let mut cursor = 1;
530
531        let mut initialism_start = None;
532
533        loop {
534            let a = &self.tokens[cursor - 1];
535            let b = &self.tokens[cursor];
536
537            let is_initialism_chunk = a.kind.is_word() && a.span.len() == 1 && b.kind.is_period();
538
539            if is_initialism_chunk {
540                if initialism_start.is_none() {
541                    initialism_start = Some(cursor - 1);
542                } else {
543                    to_remove.push_back(cursor - 1);
544                }
545
546                to_remove.push_back(cursor);
547                cursor += 1;
548            } else {
549                if let Some(start) = initialism_start {
550                    let end = self.tokens[cursor - 2].span.end;
551                    let start_tok: &mut Token = &mut self.tokens[start];
552                    start_tok.span.end = end;
553                }
554
555                initialism_start = None;
556            }
557
558            cursor += 1;
559
560            if cursor >= self.tokens.len() - 1 {
561                break;
562            }
563        }
564
565        self.tokens.remove_indices(to_remove);
566    }
567
568    fn uncached_ellipsis_pattern() -> Lrc<Repeating> {
569        let period = SequenceExpr::default().then_period();
570        Lrc::new(Repeating::new(Box::new(period), 2))
571    }
572
573    thread_local! {
574        static ELLIPSIS_EXPR: Lrc<Repeating> = Document::uncached_ellipsis_pattern();
575    }
576
577    fn condense_ellipsis(&mut self) {
578        let expr = Self::ELLIPSIS_EXPR.with(|v| v.clone());
579        self.condense_expr(&expr, |tok| {
580            tok.kind = TokenKind::Punctuation(Punctuation::Ellipsis)
581        });
582    }
583
584    fn uncached_contraction_expr() -> Lrc<SequenceExpr> {
585        Lrc::new(
586            SequenceExpr::default()
587                .then_any_word()
588                .then_apostrophe()
589                .then_any_word(),
590        )
591    }
592
593    thread_local! {
594        static CONTRACTION_EXPR: Lrc<SequenceExpr> = Document::uncached_contraction_expr();
595    }
596
597    /// Searches for contractions and condenses them down into single
598    /// tokens.
599    fn condense_contractions(&mut self) {
600        let expr = Self::CONTRACTION_EXPR.with(|v| v.clone());
601
602        self.condense_expr(&expr, |_| {});
603    }
604}
605
606/// Creates functions necessary to implement [`TokenStringExt]` on a document.
607macro_rules! create_fns_on_doc {
608    ($thing:ident) => {
609        paste! {
610            fn [< first_ $thing >](&self) -> Option<&Token> {
611                self.tokens.[< first_ $thing >]()
612            }
613
614            fn [< last_ $thing >](&self) -> Option<&Token> {
615                self.tokens.[< last_ $thing >]()
616            }
617
618            fn [< last_ $thing _index>](&self) -> Option<usize> {
619                self.tokens.[< last_ $thing _index >]()
620            }
621
622            fn [<iter_ $thing _indices>](&self) -> impl DoubleEndedIterator<Item = usize> + '_ {
623                self.tokens.[< iter_ $thing _indices >]()
624            }
625
626            fn [<iter_ $thing s>](&self) -> impl Iterator<Item = &Token> + '_ {
627                self.tokens.[< iter_ $thing s >]()
628            }
629        }
630    };
631}
632
633impl TokenStringExt for Document {
634    create_fns_on_doc!(adjective);
635    create_fns_on_doc!(apostrophe);
636    create_fns_on_doc!(at);
637    create_fns_on_doc!(chunk_terminator);
638    create_fns_on_doc!(comma);
639    create_fns_on_doc!(conjunction);
640    create_fns_on_doc!(currency);
641    create_fns_on_doc!(ellipsis);
642    create_fns_on_doc!(hostname);
643    create_fns_on_doc!(likely_homograph);
644    create_fns_on_doc!(noun);
645    create_fns_on_doc!(number);
646    create_fns_on_doc!(paragraph_break);
647    create_fns_on_doc!(pipe);
648    create_fns_on_doc!(preposition);
649    create_fns_on_doc!(punctuation);
650    create_fns_on_doc!(quote);
651    create_fns_on_doc!(sentence_terminator);
652    create_fns_on_doc!(space);
653    create_fns_on_doc!(unlintable);
654    create_fns_on_doc!(verb);
655    create_fns_on_doc!(word);
656    create_fns_on_doc!(word_like);
657
658    fn first_sentence_word(&self) -> Option<&Token> {
659        self.tokens.first_sentence_word()
660    }
661
662    fn first_non_whitespace(&self) -> Option<&Token> {
663        self.tokens.first_non_whitespace()
664    }
665
666    fn span(&self) -> Option<Span> {
667        self.tokens.span()
668    }
669
670    fn iter_linking_verb_indices(&self) -> impl Iterator<Item = usize> + '_ {
671        self.tokens.iter_linking_verb_indices()
672    }
673
674    fn iter_linking_verbs(&self) -> impl Iterator<Item = &Token> + '_ {
675        self.tokens.iter_linking_verbs()
676    }
677
678    fn iter_chunks(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
679        self.tokens.iter_chunks()
680    }
681
682    fn iter_paragraphs(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
683        self.tokens.iter_paragraphs()
684    }
685
686    fn iter_sentences(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
687        self.tokens.iter_sentences()
688    }
689}
690
691impl Display for Document {
692    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
693        for token in &self.tokens {
694            write!(f, "{}", self.get_span_content_str(&token.span))?;
695        }
696
697        Ok(())
698    }
699}
700
701#[cfg(test)]
702mod tests {
703    use itertools::Itertools;
704
705    use super::Document;
706    use crate::{Span, parsers::MarkdownOptions};
707
708    fn assert_condensed_contractions(text: &str, final_tok_count: usize) {
709        let document = Document::new_plain_english_curated(text);
710
711        assert_eq!(document.tokens.len(), final_tok_count);
712
713        let document = Document::new_markdown_curated(text, MarkdownOptions::default());
714
715        assert_eq!(document.tokens.len(), final_tok_count);
716    }
717
718    #[test]
719    fn simple_contraction() {
720        assert_condensed_contractions("isn't", 1);
721    }
722
723    #[test]
724    fn simple_contraction2() {
725        assert_condensed_contractions("wasn't", 1);
726    }
727
728    #[test]
729    fn simple_contraction3() {
730        assert_condensed_contractions("There's", 1);
731    }
732
733    #[test]
734    fn medium_contraction() {
735        assert_condensed_contractions("isn't wasn't", 3);
736    }
737
738    #[test]
739    fn medium_contraction2() {
740        assert_condensed_contractions("There's no way", 5);
741    }
742
743    #[test]
744    fn selects_token_at_char_index() {
745        let text = "There were three little pigs. They built three little homes.";
746        let document = Document::new_plain_english_curated(text);
747
748        let got = document.get_token_at_char_index(19).unwrap();
749
750        assert!(got.kind.is_word());
751        assert_eq!(got.span, Span::new(17, 23));
752    }
753
754    fn assert_token_count(source: &str, count: usize) {
755        let document = Document::new_plain_english_curated(source);
756
757        dbg!(document.tokens().map(|t| t.kind.clone()).collect_vec());
758        assert_eq!(document.tokens.len(), count);
759    }
760
761    #[test]
762    fn condenses_number_suffixes() {
763        assert_token_count("1st", 1);
764        assert_token_count("This is the 2nd test", 9);
765        assert_token_count("This is the 3rd test", 9);
766        assert_token_count(
767            "It works even with weird capitalization like this: 600nD",
768            18,
769        );
770    }
771
772    #[test]
773    fn condenses_ie() {
774        assert_token_count("There is a thing (i.e. that one)", 15);
775        assert_token_count("We are trying to condense \"i.e.\"", 13);
776        assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
777    }
778
779    #[test]
780    fn condenses_eg() {
781        assert_token_count("We are trying to condense \"e.g.\"", 13);
782        assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
783    }
784
785    #[test]
786    fn condenses_nsa() {
787        assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
788    }
789
790    #[test]
791    fn parses_ellipsis() {
792        assert_token_count("...", 1);
793    }
794
795    #[test]
796    fn parses_long_ellipsis() {
797        assert_token_count(".....", 1);
798    }
799
800    #[test]
801    fn parses_short_ellipsis() {
802        assert_token_count("..", 1);
803    }
804
805    #[test]
806    fn selects_token_at_offset() {
807        let doc = Document::new_plain_english_curated("Foo bar baz");
808
809        let tok = doc.get_token_offset(1, -1).unwrap();
810
811        assert_eq!(tok.span, Span::new(0, 3));
812    }
813
814    #[test]
815    fn cant_select_token_before_start() {
816        let doc = Document::new_plain_english_curated("Foo bar baz");
817
818        let tok = doc.get_token_offset(0, -1);
819
820        assert!(tok.is_none());
821    }
822
823    #[test]
824    fn select_next_word_pos_offset() {
825        let doc = Document::new_plain_english_curated("Foo bar baz");
826
827        let bar = doc.get_next_word_from_offset(0, 1).unwrap();
828        let bar = doc.get_span_content(&bar.span);
829        assert_eq!(bar, ['b', 'a', 'r']);
830    }
831
832    #[test]
833    fn select_next_word_neg_offset() {
834        let doc = Document::new_plain_english_curated("Foo bar baz");
835
836        let bar = doc.get_next_word_from_offset(2, -1).unwrap();
837        let bar = doc.get_span_content(&bar.span);
838        assert_eq!(bar, ['F', 'o', 'o']);
839    }
840
841    #[test]
842    fn cant_select_next_word_not_from_whitespace() {
843        let doc = Document::new_plain_english_curated("Foo bar baz");
844
845        let tok = doc.get_next_word_from_offset(0, 2);
846
847        assert!(tok.is_none());
848    }
849
850    #[test]
851    fn cant_select_next_word_before_start() {
852        let doc = Document::new_plain_english_curated("Foo bar baz");
853
854        let tok = doc.get_next_word_from_offset(0, -1);
855
856        assert!(tok.is_none());
857    }
858
859    #[test]
860    fn cant_select_next_word_with_punctuation_instead_of_whitespace() {
861        let doc = Document::new_plain_english_curated("Foo, bar, baz");
862
863        let tok = doc.get_next_word_from_offset(0, 1);
864
865        assert!(tok.is_none());
866    }
867
868    #[test]
869    fn cant_select_next_word_with_punctuation_after_whitespace() {
870        let doc = Document::new_plain_english_curated("Foo \"bar\", baz");
871
872        let tok = doc.get_next_word_from_offset(0, 1);
873
874        assert!(tok.is_none());
875    }
876}