harper_core/
document.rs

1use std::cmp::Ordering;
2use std::collections::VecDeque;
3use std::fmt::Display;
4
5use paste::paste;
6
7use crate::parsers::{Markdown, MarkdownOptions, Parser, PlainEnglish};
8use crate::patterns::{
9    DocPattern, EitherPattern, Pattern, RepeatingPattern, SequencePattern, WordSet,
10};
11use crate::punctuation::Punctuation;
12use crate::vec_ext::VecExt;
13use crate::{
14    Dictionary, FatStringToken, FatToken, FstDictionary, Lrc, Token, TokenKind, TokenStringExt,
15};
16use crate::{OrdinalSuffix, Span};
17
18/// A document containing some amount of lexed and parsed English text.
19#[derive(Debug, Clone)]
20pub struct Document {
21    source: Lrc<Vec<char>>,
22    tokens: Vec<Token>,
23}
24
25impl Default for Document {
26    fn default() -> Self {
27        Self::new("", &PlainEnglish, &FstDictionary::curated())
28    }
29}
30
31impl Document {
32    /// Locate all the tokens that intersect a provided span.
33    ///
34    /// Desperately needs optimization.
35    pub fn token_indices_intersecting(&self, span: Span) -> Vec<usize> {
36        self.tokens()
37            .enumerate()
38            .filter_map(|(idx, tok)| tok.span.overlaps_with(span).then_some(idx))
39            .collect()
40    }
41
42    /// Locate all the tokens that intersect a provided span and convert them to [`FatToken`]s.
43    ///
44    /// Desperately needs optimization.
45    pub fn fat_tokens_intersecting(&self, span: Span) -> Vec<FatToken> {
46        let indices = self.token_indices_intersecting(span);
47
48        indices
49            .into_iter()
50            .map(|i| self.tokens[i].to_fat(&self.source))
51            .collect()
52    }
53
54    /// Lexes and parses text to produce a document using a provided language
55    /// parser and dictionary.
56    pub fn new(text: &str, parser: &impl Parser, dictionary: &impl Dictionary) -> Self {
57        let source: Vec<_> = text.chars().collect();
58
59        Self::new_from_vec(Lrc::new(source), parser, dictionary)
60    }
61
62    /// Lexes and parses text to produce a document using a provided language
63    /// parser and the included curated dictionary.
64    pub fn new_curated(text: &str, parser: &impl Parser) -> Self {
65        let source: Vec<_> = text.chars().collect();
66
67        Self::new_from_vec(Lrc::new(source), parser, &FstDictionary::curated())
68    }
69
70    /// Lexes and parses text to produce a document using a provided language
71    /// parser and dictionary.
72    pub fn new_from_vec(
73        source: Lrc<Vec<char>>,
74        parser: &impl Parser,
75        dictionary: &impl Dictionary,
76    ) -> Self {
77        let tokens = parser.parse(&source);
78
79        let mut document = Self { source, tokens };
80        document.parse(dictionary);
81
82        document
83    }
84
85    /// Parse text to produce a document using the built-in [`PlainEnglish`]
86    /// parser and curated dictionary.
87    pub fn new_plain_english_curated(text: &str) -> Self {
88        Self::new(text, &PlainEnglish, &FstDictionary::curated())
89    }
90
91    /// Parse text to produce a document using the built-in [`PlainEnglish`]
92    /// parser and a provided dictionary.
93    pub fn new_plain_english(text: &str, dictionary: &impl Dictionary) -> Self {
94        Self::new(text, &PlainEnglish, dictionary)
95    }
96
97    /// Parse text to produce a document using the built-in [`Markdown`] parser
98    /// and curated dictionary.
99    pub fn new_markdown_curated(text: &str, markdown_options: MarkdownOptions) -> Self {
100        Self::new(
101            text,
102            &Markdown::new(markdown_options),
103            &FstDictionary::curated(),
104        )
105    }
106
107    /// Parse text to produce a document using the built-in [`Markdown`] parser
108    /// and curated dictionary with the default Markdown configuration.
109    pub fn new_markdown_default_curated(text: &str) -> Self {
110        Self::new_markdown_curated(text, MarkdownOptions::default())
111    }
112
113    /// Parse text to produce a document using the built-in [`PlainEnglish`]
114    /// parser and the curated dictionary.
115    pub fn new_markdown(
116        text: &str,
117        markdown_options: MarkdownOptions,
118        dictionary: &impl Dictionary,
119    ) -> Self {
120        Self::new(text, &Markdown::new(markdown_options), dictionary)
121    }
122
123    /// Parse text to produce a document using the built-in [`PlainEnglish`]
124    /// parser and the curated dictionary with the default Markdown configuration.
125    pub fn new_markdown_default(text: &str, dictionary: &impl Dictionary) -> Self {
126        Self::new_markdown(text, MarkdownOptions::default(), dictionary)
127    }
128
129    /// Re-parse important language constructs.
130    ///
131    /// Should be run after every change to the underlying [`Self::source`].
132    fn parse(&mut self, dictionary: &impl Dictionary) {
133        self.condense_spaces();
134        self.condense_newlines();
135        self.newlines_to_breaks();
136        self.condense_contractions();
137        self.condense_dotted_initialisms();
138        self.condense_number_suffixes();
139        self.condense_ellipsis();
140        self.condense_latin();
141        self.match_quotes();
142        self.articles_imply_nouns();
143
144        for token in self.tokens.iter_mut() {
145            if let TokenKind::Word(meta) = &mut token.kind {
146                let word_source = token.span.get_content(&self.source);
147                let found_meta = dictionary.get_word_metadata(word_source);
148                *meta = found_meta.cloned()
149            }
150        }
151    }
152
153    fn uncached_article_pattern() -> Lrc<SequencePattern> {
154        Lrc::new(
155            SequencePattern::default()
156                .then_determiner()
157                .then_whitespace()
158                .then(|t: &Token, _source: &[char]| t.kind.is_adjective() && t.kind.is_noun())
159                .then_whitespace()
160                .then_noun(),
161        )
162    }
163
164    thread_local! {static ARTICLE_PATTERN: Lrc<SequencePattern> = Document::uncached_article_pattern()}
165
166    /// When a word that is either an adjective or a noun is sandwiched between an article and a noun,
167    /// it definitely is not a noun.
168    fn articles_imply_nouns(&mut self) {
169        let pattern = Self::ARTICLE_PATTERN.with(|v| v.clone());
170
171        for m in pattern.find_all_matches_in_doc(self) {
172            if let TokenKind::Word(Some(metadata)) = &mut self.tokens[m.start + 2].kind {
173                metadata.noun = None;
174                metadata.verb = None;
175            }
176        }
177    }
178
179    /// Convert all sets of newlines greater than 2 to paragraph breaks.
180    fn newlines_to_breaks(&mut self) {
181        for token in &mut self.tokens {
182            if let TokenKind::Newline(n) = token.kind {
183                if n >= 2 {
184                    token.kind = TokenKind::ParagraphBreak;
185                }
186            }
187        }
188    }
189
190    /// Given a list of indices, this function removes the subsequent
191    /// `stretch_len - 1` elements after each index.
192    ///
193    /// Will extend token spans to include removed elements.
194    /// Assumes condensed tokens are contiguous in source text.
195    fn condense_indices(&mut self, indices: &[usize], stretch_len: usize) {
196        // Update spans
197        for idx in indices {
198            let end_tok = self.tokens[idx + stretch_len - 1].clone();
199            let start_tok = &mut self.tokens[*idx];
200
201            start_tok.span.end = end_tok.span.end;
202        }
203
204        // Trim
205        let old = self.tokens.clone();
206        self.tokens.clear();
207
208        // Keep first chunk.
209        self.tokens
210            .extend_from_slice(&old[0..indices.first().copied().unwrap_or(indices.len())]);
211
212        let mut iter = indices.iter().peekable();
213
214        while let (Some(a_idx), b) = (iter.next(), iter.peek()) {
215            self.tokens.push(old[*a_idx].clone());
216
217            if let Some(b_idx) = b {
218                self.tokens
219                    .extend_from_slice(&old[a_idx + stretch_len..**b_idx]);
220            }
221        }
222
223        // Keep last chunk.
224        self.tokens.extend_from_slice(
225            &old[indices
226                .last()
227                .map(|v| v + stretch_len)
228                .unwrap_or(indices.len())..],
229        );
230    }
231
232    pub fn get_token_at_char_index(&self, char_index: usize) -> Option<&Token> {
233        let index = self
234            .tokens
235            .binary_search_by(|t| {
236                if t.span.overlaps_with(Span::new_with_len(char_index, 1)) {
237                    Ordering::Equal
238                } else {
239                    t.span.start.cmp(&char_index)
240                }
241            })
242            .ok()?;
243
244        Some(&self.tokens[index])
245    }
246
247    /// Defensively attempt to grab a specific token.
248    pub fn get_token(&self, index: usize) -> Option<&Token> {
249        self.tokens.get(index)
250    }
251
252    /// Get a token at a signed offset from a base index, or None if out of bounds.
253    pub fn get_token_offset(&self, base: usize, offset: isize) -> Option<&Token> {
254        match base.checked_add_signed(offset) {
255            None => None,
256            Some(idx) => self.get_token(idx),
257        }
258    }
259
260    /// Get an iterator over all the tokens contained in the document.
261    pub fn tokens(&self) -> impl Iterator<Item = &Token> + '_ {
262        self.tokens.iter()
263    }
264
265    /// Get an iterator over all the tokens contained in the document.
266    pub fn fat_tokens(&self) -> impl Iterator<Item = FatToken> + '_ {
267        self.tokens().map(|token| token.to_fat(&self.source))
268    }
269
270    /// Get the next or previous word token relative to a base index, if separated by whitespace.
271    /// Returns None if the next/previous token is not a word or does not exist.
272    pub fn get_next_word_from_offset(&self, base: usize, offset: isize) -> Option<&Token> {
273        // Look for whitespace at the expected offset
274        if !self.get_token_offset(base, offset)?.kind.is_whitespace() {
275            return None;
276        }
277        // Now look beyond the whitespace for a word token
278        let word_token = self.get_token_offset(base, offset + offset.signum());
279        let word_token = word_token?;
280        word_token.kind.is_word().then_some(word_token)
281    }
282
283    /// Get an iterator over all the tokens contained in the document.
284    pub fn fat_string_tokens(&self) -> impl Iterator<Item = FatStringToken> + '_ {
285        self.fat_tokens().map(|t| t.into())
286    }
287
288    pub fn get_span_content(&self, span: &Span) -> &[char] {
289        span.get_content(&self.source)
290    }
291
292    pub fn get_span_content_str(&self, span: &Span) -> String {
293        String::from_iter(self.get_span_content(span))
294    }
295
296    pub fn get_full_string(&self) -> String {
297        self.get_span_content_str(&Span {
298            start: 0,
299            end: self.source.len(),
300        })
301    }
302
303    pub fn get_full_content(&self) -> &[char] {
304        &self.source
305    }
306
307    pub fn get_source(&self) -> &[char] {
308        &self.source
309    }
310
311    pub fn get_tokens(&self) -> &[Token] {
312        &self.tokens
313    }
314
315    /// Searches for quotation marks and fills the
316    /// [`Punctuation::Quote::twin_loc`] field. This is on a best-effort
317    /// basis.
318    ///
319    /// Current algorithm is basic and could use some work.
320    fn match_quotes(&mut self) {
321        let quote_indices: Vec<usize> = self.tokens.iter_quote_indices().collect();
322
323        for i in 0..quote_indices.len() / 2 {
324            let a_i = quote_indices[i * 2];
325            let b_i = quote_indices[i * 2 + 1];
326
327            {
328                let a = self.tokens[a_i].kind.as_mut_quote().unwrap();
329                a.twin_loc = Some(b_i);
330            }
331
332            {
333                let b = self.tokens[b_i].kind.as_mut_quote().unwrap();
334                b.twin_loc = Some(a_i);
335            }
336        }
337    }
338
339    /// Searches for number suffixes and condenses them down into single tokens
340    fn condense_number_suffixes(&mut self) {
341        if self.tokens.len() < 2 {
342            return;
343        }
344
345        let mut replace_starts = Vec::new();
346
347        for idx in 0..self.tokens.len() - 1 {
348            let b = &self.tokens[idx + 1];
349            let a = &self.tokens[idx];
350
351            // TODO: Allow spaces between `a` and `b`
352
353            if let (TokenKind::Number(..), TokenKind::Word(..)) = (&a.kind, &b.kind) {
354                if let Some(found_suffix) =
355                    OrdinalSuffix::from_chars(self.get_span_content(&b.span))
356                {
357                    self.tokens[idx].kind.as_mut_number().unwrap().suffix = Some(found_suffix);
358                    replace_starts.push(idx);
359                }
360            }
361        }
362
363        self.condense_indices(&replace_starts, 2);
364    }
365
366    /// Searches for multiple sequential space tokens and condenses them down
367    /// into one.
368    fn condense_spaces(&mut self) {
369        let mut cursor = 0;
370        let copy = self.tokens.clone();
371
372        let mut remove_these = VecDeque::new();
373
374        while cursor < self.tokens.len() {
375            // Locate a stretch of one or more newline tokens.
376            let start_tok = &mut self.tokens[cursor];
377
378            if let TokenKind::Space(start_count) = &mut start_tok.kind {
379                loop {
380                    cursor += 1;
381
382                    if cursor >= copy.len() {
383                        break;
384                    }
385
386                    let child_tok = &copy[cursor];
387
388                    // Only condense adjacent spans
389                    if start_tok.span.end != child_tok.span.start {
390                        break;
391                    }
392
393                    if let TokenKind::Space(n) = child_tok.kind {
394                        *start_count += n;
395                        start_tok.span.end = child_tok.span.end;
396                        remove_these.push_back(cursor);
397                        cursor += 1;
398                    } else {
399                        break;
400                    };
401                }
402            }
403
404            cursor += 1;
405        }
406
407        self.tokens.remove_indices(remove_these);
408    }
409
410    thread_local! {
411        static LATIN_PATTERN: Lrc<EitherPattern> = Document::uncached_latin_pattern();
412    }
413
414    fn uncached_latin_pattern() -> Lrc<EitherPattern> {
415        Lrc::new(EitherPattern::new(vec![
416            Box::new(
417                SequencePattern::default()
418                    .then(WordSet::new(&["etc", "vs"]))
419                    .then_period(),
420            ),
421            Box::new(
422                SequencePattern::aco("et")
423                    .then_whitespace()
424                    .t_aco("al")
425                    .then_period(),
426            ),
427        ]))
428    }
429
430    /// Assumes that the first matched token is the canonical one to be condensed into.
431    /// Takes a callback that can be used to retroactively edit the canonical token afterwards.
432    fn condense_pattern<F>(&mut self, pattern: &impl Pattern, edit: F)
433    where
434        F: Fn(&mut Token),
435    {
436        let matches = pattern.find_all_matches_in_doc(self);
437
438        let mut remove_indices = VecDeque::with_capacity(matches.len());
439
440        for m in matches {
441            remove_indices.extend(m.start + 1..m.end);
442            self.tokens[m.start].span = self.tokens[m.into_iter()].span().unwrap();
443            edit(&mut self.tokens[m.start]);
444        }
445
446        self.tokens.remove_indices(remove_indices);
447    }
448
449    fn condense_latin(&mut self) {
450        self.condense_pattern(&Self::LATIN_PATTERN.with(|v| v.clone()), |_| {})
451    }
452
453    /// Searches for multiple sequential newline tokens and condenses them down
454    /// into one.
455    fn condense_newlines(&mut self) {
456        let mut cursor = 0;
457        let copy = self.tokens.clone();
458
459        let mut remove_these = VecDeque::new();
460
461        while cursor < self.tokens.len() {
462            // Locate a stretch of one or more newline tokens.
463            let start_tok = &mut self.tokens[cursor];
464
465            if let TokenKind::Newline(start_count) = &mut start_tok.kind {
466                loop {
467                    cursor += 1;
468
469                    if cursor >= copy.len() {
470                        break;
471                    }
472
473                    let child_tok = &copy[cursor];
474                    if let TokenKind::Newline(n) = child_tok.kind {
475                        *start_count += n;
476                        start_tok.span.end = child_tok.span.end;
477                        remove_these.push_back(cursor);
478                        cursor += 1;
479                    } else {
480                        break;
481                    };
482                }
483            }
484
485            cursor += 1;
486        }
487
488        self.tokens.remove_indices(remove_these);
489    }
490
491    /// Condenses words like "i.e.", "e.g." and "N.S.A." down to single words
492    /// using a state machine.
493    fn condense_dotted_initialisms(&mut self) {
494        if self.tokens.len() < 2 {
495            return;
496        }
497
498        let mut to_remove = VecDeque::new();
499
500        let mut cursor = 1;
501
502        let mut initialism_start = None;
503
504        loop {
505            let a = &self.tokens[cursor - 1];
506            let b = &self.tokens[cursor];
507
508            let is_initialism_chunk = a.kind.is_word() && a.span.len() == 1 && b.kind.is_period();
509
510            if is_initialism_chunk {
511                if initialism_start.is_none() {
512                    initialism_start = Some(cursor - 1);
513                } else {
514                    to_remove.push_back(cursor - 1);
515                }
516
517                to_remove.push_back(cursor);
518                cursor += 1;
519            } else {
520                if let Some(start) = initialism_start {
521                    let end = self.tokens[cursor - 2].span.end;
522                    let start_tok: &mut Token = &mut self.tokens[start];
523                    start_tok.span.end = end;
524                }
525
526                initialism_start = None;
527            }
528
529            cursor += 1;
530
531            if cursor >= self.tokens.len() - 1 {
532                break;
533            }
534        }
535
536        self.tokens.remove_indices(to_remove);
537    }
538
539    fn uncached_ellipsis_pattern() -> Lrc<RepeatingPattern> {
540        let period = SequencePattern::default().then_period();
541        Lrc::new(RepeatingPattern::new(Box::new(period), 2))
542    }
543
544    thread_local! {
545        static ELLIPSIS_PATTERN: Lrc<RepeatingPattern> = Document::uncached_ellipsis_pattern();
546    }
547
548    fn condense_ellipsis(&mut self) {
549        let pattern = Self::ELLIPSIS_PATTERN.with(|v| v.clone());
550        self.condense_pattern(&pattern, |tok| {
551            tok.kind = TokenKind::Punctuation(Punctuation::Ellipsis)
552        });
553    }
554
555    fn uncached_contraction_pattern() -> Lrc<SequencePattern> {
556        Lrc::new(
557            SequencePattern::default()
558                .then_any_word()
559                .then_apostrophe()
560                .then_any_word(),
561        )
562    }
563
564    thread_local! {
565        static CONTRACTION_PATTERN: Lrc<SequencePattern> = Document::uncached_contraction_pattern();
566    }
567
568    /// Searches for contractions and condenses them down into single
569    /// tokens.
570    fn condense_contractions(&mut self) {
571        let pattern = Self::CONTRACTION_PATTERN.with(|v| v.clone());
572
573        self.condense_pattern(&pattern, |_| {});
574    }
575}
576
577/// Creates functions necessary to implement [`TokenStringExt]` on a document.
578macro_rules! create_fns_on_doc {
579    ($thing:ident) => {
580        paste! {
581            fn [< first_ $thing >](&self) -> Option<&Token> {
582                self.tokens.[< first_ $thing >]()
583            }
584
585            fn [< last_ $thing >](&self) -> Option<&Token> {
586                self.tokens.[< last_ $thing >]()
587            }
588
589            fn [< last_ $thing _index>](&self) -> Option<usize> {
590                self.tokens.[< last_ $thing _index >]()
591            }
592
593            fn [<iter_ $thing _indices>](&self) -> impl Iterator<Item = usize> + '_ {
594                self.tokens.[< iter_ $thing _indices >]()
595            }
596
597            fn [<iter_ $thing s>](&self) -> impl Iterator<Item = &Token> + '_ {
598                self.tokens.[< iter_ $thing s >]()
599            }
600        }
601    };
602}
603
604impl TokenStringExt for Document {
605    create_fns_on_doc!(adjective);
606    create_fns_on_doc!(apostrophe);
607    create_fns_on_doc!(at);
608    create_fns_on_doc!(chunk_terminator);
609    create_fns_on_doc!(comma);
610    create_fns_on_doc!(conjunction);
611    create_fns_on_doc!(currency);
612    create_fns_on_doc!(ellipsis);
613    create_fns_on_doc!(hostname);
614    create_fns_on_doc!(likely_homograph);
615    create_fns_on_doc!(noun);
616    create_fns_on_doc!(number);
617    create_fns_on_doc!(paragraph_break);
618    create_fns_on_doc!(pipe);
619    create_fns_on_doc!(preposition);
620    create_fns_on_doc!(punctuation);
621    create_fns_on_doc!(quote);
622    create_fns_on_doc!(sentence_terminator);
623    create_fns_on_doc!(space);
624    create_fns_on_doc!(unlintable);
625    create_fns_on_doc!(verb);
626    create_fns_on_doc!(word);
627    create_fns_on_doc!(word_like);
628
629    fn first_sentence_word(&self) -> Option<&Token> {
630        self.tokens.first_sentence_word()
631    }
632
633    fn first_non_whitespace(&self) -> Option<&Token> {
634        self.tokens.first_non_whitespace()
635    }
636
637    fn span(&self) -> Option<Span> {
638        self.tokens.span()
639    }
640
641    fn iter_linking_verb_indices(&self) -> impl Iterator<Item = usize> + '_ {
642        self.tokens.iter_linking_verb_indices()
643    }
644
645    fn iter_linking_verbs(&self) -> impl Iterator<Item = &Token> + '_ {
646        self.tokens.iter_linking_verbs()
647    }
648
649    fn iter_chunks(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
650        self.tokens.iter_chunks()
651    }
652
653    fn iter_paragraphs(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
654        self.tokens.iter_paragraphs()
655    }
656
657    fn iter_sentences(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
658        self.tokens.iter_sentences()
659    }
660}
661
662impl Display for Document {
663    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
664        for token in &self.tokens {
665            write!(f, "{}", self.get_span_content_str(&token.span))?;
666        }
667
668        Ok(())
669    }
670}
671
672#[cfg(test)]
673mod tests {
674    use itertools::Itertools;
675
676    use super::Document;
677    use crate::{Span, parsers::MarkdownOptions};
678
679    fn assert_condensed_contractions(text: &str, final_tok_count: usize) {
680        let document = Document::new_plain_english_curated(text);
681
682        assert_eq!(document.tokens.len(), final_tok_count);
683
684        let document = Document::new_markdown_curated(text, MarkdownOptions::default());
685
686        assert_eq!(document.tokens.len(), final_tok_count);
687    }
688
689    #[test]
690    fn simple_contraction() {
691        assert_condensed_contractions("isn't", 1);
692    }
693
694    #[test]
695    fn simple_contraction2() {
696        assert_condensed_contractions("wasn't", 1);
697    }
698
699    #[test]
700    fn simple_contraction3() {
701        assert_condensed_contractions("There's", 1);
702    }
703
704    #[test]
705    fn medium_contraction() {
706        assert_condensed_contractions("isn't wasn't", 3);
707    }
708
709    #[test]
710    fn medium_contraction2() {
711        assert_condensed_contractions("There's no way", 5);
712    }
713
714    #[test]
715    fn selects_token_at_char_index() {
716        let text = "There were three little pigs. They built three little homes.";
717        let document = Document::new_plain_english_curated(text);
718
719        let got = document.get_token_at_char_index(19).unwrap();
720
721        assert!(got.kind.is_word());
722        assert_eq!(got.span, Span::new(17, 23));
723    }
724
725    fn assert_token_count(source: &str, count: usize) {
726        let document = Document::new_plain_english_curated(source);
727
728        dbg!(document.tokens().map(|t| t.kind.clone()).collect_vec());
729        assert_eq!(document.tokens.len(), count);
730    }
731
732    #[test]
733    fn condenses_number_suffixes() {
734        assert_token_count("1st", 1);
735        assert_token_count("This is the 2nd test", 9);
736        assert_token_count("This is the 3rd test", 9);
737        assert_token_count(
738            "It works even with weird capitalization like this: 600nD",
739            18,
740        );
741    }
742
743    #[test]
744    fn condenses_ie() {
745        assert_token_count("There is a thing (i.e. that one)", 15);
746        assert_token_count("We are trying to condense \"i.e.\"", 13);
747        assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
748    }
749
750    #[test]
751    fn condenses_eg() {
752        assert_token_count("We are trying to condense \"e.g.\"", 13);
753        assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
754    }
755
756    #[test]
757    fn condenses_nsa() {
758        assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
759    }
760
761    #[test]
762    fn parses_ellipsis() {
763        assert_token_count("...", 1);
764    }
765
766    #[test]
767    fn parses_long_ellipsis() {
768        assert_token_count(".....", 1);
769    }
770
771    #[test]
772    fn parses_short_ellipsis() {
773        assert_token_count("..", 1);
774    }
775
776    #[test]
777    fn selects_token_at_offset() {
778        let doc = Document::new_plain_english_curated("Foo bar baz");
779
780        let tok = doc.get_token_offset(1, -1).unwrap();
781
782        assert_eq!(tok.span, Span::new(0, 3));
783    }
784
785    #[test]
786    fn cant_select_token_before_start() {
787        let doc = Document::new_plain_english_curated("Foo bar baz");
788
789        let tok = doc.get_token_offset(0, -1);
790
791        assert!(tok.is_none());
792    }
793
794    #[test]
795    fn select_next_word_pos_offset() {
796        let doc = Document::new_plain_english_curated("Foo bar baz");
797
798        let bar = doc.get_next_word_from_offset(0, 1).unwrap();
799        let bar = doc.get_span_content(&bar.span);
800        assert_eq!(bar, ['b', 'a', 'r']);
801    }
802
803    #[test]
804    fn select_next_word_neg_offset() {
805        let doc = Document::new_plain_english_curated("Foo bar baz");
806
807        let bar = doc.get_next_word_from_offset(2, -1).unwrap();
808        let bar = doc.get_span_content(&bar.span);
809        assert_eq!(bar, ['F', 'o', 'o']);
810    }
811
812    #[test]
813    fn cant_select_next_word_not_from_whitespace() {
814        let doc = Document::new_plain_english_curated("Foo bar baz");
815
816        let tok = doc.get_next_word_from_offset(0, 2);
817
818        assert!(tok.is_none());
819    }
820
821    #[test]
822    fn cant_select_next_word_before_start() {
823        let doc = Document::new_plain_english_curated("Foo bar baz");
824
825        let tok = doc.get_next_word_from_offset(0, -1);
826
827        assert!(tok.is_none());
828    }
829
830    #[test]
831    fn cant_select_next_word_with_punctuation_instead_of_whitespace() {
832        let doc = Document::new_plain_english_curated("Foo, bar, baz");
833
834        let tok = doc.get_next_word_from_offset(0, 1);
835
836        assert!(tok.is_none());
837    }
838
839    #[test]
840    fn cant_select_next_word_with_punctuation_after_whitespace() {
841        let doc = Document::new_plain_english_curated("Foo \"bar\", baz");
842
843        let tok = doc.get_next_word_from_offset(0, 1);
844
845        assert!(tok.is_none());
846    }
847}