harper_core/
document.rs

1use std::cmp::Ordering;
2use std::collections::VecDeque;
3use std::fmt::Display;
4
5use paste::paste;
6
7use crate::parsers::{Markdown, MarkdownOptions, Parser, PlainEnglish};
8use crate::patterns::{
9    DocPattern, EitherPattern, Pattern, RepeatingPattern, SequencePattern, WordSet,
10};
11use crate::punctuation::Punctuation;
12use crate::vec_ext::VecExt;
13use crate::word_metadata::AdjectiveData;
14use crate::{
15    Dictionary, FatStringToken, FatToken, FstDictionary, Lrc, NounData, Token, TokenKind,
16    TokenStringExt,
17};
18use crate::{OrdinalSuffix, Span};
19
20/// A document containing some amount of lexed and parsed English text.
21#[derive(Debug, Clone)]
22pub struct Document {
23    source: Lrc<Vec<char>>,
24    tokens: Vec<Token>,
25}
26
27impl Default for Document {
28    fn default() -> Self {
29        Self::new("", &PlainEnglish, &FstDictionary::curated())
30    }
31}
32
33impl Document {
34    /// Locate all the tokens that intersect a provided span.
35    ///
36    /// Desperately needs optimization.
37    pub fn token_indices_intersecting(&self, span: Span) -> Vec<usize> {
38        self.tokens()
39            .enumerate()
40            .filter_map(|(idx, tok)| tok.span.overlaps_with(span).then_some(idx))
41            .collect()
42    }
43
44    /// Locate all the tokens that intersect a provided span and convert them to [`FatToken`]s.
45    ///
46    /// Desperately needs optimization.
47    pub fn fat_tokens_intersecting(&self, span: Span) -> Vec<FatToken> {
48        let indices = self.token_indices_intersecting(span);
49
50        indices
51            .into_iter()
52            .map(|i| self.tokens[i].to_fat(&self.source))
53            .collect()
54    }
55
56    /// Lexes and parses text to produce a document using a provided language
57    /// parser and dictionary.
58    pub fn new(text: &str, parser: &impl Parser, dictionary: &impl Dictionary) -> Self {
59        let source: Vec<_> = text.chars().collect();
60
61        Self::new_from_vec(Lrc::new(source), parser, dictionary)
62    }
63
64    /// Lexes and parses text to produce a document using a provided language
65    /// parser and the included curated dictionary.
66    pub fn new_curated(text: &str, parser: &impl Parser) -> Self {
67        let source: Vec<_> = text.chars().collect();
68
69        Self::new_from_vec(Lrc::new(source), parser, &FstDictionary::curated())
70    }
71
72    /// Lexes and parses text to produce a document using a provided language
73    /// parser and dictionary.
74    pub fn new_from_vec(
75        source: Lrc<Vec<char>>,
76        parser: &impl Parser,
77        dictionary: &impl Dictionary,
78    ) -> Self {
79        let tokens = parser.parse(&source);
80
81        let mut document = Self { source, tokens };
82        document.parse(dictionary);
83
84        document
85    }
86
87    /// Parse text to produce a document using the built-in [`PlainEnglish`]
88    /// parser and curated dictionary.
89    pub fn new_plain_english_curated(text: &str) -> Self {
90        Self::new(text, &PlainEnglish, &FstDictionary::curated())
91    }
92
93    /// Parse text to produce a document using the built-in [`PlainEnglish`]
94    /// parser and a provided dictionary.
95    pub fn new_plain_english(text: &str, dictionary: &impl Dictionary) -> Self {
96        Self::new(text, &PlainEnglish, dictionary)
97    }
98
99    /// Parse text to produce a document using the built-in [`Markdown`] parser
100    /// and curated dictionary.
101    pub fn new_markdown_curated(text: &str, markdown_options: MarkdownOptions) -> Self {
102        Self::new(
103            text,
104            &Markdown::new(markdown_options),
105            &FstDictionary::curated(),
106        )
107    }
108
109    /// Parse text to produce a document using the built-in [`Markdown`] parser
110    /// and curated dictionary with the default Markdown configuration.
111    pub fn new_markdown_default_curated(text: &str) -> Self {
112        Self::new_markdown_curated(text, MarkdownOptions::default())
113    }
114
115    /// Parse text to produce a document using the built-in [`PlainEnglish`]
116    /// parser and the curated dictionary.
117    pub fn new_markdown(
118        text: &str,
119        markdown_options: MarkdownOptions,
120        dictionary: &impl Dictionary,
121    ) -> Self {
122        Self::new(text, &Markdown::new(markdown_options), dictionary)
123    }
124
125    /// Parse text to produce a document using the built-in [`PlainEnglish`]
126    /// parser and the curated dictionary with the default Markdown configuration.
127    pub fn new_markdown_default(text: &str, dictionary: &impl Dictionary) -> Self {
128        Self::new_markdown(text, MarkdownOptions::default(), dictionary)
129    }
130
131    /// Re-parse important language constructs.
132    ///
133    /// Should be run after every change to the underlying [`Self::source`].
134    fn parse(&mut self, dictionary: &impl Dictionary) {
135        self.condense_spaces();
136        self.condense_newlines();
137        self.newlines_to_breaks();
138        self.condense_contractions();
139        self.condense_dotted_initialisms();
140        self.condense_number_suffixes();
141        self.condense_ellipsis();
142        self.condense_latin();
143        self.match_quotes();
144        self.articles_imply_nouns();
145
146        // annotate word metadata
147        for token in self.tokens.iter_mut() {
148            if let TokenKind::Word(meta) = &mut token.kind {
149                let word_source = token.span.get_content(&self.source);
150                let found_meta = dictionary.get_word_metadata(word_source);
151                *meta = found_meta.cloned()
152            }
153        }
154
155        // refine and disambiguate word metadata
156        self.known_preposition();
157        self.articles_imply_not_verb();
158    }
159
160    fn uncached_article_pattern() -> Lrc<SequencePattern> {
161        Lrc::new(
162            SequencePattern::default()
163                .then_determiner()
164                .then_whitespace()
165                .then(|t: &Token, _source: &[char]| t.kind.is_adjective() && t.kind.is_noun())
166                .then_whitespace()
167                .then_noun(),
168        )
169    }
170
171    thread_local! {static ARTICLE_PATTERN: Lrc<SequencePattern> = Document::uncached_article_pattern()}
172
173    /// When a word that is either an adjective or a noun is sandwiched between an article and a noun,
174    /// it definitely is not a noun.
175    fn articles_imply_nouns(&mut self) {
176        let pattern = Self::ARTICLE_PATTERN.with(|v| v.clone());
177
178        for m in pattern.find_all_matches_in_doc(self) {
179            if let TokenKind::Word(Some(metadata)) = &mut self.tokens[m.start + 2].kind {
180                metadata.noun = None;
181                metadata.verb = None;
182            }
183        }
184    }
185
186    /// A proposition-like word followed by a determiner or number is typically
187    /// really a preposition.
188    fn known_preposition(&mut self) {
189        fn create_pattern() -> Lrc<SequencePattern> {
190            Lrc::new(
191                SequencePattern::default()
192                    .then(WordSet::new(&["in", "at", "on", "to", "for", "by", "with"]))
193                    .then_whitespace()
194                    .then(|t: &Token, _source: &[char]| {
195                        t.kind.is_determiner() || t.kind.is_number()
196                    }),
197            )
198        }
199        thread_local! {static PATTERN: Lrc<SequencePattern> = create_pattern()}
200
201        let pattern = PATTERN.with(|v| v.clone());
202
203        for m in pattern.find_all_matches_in_doc(self) {
204            if let TokenKind::Word(Some(metadata)) = &mut self.tokens[m.start].kind {
205                metadata.noun = None;
206                metadata.pronoun = None;
207                metadata.verb = None;
208                metadata.adjective = None;
209            }
210        }
211    }
212
213    /// The first word after an article cannot be a verb.
214    fn articles_imply_not_verb(&mut self) {
215        fn create_pattern() -> Lrc<SequencePattern> {
216            Lrc::new(
217                SequencePattern::default()
218                    .then(WordSet::new(&[
219                        // articles
220                        "a", "an", "the",
221                        // Dependent genitive pronouns serve a similar role to articles.
222                        // Unfortunately, some overlap with other pronoun forms. E.g.
223                        // "I like her", "Something about her struck me as odd."
224                        "my", "your", "thy", "thine", "his", /*"her",*/ "its", "our", "their",
225                        "whose", // "no" is also a determiner
226                        "no",
227                    ]))
228                    .then_whitespace()
229                    .then_verb(),
230            )
231        }
232        thread_local! {static PATTERN: Lrc<SequencePattern> = create_pattern()}
233        let pattern = PATTERN.with(|v| v.clone());
234
235        for m in pattern.find_all_matches_in_doc(self) {
236            if let TokenKind::Word(Some(metadata)) = &mut self.tokens[m.end - 1].kind {
237                if metadata.noun.is_none()
238                    && metadata.adjective.is_none()
239                    && metadata.adverb.is_none()
240                {
241                    metadata.noun = Some(NounData::default());
242                    metadata.adjective = Some(AdjectiveData::default());
243                }
244                metadata.verb = None;
245            }
246        }
247    }
248
249    /// Convert all sets of newlines greater than 2 to paragraph breaks.
250    fn newlines_to_breaks(&mut self) {
251        for token in &mut self.tokens {
252            if let TokenKind::Newline(n) = token.kind {
253                if n >= 2 {
254                    token.kind = TokenKind::ParagraphBreak;
255                }
256            }
257        }
258    }
259
260    /// Given a list of indices, this function removes the subsequent
261    /// `stretch_len - 1` elements after each index.
262    ///
263    /// Will extend token spans to include removed elements.
264    /// Assumes condensed tokens are contiguous in source text.
265    fn condense_indices(&mut self, indices: &[usize], stretch_len: usize) {
266        // Update spans
267        for idx in indices {
268            let end_tok = self.tokens[idx + stretch_len - 1].clone();
269            let start_tok = &mut self.tokens[*idx];
270
271            start_tok.span.end = end_tok.span.end;
272        }
273
274        // Trim
275        let old = self.tokens.clone();
276        self.tokens.clear();
277
278        // Keep first chunk.
279        self.tokens
280            .extend_from_slice(&old[0..indices.first().copied().unwrap_or(indices.len())]);
281
282        let mut iter = indices.iter().peekable();
283
284        while let (Some(a_idx), b) = (iter.next(), iter.peek()) {
285            self.tokens.push(old[*a_idx].clone());
286
287            if let Some(b_idx) = b {
288                self.tokens
289                    .extend_from_slice(&old[a_idx + stretch_len..**b_idx]);
290            }
291        }
292
293        // Keep last chunk.
294        self.tokens.extend_from_slice(
295            &old[indices
296                .last()
297                .map(|v| v + stretch_len)
298                .unwrap_or(indices.len())..],
299        );
300    }
301
302    pub fn get_token_at_char_index(&self, char_index: usize) -> Option<&Token> {
303        let index = self
304            .tokens
305            .binary_search_by(|t| {
306                if t.span.overlaps_with(Span::new_with_len(char_index, 1)) {
307                    Ordering::Equal
308                } else {
309                    t.span.start.cmp(&char_index)
310                }
311            })
312            .ok()?;
313
314        Some(&self.tokens[index])
315    }
316
317    /// Defensively attempt to grab a specific token.
318    pub fn get_token(&self, index: usize) -> Option<&Token> {
319        self.tokens.get(index)
320    }
321
322    /// Get a token at a signed offset from a base index, or None if out of bounds.
323    pub fn get_token_offset(&self, base: usize, offset: isize) -> Option<&Token> {
324        match base.checked_add_signed(offset) {
325            None => None,
326            Some(idx) => self.get_token(idx),
327        }
328    }
329
330    /// Get an iterator over all the tokens contained in the document.
331    pub fn tokens(&self) -> impl Iterator<Item = &Token> + '_ {
332        self.tokens.iter()
333    }
334
335    /// Get an iterator over all the tokens contained in the document.
336    pub fn fat_tokens(&self) -> impl Iterator<Item = FatToken> + '_ {
337        self.tokens().map(|token| token.to_fat(&self.source))
338    }
339
340    /// Get the next or previous word token relative to a base index, if separated by whitespace.
341    /// Returns None if the next/previous token is not a word or does not exist.
342    pub fn get_next_word_from_offset(&self, base: usize, offset: isize) -> Option<&Token> {
343        // Look for whitespace at the expected offset
344        if !self.get_token_offset(base, offset)?.kind.is_whitespace() {
345            return None;
346        }
347        // Now look beyond the whitespace for a word token
348        let word_token = self.get_token_offset(base, offset + offset.signum());
349        let word_token = word_token?;
350        word_token.kind.is_word().then_some(word_token)
351    }
352
353    /// Get an iterator over all the tokens contained in the document.
354    pub fn fat_string_tokens(&self) -> impl Iterator<Item = FatStringToken> + '_ {
355        self.fat_tokens().map(|t| t.into())
356    }
357
358    pub fn get_span_content(&self, span: &Span) -> &[char] {
359        span.get_content(&self.source)
360    }
361
362    pub fn get_span_content_str(&self, span: &Span) -> String {
363        String::from_iter(self.get_span_content(span))
364    }
365
366    pub fn get_full_string(&self) -> String {
367        self.get_span_content_str(&Span {
368            start: 0,
369            end: self.source.len(),
370        })
371    }
372
373    pub fn get_full_content(&self) -> &[char] {
374        &self.source
375    }
376
377    pub fn get_source(&self) -> &[char] {
378        &self.source
379    }
380
381    pub fn get_tokens(&self) -> &[Token] {
382        &self.tokens
383    }
384
385    /// Searches for quotation marks and fills the
386    /// [`Punctuation::Quote::twin_loc`] field. This is on a best-effort
387    /// basis.
388    ///
389    /// Current algorithm is basic and could use some work.
390    fn match_quotes(&mut self) {
391        let quote_indices: Vec<usize> = self.tokens.iter_quote_indices().collect();
392
393        for i in 0..quote_indices.len() / 2 {
394            let a_i = quote_indices[i * 2];
395            let b_i = quote_indices[i * 2 + 1];
396
397            {
398                let a = self.tokens[a_i].kind.as_mut_quote().unwrap();
399                a.twin_loc = Some(b_i);
400            }
401
402            {
403                let b = self.tokens[b_i].kind.as_mut_quote().unwrap();
404                b.twin_loc = Some(a_i);
405            }
406        }
407    }
408
409    /// Searches for number suffixes and condenses them down into single tokens
410    fn condense_number_suffixes(&mut self) {
411        if self.tokens.len() < 2 {
412            return;
413        }
414
415        let mut replace_starts = Vec::new();
416
417        for idx in 0..self.tokens.len() - 1 {
418            let b = &self.tokens[idx + 1];
419            let a = &self.tokens[idx];
420
421            // TODO: Allow spaces between `a` and `b`
422
423            if let (TokenKind::Number(..), TokenKind::Word(..)) = (&a.kind, &b.kind) {
424                if let Some(found_suffix) =
425                    OrdinalSuffix::from_chars(self.get_span_content(&b.span))
426                {
427                    self.tokens[idx].kind.as_mut_number().unwrap().suffix = Some(found_suffix);
428                    replace_starts.push(idx);
429                }
430            }
431        }
432
433        self.condense_indices(&replace_starts, 2);
434    }
435
436    /// Searches for multiple sequential space tokens and condenses them down
437    /// into one.
438    fn condense_spaces(&mut self) {
439        let mut cursor = 0;
440        let copy = self.tokens.clone();
441
442        let mut remove_these = VecDeque::new();
443
444        while cursor < self.tokens.len() {
445            // Locate a stretch of one or more newline tokens.
446            let start_tok = &mut self.tokens[cursor];
447
448            if let TokenKind::Space(start_count) = &mut start_tok.kind {
449                loop {
450                    cursor += 1;
451
452                    if cursor >= copy.len() {
453                        break;
454                    }
455
456                    let child_tok = &copy[cursor];
457
458                    // Only condense adjacent spans
459                    if start_tok.span.end != child_tok.span.start {
460                        break;
461                    }
462
463                    if let TokenKind::Space(n) = child_tok.kind {
464                        *start_count += n;
465                        start_tok.span.end = child_tok.span.end;
466                        remove_these.push_back(cursor);
467                        cursor += 1;
468                    } else {
469                        break;
470                    };
471                }
472            }
473
474            cursor += 1;
475        }
476
477        self.tokens.remove_indices(remove_these);
478    }
479
480    thread_local! {
481        static LATIN_PATTERN: Lrc<EitherPattern> = Document::uncached_latin_pattern();
482    }
483
484    fn uncached_latin_pattern() -> Lrc<EitherPattern> {
485        Lrc::new(EitherPattern::new(vec![
486            Box::new(
487                SequencePattern::default()
488                    .then(WordSet::new(&["etc", "vs"]))
489                    .then_period(),
490            ),
491            Box::new(
492                SequencePattern::aco("et")
493                    .then_whitespace()
494                    .t_aco("al")
495                    .then_period(),
496            ),
497        ]))
498    }
499
500    /// Assumes that the first matched token is the canonical one to be condensed into.
501    /// Takes a callback that can be used to retroactively edit the canonical token afterwards.
502    fn condense_pattern<F>(&mut self, pattern: &impl Pattern, edit: F)
503    where
504        F: Fn(&mut Token),
505    {
506        let matches = pattern.find_all_matches_in_doc(self);
507
508        let mut remove_indices = VecDeque::with_capacity(matches.len());
509
510        for m in matches {
511            remove_indices.extend(m.start + 1..m.end);
512            self.tokens[m.start].span = self.tokens[m.into_iter()].span().unwrap();
513            edit(&mut self.tokens[m.start]);
514        }
515
516        self.tokens.remove_indices(remove_indices);
517    }
518
519    fn condense_latin(&mut self) {
520        self.condense_pattern(&Self::LATIN_PATTERN.with(|v| v.clone()), |_| {})
521    }
522
523    /// Searches for multiple sequential newline tokens and condenses them down
524    /// into one.
525    fn condense_newlines(&mut self) {
526        let mut cursor = 0;
527        let copy = self.tokens.clone();
528
529        let mut remove_these = VecDeque::new();
530
531        while cursor < self.tokens.len() {
532            // Locate a stretch of one or more newline tokens.
533            let start_tok = &mut self.tokens[cursor];
534
535            if let TokenKind::Newline(start_count) = &mut start_tok.kind {
536                loop {
537                    cursor += 1;
538
539                    if cursor >= copy.len() {
540                        break;
541                    }
542
543                    let child_tok = &copy[cursor];
544                    if let TokenKind::Newline(n) = child_tok.kind {
545                        *start_count += n;
546                        start_tok.span.end = child_tok.span.end;
547                        remove_these.push_back(cursor);
548                        cursor += 1;
549                    } else {
550                        break;
551                    };
552                }
553            }
554
555            cursor += 1;
556        }
557
558        self.tokens.remove_indices(remove_these);
559    }
560
561    /// Condenses words like "i.e.", "e.g." and "N.S.A." down to single words
562    /// using a state machine.
563    fn condense_dotted_initialisms(&mut self) {
564        if self.tokens.len() < 2 {
565            return;
566        }
567
568        let mut to_remove = VecDeque::new();
569
570        let mut cursor = 1;
571
572        let mut initialism_start = None;
573
574        loop {
575            let a = &self.tokens[cursor - 1];
576            let b = &self.tokens[cursor];
577
578            let is_initialism_chunk = a.kind.is_word() && a.span.len() == 1 && b.kind.is_period();
579
580            if is_initialism_chunk {
581                if initialism_start.is_none() {
582                    initialism_start = Some(cursor - 1);
583                } else {
584                    to_remove.push_back(cursor - 1);
585                }
586
587                to_remove.push_back(cursor);
588                cursor += 1;
589            } else {
590                if let Some(start) = initialism_start {
591                    let end = self.tokens[cursor - 2].span.end;
592                    let start_tok: &mut Token = &mut self.tokens[start];
593                    start_tok.span.end = end;
594                }
595
596                initialism_start = None;
597            }
598
599            cursor += 1;
600
601            if cursor >= self.tokens.len() - 1 {
602                break;
603            }
604        }
605
606        self.tokens.remove_indices(to_remove);
607    }
608
609    fn uncached_ellipsis_pattern() -> Lrc<RepeatingPattern> {
610        let period = SequencePattern::default().then_period();
611        Lrc::new(RepeatingPattern::new(Box::new(period), 2))
612    }
613
614    thread_local! {
615        static ELLIPSIS_PATTERN: Lrc<RepeatingPattern> = Document::uncached_ellipsis_pattern();
616    }
617
618    fn condense_ellipsis(&mut self) {
619        let pattern = Self::ELLIPSIS_PATTERN.with(|v| v.clone());
620        self.condense_pattern(&pattern, |tok| {
621            tok.kind = TokenKind::Punctuation(Punctuation::Ellipsis)
622        });
623    }
624
625    fn uncached_contraction_pattern() -> Lrc<SequencePattern> {
626        Lrc::new(
627            SequencePattern::default()
628                .then_any_word()
629                .then_apostrophe()
630                .then_any_word(),
631        )
632    }
633
634    thread_local! {
635        static CONTRACTION_PATTERN: Lrc<SequencePattern> = Document::uncached_contraction_pattern();
636    }
637
638    /// Searches for contractions and condenses them down into single
639    /// tokens.
640    fn condense_contractions(&mut self) {
641        let pattern = Self::CONTRACTION_PATTERN.with(|v| v.clone());
642
643        self.condense_pattern(&pattern, |_| {});
644    }
645}
646
647/// Creates functions necessary to implement [`TokenStringExt]` on a document.
648macro_rules! create_fns_on_doc {
649    ($thing:ident) => {
650        paste! {
651            fn [< first_ $thing >](&self) -> Option<&Token> {
652                self.tokens.[< first_ $thing >]()
653            }
654
655            fn [< last_ $thing >](&self) -> Option<&Token> {
656                self.tokens.[< last_ $thing >]()
657            }
658
659            fn [< last_ $thing _index>](&self) -> Option<usize> {
660                self.tokens.[< last_ $thing _index >]()
661            }
662
663            fn [<iter_ $thing _indices>](&self) -> impl Iterator<Item = usize> + '_ {
664                self.tokens.[< iter_ $thing _indices >]()
665            }
666
667            fn [<iter_ $thing s>](&self) -> impl Iterator<Item = &Token> + '_ {
668                self.tokens.[< iter_ $thing s >]()
669            }
670        }
671    };
672}
673
674impl TokenStringExt for Document {
675    create_fns_on_doc!(adjective);
676    create_fns_on_doc!(apostrophe);
677    create_fns_on_doc!(at);
678    create_fns_on_doc!(chunk_terminator);
679    create_fns_on_doc!(comma);
680    create_fns_on_doc!(conjunction);
681    create_fns_on_doc!(currency);
682    create_fns_on_doc!(ellipsis);
683    create_fns_on_doc!(hostname);
684    create_fns_on_doc!(likely_homograph);
685    create_fns_on_doc!(noun);
686    create_fns_on_doc!(number);
687    create_fns_on_doc!(paragraph_break);
688    create_fns_on_doc!(pipe);
689    create_fns_on_doc!(preposition);
690    create_fns_on_doc!(punctuation);
691    create_fns_on_doc!(quote);
692    create_fns_on_doc!(sentence_terminator);
693    create_fns_on_doc!(space);
694    create_fns_on_doc!(unlintable);
695    create_fns_on_doc!(verb);
696    create_fns_on_doc!(word);
697    create_fns_on_doc!(word_like);
698
699    fn first_sentence_word(&self) -> Option<&Token> {
700        self.tokens.first_sentence_word()
701    }
702
703    fn first_non_whitespace(&self) -> Option<&Token> {
704        self.tokens.first_non_whitespace()
705    }
706
707    fn span(&self) -> Option<Span> {
708        self.tokens.span()
709    }
710
711    fn iter_linking_verb_indices(&self) -> impl Iterator<Item = usize> + '_ {
712        self.tokens.iter_linking_verb_indices()
713    }
714
715    fn iter_linking_verbs(&self) -> impl Iterator<Item = &Token> + '_ {
716        self.tokens.iter_linking_verbs()
717    }
718
719    fn iter_chunks(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
720        self.tokens.iter_chunks()
721    }
722
723    fn iter_paragraphs(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
724        self.tokens.iter_paragraphs()
725    }
726
727    fn iter_sentences(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
728        self.tokens.iter_sentences()
729    }
730}
731
732impl Display for Document {
733    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
734        for token in &self.tokens {
735            write!(f, "{}", self.get_span_content_str(&token.span))?;
736        }
737
738        Ok(())
739    }
740}
741
742#[cfg(test)]
743mod tests {
744    use itertools::Itertools;
745
746    use super::Document;
747    use crate::{Span, parsers::MarkdownOptions};
748
749    fn assert_condensed_contractions(text: &str, final_tok_count: usize) {
750        let document = Document::new_plain_english_curated(text);
751
752        assert_eq!(document.tokens.len(), final_tok_count);
753
754        let document = Document::new_markdown_curated(text, MarkdownOptions::default());
755
756        assert_eq!(document.tokens.len(), final_tok_count);
757    }
758
759    #[test]
760    fn simple_contraction() {
761        assert_condensed_contractions("isn't", 1);
762    }
763
764    #[test]
765    fn simple_contraction2() {
766        assert_condensed_contractions("wasn't", 1);
767    }
768
769    #[test]
770    fn simple_contraction3() {
771        assert_condensed_contractions("There's", 1);
772    }
773
774    #[test]
775    fn medium_contraction() {
776        assert_condensed_contractions("isn't wasn't", 3);
777    }
778
779    #[test]
780    fn medium_contraction2() {
781        assert_condensed_contractions("There's no way", 5);
782    }
783
784    #[test]
785    fn selects_token_at_char_index() {
786        let text = "There were three little pigs. They built three little homes.";
787        let document = Document::new_plain_english_curated(text);
788
789        let got = document.get_token_at_char_index(19).unwrap();
790
791        assert!(got.kind.is_word());
792        assert_eq!(got.span, Span::new(17, 23));
793    }
794
795    fn assert_token_count(source: &str, count: usize) {
796        let document = Document::new_plain_english_curated(source);
797
798        dbg!(document.tokens().map(|t| t.kind.clone()).collect_vec());
799        assert_eq!(document.tokens.len(), count);
800    }
801
802    #[test]
803    fn condenses_number_suffixes() {
804        assert_token_count("1st", 1);
805        assert_token_count("This is the 2nd test", 9);
806        assert_token_count("This is the 3rd test", 9);
807        assert_token_count(
808            "It works even with weird capitalization like this: 600nD",
809            18,
810        );
811    }
812
813    #[test]
814    fn condenses_ie() {
815        assert_token_count("There is a thing (i.e. that one)", 15);
816        assert_token_count("We are trying to condense \"i.e.\"", 13);
817        assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
818    }
819
820    #[test]
821    fn condenses_eg() {
822        assert_token_count("We are trying to condense \"e.g.\"", 13);
823        assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
824    }
825
826    #[test]
827    fn condenses_nsa() {
828        assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
829    }
830
831    #[test]
832    fn parses_ellipsis() {
833        assert_token_count("...", 1);
834    }
835
836    #[test]
837    fn parses_long_ellipsis() {
838        assert_token_count(".....", 1);
839    }
840
841    #[test]
842    fn parses_short_ellipsis() {
843        assert_token_count("..", 1);
844    }
845
846    #[test]
847    fn selects_token_at_offset() {
848        let doc = Document::new_plain_english_curated("Foo bar baz");
849
850        let tok = doc.get_token_offset(1, -1).unwrap();
851
852        assert_eq!(tok.span, Span::new(0, 3));
853    }
854
855    #[test]
856    fn cant_select_token_before_start() {
857        let doc = Document::new_plain_english_curated("Foo bar baz");
858
859        let tok = doc.get_token_offset(0, -1);
860
861        assert!(tok.is_none());
862    }
863
864    #[test]
865    fn select_next_word_pos_offset() {
866        let doc = Document::new_plain_english_curated("Foo bar baz");
867
868        let bar = doc.get_next_word_from_offset(0, 1).unwrap();
869        let bar = doc.get_span_content(&bar.span);
870        assert_eq!(bar, ['b', 'a', 'r']);
871    }
872
873    #[test]
874    fn select_next_word_neg_offset() {
875        let doc = Document::new_plain_english_curated("Foo bar baz");
876
877        let bar = doc.get_next_word_from_offset(2, -1).unwrap();
878        let bar = doc.get_span_content(&bar.span);
879        assert_eq!(bar, ['F', 'o', 'o']);
880    }
881
882    #[test]
883    fn cant_select_next_word_not_from_whitespace() {
884        let doc = Document::new_plain_english_curated("Foo bar baz");
885
886        let tok = doc.get_next_word_from_offset(0, 2);
887
888        assert!(tok.is_none());
889    }
890
891    #[test]
892    fn cant_select_next_word_before_start() {
893        let doc = Document::new_plain_english_curated("Foo bar baz");
894
895        let tok = doc.get_next_word_from_offset(0, -1);
896
897        assert!(tok.is_none());
898    }
899
900    #[test]
901    fn cant_select_next_word_with_punctuation_instead_of_whitespace() {
902        let doc = Document::new_plain_english_curated("Foo, bar, baz");
903
904        let tok = doc.get_next_word_from_offset(0, 1);
905
906        assert!(tok.is_none());
907    }
908
909    #[test]
910    fn cant_select_next_word_with_punctuation_after_whitespace() {
911        let doc = Document::new_plain_english_curated("Foo \"bar\", baz");
912
913        let tok = doc.get_next_word_from_offset(0, 1);
914
915        assert!(tok.is_none());
916    }
917}