harper_core/
document.rs

1use std::cmp::Ordering;
2use std::collections::VecDeque;
3use std::fmt::Display;
4
5use harper_brill::{Chunker, Tagger, brill_chunker, brill_tagger};
6use paste::paste;
7
8use crate::expr::{Expr, ExprExt, FirstMatchOf, Repeating, SequenceExpr};
9use crate::parsers::{Markdown, MarkdownOptions, Parser, PlainEnglish};
10use crate::patterns::WordSet;
11use crate::punctuation::Punctuation;
12use crate::spell::{Dictionary, FstDictionary};
13use crate::vec_ext::VecExt;
14use crate::{FatStringToken, FatToken, Lrc, Token, TokenKind, TokenStringExt};
15use crate::{OrdinalSuffix, Span};
16
17/// A document containing some amount of lexed and parsed English text.
18#[derive(Debug, Clone)]
19pub struct Document {
20    source: Lrc<Vec<char>>,
21    tokens: Vec<Token>,
22}
23
24impl Default for Document {
25    fn default() -> Self {
26        Self::new("", &PlainEnglish, &FstDictionary::curated())
27    }
28}
29
30impl Document {
31    /// Locate all the tokens that intersect a provided span.
32    ///
33    /// Desperately needs optimization.
34    pub fn token_indices_intersecting(&self, span: Span<char>) -> Vec<usize> {
35        self.tokens()
36            .enumerate()
37            .filter_map(|(idx, tok)| tok.span.overlaps_with(span).then_some(idx))
38            .collect()
39    }
40
41    /// Locate all the tokens that intersect a provided span and convert them to [`FatToken`]s.
42    ///
43    /// Desperately needs optimization.
44    pub fn fat_tokens_intersecting(&self, span: Span<char>) -> Vec<FatToken> {
45        let indices = self.token_indices_intersecting(span);
46
47        indices
48            .into_iter()
49            .map(|i| self.tokens[i].to_fat(&self.source))
50            .collect()
51    }
52
53    /// Lexes and parses text to produce a document using a provided language
54    /// parser and dictionary.
55    pub fn new(text: &str, parser: &impl Parser, dictionary: &impl Dictionary) -> Self {
56        let source: Vec<_> = text.chars().collect();
57
58        Self::new_from_vec(Lrc::new(source), parser, dictionary)
59    }
60
61    /// Lexes and parses text to produce a document using a provided language
62    /// parser and the included curated dictionary.
63    pub fn new_curated(text: &str, parser: &impl Parser) -> Self {
64        let source: Vec<_> = text.chars().collect();
65
66        Self::new_from_vec(Lrc::new(source), parser, &FstDictionary::curated())
67    }
68
69    /// Lexes and parses text to produce a document using a provided language
70    /// parser and dictionary.
71    pub fn new_from_vec(
72        source: Lrc<Vec<char>>,
73        parser: &impl Parser,
74        dictionary: &impl Dictionary,
75    ) -> Self {
76        let tokens = parser.parse(&source);
77
78        let mut document = Self { source, tokens };
79        document.parse(dictionary);
80
81        document
82    }
83
84    /// Parse text to produce a document using the built-in [`PlainEnglish`]
85    /// parser and curated dictionary.
86    pub fn new_plain_english_curated(text: &str) -> Self {
87        Self::new(text, &PlainEnglish, &FstDictionary::curated())
88    }
89
90    /// Parse text to produce a document using the built-in [`PlainEnglish`]
91    /// parser and a provided dictionary.
92    pub fn new_plain_english(text: &str, dictionary: &impl Dictionary) -> Self {
93        Self::new(text, &PlainEnglish, dictionary)
94    }
95
96    /// Parse text to produce a document using the built-in [`Markdown`] parser
97    /// and curated dictionary.
98    pub fn new_markdown_curated(text: &str, markdown_options: MarkdownOptions) -> Self {
99        Self::new(
100            text,
101            &Markdown::new(markdown_options),
102            &FstDictionary::curated(),
103        )
104    }
105
106    /// Parse text to produce a document using the built-in [`Markdown`] parser
107    /// and curated dictionary with the default Markdown configuration.
108    pub fn new_markdown_default_curated(text: &str) -> Self {
109        Self::new_markdown_curated(text, MarkdownOptions::default())
110    }
111
112    /// Parse text to produce a document using the built-in [`PlainEnglish`]
113    /// parser and the curated dictionary.
114    pub fn new_markdown(
115        text: &str,
116        markdown_options: MarkdownOptions,
117        dictionary: &impl Dictionary,
118    ) -> Self {
119        Self::new(text, &Markdown::new(markdown_options), dictionary)
120    }
121
122    /// Parse text to produce a document using the built-in [`PlainEnglish`]
123    /// parser and the curated dictionary with the default Markdown configuration.
124    pub fn new_markdown_default(text: &str, dictionary: &impl Dictionary) -> Self {
125        Self::new_markdown(text, MarkdownOptions::default(), dictionary)
126    }
127
128    /// Re-parse important language constructs.
129    ///
130    /// Should be run after every change to the underlying [`Self::source`].
131    fn parse(&mut self, dictionary: &impl Dictionary) {
132        self.condense_spaces();
133        self.condense_newlines();
134        self.newlines_to_breaks();
135        self.condense_contractions();
136        self.condense_dotted_initialisms();
137        self.condense_number_suffixes();
138        self.condense_ellipsis();
139        self.condense_latin();
140        self.condense_filename_extensions();
141        self.match_quotes();
142
143        let token_strings: Vec<_> = self
144            .tokens
145            .iter()
146            .filter(|t| !t.kind.is_whitespace())
147            .map(|t| self.get_span_content_str(&t.span))
148            .collect();
149
150        let token_tags = brill_tagger().tag_sentence(&token_strings);
151        let np_flags = brill_chunker().chunk_sentence(&token_strings, &token_tags);
152
153        let mut i = 0;
154
155        // Annotate word metadata
156        for token in self.tokens.iter_mut() {
157            if let TokenKind::Word(meta) = &mut token.kind {
158                let word_source = token.span.get_content(&self.source);
159                let mut found_meta = dictionary.get_word_metadata(word_source).cloned();
160
161                if let Some(inner) = &mut found_meta {
162                    inner.pos_tag = token_tags[i].or_else(|| inner.infer_pos_tag());
163                    inner.np_member = Some(np_flags[i]);
164                }
165
166                *meta = found_meta;
167                i += 1;
168            } else if !token.kind.is_whitespace() {
169                i += 1;
170            }
171        }
172    }
173
174    /// Convert all sets of newlines greater than 2 to paragraph breaks.
175    fn newlines_to_breaks(&mut self) {
176        for token in &mut self.tokens {
177            if let TokenKind::Newline(n) = token.kind {
178                if n >= 2 {
179                    token.kind = TokenKind::ParagraphBreak;
180                }
181            }
182        }
183    }
184
185    /// Given a list of indices, this function removes the subsequent
186    /// `stretch_len - 1` elements after each index.
187    ///
188    /// Will extend token spans to include removed elements.
189    /// Assumes condensed tokens are contiguous in source text.
190    fn condense_indices(&mut self, indices: &[usize], stretch_len: usize) {
191        // Update spans
192        for idx in indices {
193            let end_tok = self.tokens[idx + stretch_len - 1].clone();
194            let start_tok = &mut self.tokens[*idx];
195
196            start_tok.span.end = end_tok.span.end;
197        }
198
199        // Trim
200        let old = self.tokens.clone();
201        self.tokens.clear();
202
203        // Keep first chunk.
204        self.tokens
205            .extend_from_slice(&old[0..indices.first().copied().unwrap_or(indices.len())]);
206
207        let mut iter = indices.iter().peekable();
208
209        while let (Some(a_idx), b) = (iter.next(), iter.peek()) {
210            self.tokens.push(old[*a_idx].clone());
211
212            if let Some(b_idx) = b {
213                self.tokens
214                    .extend_from_slice(&old[a_idx + stretch_len..**b_idx]);
215            }
216        }
217
218        // Keep last chunk.
219        self.tokens.extend_from_slice(
220            &old[indices
221                .last()
222                .map(|v| v + stretch_len)
223                .unwrap_or(indices.len())..],
224        );
225    }
226
227    pub fn get_token_at_char_index(&self, char_index: usize) -> Option<&Token> {
228        let index = self
229            .tokens
230            .binary_search_by(|t| {
231                if t.span.overlaps_with(Span::new_with_len(char_index, 1)) {
232                    Ordering::Equal
233                } else {
234                    t.span.start.cmp(&char_index)
235                }
236            })
237            .ok()?;
238
239        Some(&self.tokens[index])
240    }
241
242    /// Defensively attempt to grab a specific token.
243    pub fn get_token(&self, index: usize) -> Option<&Token> {
244        self.tokens.get(index)
245    }
246
247    /// Get a token at a signed offset from a base index, or None if out of bounds.
248    pub fn get_token_offset(&self, base: usize, offset: isize) -> Option<&Token> {
249        match base.checked_add_signed(offset) {
250            None => None,
251            Some(idx) => self.get_token(idx),
252        }
253    }
254
255    /// Get an iterator over all the tokens contained in the document.
256    pub fn tokens(&self) -> impl Iterator<Item = &Token> + '_ {
257        self.tokens.iter()
258    }
259
260    pub fn iter_nominal_phrases(&self) -> impl Iterator<Item = &[Token]> {
261        fn is_np_member(t: &Token) -> bool {
262            t.kind
263                .as_word()
264                .and_then(|x| x.as_ref())
265                .and_then(|w| w.np_member)
266                .unwrap_or(false)
267        }
268
269        fn trim(slice: &[Token]) -> &[Token] {
270            let mut start = 0;
271            let mut end = slice.len();
272            while start < end && slice[start].kind.is_whitespace() {
273                start += 1;
274            }
275            while end > start && slice[end - 1].kind.is_whitespace() {
276                end -= 1;
277            }
278            &slice[start..end]
279        }
280
281        self.tokens
282            .as_slice()
283            .split(|t| !(is_np_member(t) || t.kind.is_whitespace()))
284            .filter_map(|s| {
285                let s = trim(s);
286                if s.iter().any(is_np_member) {
287                    Some(s)
288                } else {
289                    None
290                }
291            })
292    }
293
294    /// Get an iterator over all the tokens contained in the document.
295    pub fn fat_tokens(&self) -> impl Iterator<Item = FatToken> + '_ {
296        self.tokens().map(|token| token.to_fat(&self.source))
297    }
298
299    /// Get the next or previous word token relative to a base index, if separated by whitespace.
300    /// Returns None if the next/previous token is not a word or does not exist.
301    pub fn get_next_word_from_offset(&self, base: usize, offset: isize) -> Option<&Token> {
302        // Look for whitespace at the expected offset
303        if !self.get_token_offset(base, offset)?.kind.is_whitespace() {
304            return None;
305        }
306        // Now look beyond the whitespace for a word token
307        let word_token = self.get_token_offset(base, offset + offset.signum());
308        let word_token = word_token?;
309        word_token.kind.is_word().then_some(word_token)
310    }
311
312    /// Get an iterator over all the tokens contained in the document.
313    pub fn fat_string_tokens(&self) -> impl Iterator<Item = FatStringToken> + '_ {
314        self.fat_tokens().map(|t| t.into())
315    }
316
317    pub fn get_span_content(&self, span: &Span<char>) -> &[char] {
318        span.get_content(&self.source)
319    }
320
321    pub fn get_span_content_str(&self, span: &Span<char>) -> String {
322        String::from_iter(self.get_span_content(span))
323    }
324
325    pub fn get_full_string(&self) -> String {
326        self.get_span_content_str(&Span::new(0, self.source.len()))
327    }
328
329    pub fn get_full_content(&self) -> &[char] {
330        &self.source
331    }
332
333    pub fn get_source(&self) -> &[char] {
334        &self.source
335    }
336
337    pub fn get_tokens(&self) -> &[Token] {
338        &self.tokens
339    }
340
341    /// Searches for quotation marks and fills the
342    /// [`Punctuation::Quote::twin_loc`] field. This is on a best-effort
343    /// basis.
344    ///
345    /// Current algorithm is basic and could use some work.
346    fn match_quotes(&mut self) {
347        let quote_indices: Vec<usize> = self.tokens.iter_quote_indices().collect();
348
349        for i in 0..quote_indices.len() / 2 {
350            let a_i = quote_indices[i * 2];
351            let b_i = quote_indices[i * 2 + 1];
352
353            {
354                let a = self.tokens[a_i].kind.as_mut_quote().unwrap();
355                a.twin_loc = Some(b_i);
356            }
357
358            {
359                let b = self.tokens[b_i].kind.as_mut_quote().unwrap();
360                b.twin_loc = Some(a_i);
361            }
362        }
363    }
364
365    /// Searches for number suffixes and condenses them down into single tokens
366    fn condense_number_suffixes(&mut self) {
367        if self.tokens.len() < 2 {
368            return;
369        }
370
371        let mut replace_starts = Vec::new();
372
373        for idx in 0..self.tokens.len() - 1 {
374            let b = &self.tokens[idx + 1];
375            let a = &self.tokens[idx];
376
377            // TODO: Allow spaces between `a` and `b`
378
379            if let (TokenKind::Number(..), TokenKind::Word(..)) = (&a.kind, &b.kind) {
380                if let Some(found_suffix) =
381                    OrdinalSuffix::from_chars(self.get_span_content(&b.span))
382                {
383                    self.tokens[idx].kind.as_mut_number().unwrap().suffix = Some(found_suffix);
384                    replace_starts.push(idx);
385                }
386            }
387        }
388
389        self.condense_indices(&replace_starts, 2);
390    }
391
392    /// Searches for multiple sequential space tokens and condenses them down
393    /// into one.
394    fn condense_spaces(&mut self) {
395        let mut cursor = 0;
396        let copy = self.tokens.clone();
397
398        let mut remove_these = VecDeque::new();
399
400        while cursor < self.tokens.len() {
401            // Locate a stretch of one or more newline tokens.
402            let start_tok = &mut self.tokens[cursor];
403
404            if let TokenKind::Space(start_count) = &mut start_tok.kind {
405                loop {
406                    cursor += 1;
407
408                    if cursor >= copy.len() {
409                        break;
410                    }
411
412                    let child_tok = &copy[cursor];
413
414                    // Only condense adjacent spans
415                    if start_tok.span.end != child_tok.span.start {
416                        break;
417                    }
418
419                    if let TokenKind::Space(n) = child_tok.kind {
420                        *start_count += n;
421                        start_tok.span.end = child_tok.span.end;
422                        remove_these.push_back(cursor);
423                        cursor += 1;
424                    } else {
425                        break;
426                    };
427                }
428            }
429
430            cursor += 1;
431        }
432
433        self.tokens.remove_indices(remove_these);
434    }
435
436    thread_local! {
437        static LATIN_EXPR: Lrc<FirstMatchOf> = Document::uncached_latin_expr();
438    }
439
440    fn uncached_latin_expr() -> Lrc<FirstMatchOf> {
441        Lrc::new(FirstMatchOf::new(vec![
442            Box::new(
443                SequenceExpr::default()
444                    .then(WordSet::new(&["etc", "vs"]))
445                    .then_period(),
446            ),
447            Box::new(
448                SequenceExpr::aco("et")
449                    .then_whitespace()
450                    .t_aco("al")
451                    .then_period(),
452            ),
453        ]))
454    }
455
456    /// Assumes that the first matched token is the canonical one to be condensed into.
457    /// Takes a callback that can be used to retroactively edit the canonical token afterwards.
458    fn condense_expr<F>(&mut self, expr: &impl Expr, edit: F)
459    where
460        F: Fn(&mut Token),
461    {
462        let matches = expr.iter_matches_in_doc(self).collect::<Vec<_>>();
463
464        let mut remove_indices = VecDeque::with_capacity(matches.len());
465
466        for m in matches {
467            remove_indices.extend(m.start + 1..m.end);
468            self.tokens[m.start].span = self.tokens[m.into_iter()].span().unwrap();
469            edit(&mut self.tokens[m.start]);
470        }
471
472        self.tokens.remove_indices(remove_indices);
473    }
474
475    fn condense_latin(&mut self) {
476        self.condense_expr(&Self::LATIN_EXPR.with(|v| v.clone()), |_| {})
477    }
478
479    /// Searches for multiple sequential newline tokens and condenses them down
480    /// into one.
481    fn condense_newlines(&mut self) {
482        let mut cursor = 0;
483        let copy = self.tokens.clone();
484
485        let mut remove_these = VecDeque::new();
486
487        while cursor < self.tokens.len() {
488            // Locate a stretch of one or more newline tokens.
489            let start_tok = &mut self.tokens[cursor];
490
491            if let TokenKind::Newline(start_count) = &mut start_tok.kind {
492                loop {
493                    cursor += 1;
494
495                    if cursor >= copy.len() {
496                        break;
497                    }
498
499                    let child_tok = &copy[cursor];
500                    if let TokenKind::Newline(n) = child_tok.kind {
501                        *start_count += n;
502                        start_tok.span.end = child_tok.span.end;
503                        remove_these.push_back(cursor);
504                        cursor += 1;
505                    } else {
506                        break;
507                    };
508                }
509            }
510
511            cursor += 1;
512        }
513
514        self.tokens.remove_indices(remove_these);
515    }
516
517    /// Condenses words like "i.e.", "e.g." and "N.S.A." down to single words
518    /// using a state machine.
519    fn condense_dotted_initialisms(&mut self) {
520        if self.tokens.len() < 2 {
521            return;
522        }
523
524        let mut to_remove = VecDeque::new();
525
526        let mut cursor = 1;
527
528        let mut initialism_start = None;
529
530        loop {
531            let a = &self.tokens[cursor - 1];
532            let b = &self.tokens[cursor];
533
534            let is_initialism_chunk = a.kind.is_word() && a.span.len() == 1 && b.kind.is_period();
535
536            if is_initialism_chunk {
537                if initialism_start.is_none() {
538                    initialism_start = Some(cursor - 1);
539                } else {
540                    to_remove.push_back(cursor - 1);
541                }
542
543                to_remove.push_back(cursor);
544                cursor += 1;
545            } else {
546                if let Some(start) = initialism_start {
547                    let end = self.tokens[cursor - 2].span.end;
548                    let start_tok: &mut Token = &mut self.tokens[start];
549                    start_tok.span.end = end;
550                }
551
552                initialism_start = None;
553            }
554
555            cursor += 1;
556
557            if cursor >= self.tokens.len() - 1 {
558                break;
559            }
560        }
561
562        self.tokens.remove_indices(to_remove);
563    }
564
565    /// Condenses likely filename extensions down to single tokens.
566    fn condense_filename_extensions(&mut self) {
567        if self.tokens.len() < 2 {
568            return;
569        }
570
571        let mut to_remove = VecDeque::new();
572
573        let mut cursor = 1;
574
575        let mut ext_start = None;
576
577        loop {
578            let a = self.get_token_offset(cursor, -2);
579            let b = &self.tokens[cursor - 1];
580            let c = &self.tokens[cursor];
581            let d = self.get_token_offset(cursor, 1);
582
583            let is_ext_chunk = a.is_none_or(|t| t.kind.is_whitespace())
584                && b.kind.is_period()
585                && c.kind.is_word()
586                && c.span.len() <= 3
587                && d.is_none_or(|t| t.kind.is_whitespace())
588                && if d.is_none_or(|t| t.kind.is_whitespace()) {
589                    let ext_chars = c.span.get_content(&self.source);
590                    ext_chars.iter().all(|c| c.is_ascii_lowercase())
591                        || ext_chars.iter().all(|c| c.is_ascii_uppercase())
592                } else {
593                    false
594                };
595
596            if is_ext_chunk {
597                if ext_start.is_none() {
598                    ext_start = Some(cursor - 1);
599                    self.tokens[cursor - 1].kind = TokenKind::Unlintable;
600                } else {
601                    to_remove.push_back(cursor - 1);
602                }
603
604                to_remove.push_back(cursor);
605                cursor += 1;
606            } else {
607                if let Some(start) = ext_start {
608                    let end = self.tokens[cursor - 2].span.end;
609                    let start_tok: &mut Token = &mut self.tokens[start];
610                    start_tok.span.end = end;
611                }
612
613                ext_start = None;
614            }
615
616            cursor += 1;
617
618            if cursor >= self.tokens.len() {
619                break;
620            }
621        }
622
623        self.tokens.remove_indices(to_remove);
624    }
625
626    fn uncached_ellipsis_pattern() -> Lrc<Repeating> {
627        let period = SequenceExpr::default().then_period();
628        Lrc::new(Repeating::new(Box::new(period), 2))
629    }
630
631    thread_local! {
632        static ELLIPSIS_EXPR: Lrc<Repeating> = Document::uncached_ellipsis_pattern();
633    }
634
635    fn condense_ellipsis(&mut self) {
636        let expr = Self::ELLIPSIS_EXPR.with(|v| v.clone());
637        self.condense_expr(&expr, |tok| {
638            tok.kind = TokenKind::Punctuation(Punctuation::Ellipsis)
639        });
640    }
641
642    fn uncached_contraction_expr() -> Lrc<SequenceExpr> {
643        Lrc::new(
644            SequenceExpr::default()
645                .then_any_word()
646                .then_apostrophe()
647                .then_any_word(),
648        )
649    }
650
651    thread_local! {
652        static CONTRACTION_EXPR: Lrc<SequenceExpr> = Document::uncached_contraction_expr();
653    }
654
655    /// Searches for contractions and condenses them down into single
656    /// tokens.
657    fn condense_contractions(&mut self) {
658        let expr = Self::CONTRACTION_EXPR.with(|v| v.clone());
659
660        self.condense_expr(&expr, |_| {});
661    }
662}
663
664/// Creates functions necessary to implement [`TokenStringExt]` on a document.
665macro_rules! create_fns_on_doc {
666    ($thing:ident) => {
667        paste! {
668            fn [< first_ $thing >](&self) -> Option<&Token> {
669                self.tokens.[< first_ $thing >]()
670            }
671
672            fn [< last_ $thing >](&self) -> Option<&Token> {
673                self.tokens.[< last_ $thing >]()
674            }
675
676            fn [< last_ $thing _index>](&self) -> Option<usize> {
677                self.tokens.[< last_ $thing _index >]()
678            }
679
680            fn [<iter_ $thing _indices>](&self) -> impl DoubleEndedIterator<Item = usize> + '_ {
681                self.tokens.[< iter_ $thing _indices >]()
682            }
683
684            fn [<iter_ $thing s>](&self) -> impl Iterator<Item = &Token> + '_ {
685                self.tokens.[< iter_ $thing s >]()
686            }
687        }
688    };
689}
690
691impl TokenStringExt for Document {
692    create_fns_on_doc!(adjective);
693    create_fns_on_doc!(apostrophe);
694    create_fns_on_doc!(at);
695    create_fns_on_doc!(chunk_terminator);
696    create_fns_on_doc!(comma);
697    create_fns_on_doc!(conjunction);
698    create_fns_on_doc!(currency);
699    create_fns_on_doc!(ellipsis);
700    create_fns_on_doc!(hostname);
701    create_fns_on_doc!(likely_homograph);
702    create_fns_on_doc!(noun);
703    create_fns_on_doc!(number);
704    create_fns_on_doc!(paragraph_break);
705    create_fns_on_doc!(pipe);
706    create_fns_on_doc!(preposition);
707    create_fns_on_doc!(punctuation);
708    create_fns_on_doc!(quote);
709    create_fns_on_doc!(sentence_terminator);
710    create_fns_on_doc!(space);
711    create_fns_on_doc!(unlintable);
712    create_fns_on_doc!(verb);
713    create_fns_on_doc!(word);
714    create_fns_on_doc!(word_like);
715
716    fn first_sentence_word(&self) -> Option<&Token> {
717        self.tokens.first_sentence_word()
718    }
719
720    fn first_non_whitespace(&self) -> Option<&Token> {
721        self.tokens.first_non_whitespace()
722    }
723
724    fn span(&self) -> Option<Span<char>> {
725        self.tokens.span()
726    }
727
728    fn iter_linking_verb_indices(&self) -> impl Iterator<Item = usize> + '_ {
729        self.tokens.iter_linking_verb_indices()
730    }
731
732    fn iter_linking_verbs(&self) -> impl Iterator<Item = &Token> + '_ {
733        self.tokens.iter_linking_verbs()
734    }
735
736    fn iter_chunks(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
737        self.tokens.iter_chunks()
738    }
739
740    fn iter_paragraphs(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
741        self.tokens.iter_paragraphs()
742    }
743
744    fn iter_sentences(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
745        self.tokens.iter_sentences()
746    }
747}
748
749impl Display for Document {
750    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
751        for token in &self.tokens {
752            write!(f, "{}", self.get_span_content_str(&token.span))?;
753        }
754
755        Ok(())
756    }
757}
758
759#[cfg(test)]
760mod tests {
761    use itertools::Itertools;
762
763    use super::Document;
764    use crate::{Span, parsers::MarkdownOptions};
765
766    fn assert_condensed_contractions(text: &str, final_tok_count: usize) {
767        let document = Document::new_plain_english_curated(text);
768
769        assert_eq!(document.tokens.len(), final_tok_count);
770
771        let document = Document::new_markdown_curated(text, MarkdownOptions::default());
772
773        assert_eq!(document.tokens.len(), final_tok_count);
774    }
775
776    #[test]
777    fn simple_contraction() {
778        assert_condensed_contractions("isn't", 1);
779    }
780
781    #[test]
782    fn simple_contraction2() {
783        assert_condensed_contractions("wasn't", 1);
784    }
785
786    #[test]
787    fn simple_contraction3() {
788        assert_condensed_contractions("There's", 1);
789    }
790
791    #[test]
792    fn medium_contraction() {
793        assert_condensed_contractions("isn't wasn't", 3);
794    }
795
796    #[test]
797    fn medium_contraction2() {
798        assert_condensed_contractions("There's no way", 5);
799    }
800
801    #[test]
802    fn selects_token_at_char_index() {
803        let text = "There were three little pigs. They built three little homes.";
804        let document = Document::new_plain_english_curated(text);
805
806        let got = document.get_token_at_char_index(19).unwrap();
807
808        assert!(got.kind.is_word());
809        assert_eq!(got.span, Span::new(17, 23));
810    }
811
812    fn assert_token_count(source: &str, count: usize) {
813        let document = Document::new_plain_english_curated(source);
814
815        dbg!(document.tokens().map(|t| t.kind.clone()).collect_vec());
816        assert_eq!(document.tokens.len(), count);
817    }
818
819    #[test]
820    fn condenses_number_suffixes() {
821        assert_token_count("1st", 1);
822        assert_token_count("This is the 2nd test", 9);
823        assert_token_count("This is the 3rd test", 9);
824        assert_token_count(
825            "It works even with weird capitalization like this: 600nD",
826            18,
827        );
828    }
829
830    #[test]
831    fn condenses_ie() {
832        assert_token_count("There is a thing (i.e. that one)", 15);
833        assert_token_count("We are trying to condense \"i.e.\"", 13);
834        assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
835    }
836
837    #[test]
838    fn condenses_eg() {
839        assert_token_count("We are trying to condense \"e.g.\"", 13);
840        assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
841    }
842
843    #[test]
844    fn condenses_nsa() {
845        assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
846    }
847
848    #[test]
849    fn parses_ellipsis() {
850        assert_token_count("...", 1);
851    }
852
853    #[test]
854    fn parses_long_ellipsis() {
855        assert_token_count(".....", 1);
856    }
857
858    #[test]
859    fn parses_short_ellipsis() {
860        assert_token_count("..", 1);
861    }
862
863    #[test]
864    fn selects_token_at_offset() {
865        let doc = Document::new_plain_english_curated("Foo bar baz");
866
867        let tok = doc.get_token_offset(1, -1).unwrap();
868
869        assert_eq!(tok.span, Span::new(0, 3));
870    }
871
872    #[test]
873    fn cant_select_token_before_start() {
874        let doc = Document::new_plain_english_curated("Foo bar baz");
875
876        let tok = doc.get_token_offset(0, -1);
877
878        assert!(tok.is_none());
879    }
880
881    #[test]
882    fn select_next_word_pos_offset() {
883        let doc = Document::new_plain_english_curated("Foo bar baz");
884
885        let bar = doc.get_next_word_from_offset(0, 1).unwrap();
886        let bar = doc.get_span_content(&bar.span);
887        assert_eq!(bar, ['b', 'a', 'r']);
888    }
889
890    #[test]
891    fn select_next_word_neg_offset() {
892        let doc = Document::new_plain_english_curated("Foo bar baz");
893
894        let bar = doc.get_next_word_from_offset(2, -1).unwrap();
895        let bar = doc.get_span_content(&bar.span);
896        assert_eq!(bar, ['F', 'o', 'o']);
897    }
898
899    #[test]
900    fn cant_select_next_word_not_from_whitespace() {
901        let doc = Document::new_plain_english_curated("Foo bar baz");
902
903        let tok = doc.get_next_word_from_offset(0, 2);
904
905        assert!(tok.is_none());
906    }
907
908    #[test]
909    fn cant_select_next_word_before_start() {
910        let doc = Document::new_plain_english_curated("Foo bar baz");
911
912        let tok = doc.get_next_word_from_offset(0, -1);
913
914        assert!(tok.is_none());
915    }
916
917    #[test]
918    fn cant_select_next_word_with_punctuation_instead_of_whitespace() {
919        let doc = Document::new_plain_english_curated("Foo, bar, baz");
920
921        let tok = doc.get_next_word_from_offset(0, 1);
922
923        assert!(tok.is_none());
924    }
925
926    #[test]
927    fn cant_select_next_word_with_punctuation_after_whitespace() {
928        let doc = Document::new_plain_english_curated("Foo \"bar\", baz");
929
930        let tok = doc.get_next_word_from_offset(0, 1);
931
932        assert!(tok.is_none());
933    }
934
935    #[test]
936    fn condenses_filename_extensions() {
937        let doc = Document::new_plain_english_curated(".c and .exe and .js");
938        assert!(doc.tokens[0].kind.is_unlintable());
939        assert!(doc.tokens[4].kind.is_unlintable());
940        assert!(doc.tokens[8].kind.is_unlintable());
941    }
942
943    #[test]
944    fn condense_filename_extension_ok_at_start_and_end() {
945        let doc = Document::new_plain_english_curated(".c and .EXE");
946        assert!(doc.tokens.len() == 5);
947        assert!(doc.tokens[0].kind.is_unlintable());
948        assert!(doc.tokens[4].kind.is_unlintable());
949    }
950
951    #[test]
952    fn doesnt_condense_filename_extensions_with_mixed_case() {
953        let doc = Document::new_plain_english_curated(".c and .Exe");
954        assert!(doc.tokens.len() == 6);
955        assert!(doc.tokens[0].kind.is_unlintable());
956        assert!(doc.tokens[4].kind.is_punctuation());
957        assert!(doc.tokens[5].kind.is_word());
958    }
959
960    #[test]
961    fn doesnt_condense_filename_extensions_with_non_letters() {
962        let doc = Document::new_plain_english_curated(".COM and .C0M");
963        assert!(doc.tokens.len() == 6);
964        assert!(doc.tokens[0].kind.is_unlintable());
965        assert!(doc.tokens[4].kind.is_punctuation());
966        assert!(doc.tokens[5].kind.is_word());
967    }
968
969    #[test]
970    fn doesnt_condense_filename_extensions_longer_than_three() {
971        let doc = Document::new_plain_english_curated(".dll and .dlls");
972        assert!(doc.tokens.len() == 6);
973        assert!(doc.tokens[0].kind.is_unlintable());
974        assert!(doc.tokens[4].kind.is_punctuation());
975        assert!(doc.tokens[5].kind.is_word());
976    }
977}