harper_core/
document.rs

1use std::cmp::Ordering;
2use std::collections::VecDeque;
3use std::fmt::Display;
4
5use paste::paste;
6
7use crate::parsers::{Markdown, MarkdownOptions, Parser, PlainEnglish};
8use crate::patterns::{
9    DocPattern, EitherPattern, Pattern, RepeatingPattern, SequencePattern, WordSet,
10};
11use crate::punctuation::Punctuation;
12use crate::vec_ext::VecExt;
13use crate::{Dictionary, FatToken, FstDictionary, Lrc, Token, TokenKind, TokenStringExt};
14use crate::{NumberSuffix, Span};
15
16/// A document containing some amount of lexed and parsed English text.
17#[derive(Debug, Clone)]
18pub struct Document {
19    source: Lrc<Vec<char>>,
20    tokens: Vec<Token>,
21}
22
23impl Default for Document {
24    fn default() -> Self {
25        Self::new("", &PlainEnglish, &FstDictionary::curated())
26    }
27}
28
29impl Document {
30    /// Locate all the tokens that intersect a provided span.
31    ///
32    /// Desperately needs optimization.
33    pub fn token_indices_intersecting(&self, span: Span) -> Vec<usize> {
34        self.tokens()
35            .enumerate()
36            .filter_map(|(idx, tok)| tok.span.overlaps_with(span).then_some(idx))
37            .collect()
38    }
39
40    /// Lexes and parses text to produce a document using a provided language
41    /// parser and dictionary.
42    pub fn new(text: &str, parser: &impl Parser, dictionary: &impl Dictionary) -> Self {
43        let source: Vec<_> = text.chars().collect();
44
45        Self::new_from_vec(Lrc::new(source), parser, dictionary)
46    }
47
48    /// Lexes and parses text to produce a document using a provided language
49    /// parser and the included curated dictionary.
50    pub fn new_curated(text: &str, parser: &impl Parser) -> Self {
51        let source: Vec<_> = text.chars().collect();
52
53        Self::new_from_vec(Lrc::new(source), parser, &FstDictionary::curated())
54    }
55
56    /// Lexes and parses text to produce a document using a provided language
57    /// parser and dictionary.
58    pub fn new_from_vec(
59        source: Lrc<Vec<char>>,
60        parser: &impl Parser,
61        dictionary: &impl Dictionary,
62    ) -> Self {
63        let tokens = parser.parse(&source);
64
65        let mut document = Self { source, tokens };
66        document.parse(dictionary);
67
68        document
69    }
70
71    /// Parse text to produce a document using the built-in [`PlainEnglish`]
72    /// parser and curated dictionary.
73    pub fn new_plain_english_curated(text: &str) -> Self {
74        Self::new(text, &PlainEnglish, &FstDictionary::curated())
75    }
76
77    /// Parse text to produce a document using the built-in [`PlainEnglish`]
78    /// parser and a provided dictionary.
79    pub fn new_plain_english(text: &str, dictionary: &impl Dictionary) -> Self {
80        Self::new(text, &PlainEnglish, dictionary)
81    }
82
83    /// Parse text to produce a document using the built-in [`Markdown`] parser
84    /// and curated dictionary.
85    pub fn new_markdown_curated(text: &str, markdown_options: MarkdownOptions) -> Self {
86        Self::new(
87            text,
88            &Markdown::new(markdown_options),
89            &FstDictionary::curated(),
90        )
91    }
92
93    /// Parse text to produce a document using the built-in [`Markdown`] parser
94    /// and curated dictionary with the default markdown configuration.
95    pub fn new_markdown_default_curated(text: &str) -> Self {
96        Self::new_markdown_curated(text, MarkdownOptions::default())
97    }
98
99    /// Parse text to produce a document using the built-in [`PlainEnglish`]
100    /// parser and the curated dictionary.
101    pub fn new_markdown(
102        text: &str,
103        markdown_options: MarkdownOptions,
104        dictionary: &impl Dictionary,
105    ) -> Self {
106        Self::new(text, &Markdown::new(markdown_options), dictionary)
107    }
108
109    /// Parse text to produce a document using the built-in [`PlainEnglish`]
110    /// parser and the curated dictionary with the default markdown configuration.
111    pub fn new_markdown_default(text: &str, dictionary: &impl Dictionary) -> Self {
112        Self::new_markdown(text, MarkdownOptions::default(), dictionary)
113    }
114
115    /// Re-parse important language constructs.
116    ///
117    /// Should be run after every change to the underlying [`Self::source`].
118    fn parse(&mut self, dictionary: &impl Dictionary) {
119        self.condense_spaces();
120        self.condense_newlines();
121        self.newlines_to_breaks();
122        self.condense_contractions();
123        self.condense_dotted_initialisms();
124        self.condense_number_suffixes();
125        self.condense_ellipsis();
126        self.condense_latin();
127        self.match_quotes();
128
129        for token in self.tokens.iter_mut() {
130            if let TokenKind::Word(meta) = &mut token.kind {
131                let word_source = token.span.get_content(&self.source);
132                let found_meta = dictionary
133                    .get_correct_capitalization_of(word_source)
134                    .map(|canonical_caps| dictionary.get_word_metadata(canonical_caps))
135                    .unwrap_or_default();
136                *meta = meta.or(&found_meta);
137            }
138        }
139    }
140
141    /// Convert all sets of newlines greater than 2 to paragraph breaks.
142    fn newlines_to_breaks(&mut self) {
143        for token in &mut self.tokens {
144            if let TokenKind::Newline(n) = token.kind {
145                if n >= 2 {
146                    token.kind = TokenKind::ParagraphBreak;
147                }
148            }
149        }
150    }
151
152    /// Given a list of indices, this function removes the subsequent
153    /// `stretch_len - 1` elements after each index.
154    ///
155    /// Will extend token spans to include removed elements.
156    /// Assumes condensed tokens are contiguous in source text.
157    fn condense_indices(&mut self, indices: &[usize], stretch_len: usize) {
158        // Update spans
159        for idx in indices {
160            let end_tok = self.tokens[idx + stretch_len - 1];
161            let start_tok = &mut self.tokens[*idx];
162
163            start_tok.span.end = end_tok.span.end;
164        }
165
166        // Trim
167        let old = self.tokens.clone();
168        self.tokens.clear();
169
170        // Keep first chunk.
171        self.tokens
172            .extend_from_slice(&old[0..indices.first().copied().unwrap_or(indices.len())]);
173
174        let mut iter = indices.iter().peekable();
175
176        while let (Some(a_idx), b) = (iter.next(), iter.peek()) {
177            self.tokens.push(old[*a_idx]);
178
179            if let Some(b_idx) = b {
180                self.tokens
181                    .extend_from_slice(&old[a_idx + stretch_len..**b_idx]);
182            }
183        }
184
185        // Keep last chunk.
186        self.tokens.extend_from_slice(
187            &old[indices
188                .last()
189                .map(|v| v + stretch_len)
190                .unwrap_or(indices.len())..],
191        );
192    }
193
194    pub fn get_token_at_char_index(&self, char_index: usize) -> Option<Token> {
195        let index = self
196            .tokens
197            .binary_search_by(|t| {
198                if t.span.overlaps_with(Span::new_with_len(char_index, 1)) {
199                    Ordering::Equal
200                } else {
201                    t.span.start.cmp(&char_index)
202                }
203            })
204            .ok()?;
205
206        Some(self.tokens[index])
207    }
208
209    /// Defensively attempt to grab a specific token.
210    pub fn get_token(&self, index: usize) -> Option<Token> {
211        self.tokens.get(index).copied()
212    }
213
214    /// Get an iterator over all the tokens contained in the document.
215    pub fn tokens(&self) -> impl Iterator<Item = Token> + '_ {
216        self.tokens.iter().copied()
217    }
218
219    /// Get an iterator over all the tokens contained in the document.
220    pub fn fat_tokens(&self) -> impl Iterator<Item = FatToken> + '_ {
221        self.tokens().map(|token| token.to_fat(&self.source))
222    }
223
224    pub fn get_span_content(&self, span: Span) -> &[char] {
225        span.get_content(&self.source)
226    }
227
228    pub fn get_span_content_str(&self, span: Span) -> String {
229        String::from_iter(self.get_span_content(span))
230    }
231
232    pub fn get_full_string(&self) -> String {
233        self.get_span_content_str(Span {
234            start: 0,
235            end: self.source.len(),
236        })
237    }
238
239    pub fn get_full_content(&self) -> &[char] {
240        &self.source
241    }
242
243    pub fn get_source(&self) -> &[char] {
244        &self.source
245    }
246
247    pub fn get_tokens(&self) -> &[Token] {
248        &self.tokens
249    }
250
251    /// Searches for quotation marks and fills the
252    /// [`Punctuation::Quote::twin_loc`] field. This is on a best-effort
253    /// basis.
254    ///
255    /// Current algorithm is basic and could use some work.
256    fn match_quotes(&mut self) {
257        let quote_indices: Vec<usize> = self.tokens.iter_quote_indices().collect();
258
259        for i in 0..quote_indices.len() / 2 {
260            let a_i = quote_indices[i * 2];
261            let b_i = quote_indices[i * 2 + 1];
262
263            {
264                let a = self.tokens[a_i].kind.as_mut_quote().unwrap();
265                a.twin_loc = Some(b_i);
266            }
267
268            {
269                let b = self.tokens[b_i].kind.as_mut_quote().unwrap();
270                b.twin_loc = Some(a_i);
271            }
272        }
273    }
274
275    /// Searches for number suffixes and condenses them down into single tokens
276    fn condense_number_suffixes(&mut self) {
277        if self.tokens.len() < 2 {
278            return;
279        }
280
281        let mut replace_starts = Vec::new();
282
283        for idx in 0..self.tokens.len() - 1 {
284            let b = self.tokens[idx + 1];
285            let a = self.tokens[idx];
286
287            // TODO: Allow spaces between `a` and `b`
288
289            if let (TokenKind::Number(..), TokenKind::Word(..)) = (a.kind, b.kind) {
290                if let Some(found_suffix) = NumberSuffix::from_chars(self.get_span_content(b.span))
291                {
292                    self.tokens[idx].kind.as_mut_number().unwrap().suffix = Some(found_suffix);
293                    replace_starts.push(idx);
294                }
295            }
296        }
297
298        self.condense_indices(&replace_starts, 2);
299    }
300
301    /// Searches for multiple sequential space tokens and condenses them down
302    /// into one.
303    fn condense_spaces(&mut self) {
304        let mut cursor = 0;
305        let copy = self.tokens.clone();
306
307        let mut remove_these = VecDeque::new();
308
309        while cursor < self.tokens.len() {
310            // Locate a stretch of one or more newline tokens.
311            let start_tok = &mut self.tokens[cursor];
312
313            if let TokenKind::Space(start_count) = &mut start_tok.kind {
314                loop {
315                    cursor += 1;
316
317                    if cursor >= copy.len() {
318                        break;
319                    }
320
321                    let child_tok = &copy[cursor];
322
323                    // Only condense adjacent spans
324                    if start_tok.span.end != child_tok.span.start {
325                        break;
326                    }
327
328                    if let TokenKind::Space(n) = child_tok.kind {
329                        *start_count += n;
330                        start_tok.span.end = child_tok.span.end;
331                        remove_these.push_back(cursor);
332                        cursor += 1;
333                    } else {
334                        break;
335                    };
336                }
337            }
338
339            cursor += 1;
340        }
341
342        self.tokens.remove_indices(remove_these);
343    }
344
345    thread_local! {
346        static LATIN_PATTERN: Lrc<EitherPattern> = Document::uncached_latin_pattern();
347    }
348
349    fn uncached_latin_pattern() -> Lrc<EitherPattern> {
350        Lrc::new(EitherPattern::new(vec![
351            Box::new(
352                SequencePattern::default()
353                    .then_word_set(WordSet::all(&["etc", "vs"]))
354                    .then_period(),
355            ),
356            Box::new(
357                SequencePattern::aco("et")
358                    .then_whitespace()
359                    .t_aco("al")
360                    .then_period(),
361            ),
362        ]))
363    }
364
365    /// Assumes that the first matched token is the canonical one to be condensed into.
366    /// Takes a callback that can be used to retroactively edit the canonical token afterwards.
367    fn condense_pattern<F>(&mut self, pattern: &impl Pattern, edit: F)
368    where
369        F: Fn(&mut Token),
370    {
371        let matches = pattern.find_all_matches_in_doc(self);
372
373        let mut remove_indices = VecDeque::with_capacity(matches.len());
374
375        for m in matches {
376            remove_indices.extend(m.start + 1..m.end);
377            self.tokens[m.start].span = self.tokens[m.into_iter()].span().unwrap();
378            edit(&mut self.tokens[m.start]);
379        }
380
381        self.tokens.remove_indices(remove_indices);
382    }
383
384    fn condense_latin(&mut self) {
385        self.condense_pattern(&Self::LATIN_PATTERN.with(|v| v.clone()), |_| {})
386    }
387
388    /// Searches for multiple sequential newline tokens and condenses them down
389    /// into one.
390    fn condense_newlines(&mut self) {
391        let mut cursor = 0;
392        let copy = self.tokens.clone();
393
394        let mut remove_these = VecDeque::new();
395
396        while cursor < self.tokens.len() {
397            // Locate a stretch of one or more newline tokens.
398            let start_tok = &mut self.tokens[cursor];
399
400            if let TokenKind::Newline(start_count) = &mut start_tok.kind {
401                loop {
402                    cursor += 1;
403
404                    if cursor >= copy.len() {
405                        break;
406                    }
407
408                    let child_tok = &copy[cursor];
409                    if let TokenKind::Newline(n) = child_tok.kind {
410                        *start_count += n;
411                        start_tok.span.end = child_tok.span.end;
412                        remove_these.push_back(cursor);
413                        cursor += 1;
414                    } else {
415                        break;
416                    };
417                }
418            }
419
420            cursor += 1;
421        }
422
423        self.tokens.remove_indices(remove_these);
424    }
425
426    /// Condenses words like "i.e.", "e.g." and "N.S.A." down to single words
427    /// using a state machine.
428    fn condense_dotted_initialisms(&mut self) {
429        if self.tokens.len() < 2 {
430            return;
431        }
432
433        let mut to_remove = VecDeque::new();
434
435        let mut cursor = 1;
436
437        let mut initialism_start = None;
438
439        loop {
440            let a = self.tokens[cursor - 1];
441            let b = self.tokens[cursor];
442
443            let is_initialism_chunk = a.kind.is_word() && a.span.len() == 1 && b.kind.is_period();
444
445            if is_initialism_chunk {
446                if initialism_start.is_none() {
447                    initialism_start = Some(cursor - 1);
448                } else {
449                    to_remove.push_back(cursor - 1);
450                }
451
452                to_remove.push_back(cursor);
453                cursor += 1;
454            } else {
455                if let Some(start) = initialism_start {
456                    let end = self.tokens[cursor - 2].span.end;
457                    let start_tok: &mut Token = &mut self.tokens[start];
458                    start_tok.span.end = end;
459                }
460
461                initialism_start = None;
462            }
463
464            cursor += 1;
465
466            if cursor >= self.tokens.len() - 1 {
467                break;
468            }
469        }
470
471        self.tokens.remove_indices(to_remove);
472    }
473
474    fn uncached_ellipsis_pattern() -> Lrc<RepeatingPattern> {
475        let period = SequencePattern::default().then_period();
476        Lrc::new(RepeatingPattern::new(Box::new(period), 2))
477    }
478
479    thread_local! {
480        static ELLIPSIS_PATTERN: Lrc<RepeatingPattern> = Document::uncached_ellipsis_pattern();
481    }
482
483    fn condense_ellipsis(&mut self) {
484        let pattern = Self::ELLIPSIS_PATTERN.with(|v| v.clone());
485        self.condense_pattern(&pattern, |tok| {
486            tok.kind = TokenKind::Punctuation(Punctuation::Ellipsis)
487        });
488    }
489
490    fn uncached_contraction_pattern() -> Lrc<SequencePattern> {
491        Lrc::new(
492            SequencePattern::default()
493                .then_any_word()
494                .then_apostrophe()
495                .then_any_word(),
496        )
497    }
498
499    thread_local! {
500        static CONTRACTION_PATTERN: Lrc<SequencePattern> = Document::uncached_contraction_pattern();
501    }
502
503    /// Searches for contractions and condenses them down into single
504    /// tokens.
505    fn condense_contractions(&mut self) {
506        let pattern = Self::CONTRACTION_PATTERN.with(|v| v.clone());
507
508        self.condense_pattern(&pattern, |_| {});
509    }
510}
511
512/// Creates functions necessary to implement [`TokenStringExt]` on a document.
513macro_rules! create_fns_on_doc {
514    ($thing:ident) => {
515        paste! {
516            fn [< first_ $thing >](&self) -> Option<Token> {
517                self.tokens.[< first_ $thing >]()
518            }
519
520            fn [< last_ $thing >](&self) -> Option<Token> {
521                self.tokens.[< last_ $thing >]()
522            }
523
524            fn [< last_ $thing _index>](&self) -> Option<usize> {
525                self.tokens.[< last_ $thing _index >]()
526            }
527
528            fn [<iter_ $thing _indices>](&self) -> impl Iterator<Item = usize> + '_ {
529                self.tokens.[< iter_ $thing _indices >]()
530            }
531
532            fn [<iter_ $thing s>](&self) -> impl Iterator<Item = Token> + '_ {
533                self.tokens.[< iter_ $thing s >]()
534            }
535        }
536    };
537}
538
539impl TokenStringExt for Document {
540    create_fns_on_doc!(word);
541    create_fns_on_doc!(hostname);
542    create_fns_on_doc!(word_like);
543    create_fns_on_doc!(conjunction);
544    create_fns_on_doc!(space);
545    create_fns_on_doc!(apostrophe);
546    create_fns_on_doc!(pipe);
547    create_fns_on_doc!(quote);
548    create_fns_on_doc!(number);
549    create_fns_on_doc!(at);
550    create_fns_on_doc!(ellipsis);
551    create_fns_on_doc!(unlintable);
552    create_fns_on_doc!(sentence_terminator);
553    create_fns_on_doc!(paragraph_break);
554    create_fns_on_doc!(chunk_terminator);
555    create_fns_on_doc!(punctuation);
556    create_fns_on_doc!(currency);
557    create_fns_on_doc!(likely_homograph);
558    create_fns_on_doc!(comma);
559
560    fn first_sentence_word(&self) -> Option<Token> {
561        self.tokens.first_sentence_word()
562    }
563
564    fn first_non_whitespace(&self) -> Option<Token> {
565        self.tokens.first_non_whitespace()
566    }
567
568    fn span(&self) -> Option<Span> {
569        self.tokens.span()
570    }
571
572    fn iter_linking_verb_indices(&self) -> impl Iterator<Item = usize> + '_ {
573        self.tokens.iter_linking_verb_indices()
574    }
575
576    fn iter_linking_verbs(&self) -> impl Iterator<Item = Token> + '_ {
577        self.tokens.iter_linking_verbs()
578    }
579
580    fn iter_chunks(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
581        self.tokens.iter_chunks()
582    }
583
584    fn iter_paragraphs(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
585        self.tokens.iter_paragraphs()
586    }
587
588    fn iter_sentences(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
589        self.tokens.iter_sentences()
590    }
591}
592
593impl Display for Document {
594    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
595        for token in &self.tokens {
596            write!(f, "{}", self.get_span_content_str(token.span))?;
597        }
598
599        Ok(())
600    }
601}
602
603#[cfg(test)]
604mod tests {
605    use itertools::Itertools;
606
607    use super::Document;
608    use crate::{parsers::MarkdownOptions, Span};
609
610    fn assert_condensed_contractions(text: &str, final_tok_count: usize) {
611        let document = Document::new_plain_english_curated(text);
612
613        assert_eq!(document.tokens.len(), final_tok_count);
614
615        let document = Document::new_markdown_curated(text, MarkdownOptions::default());
616
617        assert_eq!(document.tokens.len(), final_tok_count);
618    }
619
620    #[test]
621    fn simple_contraction() {
622        assert_condensed_contractions("isn't", 1);
623    }
624
625    #[test]
626    fn simple_contraction2() {
627        assert_condensed_contractions("wasn't", 1);
628    }
629
630    #[test]
631    fn simple_contraction3() {
632        assert_condensed_contractions("There's", 1);
633    }
634
635    #[test]
636    fn medium_contraction() {
637        assert_condensed_contractions("isn't wasn't", 3);
638    }
639
640    #[test]
641    fn medium_contraction2() {
642        assert_condensed_contractions("There's no way", 5);
643    }
644
645    #[test]
646    fn selects_token_at_char_index() {
647        let text = "There were three little pigs. They built three little homes.";
648        let document = Document::new_plain_english_curated(text);
649
650        let got = document.get_token_at_char_index(19).unwrap();
651
652        assert!(got.kind.is_word());
653        assert_eq!(got.span, Span::new(17, 23));
654    }
655
656    fn assert_token_count(source: &str, count: usize) {
657        let document = Document::new_plain_english_curated(source);
658
659        dbg!(document.tokens().map(|t| t.kind).collect_vec());
660        assert_eq!(document.tokens.len(), count);
661    }
662
663    #[test]
664    fn condenses_number_suffixes() {
665        assert_token_count("1st", 1);
666        assert_token_count("This is the 2nd test", 9);
667        assert_token_count("This is the 3rd test", 9);
668        assert_token_count(
669            "It works even with weird capitalization like this: 600nD",
670            18,
671        );
672    }
673
674    #[test]
675    fn condenses_ie() {
676        assert_token_count("There is a thing (i.e. that one)", 15);
677        assert_token_count("We are trying to condense \"i.e.\"", 13);
678        assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
679    }
680
681    #[test]
682    fn condenses_eg() {
683        assert_token_count("We are trying to condense \"e.g.\"", 13);
684        assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
685    }
686
687    #[test]
688    fn condenses_nsa() {
689        assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
690    }
691
692    #[test]
693    fn parses_ellipsis() {
694        assert_token_count("...", 1);
695    }
696
697    #[test]
698    fn parses_long_ellipsis() {
699        assert_token_count(".....", 1);
700    }
701
702    #[test]
703    fn parses_short_ellipsis() {
704        assert_token_count("..", 1);
705    }
706}