harper_core/
document.rs

1use std::cmp::Ordering;
2use std::collections::VecDeque;
3use std::fmt::Display;
4
5use paste::paste;
6
7use crate::parsers::{Markdown, MarkdownOptions, Parser, PlainEnglish};
8use crate::patterns::{
9    DocPattern, EitherPattern, Pattern, RepeatingPattern, SequencePattern, WordSet,
10};
11use crate::punctuation::Punctuation;
12use crate::vec_ext::VecExt;
13use crate::{Dictionary, FatToken, FstDictionary, Lrc, Token, TokenKind, TokenStringExt};
14use crate::{NumberSuffix, Span};
15
16/// A document containing some amount of lexed and parsed English text.
17#[derive(Debug, Clone)]
18pub struct Document {
19    source: Lrc<Vec<char>>,
20    tokens: Vec<Token>,
21}
22
23impl Default for Document {
24    fn default() -> Self {
25        Self::new("", &PlainEnglish, &FstDictionary::curated())
26    }
27}
28
29impl Document {
30    /// Locate all the tokens that intersect a provided span.
31    ///
32    /// Desperately needs optimization.
33    pub fn token_indices_intersecting(&self, span: Span) -> Vec<usize> {
34        self.tokens()
35            .enumerate()
36            .filter_map(|(idx, tok)| tok.span.overlaps_with(span).then_some(idx))
37            .collect()
38    }
39
40    /// Lexes and parses text to produce a document using a provided language
41    /// parser and dictionary.
42    pub fn new(text: &str, parser: &impl Parser, dictionary: &impl Dictionary) -> Self {
43        let source: Vec<_> = text.chars().collect();
44
45        Self::new_from_vec(Lrc::new(source), parser, dictionary)
46    }
47
48    /// Lexes and parses text to produce a document using a provided language
49    /// parser and the included curated dictionary.
50    pub fn new_curated(text: &str, parser: &impl Parser) -> Self {
51        let source: Vec<_> = text.chars().collect();
52
53        Self::new_from_vec(Lrc::new(source), parser, &FstDictionary::curated())
54    }
55
56    /// Lexes and parses text to produce a document using a provided language
57    /// parser and dictionary.
58    pub fn new_from_vec(
59        source: Lrc<Vec<char>>,
60        parser: &impl Parser,
61        dictionary: &impl Dictionary,
62    ) -> Self {
63        let tokens = parser.parse(&source);
64
65        let mut document = Self { source, tokens };
66        document.parse(dictionary);
67
68        document
69    }
70
71    /// Parse text to produce a document using the built-in [`PlainEnglish`]
72    /// parser and curated dictionary.
73    pub fn new_plain_english_curated(text: &str) -> Self {
74        Self::new(text, &PlainEnglish, &FstDictionary::curated())
75    }
76
77    /// Parse text to produce a document using the built-in [`PlainEnglish`]
78    /// parser and a provided dictionary.
79    pub fn new_plain_english(text: &str, dictionary: &impl Dictionary) -> Self {
80        Self::new(text, &PlainEnglish, dictionary)
81    }
82
83    /// Parse text to produce a document using the built-in [`Markdown`] parser
84    /// and curated dictionary.
85    pub fn new_markdown_curated(text: &str, markdown_options: MarkdownOptions) -> Self {
86        Self::new(
87            text,
88            &Markdown::new(markdown_options),
89            &FstDictionary::curated(),
90        )
91    }
92
93    /// Parse text to produce a document using the built-in [`Markdown`] parser
94    /// and curated dictionary with the default Markdown configuration.
95    pub fn new_markdown_default_curated(text: &str) -> Self {
96        Self::new_markdown_curated(text, MarkdownOptions::default())
97    }
98
99    /// Parse text to produce a document using the built-in [`PlainEnglish`]
100    /// parser and the curated dictionary.
101    pub fn new_markdown(
102        text: &str,
103        markdown_options: MarkdownOptions,
104        dictionary: &impl Dictionary,
105    ) -> Self {
106        Self::new(text, &Markdown::new(markdown_options), dictionary)
107    }
108
109    /// Parse text to produce a document using the built-in [`PlainEnglish`]
110    /// parser and the curated dictionary with the default Markdown configuration.
111    pub fn new_markdown_default(text: &str, dictionary: &impl Dictionary) -> Self {
112        Self::new_markdown(text, MarkdownOptions::default(), dictionary)
113    }
114
115    /// Re-parse important language constructs.
116    ///
117    /// Should be run after every change to the underlying [`Self::source`].
118    fn parse(&mut self, dictionary: &impl Dictionary) {
119        self.condense_spaces();
120        self.condense_newlines();
121        self.newlines_to_breaks();
122        self.condense_contractions();
123        self.condense_dotted_initialisms();
124        self.condense_number_suffixes();
125        self.condense_ellipsis();
126        self.condense_latin();
127        self.match_quotes();
128        self.articles_imply_nouns();
129
130        for token in self.tokens.iter_mut() {
131            if let TokenKind::Word(meta) = &mut token.kind {
132                let word_source = token.span.get_content(&self.source);
133                let found_meta = dictionary.get_word_metadata(word_source);
134                *meta = found_meta.cloned()
135            }
136        }
137    }
138
139    fn uncached_article_pattern() -> Lrc<SequencePattern> {
140        Lrc::new(
141            SequencePattern::default()
142                .then_determiner()
143                .then_whitespace()
144                .then(|t: &Token, _source: &[char]| t.kind.is_adjective() && t.kind.is_noun())
145                .then_whitespace()
146                .then_noun(),
147        )
148    }
149
150    thread_local! {static ARTICLE_PATTERN: Lrc<SequencePattern> = Document::uncached_article_pattern()}
151
152    /// When a word that is either an adjective or a noun is sandwiched between an article and a noun,
153    /// it definitely is not a noun.
154    fn articles_imply_nouns(&mut self) {
155        let pattern = Self::ARTICLE_PATTERN.with(|v| v.clone());
156
157        for m in pattern.find_all_matches_in_doc(self) {
158            if let TokenKind::Word(Some(metadata)) = &mut self.tokens[m.start + 2].kind {
159                metadata.noun = None;
160                metadata.verb = None;
161            }
162        }
163    }
164
165    /// Convert all sets of newlines greater than 2 to paragraph breaks.
166    fn newlines_to_breaks(&mut self) {
167        for token in &mut self.tokens {
168            if let TokenKind::Newline(n) = token.kind {
169                if n >= 2 {
170                    token.kind = TokenKind::ParagraphBreak;
171                }
172            }
173        }
174    }
175
176    /// Given a list of indices, this function removes the subsequent
177    /// `stretch_len - 1` elements after each index.
178    ///
179    /// Will extend token spans to include removed elements.
180    /// Assumes condensed tokens are contiguous in source text.
181    fn condense_indices(&mut self, indices: &[usize], stretch_len: usize) {
182        // Update spans
183        for idx in indices {
184            let end_tok = self.tokens[idx + stretch_len - 1].clone();
185            let start_tok = &mut self.tokens[*idx];
186
187            start_tok.span.end = end_tok.span.end;
188        }
189
190        // Trim
191        let old = self.tokens.clone();
192        self.tokens.clear();
193
194        // Keep first chunk.
195        self.tokens
196            .extend_from_slice(&old[0..indices.first().copied().unwrap_or(indices.len())]);
197
198        let mut iter = indices.iter().peekable();
199
200        while let (Some(a_idx), b) = (iter.next(), iter.peek()) {
201            self.tokens.push(old[*a_idx].clone());
202
203            if let Some(b_idx) = b {
204                self.tokens
205                    .extend_from_slice(&old[a_idx + stretch_len..**b_idx]);
206            }
207        }
208
209        // Keep last chunk.
210        self.tokens.extend_from_slice(
211            &old[indices
212                .last()
213                .map(|v| v + stretch_len)
214                .unwrap_or(indices.len())..],
215        );
216    }
217
218    pub fn get_token_at_char_index(&self, char_index: usize) -> Option<&Token> {
219        let index = self
220            .tokens
221            .binary_search_by(|t| {
222                if t.span.overlaps_with(Span::new_with_len(char_index, 1)) {
223                    Ordering::Equal
224                } else {
225                    t.span.start.cmp(&char_index)
226                }
227            })
228            .ok()?;
229
230        Some(&self.tokens[index])
231    }
232
233    /// Defensively attempt to grab a specific token.
234    pub fn get_token(&self, index: usize) -> Option<&Token> {
235        self.tokens.get(index)
236    }
237
238    /// Get an iterator over all the tokens contained in the document.
239    pub fn tokens(&self) -> impl Iterator<Item = &Token> + '_ {
240        self.tokens.iter()
241    }
242
243    /// Get an iterator over all the tokens contained in the document.
244    pub fn fat_tokens(&self) -> impl Iterator<Item = FatToken> + '_ {
245        self.tokens().map(|token| token.to_fat(&self.source))
246    }
247
248    pub fn get_span_content(&self, span: &Span) -> &[char] {
249        span.get_content(&self.source)
250    }
251
252    pub fn get_span_content_str(&self, span: &Span) -> String {
253        String::from_iter(self.get_span_content(span))
254    }
255
256    pub fn get_full_string(&self) -> String {
257        self.get_span_content_str(&Span {
258            start: 0,
259            end: self.source.len(),
260        })
261    }
262
263    pub fn get_full_content(&self) -> &[char] {
264        &self.source
265    }
266
267    pub fn get_source(&self) -> &[char] {
268        &self.source
269    }
270
271    pub fn get_tokens(&self) -> &[Token] {
272        &self.tokens
273    }
274
275    /// Searches for quotation marks and fills the
276    /// [`Punctuation::Quote::twin_loc`] field. This is on a best-effort
277    /// basis.
278    ///
279    /// Current algorithm is basic and could use some work.
280    fn match_quotes(&mut self) {
281        let quote_indices: Vec<usize> = self.tokens.iter_quote_indices().collect();
282
283        for i in 0..quote_indices.len() / 2 {
284            let a_i = quote_indices[i * 2];
285            let b_i = quote_indices[i * 2 + 1];
286
287            {
288                let a = self.tokens[a_i].kind.as_mut_quote().unwrap();
289                a.twin_loc = Some(b_i);
290            }
291
292            {
293                let b = self.tokens[b_i].kind.as_mut_quote().unwrap();
294                b.twin_loc = Some(a_i);
295            }
296        }
297    }
298
299    /// Searches for number suffixes and condenses them down into single tokens
300    fn condense_number_suffixes(&mut self) {
301        if self.tokens.len() < 2 {
302            return;
303        }
304
305        let mut replace_starts = Vec::new();
306
307        for idx in 0..self.tokens.len() - 1 {
308            let b = &self.tokens[idx + 1];
309            let a = &self.tokens[idx];
310
311            // TODO: Allow spaces between `a` and `b`
312
313            if let (TokenKind::Number(..), TokenKind::Word(..)) = (&a.kind, &b.kind) {
314                if let Some(found_suffix) = NumberSuffix::from_chars(self.get_span_content(&b.span))
315                {
316                    self.tokens[idx].kind.as_mut_number().unwrap().suffix = Some(found_suffix);
317                    replace_starts.push(idx);
318                }
319            }
320        }
321
322        self.condense_indices(&replace_starts, 2);
323    }
324
325    /// Searches for multiple sequential space tokens and condenses them down
326    /// into one.
327    fn condense_spaces(&mut self) {
328        let mut cursor = 0;
329        let copy = self.tokens.clone();
330
331        let mut remove_these = VecDeque::new();
332
333        while cursor < self.tokens.len() {
334            // Locate a stretch of one or more newline tokens.
335            let start_tok = &mut self.tokens[cursor];
336
337            if let TokenKind::Space(start_count) = &mut start_tok.kind {
338                loop {
339                    cursor += 1;
340
341                    if cursor >= copy.len() {
342                        break;
343                    }
344
345                    let child_tok = &copy[cursor];
346
347                    // Only condense adjacent spans
348                    if start_tok.span.end != child_tok.span.start {
349                        break;
350                    }
351
352                    if let TokenKind::Space(n) = child_tok.kind {
353                        *start_count += n;
354                        start_tok.span.end = child_tok.span.end;
355                        remove_these.push_back(cursor);
356                        cursor += 1;
357                    } else {
358                        break;
359                    };
360                }
361            }
362
363            cursor += 1;
364        }
365
366        self.tokens.remove_indices(remove_these);
367    }
368
369    thread_local! {
370        static LATIN_PATTERN: Lrc<EitherPattern> = Document::uncached_latin_pattern();
371    }
372
373    fn uncached_latin_pattern() -> Lrc<EitherPattern> {
374        Lrc::new(EitherPattern::new(vec![
375            Box::new(
376                SequencePattern::default()
377                    .then(WordSet::new(&["etc", "vs"]))
378                    .then_period(),
379            ),
380            Box::new(
381                SequencePattern::aco("et")
382                    .then_whitespace()
383                    .t_aco("al")
384                    .then_period(),
385            ),
386        ]))
387    }
388
389    /// Assumes that the first matched token is the canonical one to be condensed into.
390    /// Takes a callback that can be used to retroactively edit the canonical token afterwards.
391    fn condense_pattern<F>(&mut self, pattern: &impl Pattern, edit: F)
392    where
393        F: Fn(&mut Token),
394    {
395        let matches = pattern.find_all_matches_in_doc(self);
396
397        let mut remove_indices = VecDeque::with_capacity(matches.len());
398
399        for m in matches {
400            remove_indices.extend(m.start + 1..m.end);
401            self.tokens[m.start].span = self.tokens[m.into_iter()].span().unwrap();
402            edit(&mut self.tokens[m.start]);
403        }
404
405        self.tokens.remove_indices(remove_indices);
406    }
407
408    fn condense_latin(&mut self) {
409        self.condense_pattern(&Self::LATIN_PATTERN.with(|v| v.clone()), |_| {})
410    }
411
412    /// Searches for multiple sequential newline tokens and condenses them down
413    /// into one.
414    fn condense_newlines(&mut self) {
415        let mut cursor = 0;
416        let copy = self.tokens.clone();
417
418        let mut remove_these = VecDeque::new();
419
420        while cursor < self.tokens.len() {
421            // Locate a stretch of one or more newline tokens.
422            let start_tok = &mut self.tokens[cursor];
423
424            if let TokenKind::Newline(start_count) = &mut start_tok.kind {
425                loop {
426                    cursor += 1;
427
428                    if cursor >= copy.len() {
429                        break;
430                    }
431
432                    let child_tok = &copy[cursor];
433                    if let TokenKind::Newline(n) = child_tok.kind {
434                        *start_count += n;
435                        start_tok.span.end = child_tok.span.end;
436                        remove_these.push_back(cursor);
437                        cursor += 1;
438                    } else {
439                        break;
440                    };
441                }
442            }
443
444            cursor += 1;
445        }
446
447        self.tokens.remove_indices(remove_these);
448    }
449
450    /// Condenses words like "i.e.", "e.g." and "N.S.A." down to single words
451    /// using a state machine.
452    fn condense_dotted_initialisms(&mut self) {
453        if self.tokens.len() < 2 {
454            return;
455        }
456
457        let mut to_remove = VecDeque::new();
458
459        let mut cursor = 1;
460
461        let mut initialism_start = None;
462
463        loop {
464            let a = &self.tokens[cursor - 1];
465            let b = &self.tokens[cursor];
466
467            let is_initialism_chunk = a.kind.is_word() && a.span.len() == 1 && b.kind.is_period();
468
469            if is_initialism_chunk {
470                if initialism_start.is_none() {
471                    initialism_start = Some(cursor - 1);
472                } else {
473                    to_remove.push_back(cursor - 1);
474                }
475
476                to_remove.push_back(cursor);
477                cursor += 1;
478            } else {
479                if let Some(start) = initialism_start {
480                    let end = self.tokens[cursor - 2].span.end;
481                    let start_tok: &mut Token = &mut self.tokens[start];
482                    start_tok.span.end = end;
483                }
484
485                initialism_start = None;
486            }
487
488            cursor += 1;
489
490            if cursor >= self.tokens.len() - 1 {
491                break;
492            }
493        }
494
495        self.tokens.remove_indices(to_remove);
496    }
497
498    fn uncached_ellipsis_pattern() -> Lrc<RepeatingPattern> {
499        let period = SequencePattern::default().then_period();
500        Lrc::new(RepeatingPattern::new(Box::new(period), 2))
501    }
502
503    thread_local! {
504        static ELLIPSIS_PATTERN: Lrc<RepeatingPattern> = Document::uncached_ellipsis_pattern();
505    }
506
507    fn condense_ellipsis(&mut self) {
508        let pattern = Self::ELLIPSIS_PATTERN.with(|v| v.clone());
509        self.condense_pattern(&pattern, |tok| {
510            tok.kind = TokenKind::Punctuation(Punctuation::Ellipsis)
511        });
512    }
513
514    fn uncached_contraction_pattern() -> Lrc<SequencePattern> {
515        Lrc::new(
516            SequencePattern::default()
517                .then_any_word()
518                .then_apostrophe()
519                .then_any_word(),
520        )
521    }
522
523    thread_local! {
524        static CONTRACTION_PATTERN: Lrc<SequencePattern> = Document::uncached_contraction_pattern();
525    }
526
527    /// Searches for contractions and condenses them down into single
528    /// tokens.
529    fn condense_contractions(&mut self) {
530        let pattern = Self::CONTRACTION_PATTERN.with(|v| v.clone());
531
532        self.condense_pattern(&pattern, |_| {});
533    }
534}
535
536/// Creates functions necessary to implement [`TokenStringExt]` on a document.
537macro_rules! create_fns_on_doc {
538    ($thing:ident) => {
539        paste! {
540            fn [< first_ $thing >](&self) -> Option<&Token> {
541                self.tokens.[< first_ $thing >]()
542            }
543
544            fn [< last_ $thing >](&self) -> Option<&Token> {
545                self.tokens.[< last_ $thing >]()
546            }
547
548            fn [< last_ $thing _index>](&self) -> Option<usize> {
549                self.tokens.[< last_ $thing _index >]()
550            }
551
552            fn [<iter_ $thing _indices>](&self) -> impl Iterator<Item = usize> + '_ {
553                self.tokens.[< iter_ $thing _indices >]()
554            }
555
556            fn [<iter_ $thing s>](&self) -> impl Iterator<Item = &Token> + '_ {
557                self.tokens.[< iter_ $thing s >]()
558            }
559        }
560    };
561}
562
563impl TokenStringExt for Document {
564    create_fns_on_doc!(word);
565    create_fns_on_doc!(hostname);
566    create_fns_on_doc!(word_like);
567    create_fns_on_doc!(conjunction);
568    create_fns_on_doc!(space);
569    create_fns_on_doc!(apostrophe);
570    create_fns_on_doc!(pipe);
571    create_fns_on_doc!(quote);
572    create_fns_on_doc!(number);
573    create_fns_on_doc!(at);
574    create_fns_on_doc!(ellipsis);
575    create_fns_on_doc!(unlintable);
576    create_fns_on_doc!(sentence_terminator);
577    create_fns_on_doc!(paragraph_break);
578    create_fns_on_doc!(chunk_terminator);
579    create_fns_on_doc!(punctuation);
580    create_fns_on_doc!(currency);
581    create_fns_on_doc!(likely_homograph);
582    create_fns_on_doc!(comma);
583    create_fns_on_doc!(adjective);
584    create_fns_on_doc!(verb);
585    create_fns_on_doc!(preposition);
586
587    fn first_sentence_word(&self) -> Option<&Token> {
588        self.tokens.first_sentence_word()
589    }
590
591    fn first_non_whitespace(&self) -> Option<&Token> {
592        self.tokens.first_non_whitespace()
593    }
594
595    fn span(&self) -> Option<Span> {
596        self.tokens.span()
597    }
598
599    fn iter_linking_verb_indices(&self) -> impl Iterator<Item = usize> + '_ {
600        self.tokens.iter_linking_verb_indices()
601    }
602
603    fn iter_linking_verbs(&self) -> impl Iterator<Item = &Token> + '_ {
604        self.tokens.iter_linking_verbs()
605    }
606
607    fn iter_chunks(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
608        self.tokens.iter_chunks()
609    }
610
611    fn iter_paragraphs(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
612        self.tokens.iter_paragraphs()
613    }
614
615    fn iter_sentences(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
616        self.tokens.iter_sentences()
617    }
618}
619
620impl Display for Document {
621    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
622        for token in &self.tokens {
623            write!(f, "{}", self.get_span_content_str(&token.span))?;
624        }
625
626        Ok(())
627    }
628}
629
630#[cfg(test)]
631mod tests {
632    use itertools::Itertools;
633
634    use super::Document;
635    use crate::{Span, parsers::MarkdownOptions};
636
637    fn assert_condensed_contractions(text: &str, final_tok_count: usize) {
638        let document = Document::new_plain_english_curated(text);
639
640        assert_eq!(document.tokens.len(), final_tok_count);
641
642        let document = Document::new_markdown_curated(text, MarkdownOptions::default());
643
644        assert_eq!(document.tokens.len(), final_tok_count);
645    }
646
647    #[test]
648    fn simple_contraction() {
649        assert_condensed_contractions("isn't", 1);
650    }
651
652    #[test]
653    fn simple_contraction2() {
654        assert_condensed_contractions("wasn't", 1);
655    }
656
657    #[test]
658    fn simple_contraction3() {
659        assert_condensed_contractions("There's", 1);
660    }
661
662    #[test]
663    fn medium_contraction() {
664        assert_condensed_contractions("isn't wasn't", 3);
665    }
666
667    #[test]
668    fn medium_contraction2() {
669        assert_condensed_contractions("There's no way", 5);
670    }
671
672    #[test]
673    fn selects_token_at_char_index() {
674        let text = "There were three little pigs. They built three little homes.";
675        let document = Document::new_plain_english_curated(text);
676
677        let got = document.get_token_at_char_index(19).unwrap();
678
679        assert!(got.kind.is_word());
680        assert_eq!(got.span, Span::new(17, 23));
681    }
682
683    fn assert_token_count(source: &str, count: usize) {
684        let document = Document::new_plain_english_curated(source);
685
686        dbg!(document.tokens().map(|t| t.kind.clone()).collect_vec());
687        assert_eq!(document.tokens.len(), count);
688    }
689
690    #[test]
691    fn condenses_number_suffixes() {
692        assert_token_count("1st", 1);
693        assert_token_count("This is the 2nd test", 9);
694        assert_token_count("This is the 3rd test", 9);
695        assert_token_count(
696            "It works even with weird capitalization like this: 600nD",
697            18,
698        );
699    }
700
701    #[test]
702    fn condenses_ie() {
703        assert_token_count("There is a thing (i.e. that one)", 15);
704        assert_token_count("We are trying to condense \"i.e.\"", 13);
705        assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
706    }
707
708    #[test]
709    fn condenses_eg() {
710        assert_token_count("We are trying to condense \"e.g.\"", 13);
711        assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
712    }
713
714    #[test]
715    fn condenses_nsa() {
716        assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
717    }
718
719    #[test]
720    fn parses_ellipsis() {
721        assert_token_count("...", 1);
722    }
723
724    #[test]
725    fn parses_long_ellipsis() {
726        assert_token_count(".....", 1);
727    }
728
729    #[test]
730    fn parses_short_ellipsis() {
731        assert_token_count("..", 1);
732    }
733}