harper_core/
document.rs

1use std::cmp::Ordering;
2use std::collections::VecDeque;
3use std::fmt::Display;
4
5use paste::paste;
6
7use crate::parsers::{Markdown, MarkdownOptions, Parser, PlainEnglish};
8use crate::patterns::{
9    DocPattern, EitherPattern, Pattern, RepeatingPattern, SequencePattern, WordSet,
10};
11use crate::punctuation::Punctuation;
12use crate::vec_ext::VecExt;
13use crate::{Dictionary, FatToken, FstDictionary, Lrc, Token, TokenKind, TokenStringExt};
14use crate::{NumberSuffix, Span};
15
16/// A document containing some amount of lexed and parsed English text.
17#[derive(Debug, Clone)]
18pub struct Document {
19    source: Lrc<Vec<char>>,
20    tokens: Vec<Token>,
21}
22
23impl Default for Document {
24    fn default() -> Self {
25        Self::new("", &PlainEnglish, &FstDictionary::curated())
26    }
27}
28
29impl Document {
30    /// Locate all the tokens that intersect a provided span.
31    ///
32    /// Desperately needs optimization.
33    pub fn token_indices_intersecting(&self, span: Span) -> Vec<usize> {
34        self.tokens()
35            .enumerate()
36            .filter_map(|(idx, tok)| tok.span.overlaps_with(span).then_some(idx))
37            .collect()
38    }
39
40    /// Lexes and parses text to produce a document using a provided language
41    /// parser and dictionary.
42    pub fn new(text: &str, parser: &impl Parser, dictionary: &impl Dictionary) -> Self {
43        let source: Vec<_> = text.chars().collect();
44
45        Self::new_from_vec(Lrc::new(source), parser, dictionary)
46    }
47
48    /// Lexes and parses text to produce a document using a provided language
49    /// parser and the included curated dictionary.
50    pub fn new_curated(text: &str, parser: &impl Parser) -> Self {
51        let source: Vec<_> = text.chars().collect();
52
53        Self::new_from_vec(Lrc::new(source), parser, &FstDictionary::curated())
54    }
55
56    /// Lexes and parses text to produce a document using a provided language
57    /// parser and dictionary.
58    pub fn new_from_vec(
59        source: Lrc<Vec<char>>,
60        parser: &impl Parser,
61        dictionary: &impl Dictionary,
62    ) -> Self {
63        let tokens = parser.parse(&source);
64
65        let mut document = Self { source, tokens };
66        document.parse(dictionary);
67
68        document
69    }
70
71    /// Parse text to produce a document using the built-in [`PlainEnglish`]
72    /// parser and curated dictionary.
73    pub fn new_plain_english_curated(text: &str) -> Self {
74        Self::new(text, &PlainEnglish, &FstDictionary::curated())
75    }
76
77    /// Parse text to produce a document using the built-in [`PlainEnglish`]
78    /// parser and a provided dictionary.
79    pub fn new_plain_english(text: &str, dictionary: &impl Dictionary) -> Self {
80        Self::new(text, &PlainEnglish, dictionary)
81    }
82
83    /// Parse text to produce a document using the built-in [`Markdown`] parser
84    /// and curated dictionary.
85    pub fn new_markdown_curated(text: &str, markdown_options: MarkdownOptions) -> Self {
86        Self::new(
87            text,
88            &Markdown::new(markdown_options),
89            &FstDictionary::curated(),
90        )
91    }
92
93    /// Parse text to produce a document using the built-in [`Markdown`] parser
94    /// and curated dictionary with the default markdown configuration.
95    pub fn new_markdown_default_curated(text: &str) -> Self {
96        Self::new_markdown_curated(text, MarkdownOptions::default())
97    }
98
99    /// Parse text to produce a document using the built-in [`PlainEnglish`]
100    /// parser and the curated dictionary.
101    pub fn new_markdown(
102        text: &str,
103        markdown_options: MarkdownOptions,
104        dictionary: &impl Dictionary,
105    ) -> Self {
106        Self::new(text, &Markdown::new(markdown_options), dictionary)
107    }
108
109    /// Parse text to produce a document using the built-in [`PlainEnglish`]
110    /// parser and the curated dictionary with the default markdown configuration.
111    pub fn new_markdown_default(text: &str, dictionary: &impl Dictionary) -> Self {
112        Self::new_markdown(text, MarkdownOptions::default(), dictionary)
113    }
114
115    /// Re-parse important language constructs.
116    ///
117    /// Should be run after every change to the underlying [`Self::source`].
118    fn parse(&mut self, dictionary: &impl Dictionary) {
119        self.condense_spaces();
120        self.condense_newlines();
121        self.newlines_to_breaks();
122        self.condense_contractions();
123        self.condense_dotted_initialisms();
124        self.condense_number_suffixes();
125        self.condense_ellipsis();
126        self.condense_latin();
127        self.match_quotes();
128
129        for token in self.tokens.iter_mut() {
130            if let TokenKind::Word(meta) = &mut token.kind {
131                let word_source = token.span.get_content(&self.source);
132                let found_meta = dictionary.get_word_metadata(word_source);
133                *meta = meta.or(&found_meta);
134            }
135        }
136    }
137
138    /// Convert all sets of newlines greater than 2 to paragraph breaks.
139    fn newlines_to_breaks(&mut self) {
140        for token in &mut self.tokens {
141            if let TokenKind::Newline(n) = token.kind {
142                if n >= 2 {
143                    token.kind = TokenKind::ParagraphBreak;
144                }
145            }
146        }
147    }
148
149    /// Given a list of indices, this function removes the subsequent
150    /// `stretch_len - 1` elements after each index.
151    ///
152    /// Will extend token spans to include removed elements.
153    /// Assumes condensed tokens are contiguous in source text.
154    fn condense_indices(&mut self, indices: &[usize], stretch_len: usize) {
155        // Update spans
156        for idx in indices {
157            let end_tok = self.tokens[idx + stretch_len - 1];
158            let start_tok = &mut self.tokens[*idx];
159
160            start_tok.span.end = end_tok.span.end;
161        }
162
163        // Trim
164        let old = self.tokens.clone();
165        self.tokens.clear();
166
167        // Keep first chunk.
168        self.tokens
169            .extend_from_slice(&old[0..indices.first().copied().unwrap_or(indices.len())]);
170
171        let mut iter = indices.iter().peekable();
172
173        while let (Some(a_idx), b) = (iter.next(), iter.peek()) {
174            self.tokens.push(old[*a_idx]);
175
176            if let Some(b_idx) = b {
177                self.tokens
178                    .extend_from_slice(&old[a_idx + stretch_len..**b_idx]);
179            }
180        }
181
182        // Keep last chunk.
183        self.tokens.extend_from_slice(
184            &old[indices
185                .last()
186                .map(|v| v + stretch_len)
187                .unwrap_or(indices.len())..],
188        );
189    }
190
191    pub fn get_token_at_char_index(&self, char_index: usize) -> Option<Token> {
192        let index = self
193            .tokens
194            .binary_search_by(|t| {
195                if t.span.overlaps_with(Span::new_with_len(char_index, 1)) {
196                    Ordering::Equal
197                } else {
198                    t.span.start.cmp(&char_index)
199                }
200            })
201            .ok()?;
202
203        Some(self.tokens[index])
204    }
205
206    /// Defensively attempt to grab a specific token.
207    pub fn get_token(&self, index: usize) -> Option<Token> {
208        self.tokens.get(index).copied()
209    }
210
211    /// Get an iterator over all the tokens contained in the document.
212    pub fn tokens(&self) -> impl Iterator<Item = Token> + '_ {
213        self.tokens.iter().copied()
214    }
215
216    /// Get an iterator over all the tokens contained in the document.
217    pub fn fat_tokens(&self) -> impl Iterator<Item = FatToken> + '_ {
218        self.tokens().map(|token| token.to_fat(&self.source))
219    }
220
221    pub fn get_span_content(&self, span: Span) -> &[char] {
222        span.get_content(&self.source)
223    }
224
225    pub fn get_span_content_str(&self, span: Span) -> String {
226        String::from_iter(self.get_span_content(span))
227    }
228
229    pub fn get_full_string(&self) -> String {
230        self.get_span_content_str(Span {
231            start: 0,
232            end: self.source.len(),
233        })
234    }
235
236    pub fn get_full_content(&self) -> &[char] {
237        &self.source
238    }
239
240    pub fn get_source(&self) -> &[char] {
241        &self.source
242    }
243
244    pub fn get_tokens(&self) -> &[Token] {
245        &self.tokens
246    }
247
248    /// Searches for quotation marks and fills the
249    /// [`Punctuation::Quote::twin_loc`] field. This is on a best-effort
250    /// basis.
251    ///
252    /// Current algorithm is basic and could use some work.
253    fn match_quotes(&mut self) {
254        let quote_indices: Vec<usize> = self.tokens.iter_quote_indices().collect();
255
256        for i in 0..quote_indices.len() / 2 {
257            let a_i = quote_indices[i * 2];
258            let b_i = quote_indices[i * 2 + 1];
259
260            {
261                let a = self.tokens[a_i].kind.as_mut_quote().unwrap();
262                a.twin_loc = Some(b_i);
263            }
264
265            {
266                let b = self.tokens[b_i].kind.as_mut_quote().unwrap();
267                b.twin_loc = Some(a_i);
268            }
269        }
270    }
271
272    /// Searches for number suffixes and condenses them down into single tokens
273    fn condense_number_suffixes(&mut self) {
274        if self.tokens.len() < 2 {
275            return;
276        }
277
278        let mut replace_starts = Vec::new();
279
280        for idx in 0..self.tokens.len() - 1 {
281            let b = self.tokens[idx + 1];
282            let a = self.tokens[idx];
283
284            // TODO: Allow spaces between `a` and `b`
285
286            if let (TokenKind::Number(..), TokenKind::Word(..)) = (a.kind, b.kind) {
287                if let Some(found_suffix) = NumberSuffix::from_chars(self.get_span_content(b.span))
288                {
289                    self.tokens[idx].kind.as_mut_number().unwrap().suffix = Some(found_suffix);
290                    replace_starts.push(idx);
291                }
292            }
293        }
294
295        self.condense_indices(&replace_starts, 2);
296    }
297
298    /// Searches for multiple sequential space tokens and condenses them down
299    /// into one.
300    fn condense_spaces(&mut self) {
301        let mut cursor = 0;
302        let copy = self.tokens.clone();
303
304        let mut remove_these = VecDeque::new();
305
306        while cursor < self.tokens.len() {
307            // Locate a stretch of one or more newline tokens.
308            let start_tok = &mut self.tokens[cursor];
309
310            if let TokenKind::Space(start_count) = &mut start_tok.kind {
311                loop {
312                    cursor += 1;
313
314                    if cursor >= copy.len() {
315                        break;
316                    }
317
318                    let child_tok = &copy[cursor];
319
320                    // Only condense adjacent spans
321                    if start_tok.span.end != child_tok.span.start {
322                        break;
323                    }
324
325                    if let TokenKind::Space(n) = child_tok.kind {
326                        *start_count += n;
327                        start_tok.span.end = child_tok.span.end;
328                        remove_these.push_back(cursor);
329                        cursor += 1;
330                    } else {
331                        break;
332                    };
333                }
334            }
335
336            cursor += 1;
337        }
338
339        self.tokens.remove_indices(remove_these);
340    }
341
342    thread_local! {
343        static LATIN_PATTERN: Lrc<EitherPattern> = Document::uncached_latin_pattern();
344    }
345
346    fn uncached_latin_pattern() -> Lrc<EitherPattern> {
347        Lrc::new(EitherPattern::new(vec![
348            Box::new(
349                SequencePattern::default()
350                    .then_word_set(WordSet::all(&["etc", "vs"]))
351                    .then_period(),
352            ),
353            Box::new(
354                SequencePattern::aco("et")
355                    .then_whitespace()
356                    .t_aco("al")
357                    .then_period(),
358            ),
359        ]))
360    }
361
362    /// Assumes that the first matched token is the canonical one to be condensed into.
363    /// Takes a callback that can be used to retroactively edit the canonical token afterwards.
364    fn condense_pattern<F>(&mut self, pattern: &impl Pattern, edit: F)
365    where
366        F: Fn(&mut Token),
367    {
368        let matches = pattern.find_all_matches_in_doc(self);
369
370        let mut remove_indices = VecDeque::with_capacity(matches.len());
371
372        for m in matches {
373            remove_indices.extend(m.start + 1..m.end);
374            self.tokens[m.start].span = self.tokens[m.into_iter()].span().unwrap();
375            edit(&mut self.tokens[m.start]);
376        }
377
378        self.tokens.remove_indices(remove_indices);
379    }
380
381    fn condense_latin(&mut self) {
382        self.condense_pattern(&Self::LATIN_PATTERN.with(|v| v.clone()), |_| {})
383    }
384
385    /// Searches for multiple sequential newline tokens and condenses them down
386    /// into one.
387    fn condense_newlines(&mut self) {
388        let mut cursor = 0;
389        let copy = self.tokens.clone();
390
391        let mut remove_these = VecDeque::new();
392
393        while cursor < self.tokens.len() {
394            // Locate a stretch of one or more newline tokens.
395            let start_tok = &mut self.tokens[cursor];
396
397            if let TokenKind::Newline(start_count) = &mut start_tok.kind {
398                loop {
399                    cursor += 1;
400
401                    if cursor >= copy.len() {
402                        break;
403                    }
404
405                    let child_tok = &copy[cursor];
406                    if let TokenKind::Newline(n) = child_tok.kind {
407                        *start_count += n;
408                        start_tok.span.end = child_tok.span.end;
409                        remove_these.push_back(cursor);
410                        cursor += 1;
411                    } else {
412                        break;
413                    };
414                }
415            }
416
417            cursor += 1;
418        }
419
420        self.tokens.remove_indices(remove_these);
421    }
422
423    /// Condenses words like "i.e.", "e.g." and "N.S.A." down to single words
424    /// using a state machine.
425    fn condense_dotted_initialisms(&mut self) {
426        if self.tokens.len() < 2 {
427            return;
428        }
429
430        let mut to_remove = VecDeque::new();
431
432        let mut cursor = 1;
433
434        let mut initialism_start = None;
435
436        loop {
437            let a = self.tokens[cursor - 1];
438            let b = self.tokens[cursor];
439
440            let is_initialism_chunk = a.kind.is_word() && a.span.len() == 1 && b.kind.is_period();
441
442            if is_initialism_chunk {
443                if initialism_start.is_none() {
444                    initialism_start = Some(cursor - 1);
445                } else {
446                    to_remove.push_back(cursor - 1);
447                }
448
449                to_remove.push_back(cursor);
450                cursor += 1;
451            } else {
452                if let Some(start) = initialism_start {
453                    let end = self.tokens[cursor - 2].span.end;
454                    let start_tok: &mut Token = &mut self.tokens[start];
455                    start_tok.span.end = end;
456                }
457
458                initialism_start = None;
459            }
460
461            cursor += 1;
462
463            if cursor >= self.tokens.len() - 1 {
464                break;
465            }
466        }
467
468        self.tokens.remove_indices(to_remove);
469    }
470
471    fn uncached_ellipsis_pattern() -> Lrc<RepeatingPattern> {
472        let period = SequencePattern::default().then_period();
473        Lrc::new(RepeatingPattern::new(Box::new(period), 2))
474    }
475
476    thread_local! {
477        static ELLIPSIS_PATTERN: Lrc<RepeatingPattern> = Document::uncached_ellipsis_pattern();
478    }
479
480    fn condense_ellipsis(&mut self) {
481        let pattern = Self::ELLIPSIS_PATTERN.with(|v| v.clone());
482        self.condense_pattern(&pattern, |tok| {
483            tok.kind = TokenKind::Punctuation(Punctuation::Ellipsis)
484        });
485    }
486
487    fn uncached_contraction_pattern() -> Lrc<SequencePattern> {
488        Lrc::new(
489            SequencePattern::default()
490                .then_any_word()
491                .then_apostrophe()
492                .then_any_word(),
493        )
494    }
495
496    thread_local! {
497        static CONTRACTION_PATTERN: Lrc<SequencePattern> = Document::uncached_contraction_pattern();
498    }
499
500    /// Searches for contractions and condenses them down into single
501    /// tokens.
502    fn condense_contractions(&mut self) {
503        let pattern = Self::CONTRACTION_PATTERN.with(|v| v.clone());
504
505        self.condense_pattern(&pattern, |_| {});
506    }
507}
508
509/// Creates functions necessary to implement [`TokenStringExt]` on a document.
510macro_rules! create_fns_on_doc {
511    ($thing:ident) => {
512        paste! {
513            fn [< first_ $thing >](&self) -> Option<Token> {
514                self.tokens.[< first_ $thing >]()
515            }
516
517            fn [< last_ $thing >](&self) -> Option<Token> {
518                self.tokens.[< last_ $thing >]()
519            }
520
521            fn [< last_ $thing _index>](&self) -> Option<usize> {
522                self.tokens.[< last_ $thing _index >]()
523            }
524
525            fn [<iter_ $thing _indices>](&self) -> impl Iterator<Item = usize> + '_ {
526                self.tokens.[< iter_ $thing _indices >]()
527            }
528
529            fn [<iter_ $thing s>](&self) -> impl Iterator<Item = Token> + '_ {
530                self.tokens.[< iter_ $thing s >]()
531            }
532        }
533    };
534}
535
536impl TokenStringExt for Document {
537    create_fns_on_doc!(word);
538    create_fns_on_doc!(word_like);
539    create_fns_on_doc!(conjunction);
540    create_fns_on_doc!(space);
541    create_fns_on_doc!(apostrophe);
542    create_fns_on_doc!(pipe);
543    create_fns_on_doc!(quote);
544    create_fns_on_doc!(number);
545    create_fns_on_doc!(at);
546    create_fns_on_doc!(ellipsis);
547    create_fns_on_doc!(unlintable);
548    create_fns_on_doc!(sentence_terminator);
549    create_fns_on_doc!(paragraph_break);
550    create_fns_on_doc!(chunk_terminator);
551    create_fns_on_doc!(punctuation);
552    create_fns_on_doc!(currency);
553    create_fns_on_doc!(likely_homograph);
554    create_fns_on_doc!(comma);
555
556    fn first_sentence_word(&self) -> Option<Token> {
557        self.tokens.first_sentence_word()
558    }
559
560    fn first_non_whitespace(&self) -> Option<Token> {
561        self.tokens.first_non_whitespace()
562    }
563
564    fn span(&self) -> Option<Span> {
565        self.tokens.span()
566    }
567
568    fn iter_linking_verb_indices(&self) -> impl Iterator<Item = usize> + '_ {
569        self.tokens.iter_linking_verb_indices()
570    }
571
572    fn iter_linking_verbs(&self) -> impl Iterator<Item = Token> + '_ {
573        self.tokens.iter_linking_verbs()
574    }
575
576    fn iter_chunks(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
577        self.tokens.iter_chunks()
578    }
579
580    fn iter_paragraphs(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
581        self.tokens.iter_paragraphs()
582    }
583
584    fn iter_sentences(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
585        self.tokens.iter_sentences()
586    }
587}
588
589impl Display for Document {
590    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
591        for token in &self.tokens {
592            write!(f, "{}", self.get_span_content_str(token.span))?;
593        }
594
595        Ok(())
596    }
597}
598
599#[cfg(test)]
600mod tests {
601    use itertools::Itertools;
602
603    use super::Document;
604    use crate::{parsers::MarkdownOptions, Span};
605
606    fn assert_condensed_contractions(text: &str, final_tok_count: usize) {
607        let document = Document::new_plain_english_curated(text);
608
609        assert_eq!(document.tokens.len(), final_tok_count);
610
611        let document = Document::new_markdown_curated(text, MarkdownOptions::default());
612
613        assert_eq!(document.tokens.len(), final_tok_count);
614    }
615
616    #[test]
617    fn simple_contraction() {
618        assert_condensed_contractions("isn't", 1);
619    }
620
621    #[test]
622    fn simple_contraction2() {
623        assert_condensed_contractions("wasn't", 1);
624    }
625
626    #[test]
627    fn simple_contraction3() {
628        assert_condensed_contractions("There's", 1);
629    }
630
631    #[test]
632    fn medium_contraction() {
633        assert_condensed_contractions("isn't wasn't", 3);
634    }
635
636    #[test]
637    fn medium_contraction2() {
638        assert_condensed_contractions("There's no way", 5);
639    }
640
641    #[test]
642    fn selects_token_at_char_index() {
643        let text = "There were three little pigs. They built three little homes.";
644        let document = Document::new_plain_english_curated(text);
645
646        let got = document.get_token_at_char_index(19).unwrap();
647
648        assert!(got.kind.is_word());
649        assert_eq!(got.span, Span::new(17, 23));
650    }
651
652    fn assert_token_count(source: &str, count: usize) {
653        let document = Document::new_plain_english_curated(source);
654
655        dbg!(document.tokens().map(|t| t.kind).collect_vec());
656        assert_eq!(document.tokens.len(), count);
657    }
658
659    #[test]
660    fn condenses_number_suffixes() {
661        assert_token_count("1st", 1);
662        assert_token_count("This is the 2nd test", 9);
663        assert_token_count("This is the 3rd test", 9);
664        assert_token_count(
665            "It works even with weird capitalization like this: 600nD",
666            18,
667        );
668    }
669
670    #[test]
671    fn condenses_ie() {
672        assert_token_count("There is a thing (i.e. that one)", 15);
673        assert_token_count("We are trying to condense \"i.e.\"", 13);
674        assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
675    }
676
677    #[test]
678    fn condenses_eg() {
679        assert_token_count("We are trying to condense \"e.g.\"", 13);
680        assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
681    }
682
683    #[test]
684    fn condenses_nsa() {
685        assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
686    }
687
688    #[test]
689    fn parses_ellipsis() {
690        assert_token_count("...", 1);
691    }
692
693    #[test]
694    fn parses_long_ellipsis() {
695        assert_token_count(".....", 1);
696    }
697
698    #[test]
699    fn parses_short_ellipsis() {
700        assert_token_count("..", 1);
701    }
702}