harper_core/
document.rs

1use std::cmp::Ordering;
2use std::collections::VecDeque;
3use std::fmt::Display;
4
5use harper_brill::{Chunker, Tagger, brill_tagger, burn_chunker};
6use paste::paste;
7
8use crate::expr::{Expr, ExprExt, FirstMatchOf, Repeating, SequenceExpr};
9use crate::parsers::{Markdown, MarkdownOptions, Parser, PlainEnglish};
10use crate::patterns::WordSet;
11use crate::punctuation::Punctuation;
12use crate::spell::{Dictionary, FstDictionary};
13use crate::vec_ext::VecExt;
14use crate::{CharStringExt, FatStringToken, FatToken, Lrc, Token, TokenKind, TokenStringExt};
15use crate::{OrdinalSuffix, Span};
16
17/// A document containing some amount of lexed and parsed English text.
18#[derive(Debug, Clone)]
19pub struct Document {
20    source: Lrc<Vec<char>>,
21    tokens: Vec<Token>,
22}
23
24impl Default for Document {
25    fn default() -> Self {
26        Self::new("", &PlainEnglish, &FstDictionary::curated())
27    }
28}
29
30impl Document {
31    /// Locate all the tokens that intersect a provided span.
32    ///
33    /// Desperately needs optimization.
34    pub fn token_indices_intersecting(&self, span: Span<char>) -> Vec<usize> {
35        self.tokens()
36            .enumerate()
37            .filter_map(|(idx, tok)| tok.span.overlaps_with(span).then_some(idx))
38            .collect()
39    }
40
41    /// Locate all the tokens that intersect a provided span and convert them to [`FatToken`]s.
42    ///
43    /// Desperately needs optimization.
44    pub fn fat_tokens_intersecting(&self, span: Span<char>) -> Vec<FatToken> {
45        let indices = self.token_indices_intersecting(span);
46
47        indices
48            .into_iter()
49            .map(|i| self.tokens[i].to_fat(&self.source))
50            .collect()
51    }
52
53    /// Lexes and parses text to produce a document using a provided language
54    /// parser and dictionary.
55    pub fn new(text: &str, parser: &impl Parser, dictionary: &impl Dictionary) -> Self {
56        let source: Vec<_> = text.chars().collect();
57
58        Self::new_from_vec(Lrc::new(source), parser, dictionary)
59    }
60
61    /// Lexes and parses text to produce a document using a provided language
62    /// parser and the included curated dictionary.
63    pub fn new_curated(text: &str, parser: &impl Parser) -> Self {
64        let source: Vec<_> = text.chars().collect();
65
66        Self::new_from_vec(Lrc::new(source), parser, &FstDictionary::curated())
67    }
68
69    /// Lexes and parses text to produce a document using a provided language
70    /// parser and dictionary.
71    pub fn new_from_vec(
72        source: Lrc<Vec<char>>,
73        parser: &impl Parser,
74        dictionary: &impl Dictionary,
75    ) -> Self {
76        let tokens = parser.parse(&source);
77
78        let mut document = Self { source, tokens };
79        document.parse(dictionary);
80
81        document
82    }
83
84    /// Parse text to produce a document using the built-in [`PlainEnglish`]
85    /// parser and curated dictionary.
86    pub fn new_plain_english_curated(text: &str) -> Self {
87        Self::new(text, &PlainEnglish, &FstDictionary::curated())
88    }
89
90    /// Parse text to produce a document using the built-in [`PlainEnglish`]
91    /// parser and a provided dictionary.
92    pub fn new_plain_english(text: &str, dictionary: &impl Dictionary) -> Self {
93        Self::new(text, &PlainEnglish, dictionary)
94    }
95
96    /// Parse text to produce a document using the built-in [`Markdown`] parser
97    /// and curated dictionary.
98    pub fn new_markdown_curated(text: &str, markdown_options: MarkdownOptions) -> Self {
99        Self::new(
100            text,
101            &Markdown::new(markdown_options),
102            &FstDictionary::curated(),
103        )
104    }
105
106    /// Parse text to produce a document using the built-in [`Markdown`] parser
107    /// and curated dictionary with the default Markdown configuration.
108    pub fn new_markdown_default_curated(text: &str) -> Self {
109        Self::new_markdown_curated(text, MarkdownOptions::default())
110    }
111
112    /// Parse text to produce a document using the built-in [`PlainEnglish`]
113    /// parser and the curated dictionary.
114    pub fn new_markdown(
115        text: &str,
116        markdown_options: MarkdownOptions,
117        dictionary: &impl Dictionary,
118    ) -> Self {
119        Self::new(text, &Markdown::new(markdown_options), dictionary)
120    }
121
122    /// Parse text to produce a document using the built-in [`PlainEnglish`]
123    /// parser and the curated dictionary with the default Markdown configuration.
124    pub fn new_markdown_default(text: &str, dictionary: &impl Dictionary) -> Self {
125        Self::new_markdown(text, MarkdownOptions::default(), dictionary)
126    }
127
128    /// Re-parse important language constructs.
129    ///
130    /// Should be run after every change to the underlying [`Self::source`].
131    fn parse(&mut self, dictionary: &impl Dictionary) {
132        self.condense_spaces();
133        self.condense_newlines();
134        self.newlines_to_breaks();
135        self.condense_contractions();
136        self.condense_dotted_initialisms();
137        self.condense_number_suffixes();
138        self.condense_ellipsis();
139        self.condense_latin();
140        self.condense_filename_extensions();
141        self.condense_tldr();
142        self.condense_ampersand_pairs();
143        self.match_quotes();
144
145        let chunker = burn_chunker();
146        let tagger = brill_tagger();
147
148        for sent in self.tokens.iter_sentences_mut() {
149            let token_strings: Vec<_> = sent
150                .iter()
151                .filter(|t| !t.kind.is_whitespace())
152                .map(|t| t.span.get_content_string(&self.source))
153                .collect();
154
155            let token_tags = tagger.tag_sentence(&token_strings);
156            let np_flags = chunker.chunk_sentence(&token_strings, &token_tags);
157
158            let mut i = 0;
159
160            // Annotate word metadata
161            for token in sent.iter_mut() {
162                if let TokenKind::Word(meta) = &mut token.kind {
163                    let word_source = token.span.get_content(&self.source);
164                    let mut found_meta = dictionary.get_word_metadata(word_source).cloned();
165
166                    if let Some(inner) = &mut found_meta {
167                        inner.pos_tag = token_tags[i].or_else(|| inner.infer_pos_tag());
168                        inner.np_member = Some(np_flags[i]);
169                    }
170
171                    *meta = found_meta;
172                    i += 1;
173                } else if !token.kind.is_whitespace() {
174                    i += 1;
175                }
176            }
177        }
178    }
179
180    /// Convert all sets of newlines greater than 2 to paragraph breaks.
181    fn newlines_to_breaks(&mut self) {
182        for token in &mut self.tokens {
183            if let TokenKind::Newline(n) = token.kind
184                && n >= 2
185            {
186                token.kind = TokenKind::ParagraphBreak;
187            }
188        }
189    }
190
191    /// Given a list of indices, this function removes the subsequent
192    /// `stretch_len - 1` elements after each index.
193    ///
194    /// Will extend token spans to include removed elements.
195    /// Assumes condensed tokens are contiguous in source text.
196    fn condense_indices(&mut self, indices: &[usize], stretch_len: usize) {
197        // Update spans
198        for idx in indices {
199            let end_tok = self.tokens[idx + stretch_len - 1].clone();
200            let start_tok = &mut self.tokens[*idx];
201
202            start_tok.span.end = end_tok.span.end;
203        }
204
205        // Trim
206        let old = self.tokens.clone();
207        self.tokens.clear();
208
209        // Keep first chunk.
210        self.tokens
211            .extend_from_slice(&old[0..indices.first().copied().unwrap_or(indices.len())]);
212
213        let mut iter = indices.iter().peekable();
214
215        while let (Some(a_idx), b) = (iter.next(), iter.peek()) {
216            self.tokens.push(old[*a_idx].clone());
217
218            if let Some(b_idx) = b {
219                self.tokens
220                    .extend_from_slice(&old[a_idx + stretch_len..**b_idx]);
221            }
222        }
223
224        // Keep last chunk.
225        self.tokens.extend_from_slice(
226            &old[indices
227                .last()
228                .map(|v| v + stretch_len)
229                .unwrap_or(indices.len())..],
230        );
231    }
232
233    pub fn get_token_at_char_index(&self, char_index: usize) -> Option<&Token> {
234        let index = self
235            .tokens
236            .binary_search_by(|t| {
237                if t.span.overlaps_with(Span::new_with_len(char_index, 1)) {
238                    Ordering::Equal
239                } else {
240                    t.span.start.cmp(&char_index)
241                }
242            })
243            .ok()?;
244
245        Some(&self.tokens[index])
246    }
247
248    /// Defensively attempt to grab a specific token.
249    pub fn get_token(&self, index: usize) -> Option<&Token> {
250        self.tokens.get(index)
251    }
252
253    /// Get a token at a signed offset from a base index, or None if out of bounds.
254    pub fn get_token_offset(&self, base: usize, offset: isize) -> Option<&Token> {
255        match base.checked_add_signed(offset) {
256            None => None,
257            Some(idx) => self.get_token(idx),
258        }
259    }
260
261    /// Get an iterator over all the tokens contained in the document.
262    pub fn tokens(&self) -> impl Iterator<Item = &Token> + '_ {
263        self.tokens.iter()
264    }
265
266    pub fn iter_nominal_phrases(&self) -> impl Iterator<Item = &[Token]> {
267        fn is_np_member(t: &Token) -> bool {
268            t.kind
269                .as_word()
270                .and_then(|x| x.as_ref())
271                .and_then(|w| w.np_member)
272                .unwrap_or(false)
273        }
274
275        fn trim(slice: &[Token]) -> &[Token] {
276            let mut start = 0;
277            let mut end = slice.len();
278            while start < end && slice[start].kind.is_whitespace() {
279                start += 1;
280            }
281            while end > start && slice[end - 1].kind.is_whitespace() {
282                end -= 1;
283            }
284            &slice[start..end]
285        }
286
287        self.tokens
288            .as_slice()
289            .split(|t| !(is_np_member(t) || t.kind.is_whitespace()))
290            .filter_map(|s| {
291                let s = trim(s);
292                if s.iter().any(is_np_member) {
293                    Some(s)
294                } else {
295                    None
296                }
297            })
298    }
299
300    /// Get an iterator over all the tokens contained in the document.
301    pub fn fat_tokens(&self) -> impl Iterator<Item = FatToken> + '_ {
302        self.tokens().map(|token| token.to_fat(&self.source))
303    }
304
305    /// Get the next or previous word token relative to a base index, if separated by whitespace.
306    /// Returns None if the next/previous token is not a word or does not exist.
307    pub fn get_next_word_from_offset(&self, base: usize, offset: isize) -> Option<&Token> {
308        // Look for whitespace at the expected offset
309        if !self.get_token_offset(base, offset)?.kind.is_whitespace() {
310            return None;
311        }
312        // Now look beyond the whitespace for a word token
313        let word_token = self.get_token_offset(base, offset + offset.signum());
314        let word_token = word_token?;
315        word_token.kind.is_word().then_some(word_token)
316    }
317
318    /// Get an iterator over all the tokens contained in the document.
319    pub fn fat_string_tokens(&self) -> impl Iterator<Item = FatStringToken> + '_ {
320        self.fat_tokens().map(|t| t.into())
321    }
322
323    pub fn get_span_content(&self, span: &Span<char>) -> &[char] {
324        span.get_content(&self.source)
325    }
326
327    pub fn get_span_content_str(&self, span: &Span<char>) -> String {
328        String::from_iter(self.get_span_content(span))
329    }
330
331    pub fn get_full_string(&self) -> String {
332        self.get_span_content_str(&Span::new(0, self.source.len()))
333    }
334
335    pub fn get_full_content(&self) -> &[char] {
336        &self.source
337    }
338
339    pub fn get_source(&self) -> &[char] {
340        &self.source
341    }
342
343    pub fn get_tokens(&self) -> &[Token] {
344        &self.tokens
345    }
346
347    /// Searches for quotation marks and fills the
348    /// [`Punctuation::Quote::twin_loc`] field. This is on a best-effort
349    /// basis.
350    ///
351    /// Current algorithm is basic and could use some work.
352    fn match_quotes(&mut self) {
353        let quote_indices: Vec<usize> = self.tokens.iter_quote_indices().collect();
354
355        for i in 0..quote_indices.len() / 2 {
356            let a_i = quote_indices[i * 2];
357            let b_i = quote_indices[i * 2 + 1];
358
359            {
360                let a = self.tokens[a_i].kind.as_mut_quote().unwrap();
361                a.twin_loc = Some(b_i);
362            }
363
364            {
365                let b = self.tokens[b_i].kind.as_mut_quote().unwrap();
366                b.twin_loc = Some(a_i);
367            }
368        }
369    }
370
371    /// Searches for number suffixes and condenses them down into single tokens
372    fn condense_number_suffixes(&mut self) {
373        if self.tokens.len() < 2 {
374            return;
375        }
376
377        let mut replace_starts = Vec::new();
378
379        for idx in 0..self.tokens.len() - 1 {
380            let b = &self.tokens[idx + 1];
381            let a = &self.tokens[idx];
382
383            // TODO: Allow spaces between `a` and `b`
384
385            if let (TokenKind::Number(..), TokenKind::Word(..)) = (&a.kind, &b.kind)
386                && let Some(found_suffix) =
387                    OrdinalSuffix::from_chars(self.get_span_content(&b.span))
388            {
389                self.tokens[idx].kind.as_mut_number().unwrap().suffix = Some(found_suffix);
390                replace_starts.push(idx);
391            }
392        }
393
394        self.condense_indices(&replace_starts, 2);
395    }
396
397    /// Searches for multiple sequential space tokens and condenses them down
398    /// into one.
399    fn condense_spaces(&mut self) {
400        let mut cursor = 0;
401        let copy = self.tokens.clone();
402
403        let mut remove_these = VecDeque::new();
404
405        while cursor < self.tokens.len() {
406            // Locate a stretch of one or more newline tokens.
407            let start_tok = &mut self.tokens[cursor];
408
409            if let TokenKind::Space(start_count) = &mut start_tok.kind {
410                loop {
411                    cursor += 1;
412
413                    if cursor >= copy.len() {
414                        break;
415                    }
416
417                    let child_tok = &copy[cursor];
418
419                    // Only condense adjacent spans
420                    if start_tok.span.end != child_tok.span.start {
421                        break;
422                    }
423
424                    if let TokenKind::Space(n) = child_tok.kind {
425                        *start_count += n;
426                        start_tok.span.end = child_tok.span.end;
427                        remove_these.push_back(cursor);
428                        cursor += 1;
429                    } else {
430                        break;
431                    };
432                }
433            }
434
435            cursor += 1;
436        }
437
438        self.tokens.remove_indices(remove_these);
439    }
440
441    thread_local! {
442        static LATIN_EXPR: Lrc<FirstMatchOf> = Document::uncached_latin_expr();
443    }
444
445    fn uncached_latin_expr() -> Lrc<FirstMatchOf> {
446        Lrc::new(FirstMatchOf::new(vec![
447            Box::new(
448                SequenceExpr::default()
449                    .then(WordSet::new(&["etc", "vs"]))
450                    .then_period(),
451            ),
452            Box::new(
453                SequenceExpr::aco("et")
454                    .then_whitespace()
455                    .t_aco("al")
456                    .then_period(),
457            ),
458        ]))
459    }
460
461    /// Assumes that the first matched token is the canonical one to be condensed into.
462    /// Takes a callback that can be used to retroactively edit the canonical token afterwards.
463    fn condense_expr<F>(&mut self, expr: &impl Expr, edit: F)
464    where
465        F: Fn(&mut Token),
466    {
467        let matches = expr.iter_matches_in_doc(self).collect::<Vec<_>>();
468
469        let mut remove_indices = VecDeque::with_capacity(matches.len());
470
471        for m in matches {
472            remove_indices.extend(m.start + 1..m.end);
473            self.tokens[m.start].span = self.tokens[m.into_iter()].span().unwrap();
474            edit(&mut self.tokens[m.start]);
475        }
476
477        self.tokens.remove_indices(remove_indices);
478    }
479
480    fn condense_latin(&mut self) {
481        self.condense_expr(&Self::LATIN_EXPR.with(|v| v.clone()), |_| {})
482    }
483
484    /// Searches for multiple sequential newline tokens and condenses them down
485    /// into one.
486    fn condense_newlines(&mut self) {
487        let mut cursor = 0;
488        let copy = self.tokens.clone();
489
490        let mut remove_these = VecDeque::new();
491
492        while cursor < self.tokens.len() {
493            // Locate a stretch of one or more newline tokens.
494            let start_tok = &mut self.tokens[cursor];
495
496            if let TokenKind::Newline(start_count) = &mut start_tok.kind {
497                loop {
498                    cursor += 1;
499
500                    if cursor >= copy.len() {
501                        break;
502                    }
503
504                    let child_tok = &copy[cursor];
505                    if let TokenKind::Newline(n) = child_tok.kind {
506                        *start_count += n;
507                        start_tok.span.end = child_tok.span.end;
508                        remove_these.push_back(cursor);
509                        cursor += 1;
510                    } else {
511                        break;
512                    };
513                }
514            }
515
516            cursor += 1;
517        }
518
519        self.tokens.remove_indices(remove_these);
520    }
521
522    /// Condenses words like "i.e.", "e.g." and "N.S.A." down to single words
523    /// using a state machine.
524    fn condense_dotted_initialisms(&mut self) {
525        if self.tokens.len() < 2 {
526            return;
527        }
528
529        let mut to_remove = VecDeque::new();
530
531        let mut cursor = 1;
532
533        let mut initialism_start = None;
534
535        loop {
536            let a = &self.tokens[cursor - 1];
537            let b = &self.tokens[cursor];
538
539            let is_initialism_chunk = a.kind.is_word() && a.span.len() == 1 && b.kind.is_period();
540
541            if is_initialism_chunk {
542                if initialism_start.is_none() {
543                    initialism_start = Some(cursor - 1);
544                } else {
545                    to_remove.push_back(cursor - 1);
546                }
547
548                to_remove.push_back(cursor);
549                cursor += 1;
550            } else {
551                if let Some(start) = initialism_start {
552                    let end = self.tokens[cursor - 2].span.end;
553                    let start_tok: &mut Token = &mut self.tokens[start];
554                    start_tok.span.end = end;
555                }
556
557                initialism_start = None;
558            }
559
560            cursor += 1;
561
562            if cursor >= self.tokens.len() - 1 {
563                break;
564            }
565        }
566
567        self.tokens.remove_indices(to_remove);
568    }
569
570    /// Condenses likely filename extensions down to single tokens.
571    fn condense_filename_extensions(&mut self) {
572        if self.tokens.len() < 2 {
573            return;
574        }
575
576        let mut to_remove = VecDeque::new();
577
578        let mut cursor = 1;
579
580        let mut ext_start = None;
581
582        loop {
583            // left context, dot, extension, right context
584            let l = self.get_token_offset(cursor, -2);
585            let d = &self.tokens[cursor - 1];
586            let x = &self.tokens[cursor];
587            let r = self.get_token_offset(cursor, 1);
588
589            let is_ext_chunk = d.kind.is_period()
590                && x.kind.is_word()
591                && x.span.len() <= 3
592                && ((l.is_none_or(|t| t.kind.is_whitespace())
593                    && r.is_none_or(|t| t.kind.is_whitespace()))
594                    || (l.is_some_and(|t| t.kind.is_open_round())
595                        && r.is_some_and(|t| t.kind.is_close_round())))
596                && {
597                    let ext_chars = x.span.get_content(&self.source);
598                    ext_chars.iter().all(|c| c.is_ascii_lowercase())
599                        || ext_chars.iter().all(|c| c.is_ascii_uppercase())
600                };
601
602            if is_ext_chunk {
603                if ext_start.is_none() {
604                    ext_start = Some(cursor - 1);
605                    self.tokens[cursor - 1].kind = TokenKind::Unlintable;
606                } else {
607                    to_remove.push_back(cursor - 1);
608                }
609
610                to_remove.push_back(cursor);
611                cursor += 1;
612            } else {
613                if let Some(start) = ext_start {
614                    let end = self.tokens[cursor - 2].span.end;
615                    let start_tok: &mut Token = &mut self.tokens[start];
616                    start_tok.span.end = end;
617                }
618
619                ext_start = None;
620            }
621
622            cursor += 1;
623
624            if cursor >= self.tokens.len() {
625                break;
626            }
627        }
628
629        self.tokens.remove_indices(to_remove);
630    }
631
632    /// Condenses "tl;dr" down to a single word token.
633    fn condense_tldr(&mut self) {
634        if self.tokens.len() < 3 {
635            return;
636        }
637
638        let mut to_remove = VecDeque::new();
639        let mut cursor = 2;
640
641        loop {
642            let tl = &self.tokens[cursor - 2];
643            let simicolon = &self.tokens[cursor - 1];
644            let dr = &self.tokens[cursor];
645
646            let is_tldr_chunk = tl.kind.is_word()
647                && tl.span.len() == 2
648                && tl
649                    .span
650                    .get_content(&self.source)
651                    .eq_ignore_ascii_case_chars(&['t', 'l'])
652                && simicolon.kind.is_semicolon()
653                && dr.kind.is_word()
654                && dr.span.len() >= 2
655                && dr.span.len() <= 3
656                && dr
657                    .span
658                    .get_content(&self.source)
659                    .eq_any_ignore_ascii_case_chars(&[&['d', 'r'], &['d', 'r', 's']]);
660
661            if is_tldr_chunk {
662                // Update the first token to be the full "tl;dr" as a word
663                self.tokens[cursor - 2].span = Span::new(
664                    self.tokens[cursor - 2].span.start,
665                    self.tokens[cursor].span.end,
666                );
667
668                // Mark the semicolon and "dr" tokens for removal
669                to_remove.push_back(cursor - 1);
670                to_remove.push_back(cursor);
671            }
672
673            // Skip ahead since we've processed these tokens
674            cursor += 1;
675
676            if cursor >= self.tokens.len() {
677                break;
678            }
679        }
680
681        // Remove the marked tokens in reverse order to maintain correct indices
682        self.tokens.remove_indices(to_remove);
683    }
684
685    /// Condenses "R&D" or "Q&A" down to a single word token.
686    fn condense_ampersand_pairs(&mut self) {
687        if self.tokens.len() < 3 {
688            return;
689        }
690
691        let mut to_remove = VecDeque::new();
692        // The number of tokens we look at, minus 1
693        let mut cursor = 2;
694
695        loop {
696            let l1 = &self.tokens[cursor - 2];
697            let and = &self.tokens[cursor - 1];
698            let l2 = &self.tokens[cursor];
699
700            let is_letter_amp_letter_chunk = l1.kind.is_word()
701                && l1.span.len() == 1
702                && and.kind.is_ampersand()
703                && l2.kind.is_word()
704                && l2.span.len() == 1;
705
706            if is_letter_amp_letter_chunk {
707                let (l1, l2) = (
708                    l1.span.get_content(&self.source).first(),
709                    l2.span.get_content(&self.source).first(),
710                );
711
712                let is_valid_pair = match (l1, l2) {
713                    (Some(l1), Some(l2)) => {
714                        matches!(
715                            (l1.to_ascii_lowercase(), l2.to_ascii_lowercase()),
716                            ('r', 'd') | ('q', 'a')
717                        )
718                    }
719                    _ => false,
720                };
721
722                if is_valid_pair {
723                    self.tokens[cursor - 2].span = Span::new(
724                        self.tokens[cursor - 2].span.start,
725                        self.tokens[cursor].span.end,
726                    );
727                    to_remove.push_back(cursor - 1);
728                    to_remove.push_back(cursor);
729                }
730            }
731
732            // Skip ahead since we've processed these tokens
733            cursor += 1;
734
735            if cursor >= self.tokens.len() {
736                break;
737            }
738        }
739
740        // Remove the marked tokens in reverse order to maintain correct indices
741        self.tokens.remove_indices(to_remove);
742    }
743
744    fn uncached_ellipsis_pattern() -> Lrc<Repeating> {
745        let period = SequenceExpr::default().then_period();
746        Lrc::new(Repeating::new(Box::new(period), 2))
747    }
748
749    thread_local! {
750        static ELLIPSIS_EXPR: Lrc<Repeating> = Document::uncached_ellipsis_pattern();
751    }
752
753    fn condense_ellipsis(&mut self) {
754        let expr = Self::ELLIPSIS_EXPR.with(|v| v.clone());
755        self.condense_expr(&expr, |tok| {
756            tok.kind = TokenKind::Punctuation(Punctuation::Ellipsis)
757        });
758    }
759
760    fn uncached_contraction_expr() -> Lrc<SequenceExpr> {
761        Lrc::new(
762            SequenceExpr::default()
763                .then_any_word()
764                .then_apostrophe()
765                .then_any_word(),
766        )
767    }
768
769    thread_local! {
770        static CONTRACTION_EXPR: Lrc<SequenceExpr> = Document::uncached_contraction_expr();
771    }
772
773    /// Searches for contractions and condenses them down into single
774    /// tokens.
775    fn condense_contractions(&mut self) {
776        let expr = Self::CONTRACTION_EXPR.with(|v| v.clone());
777
778        self.condense_expr(&expr, |_| {})
779    }
780}
781
782/// Creates functions necessary to implement [`TokenStringExt]` on a document.
783macro_rules! create_fns_on_doc {
784    ($thing:ident) => {
785        paste! {
786            fn [< first_ $thing >](&self) -> Option<&Token> {
787                self.tokens.[< first_ $thing >]()
788            }
789
790            fn [< last_ $thing >](&self) -> Option<&Token> {
791                self.tokens.[< last_ $thing >]()
792            }
793
794            fn [< last_ $thing _index>](&self) -> Option<usize> {
795                self.tokens.[< last_ $thing _index >]()
796            }
797
798            fn [<iter_ $thing _indices>](&self) -> impl DoubleEndedIterator<Item = usize> + '_ {
799                self.tokens.[< iter_ $thing _indices >]()
800            }
801
802            fn [<iter_ $thing s>](&self) -> impl Iterator<Item = &Token> + '_ {
803                self.tokens.[< iter_ $thing s >]()
804            }
805        }
806    };
807}
808
809impl TokenStringExt for Document {
810    create_fns_on_doc!(adjective);
811    create_fns_on_doc!(apostrophe);
812    create_fns_on_doc!(at);
813    create_fns_on_doc!(chunk_terminator);
814    create_fns_on_doc!(comma);
815    create_fns_on_doc!(conjunction);
816    create_fns_on_doc!(currency);
817    create_fns_on_doc!(ellipsis);
818    create_fns_on_doc!(hostname);
819    create_fns_on_doc!(likely_homograph);
820    create_fns_on_doc!(noun);
821    create_fns_on_doc!(number);
822    create_fns_on_doc!(paragraph_break);
823    create_fns_on_doc!(pipe);
824    create_fns_on_doc!(preposition);
825    create_fns_on_doc!(punctuation);
826    create_fns_on_doc!(quote);
827    create_fns_on_doc!(sentence_terminator);
828    create_fns_on_doc!(space);
829    create_fns_on_doc!(unlintable);
830    create_fns_on_doc!(verb);
831    create_fns_on_doc!(word);
832    create_fns_on_doc!(word_like);
833
834    fn first_sentence_word(&self) -> Option<&Token> {
835        self.tokens.first_sentence_word()
836    }
837
838    fn first_non_whitespace(&self) -> Option<&Token> {
839        self.tokens.first_non_whitespace()
840    }
841
842    fn span(&self) -> Option<Span<char>> {
843        self.tokens.span()
844    }
845
846    fn iter_linking_verb_indices(&self) -> impl Iterator<Item = usize> + '_ {
847        self.tokens.iter_linking_verb_indices()
848    }
849
850    fn iter_linking_verbs(&self) -> impl Iterator<Item = &Token> + '_ {
851        self.tokens.iter_linking_verbs()
852    }
853
854    fn iter_chunks(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
855        self.tokens.iter_chunks()
856    }
857
858    fn iter_paragraphs(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
859        self.tokens.iter_paragraphs()
860    }
861
862    fn iter_sentences(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
863        self.tokens.iter_sentences()
864    }
865
866    fn iter_sentences_mut(&mut self) -> impl Iterator<Item = &'_ mut [Token]> + '_ {
867        self.tokens.iter_sentences_mut()
868    }
869}
870
871impl Display for Document {
872    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
873        for token in &self.tokens {
874            write!(f, "{}", self.get_span_content_str(&token.span))?;
875        }
876
877        Ok(())
878    }
879}
880
881#[cfg(test)]
882mod tests {
883    use itertools::Itertools;
884
885    use super::Document;
886    use crate::{Span, parsers::MarkdownOptions};
887
888    fn assert_condensed_contractions(text: &str, final_tok_count: usize) {
889        let document = Document::new_plain_english_curated(text);
890
891        assert_eq!(document.tokens.len(), final_tok_count);
892
893        let document = Document::new_markdown_curated(text, MarkdownOptions::default());
894
895        assert_eq!(document.tokens.len(), final_tok_count);
896    }
897
898    #[test]
899    fn simple_contraction() {
900        assert_condensed_contractions("isn't", 1);
901    }
902
903    #[test]
904    fn simple_contraction2() {
905        assert_condensed_contractions("wasn't", 1);
906    }
907
908    #[test]
909    fn simple_contraction3() {
910        assert_condensed_contractions("There's", 1);
911    }
912
913    #[test]
914    fn medium_contraction() {
915        assert_condensed_contractions("isn't wasn't", 3);
916    }
917
918    #[test]
919    fn medium_contraction2() {
920        assert_condensed_contractions("There's no way", 5);
921    }
922
923    #[test]
924    fn selects_token_at_char_index() {
925        let text = "There were three little pigs. They built three little homes.";
926        let document = Document::new_plain_english_curated(text);
927
928        let got = document.get_token_at_char_index(19).unwrap();
929
930        assert!(got.kind.is_word());
931        assert_eq!(got.span, Span::new(17, 23));
932    }
933
934    fn assert_token_count(source: &str, count: usize) {
935        let document = Document::new_plain_english_curated(source);
936
937        dbg!(document.tokens().map(|t| t.kind.clone()).collect_vec());
938        assert_eq!(document.tokens.len(), count);
939    }
940
941    #[test]
942    fn condenses_number_suffixes() {
943        assert_token_count("1st", 1);
944        assert_token_count("This is the 2nd test", 9);
945        assert_token_count("This is the 3rd test", 9);
946        assert_token_count(
947            "It works even with weird capitalization like this: 600nD",
948            18,
949        );
950    }
951
952    #[test]
953    fn condenses_ie() {
954        assert_token_count("There is a thing (i.e. that one)", 15);
955        assert_token_count("We are trying to condense \"i.e.\"", 13);
956        assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
957    }
958
959    #[test]
960    fn condenses_eg() {
961        assert_token_count("We are trying to condense \"e.g.\"", 13);
962        assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
963    }
964
965    #[test]
966    fn condenses_nsa() {
967        assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
968    }
969
970    #[test]
971    fn parses_ellipsis() {
972        assert_token_count("...", 1);
973    }
974
975    #[test]
976    fn parses_long_ellipsis() {
977        assert_token_count(".....", 1);
978    }
979
980    #[test]
981    fn parses_short_ellipsis() {
982        assert_token_count("..", 1);
983    }
984
985    #[test]
986    fn selects_token_at_offset() {
987        let doc = Document::new_plain_english_curated("Foo bar baz");
988
989        let tok = doc.get_token_offset(1, -1).unwrap();
990
991        assert_eq!(tok.span, Span::new(0, 3));
992    }
993
994    #[test]
995    fn cant_select_token_before_start() {
996        let doc = Document::new_plain_english_curated("Foo bar baz");
997
998        let tok = doc.get_token_offset(0, -1);
999
1000        assert!(tok.is_none());
1001    }
1002
1003    #[test]
1004    fn select_next_word_pos_offset() {
1005        let doc = Document::new_plain_english_curated("Foo bar baz");
1006
1007        let bar = doc.get_next_word_from_offset(0, 1).unwrap();
1008        let bar = doc.get_span_content(&bar.span);
1009        assert_eq!(bar, ['b', 'a', 'r']);
1010    }
1011
1012    #[test]
1013    fn select_next_word_neg_offset() {
1014        let doc = Document::new_plain_english_curated("Foo bar baz");
1015
1016        let bar = doc.get_next_word_from_offset(2, -1).unwrap();
1017        let bar = doc.get_span_content(&bar.span);
1018        assert_eq!(bar, ['F', 'o', 'o']);
1019    }
1020
1021    #[test]
1022    fn cant_select_next_word_not_from_whitespace() {
1023        let doc = Document::new_plain_english_curated("Foo bar baz");
1024
1025        let tok = doc.get_next_word_from_offset(0, 2);
1026
1027        assert!(tok.is_none());
1028    }
1029
1030    #[test]
1031    fn cant_select_next_word_before_start() {
1032        let doc = Document::new_plain_english_curated("Foo bar baz");
1033
1034        let tok = doc.get_next_word_from_offset(0, -1);
1035
1036        assert!(tok.is_none());
1037    }
1038
1039    #[test]
1040    fn cant_select_next_word_with_punctuation_instead_of_whitespace() {
1041        let doc = Document::new_plain_english_curated("Foo, bar, baz");
1042
1043        let tok = doc.get_next_word_from_offset(0, 1);
1044
1045        assert!(tok.is_none());
1046    }
1047
1048    #[test]
1049    fn cant_select_next_word_with_punctuation_after_whitespace() {
1050        let doc = Document::new_plain_english_curated("Foo \"bar\", baz");
1051
1052        let tok = doc.get_next_word_from_offset(0, 1);
1053
1054        assert!(tok.is_none());
1055    }
1056
1057    #[test]
1058    fn condenses_filename_extensions() {
1059        let doc = Document::new_plain_english_curated(".c and .exe and .js");
1060        assert!(doc.tokens[0].kind.is_unlintable());
1061        assert!(doc.tokens[4].kind.is_unlintable());
1062        assert!(doc.tokens[8].kind.is_unlintable());
1063    }
1064
1065    #[test]
1066    fn condense_filename_extension_ok_at_start_and_end() {
1067        let doc = Document::new_plain_english_curated(".c and .EXE");
1068        assert!(doc.tokens.len() == 5);
1069        assert!(doc.tokens[0].kind.is_unlintable());
1070        assert!(doc.tokens[4].kind.is_unlintable());
1071    }
1072
1073    #[test]
1074    fn doesnt_condense_filename_extensions_with_mixed_case() {
1075        let doc = Document::new_plain_english_curated(".c and .Exe");
1076        assert!(doc.tokens.len() == 6);
1077        assert!(doc.tokens[0].kind.is_unlintable());
1078        assert!(doc.tokens[4].kind.is_punctuation());
1079        assert!(doc.tokens[5].kind.is_word());
1080    }
1081
1082    #[test]
1083    fn doesnt_condense_filename_extensions_with_non_letters() {
1084        let doc = Document::new_plain_english_curated(".COM and .C0M");
1085        assert!(doc.tokens.len() == 6);
1086        assert!(doc.tokens[0].kind.is_unlintable());
1087        assert!(doc.tokens[4].kind.is_punctuation());
1088        assert!(doc.tokens[5].kind.is_word());
1089    }
1090
1091    #[test]
1092    fn doesnt_condense_filename_extensions_longer_than_three() {
1093        let doc = Document::new_plain_english_curated(".dll and .dlls");
1094        assert!(doc.tokens.len() == 6);
1095        assert!(doc.tokens[0].kind.is_unlintable());
1096        assert!(doc.tokens[4].kind.is_punctuation());
1097        assert!(doc.tokens[5].kind.is_word());
1098    }
1099
1100    #[test]
1101    fn condense_filename_extension_in_parens() {
1102        let doc = Document::new_plain_english_curated(
1103            "true for the manual installation when trying to run the executable(.exe) after a manual download",
1104        );
1105        assert!(doc.tokens.len() > 23);
1106        assert!(doc.tokens[21].kind.is_open_round());
1107        assert!(doc.tokens[22].kind.is_unlintable());
1108        assert!(doc.tokens[23].kind.is_close_round());
1109    }
1110
1111    #[test]
1112    fn condense_tldr_uppercase() {
1113        let doc = Document::new_plain_english_curated("TL;DR");
1114        assert!(doc.tokens.len() == 1);
1115        assert!(doc.tokens[0].kind.is_word());
1116        assert!(doc.tokens[0].span.len() == 5);
1117    }
1118
1119    #[test]
1120    fn condense_tldr_lowercase() {
1121        let doc = Document::new_plain_english_curated("tl;dr");
1122        assert!(doc.tokens.len() == 1);
1123        assert!(doc.tokens[0].kind.is_word());
1124    }
1125
1126    #[test]
1127    fn condense_tldr_mixed_case_1() {
1128        let doc = Document::new_plain_english_curated("tl;DR");
1129        assert!(doc.tokens.len() == 1);
1130        assert!(doc.tokens[0].kind.is_word());
1131    }
1132
1133    #[test]
1134    fn condense_tldr_mixed_case_2() {
1135        let doc = Document::new_plain_english_curated("TL;Dr");
1136        assert!(doc.tokens.len() == 1);
1137        assert!(doc.tokens[0].kind.is_word());
1138    }
1139
1140    #[test]
1141    fn condense_tldr_pural() {
1142        let doc = Document::new_plain_english_curated(
1143            "managing the flow between components to produce relevant TL;DRs of current news articles",
1144        );
1145        // no token is a punctuation token - only words with whitespace between
1146        assert!(
1147            doc.tokens
1148                .iter()
1149                .all(|t| t.kind.is_word() || t.kind.is_whitespace())
1150        );
1151        // one of the word tokens contains a ';' character
1152        let tldrs = doc
1153            .tokens
1154            .iter()
1155            .filter(|t| t.span.get_content(&doc.source).contains(&';'))
1156            .collect_vec();
1157        assert!(tldrs.len() == 1);
1158        assert!(tldrs[0].span.get_content_string(&doc.source) == "TL;DRs");
1159    }
1160
1161    #[test]
1162    fn condense_r_and_d_caps() {
1163        let doc = Document::new_plain_english_curated("R&D");
1164        assert!(doc.tokens.len() == 1);
1165        assert!(doc.tokens[0].kind.is_word());
1166    }
1167
1168    #[test]
1169    fn condense_r_and_d_mixed_case() {
1170        let doc = Document::new_plain_english_curated("R&d");
1171        assert!(doc.tokens.len() == 1);
1172        assert!(doc.tokens[0].kind.is_word());
1173    }
1174
1175    #[test]
1176    fn condense_r_and_d_lowercase() {
1177        let doc = Document::new_plain_english_curated("r&d");
1178        assert!(doc.tokens.len() == 1);
1179        assert!(doc.tokens[0].kind.is_word());
1180    }
1181
1182    #[test]
1183    fn dont_condense_r_and_d_with_spaces() {
1184        let doc = Document::new_plain_english_curated("R & D");
1185        assert!(doc.tokens.len() == 5);
1186        assert!(doc.tokens[0].kind.is_word());
1187        assert!(doc.tokens[1].kind.is_whitespace());
1188        assert!(doc.tokens[2].kind.is_ampersand());
1189        assert!(doc.tokens[3].kind.is_whitespace());
1190        assert!(doc.tokens[4].kind.is_word());
1191    }
1192
1193    #[test]
1194    fn condense_q_and_a() {
1195        let doc =
1196            Document::new_plain_english_curated("A Q&A platform software for teams at any scales.");
1197        assert!(doc.tokens.len() >= 3);
1198        assert!(doc.tokens[2].kind.is_word());
1199        assert!(doc.tokens[2].span.get_content_string(&doc.source) == "Q&A");
1200    }
1201
1202    #[test]
1203    fn dont_allow_mixed_r_and_d_with_q_and_a() {
1204        let doc = Document::new_plain_english_curated("R&A or Q&D");
1205        assert!(doc.tokens.len() == 9);
1206        assert!(doc.tokens[1].kind.is_ampersand() || doc.tokens[7].kind.is_ampersand());
1207    }
1208}
harper_core/document.rs

harper_core/
document.rs