harper_core/
document.rs

1use std::cmp::Ordering;
2use std::collections::VecDeque;
3use std::fmt::Display;
4
5use harper_brill::{Chunker, Tagger, brill_tagger, burn_chunker};
6use itertools::Itertools;
7use paste::paste;
8
9use crate::expr::{Expr, ExprExt, FirstMatchOf, Repeating, SequenceExpr};
10use crate::parsers::{Markdown, MarkdownOptions, Parser, PlainEnglish};
11use crate::patterns::WordSet;
12use crate::punctuation::Punctuation;
13use crate::spell::{Dictionary, FstDictionary};
14use crate::vec_ext::VecExt;
15use crate::{CharStringExt, FatStringToken, FatToken, Lrc, Token, TokenKind, TokenStringExt};
16use crate::{OrdinalSuffix, Span};
17
18/// A document containing some amount of lexed and parsed English text.
19#[derive(Debug, Clone)]
20pub struct Document {
21    source: Lrc<Vec<char>>,
22    tokens: Vec<Token>,
23}
24
25impl Default for Document {
26    fn default() -> Self {
27        Self::new("", &PlainEnglish, &FstDictionary::curated())
28    }
29}
30
31impl Document {
32    /// Locate all the tokens that intersect a provided span.
33    ///
34    /// Desperately needs optimization.
35    pub fn token_indices_intersecting(&self, span: Span<char>) -> Vec<usize> {
36        self.tokens()
37            .enumerate()
38            .filter_map(|(idx, tok)| tok.span.overlaps_with(span).then_some(idx))
39            .collect()
40    }
41
42    /// Locate all the tokens that intersect a provided span and convert them to [`FatToken`]s.
43    ///
44    /// Desperately needs optimization.
45    pub fn fat_tokens_intersecting(&self, span: Span<char>) -> Vec<FatToken> {
46        let indices = self.token_indices_intersecting(span);
47
48        indices
49            .into_iter()
50            .map(|i| self.tokens[i].to_fat(&self.source))
51            .collect()
52    }
53
54    /// Lexes and parses text to produce a document using a provided language
55    /// parser and dictionary.
56    pub fn new(text: &str, parser: &impl Parser, dictionary: &impl Dictionary) -> Self {
57        let source: Vec<_> = text.chars().collect();
58
59        Self::new_from_vec(Lrc::new(source), parser, dictionary)
60    }
61
62    /// Lexes and parses text to produce a document using a provided language
63    /// parser and the included curated dictionary.
64    pub fn new_curated(text: &str, parser: &impl Parser) -> Self {
65        let source: Vec<_> = text.chars().collect();
66
67        Self::new_from_vec(Lrc::new(source), parser, &FstDictionary::curated())
68    }
69
70    /// Lexes and parses text to produce a document using a provided language
71    /// parser and dictionary.
72    pub fn new_from_vec(
73        source: Lrc<Vec<char>>,
74        parser: &impl Parser,
75        dictionary: &impl Dictionary,
76    ) -> Self {
77        let tokens = parser.parse(&source);
78
79        let mut document = Self { source, tokens };
80        document.parse(dictionary);
81
82        document
83    }
84
85    /// Parse text to produce a document using the built-in [`PlainEnglish`]
86    /// parser and curated dictionary.
87    pub fn new_plain_english_curated(text: &str) -> Self {
88        Self::new(text, &PlainEnglish, &FstDictionary::curated())
89    }
90
91    /// Create a new document simply by tokenizing the provided input and applying fix-ups. The
92    /// contained words will not contain any metadata.
93    ///
94    /// This avoids running potentially expensive metadata generation code, so this is more
95    /// efficient if you don't need that information.
96    pub(crate) fn new_basic_tokenize(text: &str, parser: &impl Parser) -> Self {
97        let source = Lrc::new(text.chars().collect_vec());
98        let tokens = parser.parse(&source);
99        let mut document = Self { source, tokens };
100        document.apply_fixups();
101        document
102    }
103
104    /// Parse text to produce a document using the built-in [`PlainEnglish`]
105    /// parser and a provided dictionary.
106    pub fn new_plain_english(text: &str, dictionary: &impl Dictionary) -> Self {
107        Self::new(text, &PlainEnglish, dictionary)
108    }
109
110    /// Parse text to produce a document using the built-in [`Markdown`] parser
111    /// and curated dictionary.
112    pub fn new_markdown_curated(text: &str, markdown_options: MarkdownOptions) -> Self {
113        Self::new(
114            text,
115            &Markdown::new(markdown_options),
116            &FstDictionary::curated(),
117        )
118    }
119
120    /// Parse text to produce a document using the built-in [`Markdown`] parser
121    /// and curated dictionary with the default Markdown configuration.
122    pub fn new_markdown_default_curated(text: &str) -> Self {
123        Self::new_markdown_curated(text, MarkdownOptions::default())
124    }
125
126    /// Parse text to produce a document using the built-in [`PlainEnglish`]
127    /// parser and the curated dictionary.
128    pub fn new_markdown(
129        text: &str,
130        markdown_options: MarkdownOptions,
131        dictionary: &impl Dictionary,
132    ) -> Self {
133        Self::new(text, &Markdown::new(markdown_options), dictionary)
134    }
135
136    /// Parse text to produce a document using the built-in [`PlainEnglish`]
137    /// parser and the curated dictionary with the default Markdown configuration.
138    pub fn new_markdown_default(text: &str, dictionary: &impl Dictionary) -> Self {
139        Self::new_markdown(text, MarkdownOptions::default(), dictionary)
140    }
141
142    fn apply_fixups(&mut self) {
143        self.condense_spaces();
144        self.condense_newlines();
145        self.newlines_to_breaks();
146        self.condense_dotted_initialisms();
147        self.condense_number_suffixes();
148        self.condense_ellipsis();
149        self.condense_latin();
150        self.condense_filename_extensions();
151        self.condense_tldr();
152        self.condense_ampersand_pairs();
153        self.condense_slash_pairs();
154        self.match_quotes();
155    }
156
157    /// Re-parse important language constructs.
158    ///
159    /// Should be run after every change to the underlying [`Self::source`].
160    fn parse(&mut self, dictionary: &impl Dictionary) {
161        self.apply_fixups();
162
163        let chunker = burn_chunker();
164        let tagger = brill_tagger();
165
166        for sent in self.tokens.iter_sentences_mut() {
167            let token_strings: Vec<_> = sent
168                .iter()
169                .filter(|t| !t.kind.is_whitespace())
170                .map(|t| t.span.get_content_string(&self.source))
171                .collect();
172
173            let token_tags = tagger.tag_sentence(&token_strings);
174            let np_flags = chunker.chunk_sentence(&token_strings, &token_tags);
175
176            let mut i = 0;
177
178            // Annotate DictWord metadata
179            for token in sent.iter_mut() {
180                if let TokenKind::Word(meta) = &mut token.kind {
181                    let word_source = token.span.get_content(&self.source);
182                    let mut found_meta = dictionary
183                        .get_word_metadata(word_source)
184                        .map(|c| c.into_owned());
185
186                    if let Some(inner) = &mut found_meta {
187                        inner.pos_tag = token_tags[i].or_else(|| inner.infer_pos_tag());
188                        inner.np_member = Some(np_flags[i]);
189                    }
190
191                    *meta = found_meta;
192                    i += 1;
193                } else if !token.kind.is_whitespace() {
194                    i += 1;
195                }
196            }
197        }
198    }
199
200    /// Convert all sets of newlines greater than 2 to paragraph breaks.
201    fn newlines_to_breaks(&mut self) {
202        for token in &mut self.tokens {
203            if let TokenKind::Newline(n) = token.kind
204                && n >= 2
205            {
206                token.kind = TokenKind::ParagraphBreak;
207            }
208        }
209    }
210
211    /// Given a list of indices, this function removes the subsequent
212    /// `stretch_len - 1` elements after each index.
213    ///
214    /// Will extend token spans to include removed elements.
215    /// Assumes condensed tokens are contiguous in source text.
216    fn condense_indices(&mut self, indices: &[usize], stretch_len: usize) {
217        // Update spans
218        for idx in indices {
219            let end_tok = self.tokens[idx + stretch_len - 1].clone();
220            let start_tok = &mut self.tokens[*idx];
221
222            start_tok.span.end = end_tok.span.end;
223        }
224
225        // Trim
226        let old = self.tokens.clone();
227        self.tokens.clear();
228
229        // Keep first chunk.
230        self.tokens
231            .extend_from_slice(&old[0..indices.first().copied().unwrap_or(indices.len())]);
232
233        let mut iter = indices.iter().peekable();
234
235        while let (Some(a_idx), b) = (iter.next(), iter.peek()) {
236            self.tokens.push(old[*a_idx].clone());
237
238            if let Some(b_idx) = b {
239                self.tokens
240                    .extend_from_slice(&old[a_idx + stretch_len..**b_idx]);
241            }
242        }
243
244        // Keep last chunk.
245        self.tokens.extend_from_slice(
246            &old[indices
247                .last()
248                .map(|v| v + stretch_len)
249                .unwrap_or(indices.len())..],
250        );
251    }
252
253    pub fn get_token_at_char_index(&self, char_index: usize) -> Option<&Token> {
254        let index = self
255            .tokens
256            .binary_search_by(|t| {
257                if t.span.overlaps_with(Span::new_with_len(char_index, 1)) {
258                    Ordering::Equal
259                } else {
260                    t.span.start.cmp(&char_index)
261                }
262            })
263            .ok()?;
264
265        Some(&self.tokens[index])
266    }
267
268    /// Defensively attempt to grab a specific token.
269    pub fn get_token(&self, index: usize) -> Option<&Token> {
270        self.tokens.get(index)
271    }
272
273    /// Get a token at a signed offset from a base index, or None if out of bounds.
274    pub fn get_token_offset(&self, base: usize, offset: isize) -> Option<&Token> {
275        match base.checked_add_signed(offset) {
276            None => None,
277            Some(idx) => self.get_token(idx),
278        }
279    }
280
281    /// Get an iterator over all the tokens contained in the document.
282    pub fn tokens(&self) -> impl Iterator<Item = &Token> + '_ {
283        self.tokens.iter()
284    }
285
286    pub fn iter_nominal_phrases(&self) -> impl Iterator<Item = &[Token]> {
287        fn is_np_member(t: &Token) -> bool {
288            t.kind
289                .as_word()
290                .and_then(|x| x.as_ref())
291                .and_then(|w| w.np_member)
292                .unwrap_or(false)
293        }
294
295        fn trim(slice: &[Token]) -> &[Token] {
296            let mut start = 0;
297            let mut end = slice.len();
298            while start < end && slice[start].kind.is_whitespace() {
299                start += 1;
300            }
301            while end > start && slice[end - 1].kind.is_whitespace() {
302                end -= 1;
303            }
304            &slice[start..end]
305        }
306
307        self.tokens
308            .as_slice()
309            .split(|t| !(is_np_member(t) || t.kind.is_whitespace()))
310            .filter_map(|s| {
311                let s = trim(s);
312                if s.iter().any(is_np_member) {
313                    Some(s)
314                } else {
315                    None
316                }
317            })
318    }
319
320    /// Get an iterator over all the tokens contained in the document.
321    pub fn fat_tokens(&self) -> impl Iterator<Item = FatToken> + '_ {
322        self.tokens().map(|token| token.to_fat(&self.source))
323    }
324
325    /// Get the next or previous word token relative to a base index, if separated by whitespace.
326    /// Returns None if the next/previous token is not a word or does not exist.
327    pub fn get_next_word_from_offset(&self, base: usize, offset: isize) -> Option<&Token> {
328        // Look for whitespace at the expected offset
329        if !self.get_token_offset(base, offset)?.kind.is_whitespace() {
330            return None;
331        }
332        // Now look beyond the whitespace for a word token
333        let word_token = self.get_token_offset(base, offset + offset.signum());
334        let word_token = word_token?;
335        word_token.kind.is_word().then_some(word_token)
336    }
337
338    /// Get an iterator over all the tokens contained in the document.
339    pub fn fat_string_tokens(&self) -> impl Iterator<Item = FatStringToken> + '_ {
340        self.fat_tokens().map(|t| t.into())
341    }
342
343    pub fn get_span_content(&self, span: &Span<char>) -> &[char] {
344        span.get_content(&self.source)
345    }
346
347    pub fn get_span_content_str(&self, span: &Span<char>) -> String {
348        String::from_iter(self.get_span_content(span))
349    }
350
351    pub fn get_full_string(&self) -> String {
352        self.get_span_content_str(&Span::new(0, self.source.len()))
353    }
354
355    pub fn get_full_content(&self) -> &[char] {
356        &self.source
357    }
358
359    pub fn get_source(&self) -> &[char] {
360        &self.source
361    }
362
363    pub fn get_tokens(&self) -> &[Token] {
364        &self.tokens
365    }
366
367    /// Searches for quotation marks and fills the
368    /// [`Punctuation::Quote::twin_loc`] field. This is on a best-effort
369    /// basis.
370    ///
371    /// Current algorithm is based on https://leancrew.com/all-this/2025/03/a-mac-smart-quote-curiosity
372    fn match_quotes(&mut self) {
373        let mut pg_indices: Vec<_> = vec![0];
374        pg_indices.extend(self.iter_paragraph_break_indices());
375        pg_indices.push(self.tokens.len());
376
377        // Avoid allocation in loop
378        let mut quote_indices = Vec::new();
379        let mut open_quote_indices = Vec::new();
380
381        for (start, end) in pg_indices.into_iter().tuple_windows() {
382            let pg = &mut self.tokens[start..end];
383
384            quote_indices.clear();
385            quote_indices.extend(pg.iter_quote_indices());
386            open_quote_indices.clear();
387
388            // Find open quotes first.
389            for quote in &quote_indices {
390                let is_open = *quote == 0
391                    || pg[0..*quote].iter_word_likes().next().is_none()
392                    || pg[quote - 1].kind.is_whitespace()
393                    || matches!(
394                        pg[quote - 1].kind.as_punctuation(),
395                        Some(Punctuation::LessThan)
396                            | Some(Punctuation::OpenRound)
397                            | Some(Punctuation::OpenSquare)
398                            | Some(Punctuation::OpenCurly)
399                            | Some(Punctuation::Apostrophe)
400                    );
401
402                if is_open {
403                    open_quote_indices.push(*quote);
404                }
405            }
406
407            while let Some(open_idx) = open_quote_indices.pop() {
408                let Some(close_idx) = pg[open_idx + 1..].iter_quote_indices().next() else {
409                    continue;
410                };
411
412                if pg[close_idx + open_idx + 1]
413                    .kind
414                    .as_quote()
415                    .unwrap()
416                    .twin_loc
417                    .is_some()
418                {
419                    continue;
420                }
421
422                pg[open_idx].kind.as_mut_quote().unwrap().twin_loc =
423                    Some(close_idx + open_idx + start + 1);
424                pg[close_idx + open_idx + 1]
425                    .kind
426                    .as_mut_quote()
427                    .unwrap()
428                    .twin_loc = Some(open_idx + start);
429            }
430        }
431    }
432
433    /// Searches for number suffixes and condenses them down into single tokens
434    fn condense_number_suffixes(&mut self) {
435        if self.tokens.len() < 2 {
436            return;
437        }
438
439        let mut replace_starts = Vec::new();
440
441        for idx in 0..self.tokens.len() - 1 {
442            let b = &self.tokens[idx + 1];
443            let a = &self.tokens[idx];
444
445            // TODO: Allow spaces between `a` and `b`
446
447            if let (TokenKind::Number(..), TokenKind::Word(..)) = (&a.kind, &b.kind)
448                && let Some(found_suffix) =
449                    OrdinalSuffix::from_chars(self.get_span_content(&b.span))
450            {
451                self.tokens[idx].kind.as_mut_number().unwrap().suffix = Some(found_suffix);
452                replace_starts.push(idx);
453            }
454        }
455
456        self.condense_indices(&replace_starts, 2);
457    }
458
459    /// Searches for multiple sequential space tokens and condenses them down
460    /// into one.
461    fn condense_spaces(&mut self) {
462        let mut cursor = 0;
463        let copy = self.tokens.clone();
464
465        let mut remove_these = VecDeque::new();
466
467        while cursor < self.tokens.len() {
468            // Locate a stretch of one or more newline tokens.
469            let start_tok = &mut self.tokens[cursor];
470
471            if let TokenKind::Space(start_count) = &mut start_tok.kind {
472                loop {
473                    cursor += 1;
474
475                    if cursor >= copy.len() {
476                        break;
477                    }
478
479                    let child_tok = &copy[cursor];
480
481                    // Only condense adjacent spans
482                    if start_tok.span.end != child_tok.span.start {
483                        break;
484                    }
485
486                    if let TokenKind::Space(n) = child_tok.kind {
487                        *start_count += n;
488                        start_tok.span.end = child_tok.span.end;
489                        remove_these.push_back(cursor);
490                        cursor += 1;
491                    } else {
492                        break;
493                    };
494                }
495            }
496
497            cursor += 1;
498        }
499
500        self.tokens.remove_indices(remove_these);
501    }
502
503    thread_local! {
504        static LATIN_EXPR: Lrc<FirstMatchOf> = Document::uncached_latin_expr();
505    }
506
507    fn uncached_latin_expr() -> Lrc<FirstMatchOf> {
508        Lrc::new(FirstMatchOf::new(vec![
509            Box::new(
510                SequenceExpr::default()
511                    .then(WordSet::new(&["etc", "vs"]))
512                    .then_period(),
513            ),
514            Box::new(
515                SequenceExpr::aco("et")
516                    .then_whitespace()
517                    .t_aco("al")
518                    .then_period(),
519            ),
520        ]))
521    }
522
523    /// Assumes that the first matched token is the canonical one to be condensed into.
524    /// Takes a callback that can be used to retroactively edit the canonical token afterwards.
525    fn condense_expr<F>(&mut self, expr: &impl Expr, edit: F)
526    where
527        F: Fn(&mut Token),
528    {
529        let matches = expr.iter_matches_in_doc(self).collect::<Vec<_>>();
530
531        let mut remove_indices = VecDeque::with_capacity(matches.len());
532
533        for m in matches {
534            remove_indices.extend(m.start + 1..m.end);
535            self.tokens[m.start].span = self.tokens[m.into_iter()].span().unwrap();
536            edit(&mut self.tokens[m.start]);
537        }
538
539        self.tokens.remove_indices(remove_indices);
540    }
541
542    fn condense_latin(&mut self) {
543        self.condense_expr(&Self::LATIN_EXPR.with(|v| v.clone()), |_| {})
544    }
545
546    /// Searches for multiple sequential newline tokens and condenses them down
547    /// into one.
548    fn condense_newlines(&mut self) {
549        let mut cursor = 0;
550        let copy = self.tokens.clone();
551
552        let mut remove_these = VecDeque::new();
553
554        while cursor < self.tokens.len() {
555            // Locate a stretch of one or more newline tokens.
556            let start_tok = &mut self.tokens[cursor];
557
558            if let TokenKind::Newline(start_count) = &mut start_tok.kind {
559                loop {
560                    cursor += 1;
561
562                    if cursor >= copy.len() {
563                        break;
564                    }
565
566                    let child_tok = &copy[cursor];
567                    if let TokenKind::Newline(n) = child_tok.kind {
568                        *start_count += n;
569                        start_tok.span.end = child_tok.span.end;
570                        remove_these.push_back(cursor);
571                        cursor += 1;
572                    } else {
573                        break;
574                    };
575                }
576            }
577
578            cursor += 1;
579        }
580
581        self.tokens.remove_indices(remove_these);
582    }
583
584    /// Condenses words like "i.e.", "e.g." and "N.S.A." down to single words
585    /// using a state machine.
586    fn condense_dotted_initialisms(&mut self) {
587        if self.tokens.len() < 2 {
588            return;
589        }
590
591        let mut to_remove = VecDeque::new();
592
593        let mut cursor = 1;
594
595        let mut initialism_start = None;
596
597        loop {
598            let a = &self.tokens[cursor - 1];
599            let b = &self.tokens[cursor];
600
601            let is_initialism_chunk = a.kind.is_word() && a.span.len() == 1 && b.kind.is_period();
602
603            if is_initialism_chunk {
604                if initialism_start.is_none() {
605                    initialism_start = Some(cursor - 1);
606                } else {
607                    to_remove.push_back(cursor - 1);
608                }
609
610                to_remove.push_back(cursor);
611                cursor += 1;
612            } else {
613                if let Some(start) = initialism_start {
614                    let end = self.tokens[cursor - 2].span.end;
615                    let start_tok: &mut Token = &mut self.tokens[start];
616                    start_tok.span.end = end;
617                }
618
619                initialism_start = None;
620            }
621
622            cursor += 1;
623
624            if cursor >= self.tokens.len() - 1 {
625                break;
626            }
627        }
628
629        self.tokens.remove_indices(to_remove);
630    }
631
632    /// Condenses likely filename extensions down to single tokens.
633    fn condense_filename_extensions(&mut self) {
634        if self.tokens.len() < 2 {
635            return;
636        }
637
638        let mut to_remove = VecDeque::new();
639
640        let mut cursor = 1;
641
642        let mut ext_start = None;
643
644        loop {
645            // left context, dot, extension, right context
646            let l = self.get_token_offset(cursor, -2);
647            let d = &self.tokens[cursor - 1];
648            let x = &self.tokens[cursor];
649            let r = self.get_token_offset(cursor, 1);
650
651            let is_ext_chunk = d.kind.is_period()
652                && x.kind.is_word()
653                && x.span.len() <= 3
654                && ((l.is_none_or(|t| t.kind.is_whitespace())
655                    && r.is_none_or(|t| t.kind.is_whitespace()))
656                    || (l.is_some_and(|t| t.kind.is_open_round())
657                        && r.is_some_and(|t| t.kind.is_close_round())))
658                && {
659                    let ext_chars = x.span.get_content(&self.source);
660                    ext_chars.iter().all(|c| c.is_ascii_lowercase())
661                        || ext_chars.iter().all(|c| c.is_ascii_uppercase())
662                };
663
664            if is_ext_chunk {
665                if ext_start.is_none() {
666                    ext_start = Some(cursor - 1);
667                    self.tokens[cursor - 1].kind = TokenKind::Unlintable;
668                } else {
669                    to_remove.push_back(cursor - 1);
670                }
671
672                to_remove.push_back(cursor);
673                cursor += 1;
674            } else {
675                if let Some(start) = ext_start {
676                    let end = self.tokens[cursor - 2].span.end;
677                    let start_tok: &mut Token = &mut self.tokens[start];
678                    start_tok.span.end = end;
679                }
680
681                ext_start = None;
682            }
683
684            cursor += 1;
685
686            if cursor >= self.tokens.len() {
687                break;
688            }
689        }
690
691        self.tokens.remove_indices(to_remove);
692    }
693
694    /// Condenses "tl;dr" down to a single word token.
695    fn condense_tldr(&mut self) {
696        if self.tokens.len() < 3 {
697            return;
698        }
699
700        let mut to_remove = VecDeque::new();
701        let mut cursor = 2;
702
703        loop {
704            let tl = &self.tokens[cursor - 2];
705            let simicolon = &self.tokens[cursor - 1];
706            let dr = &self.tokens[cursor];
707
708            let is_tldr_chunk = tl.kind.is_word()
709                && tl.span.len() == 2
710                && tl
711                    .span
712                    .get_content(&self.source)
713                    .eq_ignore_ascii_case_chars(&['t', 'l'])
714                && simicolon.kind.is_semicolon()
715                && dr.kind.is_word()
716                && dr.span.len() >= 2
717                && dr.span.len() <= 3
718                && dr
719                    .span
720                    .get_content(&self.source)
721                    .eq_any_ignore_ascii_case_chars(&[&['d', 'r'], &['d', 'r', 's']]);
722
723            if is_tldr_chunk {
724                // Update the first token to be the full "tl;dr" as a word
725                self.tokens[cursor - 2].span = Span::new(
726                    self.tokens[cursor - 2].span.start,
727                    self.tokens[cursor].span.end,
728                );
729
730                // Mark the semicolon and "dr" tokens for removal
731                to_remove.push_back(cursor - 1);
732                to_remove.push_back(cursor);
733            }
734
735            // Skip ahead since we've processed these tokens
736            cursor += 1;
737
738            if cursor >= self.tokens.len() {
739                break;
740            }
741        }
742
743        // Remove the marked tokens in reverse order to maintain correct indices
744        self.tokens.remove_indices(to_remove);
745    }
746
747    /// Allows condensing of delimited pairs of tokens into a single token.
748    ///
749    /// # Arguments
750    ///
751    /// * `is_delimiter` - A function that returns `true` if the token is a delimiter.
752    /// * `valid_pairs` - A slice of tuples representing the valid pairs of tokens to condense.
753    ///
754    fn condense_delimited_pairs<F>(&mut self, is_delimiter: F, valid_pairs: &[(char, char)])
755    where
756        F: Fn(&TokenKind) -> bool,
757    {
758        if self.tokens.len() < 3 {
759            return;
760        }
761
762        let mut to_remove = VecDeque::new();
763        let mut cursor = 2;
764
765        loop {
766            let l1 = &self.tokens[cursor - 2];
767            let delim = &self.tokens[cursor - 1];
768            let l2 = &self.tokens[cursor];
769
770            let is_delimited_chunk = l1.kind.is_word()
771                && l1.span.len() == 1
772                && is_delimiter(&delim.kind)
773                && l2.kind.is_word()
774                && l2.span.len() == 1;
775
776            if is_delimited_chunk {
777                let (l1, l2) = (
778                    l1.span.get_content(&self.source).first(),
779                    l2.span.get_content(&self.source).first(),
780                );
781
782                let is_valid_pair = match (l1, l2) {
783                    (Some(l1), Some(l2)) => {
784                        let pair = (l1.to_ascii_lowercase(), l2.to_ascii_lowercase());
785                        valid_pairs.contains(&pair)
786                    }
787                    _ => false,
788                };
789
790                if is_valid_pair {
791                    self.tokens[cursor - 2].span = Span::new(
792                        self.tokens[cursor - 2].span.start,
793                        self.tokens[cursor].span.end,
794                    );
795                    to_remove.push_back(cursor - 1);
796                    to_remove.push_back(cursor);
797                }
798            }
799
800            cursor += 1;
801            if cursor >= self.tokens.len() {
802                break;
803            }
804        }
805
806        self.tokens.remove_indices(to_remove);
807    }
808
809    // Condenses "ampersand pairs" such as "R&D" or "Q&A" into single tokens.
810    fn condense_ampersand_pairs(&mut self) {
811        self.condense_delimited_pairs(
812            |kind| kind.is_ampersand(),
813            &[
814                ('b', 'b'), // bed & breakfast
815                ('b', 'w'), // black & white
816                ('g', 't'), // gin & tonic
817                ('k', 'r'), // Kernighan & Ritchie
818                ('q', 'a'), // question & answer
819                ('r', 'b'), // rhythm & blues
820                ('r', 'd'), // research & development
821                ('r', 'r'), // rest & relaxation
822                ('s', 'p'), // Standard & Poor's
823            ],
824        );
825    }
826
827    // Condenses "slash pairs" such as "I/O" into single tokens.
828    fn condense_slash_pairs(&mut self) {
829        self.condense_delimited_pairs(
830            |kind| kind.is_slash(),
831            &[
832                ('a', 'c'), // aircon; alternating current
833                ('b', 'w'), // black and white
834                ('c', 'o'), // care of
835                ('d', 'c'), // direct current
836                ('d', 'l'), // download
837                ('i', 'o'), // input/output
838                ('j', 'k'), // just kidding
839                ('n', 'a'), // not applicable
840                ('r', 'c'), // radio control
841                ('s', 'n'), // serial number
842                ('y', 'n'), // yes/no
843                ('y', 'o'), // years old
844            ],
845        );
846    }
847
848    fn uncached_ellipsis_pattern() -> Lrc<Repeating> {
849        let period = SequenceExpr::default().then_period();
850        Lrc::new(Repeating::new(Box::new(period), 2))
851    }
852
853    thread_local! {
854        static ELLIPSIS_EXPR: Lrc<Repeating> = Document::uncached_ellipsis_pattern();
855    }
856
857    fn condense_ellipsis(&mut self) {
858        let expr = Self::ELLIPSIS_EXPR.with(|v| v.clone());
859        self.condense_expr(&expr, |tok| {
860            tok.kind = TokenKind::Punctuation(Punctuation::Ellipsis)
861        });
862    }
863}
864
865/// Creates functions necessary to implement [`TokenStringExt]` on a document.
866macro_rules! create_fns_on_doc {
867    ($thing:ident) => {
868        paste! {
869            fn [< first_ $thing >](&self) -> Option<&Token> {
870                self.tokens.[< first_ $thing >]()
871            }
872
873            fn [< last_ $thing >](&self) -> Option<&Token> {
874                self.tokens.[< last_ $thing >]()
875            }
876
877            fn [< last_ $thing _index>](&self) -> Option<usize> {
878                self.tokens.[< last_ $thing _index >]()
879            }
880
881            fn [<iter_ $thing _indices>](&self) -> impl DoubleEndedIterator<Item = usize> + '_ {
882                self.tokens.[< iter_ $thing _indices >]()
883            }
884
885            fn [<iter_ $thing s>](&self) -> impl Iterator<Item = &Token> + '_ {
886                self.tokens.[< iter_ $thing s >]()
887            }
888        }
889    };
890}
891
892impl TokenStringExt for Document {
893    create_fns_on_doc!(adjective);
894    create_fns_on_doc!(apostrophe);
895    create_fns_on_doc!(at);
896    create_fns_on_doc!(chunk_terminator);
897    create_fns_on_doc!(comma);
898    create_fns_on_doc!(conjunction);
899    create_fns_on_doc!(currency);
900    create_fns_on_doc!(ellipsis);
901    create_fns_on_doc!(hostname);
902    create_fns_on_doc!(likely_homograph);
903    create_fns_on_doc!(noun);
904    create_fns_on_doc!(number);
905    create_fns_on_doc!(paragraph_break);
906    create_fns_on_doc!(pipe);
907    create_fns_on_doc!(preposition);
908    create_fns_on_doc!(punctuation);
909    create_fns_on_doc!(quote);
910    create_fns_on_doc!(sentence_terminator);
911    create_fns_on_doc!(space);
912    create_fns_on_doc!(unlintable);
913    create_fns_on_doc!(verb);
914    create_fns_on_doc!(word);
915    create_fns_on_doc!(word_like);
916    create_fns_on_doc!(heading_start);
917
918    fn first_sentence_word(&self) -> Option<&Token> {
919        self.tokens.first_sentence_word()
920    }
921
922    fn first_non_whitespace(&self) -> Option<&Token> {
923        self.tokens.first_non_whitespace()
924    }
925
926    fn span(&self) -> Option<Span<char>> {
927        self.tokens.span()
928    }
929
930    fn iter_linking_verb_indices(&self) -> impl Iterator<Item = usize> + '_ {
931        self.tokens.iter_linking_verb_indices()
932    }
933
934    fn iter_linking_verbs(&self) -> impl Iterator<Item = &Token> + '_ {
935        self.tokens.iter_linking_verbs()
936    }
937
938    fn iter_chunks(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
939        self.tokens.iter_chunks()
940    }
941
942    fn iter_paragraphs(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
943        self.tokens.iter_paragraphs()
944    }
945
946    fn iter_headings(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
947        self.tokens.iter_headings()
948    }
949
950    fn iter_sentences(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
951        self.tokens.iter_sentences()
952    }
953
954    fn iter_sentences_mut(&mut self) -> impl Iterator<Item = &'_ mut [Token]> + '_ {
955        self.tokens.iter_sentences_mut()
956    }
957}
958
959impl Display for Document {
960    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
961        for token in &self.tokens {
962            write!(f, "{}", self.get_span_content_str(&token.span))?;
963        }
964
965        Ok(())
966    }
967}
968
969#[cfg(test)]
970mod tests {
971    use itertools::Itertools;
972
973    use super::Document;
974    use crate::TokenStringExt;
975    use crate::{Span, parsers::MarkdownOptions};
976
977    fn assert_condensed_contractions(text: &str, final_tok_count: usize) {
978        let document = Document::new_plain_english_curated(text);
979
980        assert_eq!(document.tokens.len(), final_tok_count);
981
982        let document = Document::new_markdown_curated(text, MarkdownOptions::default());
983
984        assert_eq!(document.tokens.len(), final_tok_count);
985    }
986
987    #[test]
988    fn simple_contraction() {
989        assert_condensed_contractions("isn't", 1);
990    }
991
992    #[test]
993    fn simple_contraction2() {
994        assert_condensed_contractions("wasn't", 1);
995    }
996
997    #[test]
998    fn simple_contraction3() {
999        assert_condensed_contractions("There's", 1);
1000    }
1001
1002    #[test]
1003    fn simple_contraction4() {
1004        assert_condensed_contractions("doesn't", 1);
1005    }
1006
1007    #[test]
1008    fn medium_contraction() {
1009        assert_condensed_contractions("isn't wasn't", 3);
1010    }
1011
1012    #[test]
1013    fn medium_contraction2() {
1014        assert_condensed_contractions("There's no way", 5);
1015    }
1016
1017    #[test]
1018    fn selects_token_at_char_index() {
1019        let text = "There were three little pigs. They built three little homes.";
1020        let document = Document::new_plain_english_curated(text);
1021
1022        let got = document.get_token_at_char_index(19).unwrap();
1023
1024        assert!(got.kind.is_word());
1025        assert_eq!(got.span, Span::new(17, 23));
1026    }
1027
1028    fn assert_token_count(source: &str, count: usize) {
1029        let document = Document::new_plain_english_curated(source);
1030
1031        dbg!(document.tokens().map(|t| t.kind.clone()).collect_vec());
1032        assert_eq!(document.tokens.len(), count);
1033    }
1034
1035    #[test]
1036    fn condenses_number_suffixes() {
1037        assert_token_count("1st", 1);
1038        assert_token_count("This is the 2nd test", 9);
1039        assert_token_count("This is the 3rd test", 9);
1040        assert_token_count(
1041            "It works even with weird capitalization like this: 600nD",
1042            18,
1043        );
1044    }
1045
1046    #[test]
1047    fn condenses_ie() {
1048        assert_token_count("There is a thing (i.e. that one)", 15);
1049        assert_token_count("We are trying to condense \"i.e.\"", 13);
1050        assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
1051    }
1052
1053    #[test]
1054    fn condenses_eg() {
1055        assert_token_count("We are trying to condense \"e.g.\"", 13);
1056        assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
1057    }
1058
1059    #[test]
1060    fn condenses_nsa() {
1061        assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
1062    }
1063
1064    #[test]
1065    fn parses_ellipsis() {
1066        assert_token_count("...", 1);
1067    }
1068
1069    #[test]
1070    fn parses_long_ellipsis() {
1071        assert_token_count(".....", 1);
1072    }
1073
1074    #[test]
1075    fn parses_short_ellipsis() {
1076        assert_token_count("..", 1);
1077    }
1078
1079    #[test]
1080    fn selects_token_at_offset() {
1081        let doc = Document::new_plain_english_curated("Foo bar baz");
1082
1083        let tok = doc.get_token_offset(1, -1).unwrap();
1084
1085        assert_eq!(tok.span, Span::new(0, 3));
1086    }
1087
1088    #[test]
1089    fn cant_select_token_before_start() {
1090        let doc = Document::new_plain_english_curated("Foo bar baz");
1091
1092        let tok = doc.get_token_offset(0, -1);
1093
1094        assert!(tok.is_none());
1095    }
1096
1097    #[test]
1098    fn select_next_word_pos_offset() {
1099        let doc = Document::new_plain_english_curated("Foo bar baz");
1100
1101        let bar = doc.get_next_word_from_offset(0, 1).unwrap();
1102        let bar = doc.get_span_content(&bar.span);
1103        assert_eq!(bar, ['b', 'a', 'r']);
1104    }
1105
1106    #[test]
1107    fn select_next_word_neg_offset() {
1108        let doc = Document::new_plain_english_curated("Foo bar baz");
1109
1110        let bar = doc.get_next_word_from_offset(2, -1).unwrap();
1111        let bar = doc.get_span_content(&bar.span);
1112        assert_eq!(bar, ['F', 'o', 'o']);
1113    }
1114
1115    #[test]
1116    fn cant_select_next_word_not_from_whitespace() {
1117        let doc = Document::new_plain_english_curated("Foo bar baz");
1118
1119        let tok = doc.get_next_word_from_offset(0, 2);
1120
1121        assert!(tok.is_none());
1122    }
1123
1124    #[test]
1125    fn cant_select_next_word_before_start() {
1126        let doc = Document::new_plain_english_curated("Foo bar baz");
1127
1128        let tok = doc.get_next_word_from_offset(0, -1);
1129
1130        assert!(tok.is_none());
1131    }
1132
1133    #[test]
1134    fn cant_select_next_word_with_punctuation_instead_of_whitespace() {
1135        let doc = Document::new_plain_english_curated("Foo, bar, baz");
1136
1137        let tok = doc.get_next_word_from_offset(0, 1);
1138
1139        assert!(tok.is_none());
1140    }
1141
1142    #[test]
1143    fn cant_select_next_word_with_punctuation_after_whitespace() {
1144        let doc = Document::new_plain_english_curated("Foo \"bar\", baz");
1145
1146        let tok = doc.get_next_word_from_offset(0, 1);
1147
1148        assert!(tok.is_none());
1149    }
1150
1151    #[test]
1152    fn condenses_filename_extensions() {
1153        let doc = Document::new_plain_english_curated(".c and .exe and .js");
1154        assert!(doc.tokens[0].kind.is_unlintable());
1155        assert!(doc.tokens[4].kind.is_unlintable());
1156        assert!(doc.tokens[8].kind.is_unlintable());
1157    }
1158
1159    #[test]
1160    fn condense_filename_extension_ok_at_start_and_end() {
1161        let doc = Document::new_plain_english_curated(".c and .EXE");
1162        assert!(doc.tokens.len() == 5);
1163        assert!(doc.tokens[0].kind.is_unlintable());
1164        assert!(doc.tokens[4].kind.is_unlintable());
1165    }
1166
1167    #[test]
1168    fn doesnt_condense_filename_extensions_with_mixed_case() {
1169        let doc = Document::new_plain_english_curated(".c and .Exe");
1170        assert!(doc.tokens.len() == 6);
1171        assert!(doc.tokens[0].kind.is_unlintable());
1172        assert!(doc.tokens[4].kind.is_punctuation());
1173        assert!(doc.tokens[5].kind.is_word());
1174    }
1175
1176    #[test]
1177    fn doesnt_condense_filename_extensions_with_non_letters() {
1178        let doc = Document::new_plain_english_curated(".COM and .C0M");
1179        assert!(doc.tokens.len() == 6);
1180        assert!(doc.tokens[0].kind.is_unlintable());
1181        assert!(doc.tokens[4].kind.is_punctuation());
1182        assert!(doc.tokens[5].kind.is_word());
1183    }
1184
1185    #[test]
1186    fn doesnt_condense_filename_extensions_longer_than_three() {
1187        let doc = Document::new_plain_english_curated(".dll and .dlls");
1188        assert!(doc.tokens.len() == 6);
1189        assert!(doc.tokens[0].kind.is_unlintable());
1190        assert!(doc.tokens[4].kind.is_punctuation());
1191        assert!(doc.tokens[5].kind.is_word());
1192    }
1193
1194    #[test]
1195    fn condense_filename_extension_in_parens() {
1196        let doc = Document::new_plain_english_curated(
1197            "true for the manual installation when trying to run the executable(.exe) after a manual download",
1198        );
1199        assert!(doc.tokens.len() > 23);
1200        assert!(doc.tokens[21].kind.is_open_round());
1201        assert!(doc.tokens[22].kind.is_unlintable());
1202        assert!(doc.tokens[23].kind.is_close_round());
1203    }
1204
1205    #[test]
1206    fn condense_tldr_uppercase() {
1207        let doc = Document::new_plain_english_curated("TL;DR");
1208        assert!(doc.tokens.len() == 1);
1209        assert!(doc.tokens[0].kind.is_word());
1210        assert!(doc.tokens[0].span.len() == 5);
1211    }
1212
1213    #[test]
1214    fn condense_tldr_lowercase() {
1215        let doc = Document::new_plain_english_curated("tl;dr");
1216        assert!(doc.tokens.len() == 1);
1217        assert!(doc.tokens[0].kind.is_word());
1218    }
1219
1220    #[test]
1221    fn condense_tldr_mixed_case_1() {
1222        let doc = Document::new_plain_english_curated("tl;DR");
1223        assert!(doc.tokens.len() == 1);
1224        assert!(doc.tokens[0].kind.is_word());
1225    }
1226
1227    #[test]
1228    fn condense_tldr_mixed_case_2() {
1229        let doc = Document::new_plain_english_curated("TL;Dr");
1230        assert!(doc.tokens.len() == 1);
1231        assert!(doc.tokens[0].kind.is_word());
1232    }
1233
1234    #[test]
1235    fn condense_tldr_pural() {
1236        let doc = Document::new_plain_english_curated(
1237            "managing the flow between components to produce relevant TL;DRs of current news articles",
1238        );
1239        // no token is a punctuation token - only words with whitespace between
1240        assert!(
1241            doc.tokens
1242                .iter()
1243                .all(|t| t.kind.is_word() || t.kind.is_whitespace())
1244        );
1245        // one of the word tokens contains a ';' character
1246        let tldrs = doc
1247            .tokens
1248            .iter()
1249            .filter(|t| t.span.get_content(&doc.source).contains(&';'))
1250            .collect_vec();
1251        assert!(tldrs.len() == 1);
1252        assert!(tldrs[0].span.get_content_string(&doc.source) == "TL;DRs");
1253    }
1254
1255    #[test]
1256    fn condense_r_and_d_caps() {
1257        let doc = Document::new_plain_english_curated("R&D");
1258        assert!(doc.tokens.len() == 1);
1259        assert!(doc.tokens[0].kind.is_word());
1260    }
1261
1262    #[test]
1263    fn condense_r_and_d_mixed_case() {
1264        let doc = Document::new_plain_english_curated("R&d");
1265        assert!(doc.tokens.len() == 1);
1266        assert!(doc.tokens[0].kind.is_word());
1267    }
1268
1269    #[test]
1270    fn condense_r_and_d_lowercase() {
1271        let doc = Document::new_plain_english_curated("r&d");
1272        assert!(doc.tokens.len() == 1);
1273        assert!(doc.tokens[0].kind.is_word());
1274    }
1275
1276    #[test]
1277    fn dont_condense_r_and_d_with_spaces() {
1278        let doc = Document::new_plain_english_curated("R & D");
1279        assert!(doc.tokens.len() == 5);
1280        assert!(doc.tokens[0].kind.is_word());
1281        assert!(doc.tokens[1].kind.is_whitespace());
1282        assert!(doc.tokens[2].kind.is_ampersand());
1283        assert!(doc.tokens[3].kind.is_whitespace());
1284        assert!(doc.tokens[4].kind.is_word());
1285    }
1286
1287    #[test]
1288    fn condense_q_and_a() {
1289        let doc =
1290            Document::new_plain_english_curated("A Q&A platform software for teams at any scales.");
1291        assert!(doc.tokens.len() >= 3);
1292        assert!(doc.tokens[2].kind.is_word());
1293        assert!(doc.tokens[2].span.get_content_string(&doc.source) == "Q&A");
1294    }
1295
1296    #[test]
1297    fn dont_allow_mixed_r_and_d_with_q_and_a() {
1298        let doc = Document::new_plain_english_curated("R&A or Q&D");
1299        assert!(doc.tokens.len() == 9);
1300        assert!(doc.tokens[1].kind.is_ampersand() || doc.tokens[7].kind.is_ampersand());
1301    }
1302
1303    #[test]
1304    fn condense_io() {
1305        let doc = Document::new_plain_english_curated("I/O");
1306        assert!(doc.tokens.len() == 1);
1307        assert!(doc.tokens[0].kind.is_word());
1308    }
1309
1310    #[test]
1311    fn finds_unmatched_quotes_in_document() {
1312        let raw = r#"
1313This is a paragraph with a single word "quoted."
1314
1315This is a second paragraph with no quotes.
1316
1317This is a third paragraph with a single erroneous "quote.
1318
1319This is a final paragraph with a weird "quote and a not-weird "quote".
1320            "#;
1321
1322        let doc = Document::new_markdown_default_curated(raw);
1323
1324        let quote_twins: Vec<_> = doc
1325            .iter_quotes()
1326            .map(|t| t.kind.as_quote().unwrap().twin_loc)
1327            .collect();
1328
1329        assert_eq!(
1330            quote_twins,
1331            vec![Some(19), Some(16), None, None, Some(89), Some(87)]
1332        )
1333    }
1334
1335    #[test]
1336    fn issue_1901() {
1337        let raw = r#"
1338"A quoted line"
1339"A quote without a closing mark
1340"Another quoted lined"
1341"The last quoted line"
1342            "#;
1343
1344        let doc = Document::new_markdown_default_curated(raw);
1345
1346        let quote_twins: Vec<_> = doc
1347            .iter_quotes()
1348            .map(|t| t.kind.as_quote().unwrap().twin_loc)
1349            .collect();
1350
1351        assert_eq!(
1352            quote_twins,
1353            vec![
1354                Some(6),
1355                Some(0),
1356                None,
1357                Some(27),
1358                Some(21),
1359                Some(37),
1360                Some(29)
1361            ]
1362        )
1363    }
1364}