harper_core/
document.rs

1use std::cmp::Ordering;
2use std::collections::VecDeque;
3use std::fmt::Display;
4
5use harper_brill::{Chunker, Tagger, brill_tagger, burn_chunker};
6use itertools::Itertools;
7use paste::paste;
8
9use crate::expr::{Expr, ExprExt, FirstMatchOf, Repeating, SequenceExpr};
10use crate::parsers::{Markdown, MarkdownOptions, Parser, PlainEnglish};
11use crate::patterns::WordSet;
12use crate::punctuation::Punctuation;
13use crate::spell::{Dictionary, FstDictionary};
14use crate::vec_ext::VecExt;
15use crate::{CharStringExt, FatStringToken, FatToken, Lrc, Token, TokenKind, TokenStringExt};
16use crate::{OrdinalSuffix, Span};
17
18/// A document containing some amount of lexed and parsed English text.
19#[derive(Debug, Clone)]
20pub struct Document {
21    source: Lrc<Vec<char>>,
22    tokens: Vec<Token>,
23}
24
25impl Default for Document {
26    fn default() -> Self {
27        Self::new("", &PlainEnglish, &FstDictionary::curated())
28    }
29}
30
31impl Document {
32    /// Locate all the tokens that intersect a provided span.
33    ///
34    /// Desperately needs optimization.
35    pub fn token_indices_intersecting(&self, span: Span<char>) -> Vec<usize> {
36        self.tokens()
37            .enumerate()
38            .filter_map(|(idx, tok)| tok.span.overlaps_with(span).then_some(idx))
39            .collect()
40    }
41
42    /// Locate all the tokens that intersect a provided span and convert them to [`FatToken`]s.
43    ///
44    /// Desperately needs optimization.
45    pub fn fat_tokens_intersecting(&self, span: Span<char>) -> Vec<FatToken> {
46        let indices = self.token_indices_intersecting(span);
47
48        indices
49            .into_iter()
50            .map(|i| self.tokens[i].to_fat(&self.source))
51            .collect()
52    }
53
54    /// Lexes and parses text to produce a document using a provided language
55    /// parser and dictionary.
56    pub fn new(text: &str, parser: &impl Parser, dictionary: &impl Dictionary) -> Self {
57        let source: Vec<_> = text.chars().collect();
58
59        Self::new_from_vec(Lrc::new(source), parser, dictionary)
60    }
61
62    /// Lexes and parses text to produce a document using a provided language
63    /// parser and the included curated dictionary.
64    pub fn new_curated(text: &str, parser: &impl Parser) -> Self {
65        let source: Vec<_> = text.chars().collect();
66
67        Self::new_from_vec(Lrc::new(source), parser, &FstDictionary::curated())
68    }
69
70    /// Lexes and parses text to produce a document using a provided language
71    /// parser and dictionary.
72    pub fn new_from_vec(
73        source: Lrc<Vec<char>>,
74        parser: &impl Parser,
75        dictionary: &impl Dictionary,
76    ) -> Self {
77        let tokens = parser.parse(&source);
78
79        let mut document = Self { source, tokens };
80        document.parse(dictionary);
81
82        document
83    }
84
85    /// Parse text to produce a document using the built-in [`PlainEnglish`]
86    /// parser and curated dictionary.
87    pub fn new_plain_english_curated(text: &str) -> Self {
88        Self::new(text, &PlainEnglish, &FstDictionary::curated())
89    }
90
91    /// Parse text to produce a document using the built-in [`PlainEnglish`]
92    /// parser and a provided dictionary.
93    pub fn new_plain_english(text: &str, dictionary: &impl Dictionary) -> Self {
94        Self::new(text, &PlainEnglish, dictionary)
95    }
96
97    /// Parse text to produce a document using the built-in [`Markdown`] parser
98    /// and curated dictionary.
99    pub fn new_markdown_curated(text: &str, markdown_options: MarkdownOptions) -> Self {
100        Self::new(
101            text,
102            &Markdown::new(markdown_options),
103            &FstDictionary::curated(),
104        )
105    }
106
107    /// Parse text to produce a document using the built-in [`Markdown`] parser
108    /// and curated dictionary with the default Markdown configuration.
109    pub fn new_markdown_default_curated(text: &str) -> Self {
110        Self::new_markdown_curated(text, MarkdownOptions::default())
111    }
112
113    /// Parse text to produce a document using the built-in [`PlainEnglish`]
114    /// parser and the curated dictionary.
115    pub fn new_markdown(
116        text: &str,
117        markdown_options: MarkdownOptions,
118        dictionary: &impl Dictionary,
119    ) -> Self {
120        Self::new(text, &Markdown::new(markdown_options), dictionary)
121    }
122
123    /// Parse text to produce a document using the built-in [`PlainEnglish`]
124    /// parser and the curated dictionary with the default Markdown configuration.
125    pub fn new_markdown_default(text: &str, dictionary: &impl Dictionary) -> Self {
126        Self::new_markdown(text, MarkdownOptions::default(), dictionary)
127    }
128
129    /// Re-parse important language constructs.
130    ///
131    /// Should be run after every change to the underlying [`Self::source`].
132    fn parse(&mut self, dictionary: &impl Dictionary) {
133        self.condense_spaces();
134        self.condense_newlines();
135        self.newlines_to_breaks();
136        self.condense_contractions();
137        self.condense_dotted_initialisms();
138        self.condense_number_suffixes();
139        self.condense_ellipsis();
140        self.condense_latin();
141        self.condense_filename_extensions();
142        self.condense_tldr();
143        self.condense_ampersand_pairs();
144        self.condense_slash_pairs();
145        self.match_quotes();
146
147        let chunker = burn_chunker();
148        let tagger = brill_tagger();
149
150        for sent in self.tokens.iter_sentences_mut() {
151            let token_strings: Vec<_> = sent
152                .iter()
153                .filter(|t| !t.kind.is_whitespace())
154                .map(|t| t.span.get_content_string(&self.source))
155                .collect();
156
157            let token_tags = tagger.tag_sentence(&token_strings);
158            let np_flags = chunker.chunk_sentence(&token_strings, &token_tags);
159
160            let mut i = 0;
161
162            // Annotate DictWord metadata
163            for token in sent.iter_mut() {
164                if let TokenKind::Word(meta) = &mut token.kind {
165                    let word_source = token.span.get_content(&self.source);
166                    let mut found_meta = dictionary
167                        .get_lexeme_metadata(word_source)
168                        .map(|c| c.into_owned());
169
170                    if let Some(inner) = &mut found_meta {
171                        inner.pos_tag = token_tags[i].or_else(|| inner.infer_pos_tag());
172                        inner.np_member = Some(np_flags[i]);
173                    }
174
175                    *meta = found_meta;
176                    i += 1;
177                } else if !token.kind.is_whitespace() {
178                    i += 1;
179                }
180            }
181        }
182    }
183
184    /// Convert all sets of newlines greater than 2 to paragraph breaks.
185    fn newlines_to_breaks(&mut self) {
186        for token in &mut self.tokens {
187            if let TokenKind::Newline(n) = token.kind
188                && n >= 2
189            {
190                token.kind = TokenKind::ParagraphBreak;
191            }
192        }
193    }
194
195    /// Given a list of indices, this function removes the subsequent
196    /// `stretch_len - 1` elements after each index.
197    ///
198    /// Will extend token spans to include removed elements.
199    /// Assumes condensed tokens are contiguous in source text.
200    fn condense_indices(&mut self, indices: &[usize], stretch_len: usize) {
201        // Update spans
202        for idx in indices {
203            let end_tok = self.tokens[idx + stretch_len - 1].clone();
204            let start_tok = &mut self.tokens[*idx];
205
206            start_tok.span.end = end_tok.span.end;
207        }
208
209        // Trim
210        let old = self.tokens.clone();
211        self.tokens.clear();
212
213        // Keep first chunk.
214        self.tokens
215            .extend_from_slice(&old[0..indices.first().copied().unwrap_or(indices.len())]);
216
217        let mut iter = indices.iter().peekable();
218
219        while let (Some(a_idx), b) = (iter.next(), iter.peek()) {
220            self.tokens.push(old[*a_idx].clone());
221
222            if let Some(b_idx) = b {
223                self.tokens
224                    .extend_from_slice(&old[a_idx + stretch_len..**b_idx]);
225            }
226        }
227
228        // Keep last chunk.
229        self.tokens.extend_from_slice(
230            &old[indices
231                .last()
232                .map(|v| v + stretch_len)
233                .unwrap_or(indices.len())..],
234        );
235    }
236
237    pub fn get_token_at_char_index(&self, char_index: usize) -> Option<&Token> {
238        let index = self
239            .tokens
240            .binary_search_by(|t| {
241                if t.span.overlaps_with(Span::new_with_len(char_index, 1)) {
242                    Ordering::Equal
243                } else {
244                    t.span.start.cmp(&char_index)
245                }
246            })
247            .ok()?;
248
249        Some(&self.tokens[index])
250    }
251
252    /// Defensively attempt to grab a specific token.
253    pub fn get_token(&self, index: usize) -> Option<&Token> {
254        self.tokens.get(index)
255    }
256
257    /// Get a token at a signed offset from a base index, or None if out of bounds.
258    pub fn get_token_offset(&self, base: usize, offset: isize) -> Option<&Token> {
259        match base.checked_add_signed(offset) {
260            None => None,
261            Some(idx) => self.get_token(idx),
262        }
263    }
264
265    /// Get an iterator over all the tokens contained in the document.
266    pub fn tokens(&self) -> impl Iterator<Item = &Token> + '_ {
267        self.tokens.iter()
268    }
269
270    pub fn iter_nominal_phrases(&self) -> impl Iterator<Item = &[Token]> {
271        fn is_np_member(t: &Token) -> bool {
272            t.kind
273                .as_word()
274                .and_then(|x| x.as_ref())
275                .and_then(|w| w.np_member)
276                .unwrap_or(false)
277        }
278
279        fn trim(slice: &[Token]) -> &[Token] {
280            let mut start = 0;
281            let mut end = slice.len();
282            while start < end && slice[start].kind.is_whitespace() {
283                start += 1;
284            }
285            while end > start && slice[end - 1].kind.is_whitespace() {
286                end -= 1;
287            }
288            &slice[start..end]
289        }
290
291        self.tokens
292            .as_slice()
293            .split(|t| !(is_np_member(t) || t.kind.is_whitespace()))
294            .filter_map(|s| {
295                let s = trim(s);
296                if s.iter().any(is_np_member) {
297                    Some(s)
298                } else {
299                    None
300                }
301            })
302    }
303
304    /// Get an iterator over all the tokens contained in the document.
305    pub fn fat_tokens(&self) -> impl Iterator<Item = FatToken> + '_ {
306        self.tokens().map(|token| token.to_fat(&self.source))
307    }
308
309    /// Get the next or previous word token relative to a base index, if separated by whitespace.
310    /// Returns None if the next/previous token is not a word or does not exist.
311    pub fn get_next_word_from_offset(&self, base: usize, offset: isize) -> Option<&Token> {
312        // Look for whitespace at the expected offset
313        if !self.get_token_offset(base, offset)?.kind.is_whitespace() {
314            return None;
315        }
316        // Now look beyond the whitespace for a word token
317        let word_token = self.get_token_offset(base, offset + offset.signum());
318        let word_token = word_token?;
319        word_token.kind.is_word().then_some(word_token)
320    }
321
322    /// Get an iterator over all the tokens contained in the document.
323    pub fn fat_string_tokens(&self) -> impl Iterator<Item = FatStringToken> + '_ {
324        self.fat_tokens().map(|t| t.into())
325    }
326
327    pub fn get_span_content(&self, span: &Span<char>) -> &[char] {
328        span.get_content(&self.source)
329    }
330
331    pub fn get_span_content_str(&self, span: &Span<char>) -> String {
332        String::from_iter(self.get_span_content(span))
333    }
334
335    pub fn get_full_string(&self) -> String {
336        self.get_span_content_str(&Span::new(0, self.source.len()))
337    }
338
339    pub fn get_full_content(&self) -> &[char] {
340        &self.source
341    }
342
343    pub fn get_source(&self) -> &[char] {
344        &self.source
345    }
346
347    pub fn get_tokens(&self) -> &[Token] {
348        &self.tokens
349    }
350
351    /// Searches for quotation marks and fills the
352    /// [`Punctuation::Quote::twin_loc`] field. This is on a best-effort
353    /// basis.
354    ///
355    /// Current algorithm is based on https://leancrew.com/all-this/2025/03/a-mac-smart-quote-curiosity
356    fn match_quotes(&mut self) {
357        let mut pg_indices: Vec<_> = vec![0];
358        pg_indices.extend(self.iter_paragraph_break_indices());
359        pg_indices.push(self.tokens.len());
360
361        // Avoid allocation in loop
362        let mut quote_indices = Vec::new();
363        let mut open_quote_indices = Vec::new();
364
365        for (start, end) in pg_indices.into_iter().tuple_windows() {
366            let pg = &mut self.tokens[start..end];
367
368            quote_indices.clear();
369            quote_indices.extend(pg.iter_quote_indices());
370            open_quote_indices.clear();
371
372            // Find open quotes first.
373            for quote in &quote_indices {
374                let is_open = *quote == 0
375                    || pg[0..*quote].iter_word_likes().next().is_none()
376                    || pg[quote - 1].kind.is_whitespace()
377                    || matches!(
378                        pg[quote - 1].kind.as_punctuation(),
379                        Some(Punctuation::LessThan)
380                            | Some(Punctuation::OpenRound)
381                            | Some(Punctuation::OpenSquare)
382                            | Some(Punctuation::OpenCurly)
383                            | Some(Punctuation::Apostrophe)
384                    );
385
386                if is_open {
387                    open_quote_indices.push(*quote);
388                }
389            }
390
391            while let Some(open_idx) = open_quote_indices.pop() {
392                let Some(close_idx) = pg[open_idx + 1..].iter_quote_indices().next() else {
393                    continue;
394                };
395
396                if pg[close_idx + open_idx + 1]
397                    .kind
398                    .as_quote()
399                    .unwrap()
400                    .twin_loc
401                    .is_some()
402                {
403                    continue;
404                }
405
406                pg[open_idx].kind.as_mut_quote().unwrap().twin_loc =
407                    Some(close_idx + open_idx + start + 1);
408                pg[close_idx + open_idx + 1]
409                    .kind
410                    .as_mut_quote()
411                    .unwrap()
412                    .twin_loc = Some(open_idx + start);
413            }
414        }
415    }
416
417    /// Searches for number suffixes and condenses them down into single tokens
418    fn condense_number_suffixes(&mut self) {
419        if self.tokens.len() < 2 {
420            return;
421        }
422
423        let mut replace_starts = Vec::new();
424
425        for idx in 0..self.tokens.len() - 1 {
426            let b = &self.tokens[idx + 1];
427            let a = &self.tokens[idx];
428
429            // TODO: Allow spaces between `a` and `b`
430
431            if let (TokenKind::Number(..), TokenKind::Word(..)) = (&a.kind, &b.kind)
432                && let Some(found_suffix) =
433                    OrdinalSuffix::from_chars(self.get_span_content(&b.span))
434            {
435                self.tokens[idx].kind.as_mut_number().unwrap().suffix = Some(found_suffix);
436                replace_starts.push(idx);
437            }
438        }
439
440        self.condense_indices(&replace_starts, 2);
441    }
442
443    /// Searches for multiple sequential space tokens and condenses them down
444    /// into one.
445    fn condense_spaces(&mut self) {
446        let mut cursor = 0;
447        let copy = self.tokens.clone();
448
449        let mut remove_these = VecDeque::new();
450
451        while cursor < self.tokens.len() {
452            // Locate a stretch of one or more newline tokens.
453            let start_tok = &mut self.tokens[cursor];
454
455            if let TokenKind::Space(start_count) = &mut start_tok.kind {
456                loop {
457                    cursor += 1;
458
459                    if cursor >= copy.len() {
460                        break;
461                    }
462
463                    let child_tok = &copy[cursor];
464
465                    // Only condense adjacent spans
466                    if start_tok.span.end != child_tok.span.start {
467                        break;
468                    }
469
470                    if let TokenKind::Space(n) = child_tok.kind {
471                        *start_count += n;
472                        start_tok.span.end = child_tok.span.end;
473                        remove_these.push_back(cursor);
474                        cursor += 1;
475                    } else {
476                        break;
477                    };
478                }
479            }
480
481            cursor += 1;
482        }
483
484        self.tokens.remove_indices(remove_these);
485    }
486
487    thread_local! {
488        static LATIN_EXPR: Lrc<FirstMatchOf> = Document::uncached_latin_expr();
489    }
490
491    fn uncached_latin_expr() -> Lrc<FirstMatchOf> {
492        Lrc::new(FirstMatchOf::new(vec![
493            Box::new(
494                SequenceExpr::default()
495                    .then(WordSet::new(&["etc", "vs"]))
496                    .then_period(),
497            ),
498            Box::new(
499                SequenceExpr::aco("et")
500                    .then_whitespace()
501                    .t_aco("al")
502                    .then_period(),
503            ),
504        ]))
505    }
506
507    /// Assumes that the first matched token is the canonical one to be condensed into.
508    /// Takes a callback that can be used to retroactively edit the canonical token afterwards.
509    fn condense_expr<F>(&mut self, expr: &impl Expr, edit: F)
510    where
511        F: Fn(&mut Token),
512    {
513        let matches = expr.iter_matches_in_doc(self).collect::<Vec<_>>();
514
515        let mut remove_indices = VecDeque::with_capacity(matches.len());
516
517        for m in matches {
518            remove_indices.extend(m.start + 1..m.end);
519            self.tokens[m.start].span = self.tokens[m.into_iter()].span().unwrap();
520            edit(&mut self.tokens[m.start]);
521        }
522
523        self.tokens.remove_indices(remove_indices);
524    }
525
526    fn condense_latin(&mut self) {
527        self.condense_expr(&Self::LATIN_EXPR.with(|v| v.clone()), |_| {})
528    }
529
530    /// Searches for multiple sequential newline tokens and condenses them down
531    /// into one.
532    fn condense_newlines(&mut self) {
533        let mut cursor = 0;
534        let copy = self.tokens.clone();
535
536        let mut remove_these = VecDeque::new();
537
538        while cursor < self.tokens.len() {
539            // Locate a stretch of one or more newline tokens.
540            let start_tok = &mut self.tokens[cursor];
541
542            if let TokenKind::Newline(start_count) = &mut start_tok.kind {
543                loop {
544                    cursor += 1;
545
546                    if cursor >= copy.len() {
547                        break;
548                    }
549
550                    let child_tok = &copy[cursor];
551                    if let TokenKind::Newline(n) = child_tok.kind {
552                        *start_count += n;
553                        start_tok.span.end = child_tok.span.end;
554                        remove_these.push_back(cursor);
555                        cursor += 1;
556                    } else {
557                        break;
558                    };
559                }
560            }
561
562            cursor += 1;
563        }
564
565        self.tokens.remove_indices(remove_these);
566    }
567
568    /// Condenses words like "i.e.", "e.g." and "N.S.A." down to single words
569    /// using a state machine.
570    fn condense_dotted_initialisms(&mut self) {
571        if self.tokens.len() < 2 {
572            return;
573        }
574
575        let mut to_remove = VecDeque::new();
576
577        let mut cursor = 1;
578
579        let mut initialism_start = None;
580
581        loop {
582            let a = &self.tokens[cursor - 1];
583            let b = &self.tokens[cursor];
584
585            let is_initialism_chunk = a.kind.is_word() && a.span.len() == 1 && b.kind.is_period();
586
587            if is_initialism_chunk {
588                if initialism_start.is_none() {
589                    initialism_start = Some(cursor - 1);
590                } else {
591                    to_remove.push_back(cursor - 1);
592                }
593
594                to_remove.push_back(cursor);
595                cursor += 1;
596            } else {
597                if let Some(start) = initialism_start {
598                    let end = self.tokens[cursor - 2].span.end;
599                    let start_tok: &mut Token = &mut self.tokens[start];
600                    start_tok.span.end = end;
601                }
602
603                initialism_start = None;
604            }
605
606            cursor += 1;
607
608            if cursor >= self.tokens.len() - 1 {
609                break;
610            }
611        }
612
613        self.tokens.remove_indices(to_remove);
614    }
615
616    /// Condenses likely filename extensions down to single tokens.
617    fn condense_filename_extensions(&mut self) {
618        if self.tokens.len() < 2 {
619            return;
620        }
621
622        let mut to_remove = VecDeque::new();
623
624        let mut cursor = 1;
625
626        let mut ext_start = None;
627
628        loop {
629            // left context, dot, extension, right context
630            let l = self.get_token_offset(cursor, -2);
631            let d = &self.tokens[cursor - 1];
632            let x = &self.tokens[cursor];
633            let r = self.get_token_offset(cursor, 1);
634
635            let is_ext_chunk = d.kind.is_period()
636                && x.kind.is_word()
637                && x.span.len() <= 3
638                && ((l.is_none_or(|t| t.kind.is_whitespace())
639                    && r.is_none_or(|t| t.kind.is_whitespace()))
640                    || (l.is_some_and(|t| t.kind.is_open_round())
641                        && r.is_some_and(|t| t.kind.is_close_round())))
642                && {
643                    let ext_chars = x.span.get_content(&self.source);
644                    ext_chars.iter().all(|c| c.is_ascii_lowercase())
645                        || ext_chars.iter().all(|c| c.is_ascii_uppercase())
646                };
647
648            if is_ext_chunk {
649                if ext_start.is_none() {
650                    ext_start = Some(cursor - 1);
651                    self.tokens[cursor - 1].kind = TokenKind::Unlintable;
652                } else {
653                    to_remove.push_back(cursor - 1);
654                }
655
656                to_remove.push_back(cursor);
657                cursor += 1;
658            } else {
659                if let Some(start) = ext_start {
660                    let end = self.tokens[cursor - 2].span.end;
661                    let start_tok: &mut Token = &mut self.tokens[start];
662                    start_tok.span.end = end;
663                }
664
665                ext_start = None;
666            }
667
668            cursor += 1;
669
670            if cursor >= self.tokens.len() {
671                break;
672            }
673        }
674
675        self.tokens.remove_indices(to_remove);
676    }
677
678    /// Condenses "tl;dr" down to a single word token.
679    fn condense_tldr(&mut self) {
680        if self.tokens.len() < 3 {
681            return;
682        }
683
684        let mut to_remove = VecDeque::new();
685        let mut cursor = 2;
686
687        loop {
688            let tl = &self.tokens[cursor - 2];
689            let simicolon = &self.tokens[cursor - 1];
690            let dr = &self.tokens[cursor];
691
692            let is_tldr_chunk = tl.kind.is_word()
693                && tl.span.len() == 2
694                && tl
695                    .span
696                    .get_content(&self.source)
697                    .eq_ignore_ascii_case_chars(&['t', 'l'])
698                && simicolon.kind.is_semicolon()
699                && dr.kind.is_word()
700                && dr.span.len() >= 2
701                && dr.span.len() <= 3
702                && dr
703                    .span
704                    .get_content(&self.source)
705                    .eq_any_ignore_ascii_case_chars(&[&['d', 'r'], &['d', 'r', 's']]);
706
707            if is_tldr_chunk {
708                // Update the first token to be the full "tl;dr" as a word
709                self.tokens[cursor - 2].span = Span::new(
710                    self.tokens[cursor - 2].span.start,
711                    self.tokens[cursor].span.end,
712                );
713
714                // Mark the semicolon and "dr" tokens for removal
715                to_remove.push_back(cursor - 1);
716                to_remove.push_back(cursor);
717            }
718
719            // Skip ahead since we've processed these tokens
720            cursor += 1;
721
722            if cursor >= self.tokens.len() {
723                break;
724            }
725        }
726
727        // Remove the marked tokens in reverse order to maintain correct indices
728        self.tokens.remove_indices(to_remove);
729    }
730
731    /// Allows condensing of delimited pairs of tokens into a single token.
732    ///
733    /// # Arguments
734    ///
735    /// * `is_delimiter` - A function that returns `true` if the token is a delimiter.
736    /// * `valid_pairs` - A slice of tuples representing the valid pairs of tokens to condense.
737    ///
738    fn condense_delimited_pairs<F>(&mut self, is_delimiter: F, valid_pairs: &[(char, char)])
739    where
740        F: Fn(&TokenKind) -> bool,
741    {
742        if self.tokens.len() < 3 {
743            return;
744        }
745
746        let mut to_remove = VecDeque::new();
747        let mut cursor = 2;
748
749        loop {
750            let l1 = &self.tokens[cursor - 2];
751            let delim = &self.tokens[cursor - 1];
752            let l2 = &self.tokens[cursor];
753
754            let is_delimited_chunk = l1.kind.is_word()
755                && l1.span.len() == 1
756                && is_delimiter(&delim.kind)
757                && l2.kind.is_word()
758                && l2.span.len() == 1;
759
760            if is_delimited_chunk {
761                let (l1, l2) = (
762                    l1.span.get_content(&self.source).first(),
763                    l2.span.get_content(&self.source).first(),
764                );
765
766                let is_valid_pair = match (l1, l2) {
767                    (Some(l1), Some(l2)) => {
768                        let pair = (l1.to_ascii_lowercase(), l2.to_ascii_lowercase());
769                        valid_pairs.contains(&pair)
770                    }
771                    _ => false,
772                };
773
774                if is_valid_pair {
775                    self.tokens[cursor - 2].span = Span::new(
776                        self.tokens[cursor - 2].span.start,
777                        self.tokens[cursor].span.end,
778                    );
779                    to_remove.push_back(cursor - 1);
780                    to_remove.push_back(cursor);
781                }
782            }
783
784            cursor += 1;
785            if cursor >= self.tokens.len() {
786                break;
787            }
788        }
789
790        self.tokens.remove_indices(to_remove);
791    }
792
793    // Condenses "ampersand pairs" such as "R&D" or "Q&A" into single tokens.
794    fn condense_ampersand_pairs(&mut self) {
795        self.condense_delimited_pairs(
796            |kind| kind.is_ampersand(),
797            &[
798                ('b', 'b'), // bed & breakfast
799                ('b', 'w'), // black & white
800                ('g', 't'), // gin & tonic
801                ('k', 'r'), // Kernighan & Ritchie
802                ('q', 'a'), // question & answer
803                ('r', 'b'), // rhythm & blues
804                ('r', 'd'), // research & development
805                ('r', 'r'), // rest & relaxation
806                ('s', 'p'), // Standard & Poor's
807            ],
808        );
809    }
810
811    // Condenses "slash pairs" such as "I/O" into single tokens.
812    fn condense_slash_pairs(&mut self) {
813        self.condense_delimited_pairs(
814            |kind| kind.is_slash(),
815            &[
816                ('a', 'c'), // aircon; alternating current
817                ('b', 'w'), // black and white
818                ('c', 'o'), // care of
819                ('d', 'c'), // direct current
820                ('d', 'l'), // download
821                ('i', 'o'), // input/output
822                ('j', 'k'), // just kidding
823                ('n', 'a'), // not applicable
824                ('r', 'c'), // radio control
825                ('s', 'n'), // serial number
826                ('y', 'n'), // yes/no
827                ('y', 'o'), // years old
828            ],
829        );
830    }
831
832    fn uncached_ellipsis_pattern() -> Lrc<Repeating> {
833        let period = SequenceExpr::default().then_period();
834        Lrc::new(Repeating::new(Box::new(period), 2))
835    }
836
837    thread_local! {
838        static ELLIPSIS_EXPR: Lrc<Repeating> = Document::uncached_ellipsis_pattern();
839    }
840
841    fn condense_ellipsis(&mut self) {
842        let expr = Self::ELLIPSIS_EXPR.with(|v| v.clone());
843        self.condense_expr(&expr, |tok| {
844            tok.kind = TokenKind::Punctuation(Punctuation::Ellipsis)
845        });
846    }
847
848    fn uncached_contraction_expr() -> Lrc<SequenceExpr> {
849        Lrc::new(
850            SequenceExpr::default()
851                .then_any_word()
852                .then_apostrophe()
853                .then_any_word(),
854        )
855    }
856
857    thread_local! {
858        static CONTRACTION_EXPR: Lrc<SequenceExpr> = Document::uncached_contraction_expr();
859    }
860
861    /// Searches for contractions and condenses them down into single
862    /// tokens.
863    fn condense_contractions(&mut self) {
864        let expr = Self::CONTRACTION_EXPR.with(|v| v.clone());
865
866        self.condense_expr(&expr, |_| {})
867    }
868}
869
870/// Creates functions necessary to implement [`TokenStringExt]` on a document.
871macro_rules! create_fns_on_doc {
872    ($thing:ident) => {
873        paste! {
874            fn [< first_ $thing >](&self) -> Option<&Token> {
875                self.tokens.[< first_ $thing >]()
876            }
877
878            fn [< last_ $thing >](&self) -> Option<&Token> {
879                self.tokens.[< last_ $thing >]()
880            }
881
882            fn [< last_ $thing _index>](&self) -> Option<usize> {
883                self.tokens.[< last_ $thing _index >]()
884            }
885
886            fn [<iter_ $thing _indices>](&self) -> impl DoubleEndedIterator<Item = usize> + '_ {
887                self.tokens.[< iter_ $thing _indices >]()
888            }
889
890            fn [<iter_ $thing s>](&self) -> impl Iterator<Item = &Token> + '_ {
891                self.tokens.[< iter_ $thing s >]()
892            }
893        }
894    };
895}
896
897impl TokenStringExt for Document {
898    create_fns_on_doc!(adjective);
899    create_fns_on_doc!(apostrophe);
900    create_fns_on_doc!(at);
901    create_fns_on_doc!(chunk_terminator);
902    create_fns_on_doc!(comma);
903    create_fns_on_doc!(conjunction);
904    create_fns_on_doc!(currency);
905    create_fns_on_doc!(ellipsis);
906    create_fns_on_doc!(hostname);
907    create_fns_on_doc!(likely_homograph);
908    create_fns_on_doc!(noun);
909    create_fns_on_doc!(number);
910    create_fns_on_doc!(paragraph_break);
911    create_fns_on_doc!(pipe);
912    create_fns_on_doc!(preposition);
913    create_fns_on_doc!(punctuation);
914    create_fns_on_doc!(quote);
915    create_fns_on_doc!(sentence_terminator);
916    create_fns_on_doc!(space);
917    create_fns_on_doc!(unlintable);
918    create_fns_on_doc!(verb);
919    create_fns_on_doc!(word);
920    create_fns_on_doc!(word_like);
921
922    fn first_sentence_word(&self) -> Option<&Token> {
923        self.tokens.first_sentence_word()
924    }
925
926    fn first_non_whitespace(&self) -> Option<&Token> {
927        self.tokens.first_non_whitespace()
928    }
929
930    fn span(&self) -> Option<Span<char>> {
931        self.tokens.span()
932    }
933
934    fn iter_linking_verb_indices(&self) -> impl Iterator<Item = usize> + '_ {
935        self.tokens.iter_linking_verb_indices()
936    }
937
938    fn iter_linking_verbs(&self) -> impl Iterator<Item = &Token> + '_ {
939        self.tokens.iter_linking_verbs()
940    }
941
942    fn iter_chunks(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
943        self.tokens.iter_chunks()
944    }
945
946    fn iter_paragraphs(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
947        self.tokens.iter_paragraphs()
948    }
949
950    fn iter_sentences(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
951        self.tokens.iter_sentences()
952    }
953
954    fn iter_sentences_mut(&mut self) -> impl Iterator<Item = &'_ mut [Token]> + '_ {
955        self.tokens.iter_sentences_mut()
956    }
957}
958
959impl Display for Document {
960    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
961        for token in &self.tokens {
962            write!(f, "{}", self.get_span_content_str(&token.span))?;
963        }
964
965        Ok(())
966    }
967}
968
969#[cfg(test)]
970mod tests {
971    use itertools::Itertools;
972
973    use super::Document;
974    use crate::TokenStringExt;
975    use crate::{Span, parsers::MarkdownOptions};
976
977    fn assert_condensed_contractions(text: &str, final_tok_count: usize) {
978        let document = Document::new_plain_english_curated(text);
979
980        assert_eq!(document.tokens.len(), final_tok_count);
981
982        let document = Document::new_markdown_curated(text, MarkdownOptions::default());
983
984        assert_eq!(document.tokens.len(), final_tok_count);
985    }
986
987    #[test]
988    fn simple_contraction() {
989        assert_condensed_contractions("isn't", 1);
990    }
991
992    #[test]
993    fn simple_contraction2() {
994        assert_condensed_contractions("wasn't", 1);
995    }
996
997    #[test]
998    fn simple_contraction3() {
999        assert_condensed_contractions("There's", 1);
1000    }
1001
1002    #[test]
1003    fn medium_contraction() {
1004        assert_condensed_contractions("isn't wasn't", 3);
1005    }
1006
1007    #[test]
1008    fn medium_contraction2() {
1009        assert_condensed_contractions("There's no way", 5);
1010    }
1011
1012    #[test]
1013    fn selects_token_at_char_index() {
1014        let text = "There were three little pigs. They built three little homes.";
1015        let document = Document::new_plain_english_curated(text);
1016
1017        let got = document.get_token_at_char_index(19).unwrap();
1018
1019        assert!(got.kind.is_word());
1020        assert_eq!(got.span, Span::new(17, 23));
1021    }
1022
1023    fn assert_token_count(source: &str, count: usize) {
1024        let document = Document::new_plain_english_curated(source);
1025
1026        dbg!(document.tokens().map(|t| t.kind.clone()).collect_vec());
1027        assert_eq!(document.tokens.len(), count);
1028    }
1029
1030    #[test]
1031    fn condenses_number_suffixes() {
1032        assert_token_count("1st", 1);
1033        assert_token_count("This is the 2nd test", 9);
1034        assert_token_count("This is the 3rd test", 9);
1035        assert_token_count(
1036            "It works even with weird capitalization like this: 600nD",
1037            18,
1038        );
1039    }
1040
1041    #[test]
1042    fn condenses_ie() {
1043        assert_token_count("There is a thing (i.e. that one)", 15);
1044        assert_token_count("We are trying to condense \"i.e.\"", 13);
1045        assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
1046    }
1047
1048    #[test]
1049    fn condenses_eg() {
1050        assert_token_count("We are trying to condense \"e.g.\"", 13);
1051        assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
1052    }
1053
1054    #[test]
1055    fn condenses_nsa() {
1056        assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
1057    }
1058
1059    #[test]
1060    fn parses_ellipsis() {
1061        assert_token_count("...", 1);
1062    }
1063
1064    #[test]
1065    fn parses_long_ellipsis() {
1066        assert_token_count(".....", 1);
1067    }
1068
1069    #[test]
1070    fn parses_short_ellipsis() {
1071        assert_token_count("..", 1);
1072    }
1073
1074    #[test]
1075    fn selects_token_at_offset() {
1076        let doc = Document::new_plain_english_curated("Foo bar baz");
1077
1078        let tok = doc.get_token_offset(1, -1).unwrap();
1079
1080        assert_eq!(tok.span, Span::new(0, 3));
1081    }
1082
1083    #[test]
1084    fn cant_select_token_before_start() {
1085        let doc = Document::new_plain_english_curated("Foo bar baz");
1086
1087        let tok = doc.get_token_offset(0, -1);
1088
1089        assert!(tok.is_none());
1090    }
1091
1092    #[test]
1093    fn select_next_word_pos_offset() {
1094        let doc = Document::new_plain_english_curated("Foo bar baz");
1095
1096        let bar = doc.get_next_word_from_offset(0, 1).unwrap();
1097        let bar = doc.get_span_content(&bar.span);
1098        assert_eq!(bar, ['b', 'a', 'r']);
1099    }
1100
1101    #[test]
1102    fn select_next_word_neg_offset() {
1103        let doc = Document::new_plain_english_curated("Foo bar baz");
1104
1105        let bar = doc.get_next_word_from_offset(2, -1).unwrap();
1106        let bar = doc.get_span_content(&bar.span);
1107        assert_eq!(bar, ['F', 'o', 'o']);
1108    }
1109
1110    #[test]
1111    fn cant_select_next_word_not_from_whitespace() {
1112        let doc = Document::new_plain_english_curated("Foo bar baz");
1113
1114        let tok = doc.get_next_word_from_offset(0, 2);
1115
1116        assert!(tok.is_none());
1117    }
1118
1119    #[test]
1120    fn cant_select_next_word_before_start() {
1121        let doc = Document::new_plain_english_curated("Foo bar baz");
1122
1123        let tok = doc.get_next_word_from_offset(0, -1);
1124
1125        assert!(tok.is_none());
1126    }
1127
1128    #[test]
1129    fn cant_select_next_word_with_punctuation_instead_of_whitespace() {
1130        let doc = Document::new_plain_english_curated("Foo, bar, baz");
1131
1132        let tok = doc.get_next_word_from_offset(0, 1);
1133
1134        assert!(tok.is_none());
1135    }
1136
1137    #[test]
1138    fn cant_select_next_word_with_punctuation_after_whitespace() {
1139        let doc = Document::new_plain_english_curated("Foo \"bar\", baz");
1140
1141        let tok = doc.get_next_word_from_offset(0, 1);
1142
1143        assert!(tok.is_none());
1144    }
1145
1146    #[test]
1147    fn condenses_filename_extensions() {
1148        let doc = Document::new_plain_english_curated(".c and .exe and .js");
1149        assert!(doc.tokens[0].kind.is_unlintable());
1150        assert!(doc.tokens[4].kind.is_unlintable());
1151        assert!(doc.tokens[8].kind.is_unlintable());
1152    }
1153
1154    #[test]
1155    fn condense_filename_extension_ok_at_start_and_end() {
1156        let doc = Document::new_plain_english_curated(".c and .EXE");
1157        assert!(doc.tokens.len() == 5);
1158        assert!(doc.tokens[0].kind.is_unlintable());
1159        assert!(doc.tokens[4].kind.is_unlintable());
1160    }
1161
1162    #[test]
1163    fn doesnt_condense_filename_extensions_with_mixed_case() {
1164        let doc = Document::new_plain_english_curated(".c and .Exe");
1165        assert!(doc.tokens.len() == 6);
1166        assert!(doc.tokens[0].kind.is_unlintable());
1167        assert!(doc.tokens[4].kind.is_punctuation());
1168        assert!(doc.tokens[5].kind.is_word());
1169    }
1170
1171    #[test]
1172    fn doesnt_condense_filename_extensions_with_non_letters() {
1173        let doc = Document::new_plain_english_curated(".COM and .C0M");
1174        assert!(doc.tokens.len() == 6);
1175        assert!(doc.tokens[0].kind.is_unlintable());
1176        assert!(doc.tokens[4].kind.is_punctuation());
1177        assert!(doc.tokens[5].kind.is_word());
1178    }
1179
1180    #[test]
1181    fn doesnt_condense_filename_extensions_longer_than_three() {
1182        let doc = Document::new_plain_english_curated(".dll and .dlls");
1183        assert!(doc.tokens.len() == 6);
1184        assert!(doc.tokens[0].kind.is_unlintable());
1185        assert!(doc.tokens[4].kind.is_punctuation());
1186        assert!(doc.tokens[5].kind.is_word());
1187    }
1188
1189    #[test]
1190    fn condense_filename_extension_in_parens() {
1191        let doc = Document::new_plain_english_curated(
1192            "true for the manual installation when trying to run the executable(.exe) after a manual download",
1193        );
1194        assert!(doc.tokens.len() > 23);
1195        assert!(doc.tokens[21].kind.is_open_round());
1196        assert!(doc.tokens[22].kind.is_unlintable());
1197        assert!(doc.tokens[23].kind.is_close_round());
1198    }
1199
1200    #[test]
1201    fn condense_tldr_uppercase() {
1202        let doc = Document::new_plain_english_curated("TL;DR");
1203        assert!(doc.tokens.len() == 1);
1204        assert!(doc.tokens[0].kind.is_word());
1205        assert!(doc.tokens[0].span.len() == 5);
1206    }
1207
1208    #[test]
1209    fn condense_tldr_lowercase() {
1210        let doc = Document::new_plain_english_curated("tl;dr");
1211        assert!(doc.tokens.len() == 1);
1212        assert!(doc.tokens[0].kind.is_word());
1213    }
1214
1215    #[test]
1216    fn condense_tldr_mixed_case_1() {
1217        let doc = Document::new_plain_english_curated("tl;DR");
1218        assert!(doc.tokens.len() == 1);
1219        assert!(doc.tokens[0].kind.is_word());
1220    }
1221
1222    #[test]
1223    fn condense_tldr_mixed_case_2() {
1224        let doc = Document::new_plain_english_curated("TL;Dr");
1225        assert!(doc.tokens.len() == 1);
1226        assert!(doc.tokens[0].kind.is_word());
1227    }
1228
1229    #[test]
1230    fn condense_tldr_pural() {
1231        let doc = Document::new_plain_english_curated(
1232            "managing the flow between components to produce relevant TL;DRs of current news articles",
1233        );
1234        // no token is a punctuation token - only words with whitespace between
1235        assert!(
1236            doc.tokens
1237                .iter()
1238                .all(|t| t.kind.is_word() || t.kind.is_whitespace())
1239        );
1240        // one of the word tokens contains a ';' character
1241        let tldrs = doc
1242            .tokens
1243            .iter()
1244            .filter(|t| t.span.get_content(&doc.source).contains(&';'))
1245            .collect_vec();
1246        assert!(tldrs.len() == 1);
1247        assert!(tldrs[0].span.get_content_string(&doc.source) == "TL;DRs");
1248    }
1249
1250    #[test]
1251    fn condense_r_and_d_caps() {
1252        let doc = Document::new_plain_english_curated("R&D");
1253        assert!(doc.tokens.len() == 1);
1254        assert!(doc.tokens[0].kind.is_word());
1255    }
1256
1257    #[test]
1258    fn condense_r_and_d_mixed_case() {
1259        let doc = Document::new_plain_english_curated("R&d");
1260        assert!(doc.tokens.len() == 1);
1261        assert!(doc.tokens[0].kind.is_word());
1262    }
1263
1264    #[test]
1265    fn condense_r_and_d_lowercase() {
1266        let doc = Document::new_plain_english_curated("r&d");
1267        assert!(doc.tokens.len() == 1);
1268        assert!(doc.tokens[0].kind.is_word());
1269    }
1270
1271    #[test]
1272    fn dont_condense_r_and_d_with_spaces() {
1273        let doc = Document::new_plain_english_curated("R & D");
1274        assert!(doc.tokens.len() == 5);
1275        assert!(doc.tokens[0].kind.is_word());
1276        assert!(doc.tokens[1].kind.is_whitespace());
1277        assert!(doc.tokens[2].kind.is_ampersand());
1278        assert!(doc.tokens[3].kind.is_whitespace());
1279        assert!(doc.tokens[4].kind.is_word());
1280    }
1281
1282    #[test]
1283    fn condense_q_and_a() {
1284        let doc =
1285            Document::new_plain_english_curated("A Q&A platform software for teams at any scales.");
1286        assert!(doc.tokens.len() >= 3);
1287        assert!(doc.tokens[2].kind.is_word());
1288        assert!(doc.tokens[2].span.get_content_string(&doc.source) == "Q&A");
1289    }
1290
1291    #[test]
1292    fn dont_allow_mixed_r_and_d_with_q_and_a() {
1293        let doc = Document::new_plain_english_curated("R&A or Q&D");
1294        assert!(doc.tokens.len() == 9);
1295        assert!(doc.tokens[1].kind.is_ampersand() || doc.tokens[7].kind.is_ampersand());
1296    }
1297
1298    #[test]
1299    fn condense_io() {
1300        let doc = Document::new_plain_english_curated("I/O");
1301        assert!(doc.tokens.len() == 1);
1302        assert!(doc.tokens[0].kind.is_word());
1303    }
1304
1305    #[test]
1306    fn finds_unmatched_quotes_in_document() {
1307        let raw = r#"
1308This is a paragraph with a single word "quoted."
1309
1310This is a second paragraph with no quotes.
1311
1312This is a third paragraph with a single erroneous "quote.
1313
1314This is a final paragraph with a weird "quote and a not-weird "quote".
1315            "#;
1316
1317        let doc = Document::new_markdown_default_curated(raw);
1318
1319        let quote_twins: Vec<_> = doc
1320            .iter_quotes()
1321            .map(|t| t.kind.as_quote().unwrap().twin_loc)
1322            .collect();
1323
1324        assert_eq!(
1325            quote_twins,
1326            vec![Some(19), Some(16), None, None, Some(89), Some(87)]
1327        )
1328    }
1329
1330    #[test]
1331    fn issue_1901() {
1332        let raw = r#"
1333"A quoted line"
1334"A quote without a closing mark
1335"Another quoted lined"
1336"The last quoted line"
1337            "#;
1338
1339        let doc = Document::new_markdown_default_curated(raw);
1340
1341        let quote_twins: Vec<_> = doc
1342            .iter_quotes()
1343            .map(|t| t.kind.as_quote().unwrap().twin_loc)
1344            .collect();
1345
1346        assert_eq!(
1347            quote_twins,
1348            vec![
1349                Some(6),
1350                Some(0),
1351                None,
1352                Some(27),
1353                Some(21),
1354                Some(37),
1355                Some(29)
1356            ]
1357        )
1358    }
1359}