Skip to main content

harper_core/
document.rs

1use std::cmp::Ordering;
2use std::collections::VecDeque;
3use std::fmt::Display;
4
5use harper_brill::{Chunker, Tagger, brill_tagger, burn_chunker};
6use itertools::Itertools;
7
8use crate::expr::{Expr, ExprExt, FirstMatchOf, Repeating, SequenceExpr};
9use crate::parsers::{Markdown, MarkdownOptions, Parser, PlainEnglish};
10use crate::punctuation::Punctuation;
11use crate::spell::{Dictionary, FstDictionary};
12use crate::vec_ext::VecExt;
13use crate::{CharStringExt, FatStringToken, FatToken, Lrc, Token, TokenKind, TokenStringExt};
14use crate::{OrdinalSuffix, Span};
15
16/// A document containing some amount of lexed and parsed English text.
17#[derive(Debug, Clone)]
18pub struct Document {
19    source: Lrc<[char]>,
20    tokens: Vec<Token>,
21}
22
23impl Default for Document {
24    fn default() -> Self {
25        Self::new("", &PlainEnglish, &FstDictionary::curated())
26    }
27}
28
29impl Document {
30    /// Locate all the tokens that intersect a provided span.
31    ///
32    /// Desperately needs optimization.
33    pub fn token_indices_intersecting(&self, span: Span<char>) -> Vec<usize> {
34        self.tokens()
35            .enumerate()
36            .filter_map(|(idx, tok)| tok.span.overlaps_with(span).then_some(idx))
37            .collect()
38    }
39
40    /// Locate all the tokens that intersect a provided span and convert them to [`FatToken`]s.
41    ///
42    /// Desperately needs optimization.
43    pub fn fat_tokens_intersecting(&self, span: Span<char>) -> Vec<FatToken> {
44        let indices = self.token_indices_intersecting(span);
45
46        indices
47            .into_iter()
48            .map(|i| self.tokens[i].to_fat(&self.source))
49            .collect()
50    }
51
52    /// Lexes and parses text to produce a document using a provided language
53    /// parser and dictionary.
54    pub fn new(text: &str, parser: &impl Parser, dictionary: &impl Dictionary) -> Self {
55        let source: Lrc<_> = text.chars().collect();
56
57        Self::new_from_chars(source, parser, dictionary)
58    }
59
60    /// Lexes and parses text to produce a document using a provided language
61    /// parser and the included curated dictionary.
62    pub fn new_curated(text: &str, parser: &impl Parser) -> Self {
63        let source: Lrc<_> = text.chars().collect();
64
65        Self::new_from_chars(source, parser, &FstDictionary::curated())
66    }
67
68    /// Lexes and parses text to produce a document using a provided language
69    /// parser and dictionary.
70    pub fn new_from_chars(
71        source: Lrc<[char]>,
72        parser: &impl Parser,
73        dictionary: &impl Dictionary,
74    ) -> Self {
75        let tokens = parser.parse(&source);
76
77        let mut document = Self { source, tokens };
78        document.parse(dictionary);
79
80        document
81    }
82
83    /// Create a new document from character data using the built-in [`PlainEnglish`]
84    /// parser and curated dictionary. This avoids string-to-char conversions.
85    pub fn new_plain_english_curated_chars(source: &[char]) -> Self {
86        Self::new_from_chars(Lrc::from(source), &PlainEnglish, &FstDictionary::curated())
87    }
88
89    /// Parse text to produce a document using the built-in [`PlainEnglish`]
90    /// parser and curated dictionary.
91    pub fn new_plain_english_curated(text: &str) -> Self {
92        Self::new(text, &PlainEnglish, &FstDictionary::curated())
93    }
94
95    /// Create a new document simply by tokenizing the provided input and applying fix-ups. The
96    /// contained words will not contain any metadata.
97    ///
98    /// This avoids running potentially expensive metadata generation code, so this is more
99    /// efficient if you don't need that information.
100    pub(crate) fn new_basic_tokenize(text: &str, parser: &impl Parser) -> Self {
101        let source: Lrc<_> = text.chars().collect();
102        let tokens = parser.parse(&source);
103        let mut document = Self { source, tokens };
104        document.apply_fixups();
105        document
106    }
107
108    /// Parse text to produce a document using the built-in [`PlainEnglish`]
109    /// parser and a provided dictionary.
110    pub fn new_plain_english(text: &str, dictionary: &impl Dictionary) -> Self {
111        Self::new(text, &PlainEnglish, dictionary)
112    }
113
114    /// Parse text to produce a document using the built-in [`Markdown`] parser
115    /// and curated dictionary.
116    pub fn new_markdown_curated(text: &str, markdown_options: MarkdownOptions) -> Self {
117        Self::new(
118            text,
119            &Markdown::new(markdown_options),
120            &FstDictionary::curated(),
121        )
122    }
123
124    /// Create a new document from character data using the built-in [`Markdown`] parser
125    /// and curated dictionary. This avoids string-to-char conversions.
126    pub fn new_markdown_default_curated_chars(chars: &[char]) -> Self {
127        Self::new_from_chars(
128            chars.to_vec().into(),
129            &Markdown::default(),
130            &FstDictionary::curated(),
131        )
132    }
133
134    /// Parse text to produce a document using the built-in [`Markdown`] parser
135    /// and curated dictionary with the default Markdown configuration.
136    pub fn new_markdown_default_curated(text: &str) -> Self {
137        Self::new_markdown_curated(text, MarkdownOptions::default())
138    }
139
140    /// Parse text to produce a document using the built-in [`PlainEnglish`]
141    /// parser and the curated dictionary.
142    pub fn new_markdown(
143        text: &str,
144        markdown_options: MarkdownOptions,
145        dictionary: &impl Dictionary,
146    ) -> Self {
147        Self::new(text, &Markdown::new(markdown_options), dictionary)
148    }
149
150    /// Parse text to produce a document using the built-in [`PlainEnglish`]
151    /// parser and the curated dictionary with the default Markdown configuration.
152    pub fn new_markdown_default(text: &str, dictionary: &impl Dictionary) -> Self {
153        Self::new_markdown(text, MarkdownOptions::default(), dictionary)
154    }
155
156    fn apply_fixups(&mut self) {
157        self.condense_spaces();
158        self.condense_newlines();
159        self.newlines_to_breaks();
160        self.condense_dotted_initialisms();
161        self.condense_number_suffixes();
162        self.condense_ellipsis();
163        self.condense_dotted_truncations();
164        self.condense_common_top_level_domains();
165        self.condense_filename_extensions();
166        self.condense_tldr();
167        self.condense_ampersand_pairs();
168        self.condense_slash_pairs();
169        self.match_quotes();
170    }
171
172    /// Re-parse important language constructs.
173    ///
174    /// Should be run after every change to the underlying [`Self::source`].
175    fn parse(&mut self, dictionary: &impl Dictionary) {
176        self.apply_fixups();
177
178        let chunker = burn_chunker();
179        let tagger = brill_tagger();
180
181        for sent in self.tokens.iter_sentences_mut() {
182            let token_strings: Vec<_> = sent
183                .iter()
184                .filter(|t| !t.kind.is_whitespace())
185                .map(|t| t.get_str(&self.source))
186                .collect();
187
188            let token_tags = tagger.tag_sentence(&token_strings);
189            let np_flags = chunker.chunk_sentence(&token_strings, &token_tags);
190
191            // Annotate DictWord metadata
192            let word_sources: Vec<_> = sent
193                .iter()
194                .filter(|t| matches!(t.kind, TokenKind::Word(_)))
195                .map(|t| t.get_ch(&self.source))
196                .collect();
197
198            let mut ti = 0; // Index for token_tags/np_flags (all non-whitespace tokens)
199            let mut wi = 0; // Index for word_sources (only word tokens)
200            for token in sent.iter_mut() {
201                if let TokenKind::Word(meta) = &mut token.kind {
202                    let word_source = word_sources[wi];
203                    let mut found_meta = dictionary
204                        .get_word_metadata(word_source)
205                        .map(|c| c.into_owned());
206
207                    if let Some(inner) = &mut found_meta {
208                        inner.pos_tag = token_tags[ti].or_else(|| inner.infer_pos_tag());
209                        inner.np_member = Some(np_flags[ti]);
210                    }
211
212                    *meta = found_meta;
213                    ti += 1;
214                    wi += 1;
215                } else if !token.kind.is_whitespace() {
216                    ti += 1;
217                }
218            }
219        }
220    }
221
222    /// Convert all sets of newlines greater than 2 to paragraph breaks.
223    fn newlines_to_breaks(&mut self) {
224        for token in &mut self.tokens {
225            if let TokenKind::Newline(n) = token.kind
226                && n >= 2
227            {
228                token.kind = TokenKind::ParagraphBreak;
229            }
230        }
231    }
232
233    /// Given a list of indices, this function removes the subsequent
234    /// `stretch_len - 1` elements after each index.
235    ///
236    /// Will extend token spans to include removed elements.
237    /// Assumes condensed tokens are contiguous in source text.
238    fn condense_indices(&mut self, indices: &[usize], stretch_len: usize) {
239        // Update spans
240        for idx in indices {
241            let end_tok = self.tokens[idx + stretch_len - 1].clone();
242            let start_tok = &mut self.tokens[*idx];
243
244            start_tok.span.end = end_tok.span.end;
245        }
246
247        // Trim
248        let old = self.tokens.clone();
249        self.tokens.clear();
250
251        // Keep first chunk.
252        self.tokens
253            .extend_from_slice(&old[0..indices.first().copied().unwrap_or(indices.len())]);
254
255        let mut iter = indices.iter().peekable();
256
257        while let (Some(a_idx), b) = (iter.next(), iter.peek()) {
258            self.tokens.push(old[*a_idx].clone());
259
260            if let Some(b_idx) = b {
261                self.tokens
262                    .extend_from_slice(&old[a_idx + stretch_len..**b_idx]);
263            }
264        }
265
266        // Keep last chunk.
267        self.tokens.extend_from_slice(
268            &old[indices
269                .last()
270                .map(|v| v + stretch_len)
271                .unwrap_or(indices.len())..],
272        );
273    }
274
275    pub fn get_token_at_char_index(&self, char_index: usize) -> Option<&Token> {
276        let index = self
277            .tokens
278            .binary_search_by(|t| {
279                if t.span.overlaps_with(Span::new_with_len(char_index, 1)) {
280                    Ordering::Equal
281                } else {
282                    t.span.start.cmp(&char_index)
283                }
284            })
285            .ok()?;
286
287        Some(&self.tokens[index])
288    }
289
290    /// Defensively attempt to grab a specific token.
291    pub fn get_token(&self, index: usize) -> Option<&Token> {
292        self.tokens.get(index)
293    }
294
295    /// Get a token at a signed offset from a base index, or None if out of bounds.
296    pub fn get_token_offset(&self, base: usize, offset: isize) -> Option<&Token> {
297        match base.checked_add_signed(offset) {
298            None => None,
299            Some(idx) => self.get_token(idx),
300        }
301    }
302
303    /// Get an iterator over all the tokens contained in the document.
304    pub fn tokens(&self) -> impl Iterator<Item = &Token> + '_ {
305        self.tokens.iter()
306    }
307
308    pub fn iter_nominal_phrases(&self) -> impl Iterator<Item = &[Token]> {
309        fn is_np_member(t: &Token) -> bool {
310            t.kind
311                .as_word()
312                .and_then(|x| x.as_ref())
313                .and_then(|w| w.np_member)
314                .unwrap_or(false)
315        }
316
317        fn trim(slice: &[Token]) -> &[Token] {
318            let mut start = 0;
319            let mut end = slice.len();
320            while start < end && slice[start].kind.is_whitespace() {
321                start += 1;
322            }
323            while end > start && slice[end - 1].kind.is_whitespace() {
324                end -= 1;
325            }
326            &slice[start..end]
327        }
328
329        self.tokens
330            .as_slice()
331            .split(|t| !(is_np_member(t) || t.kind.is_whitespace()))
332            .filter_map(|s| {
333                let s = trim(s);
334                if s.iter().any(is_np_member) {
335                    Some(s)
336                } else {
337                    None
338                }
339            })
340    }
341
342    /// Get an iterator over all the tokens contained in the document.
343    pub fn fat_tokens(&self) -> impl Iterator<Item = FatToken> + '_ {
344        self.tokens().map(|token| token.to_fat(&self.source))
345    }
346
347    /// Get the next or previous word token relative to a base index, if separated by whitespace.
348    /// Returns None if the next/previous token is not a word or does not exist.
349    pub fn get_next_word_from_offset(&self, base: usize, offset: isize) -> Option<&Token> {
350        // Look for whitespace at the expected offset
351        if !self.get_token_offset(base, offset)?.kind.is_whitespace() {
352            return None;
353        }
354        // Now look beyond the whitespace for a word token
355        let word_token = self.get_token_offset(base, offset + offset.signum());
356        let word_token = word_token?;
357        word_token.kind.is_word().then_some(word_token)
358    }
359
360    /// Get an iterator over all the tokens contained in the document.
361    pub fn fat_string_tokens(&self) -> impl Iterator<Item = FatStringToken> + '_ {
362        self.fat_tokens().map(|t| t.into())
363    }
364
365    pub fn get_span_content(&self, span: &Span<char>) -> &[char] {
366        span.get_content(&self.source)
367    }
368
369    pub fn get_span_content_str(&self, span: &Span<char>) -> String {
370        String::from_iter(self.get_span_content(span))
371    }
372
373    pub fn get_full_string(&self) -> String {
374        self.get_span_content_str(&Span::new(0, self.source.len()))
375    }
376
377    pub fn get_full_content(&self) -> &[char] {
378        &self.source
379    }
380
381    pub fn get_source(&self) -> &[char] {
382        &self.source
383    }
384
385    pub fn get_tokens(&self) -> &[Token] {
386        &self.tokens
387    }
388
389    /// Searches for quotation marks and fills the
390    /// [`Punctuation::Quote::twin_loc`] field. This is on a best-effort
391    /// basis.
392    ///
393    /// Current algorithm is based on https://leancrew.com/all-this/2025/03/a-mac-smart-quote-curiosity
394    fn match_quotes(&mut self) {
395        let mut pg_indices: Vec<_> = vec![0];
396        pg_indices.extend(self.iter_paragraph_break_indices());
397        pg_indices.push(self.tokens.len());
398
399        // Avoid allocation in loop
400        let mut quote_indices = Vec::new();
401        let mut open_quote_indices = Vec::new();
402
403        for (start, end) in pg_indices.into_iter().tuple_windows() {
404            let pg = &mut self.tokens[start..end];
405
406            quote_indices.clear();
407            quote_indices.extend(pg.iter_quote_indices());
408            open_quote_indices.clear();
409
410            // Find open quotes first.
411            for quote in &quote_indices {
412                let is_open = *quote == 0
413                    || pg[0..*quote].iter_word_likes().next().is_none()
414                    || pg[quote - 1].kind.is_whitespace()
415                    || matches!(
416                        pg[quote - 1].kind.as_punctuation(),
417                        Some(Punctuation::LessThan)
418                            | Some(Punctuation::OpenRound)
419                            | Some(Punctuation::OpenSquare)
420                            | Some(Punctuation::OpenCurly)
421                            | Some(Punctuation::EmDash)
422                            | Some(Punctuation::EnDash)
423                            | Some(Punctuation::Apostrophe)
424                    );
425
426                if is_open {
427                    open_quote_indices.push(*quote);
428                }
429            }
430
431            while let Some(open_idx) = open_quote_indices.pop() {
432                let Some(close_idx) = pg[open_idx + 1..].iter_quote_indices().next() else {
433                    continue;
434                };
435
436                if pg[close_idx + open_idx + 1]
437                    .kind
438                    .as_quote()
439                    .unwrap()
440                    .twin_loc
441                    .is_some()
442                {
443                    continue;
444                }
445
446                pg[open_idx].kind.as_mut_quote().unwrap().twin_loc =
447                    Some(close_idx + open_idx + start + 1);
448                pg[close_idx + open_idx + 1]
449                    .kind
450                    .as_mut_quote()
451                    .unwrap()
452                    .twin_loc = Some(open_idx + start);
453            }
454        }
455    }
456
457    /// Searches for number suffixes and condenses them down into single tokens
458    fn condense_number_suffixes(&mut self) {
459        if self.tokens.len() < 2 {
460            return;
461        }
462
463        let mut replace_starts = Vec::new();
464
465        for idx in 0..self.tokens.len() - 1 {
466            let b = &self.tokens[idx + 1];
467            let a = &self.tokens[idx];
468
469            // TODO: Allow spaces between `a` and `b`
470
471            if let (TokenKind::Number(..), TokenKind::Word(..)) = (&a.kind, &b.kind)
472                && let Some(found_suffix) =
473                    OrdinalSuffix::from_chars(self.get_span_content(&b.span))
474            {
475                self.tokens[idx].kind.as_mut_number().unwrap().suffix = Some(found_suffix);
476                replace_starts.push(idx);
477            }
478        }
479
480        self.condense_indices(&replace_starts, 2);
481    }
482
483    /// Searches for multiple sequential space tokens and condenses them down
484    /// into one.
485    fn condense_spaces(&mut self) {
486        let mut cursor = 0;
487        let copy = self.tokens.clone();
488
489        let mut remove_these = VecDeque::new();
490
491        while cursor < self.tokens.len() {
492            // Locate a stretch of one or more newline tokens.
493            let start_tok = &mut self.tokens[cursor];
494
495            if let TokenKind::Space(start_count) = &mut start_tok.kind {
496                loop {
497                    cursor += 1;
498
499                    if cursor >= copy.len() {
500                        break;
501                    }
502
503                    let child_tok = &copy[cursor];
504
505                    // Only condense adjacent spans
506                    if start_tok.span.end != child_tok.span.start {
507                        break;
508                    }
509
510                    if let TokenKind::Space(n) = child_tok.kind {
511                        *start_count += n;
512                        start_tok.span.end = child_tok.span.end;
513                        remove_these.push_back(cursor);
514                        cursor += 1;
515                    } else {
516                        break;
517                    };
518                }
519            }
520
521            cursor += 1;
522        }
523
524        self.tokens.remove_indices(remove_these);
525    }
526
527    thread_local! {
528        static DOTTED_TRUNCATION_EXPR: Lrc<FirstMatchOf> = Document::uncached_dotted_truncation_expr();
529    }
530
531    fn uncached_dotted_truncation_expr() -> Lrc<FirstMatchOf> {
532        Lrc::new(FirstMatchOf::new(vec![
533            Box::new(SequenceExpr::word_set(&["esp", "etc", "vs"]).then_period()),
534            Box::new(
535                SequenceExpr::aco("et")
536                    .then_whitespace()
537                    .t_aco("al")
538                    .then_period(),
539            ),
540        ]))
541    }
542
543    /// Assumes that the first matched token is the canonical one to be condensed into.
544    /// Takes a callback that can be used to retroactively edit the canonical token afterwards.
545    fn condense_expr<F>(&mut self, expr: &impl Expr, edit: F)
546    where
547        F: Fn(&mut Token),
548    {
549        let matches = expr.iter_matches_in_doc(self).collect::<Vec<_>>();
550
551        let mut remove_indices = VecDeque::with_capacity(matches.len());
552
553        for m in matches {
554            remove_indices.extend(m.start + 1..m.end);
555            self.tokens[m.start].span = self.tokens[m.into_iter()].span().unwrap();
556            edit(&mut self.tokens[m.start]);
557        }
558
559        self.tokens.remove_indices(remove_indices);
560    }
561
562    fn condense_dotted_truncations(&mut self) {
563        self.condense_expr(&Self::DOTTED_TRUNCATION_EXPR.with(|v| v.clone()), |_| {})
564    }
565
566    /// Searches for multiple sequential newline tokens and condenses them down
567    /// into one.
568    fn condense_newlines(&mut self) {
569        let mut cursor = 0;
570        let copy = self.tokens.clone();
571
572        let mut remove_these = VecDeque::new();
573
574        while cursor < self.tokens.len() {
575            // Locate a stretch of one or more newline tokens.
576            let start_tok = &mut self.tokens[cursor];
577
578            if let TokenKind::Newline(start_count) = &mut start_tok.kind {
579                loop {
580                    cursor += 1;
581
582                    if cursor >= copy.len() {
583                        break;
584                    }
585
586                    let child_tok = &copy[cursor];
587                    if let TokenKind::Newline(n) = child_tok.kind {
588                        *start_count += n;
589                        start_tok.span.end = child_tok.span.end;
590                        remove_these.push_back(cursor);
591                        cursor += 1;
592                    } else {
593                        break;
594                    };
595                }
596            }
597
598            cursor += 1;
599        }
600
601        self.tokens.remove_indices(remove_these);
602    }
603
604    /// Condenses words like "i.e.", "e.g." and "N.S.A." down to single words
605    /// using a state machine.
606    fn condense_dotted_initialisms(&mut self) {
607        if self.tokens.len() < 2 {
608            return;
609        }
610
611        let mut to_remove = VecDeque::new();
612
613        let mut cursor = 1;
614
615        let mut initialism_start = None;
616
617        loop {
618            let a = &self.tokens[cursor - 1];
619            let b = &self.tokens[cursor];
620
621            let is_initialism_chunk = a.kind.is_word() && a.span.len() == 1 && b.kind.is_period();
622
623            if is_initialism_chunk {
624                if initialism_start.is_none() {
625                    initialism_start = Some(cursor - 1);
626                } else {
627                    to_remove.push_back(cursor - 1);
628                }
629
630                to_remove.push_back(cursor);
631                cursor += 1;
632            } else {
633                if let Some(start) = initialism_start {
634                    let end = self.tokens[cursor - 2].span.end;
635                    let start_tok: &mut Token = &mut self.tokens[start];
636                    start_tok.span.end = end;
637                }
638
639                initialism_start = None;
640            }
641
642            cursor += 1;
643
644            if cursor >= self.tokens.len() - 1 {
645                break;
646            }
647        }
648
649        self.tokens.remove_indices(to_remove);
650    }
651
652    /// Condenses likely filename extensions down to single tokens.
653    fn condense_filename_extensions(&mut self) {
654        if self.tokens.len() < 2 {
655            return;
656        }
657
658        let mut to_remove = VecDeque::new();
659
660        let mut cursor = 1;
661
662        let mut ext_start = None;
663
664        loop {
665            // left context, dot, extension, right context
666            let l = self.get_token_offset(cursor, -2);
667            let d = &self.tokens[cursor - 1];
668            let x = &self.tokens[cursor];
669            let r = self.get_token_offset(cursor, 1);
670
671            let is_ext_chunk = d.kind.is_period()
672                && x.kind.is_word()
673                && x.span.len() <= 3
674                && ((l.is_none_or(|t| t.kind.is_whitespace())
675                    && r.is_none_or(|t| t.kind.is_whitespace()))
676                    || (l.is_some_and(|t| t.kind.is_open_round())
677                        && r.is_some_and(|t| t.kind.is_close_round())))
678                && {
679                    let ext_chars = x.get_ch(&self.source);
680                    ext_chars.iter().all(|c| c.is_ascii_lowercase())
681                        || ext_chars.iter().all(|c| c.is_ascii_uppercase())
682                };
683
684            if is_ext_chunk {
685                if ext_start.is_none() {
686                    ext_start = Some(cursor - 1);
687                    self.tokens[cursor - 1].kind = TokenKind::Unlintable;
688                } else {
689                    to_remove.push_back(cursor - 1);
690                }
691
692                to_remove.push_back(cursor);
693                cursor += 1;
694            } else {
695                if let Some(start) = ext_start {
696                    let end = self.tokens[cursor - 2].span.end;
697                    let start_tok: &mut Token = &mut self.tokens[start];
698                    start_tok.span.end = end;
699                }
700
701                ext_start = None;
702            }
703
704            cursor += 1;
705
706            if cursor >= self.tokens.len() {
707                break;
708            }
709        }
710
711        self.tokens.remove_indices(to_remove);
712    }
713
714    /// Condenses common top-level domains (for example: `.blog`, `.com`) down to single tokens.
715    fn condense_common_top_level_domains(&mut self) {
716        const COMMON_TOP_LEVEL_DOMAINS: &[&str; 106] = &[
717            "ai", "app", "blog", "co", "com", "dev", "edu", "gov", "info", "io", "me", "mil",
718            "net", "org", "shop", "tech", "uk", "us", "xyz", "jp", "de", "fr", "br", "it", "ru",
719            "es", "pl", "ca", "au", "cn", "in", "nl", "eu", "ch", "id", "at", "kr", "cz", "mx",
720            "be", "tv", "se", "tr", "tw", "al", "ua", "ir", "vn", "cl", "sk", "ly", "cc", "to",
721            "no", "fi", "pt", "dk", "ar", "hu", "tk", "gr", "il", "news", "ro", "my", "biz", "ie",
722            "za", "nz", "sg", "ee", "th", "pe", "bg", "hk", "rs", "lt", "link", "ph", "club", "si",
723            "site", "mobi", "by", "cat", "wiki", "la", "ga", "xxx", "cf", "hr", "ng", "jobs",
724            "online", "kz", "ug", "gq", "ae", "is", "lv", "pro", "fm", "tips", "ms", "sa", "int",
725        ];
726
727        if self.tokens.len() < 2 {
728            return;
729        }
730
731        let mut to_remove = VecDeque::new();
732        for cursor in 1..self.tokens.len() {
733            // left context, dot, tld, right context
734            let l = self.get_token_offset(cursor, -2);
735            let d = &self.tokens[cursor - 1];
736            let tld = &self.tokens[cursor];
737            let r = self.get_token_offset(cursor, 1);
738
739            let is_tld_chunk = d.kind.is_period()
740                && tld.kind.is_word()
741                && tld
742                    .get_ch(&self.source)
743                    .iter()
744                    .all(|c| c.is_ascii_alphabetic())
745                && tld
746                    .get_ch(&self.source)
747                    .eq_any_ignore_ascii_case_str(COMMON_TOP_LEVEL_DOMAINS)
748                && ((l.is_none_or(|t| t.kind.is_whitespace())
749                    && r.is_none_or(|t| t.kind.is_whitespace()))
750                    || (l.is_some_and(|t| t.kind.is_open_round())
751                        && r.is_some_and(|t| t.kind.is_close_round())));
752
753            if is_tld_chunk {
754                self.tokens[cursor - 1].kind = TokenKind::Unlintable;
755                self.tokens[cursor - 1].span.end = self.tokens[cursor].span.end;
756                to_remove.push_back(cursor);
757            }
758        }
759
760        self.tokens.remove_indices(to_remove);
761    }
762
763    /// Condenses "tl;dr" down to a single word token.
764    fn condense_tldr(&mut self) {
765        if self.tokens.len() < 3 {
766            return;
767        }
768
769        let mut to_remove = VecDeque::new();
770        let mut cursor = 2;
771
772        loop {
773            let tl = &self.tokens[cursor - 2];
774            let simicolon = &self.tokens[cursor - 1];
775            let dr = &self.tokens[cursor];
776
777            let is_tldr_chunk = tl.kind.is_word()
778                && tl.span.len() == 2
779                && tl.get_ch(&self.source).eq_ch(&['t', 'l'])
780                && simicolon.kind.is_semicolon()
781                && dr.kind.is_word()
782                && dr.span.len() >= 2
783                && dr.span.len() <= 3
784                && dr
785                    .get_ch(&self.source)
786                    .eq_any_ignore_ascii_case_chars(&[&['d', 'r'], &['d', 'r', 's']]);
787
788            if is_tldr_chunk {
789                // Update the first token to be the full "tl;dr" as a word
790                self.tokens[cursor - 2].span = Span::new(
791                    self.tokens[cursor - 2].span.start,
792                    self.tokens[cursor].span.end,
793                );
794
795                // Mark the semicolon and "dr" tokens for removal
796                to_remove.push_back(cursor - 1);
797                to_remove.push_back(cursor);
798            }
799
800            // Skip ahead since we've processed these tokens
801            cursor += 1;
802
803            if cursor >= self.tokens.len() {
804                break;
805            }
806        }
807
808        // Remove the marked tokens in reverse order to maintain correct indices
809        self.tokens.remove_indices(to_remove);
810    }
811
812    /// Allows condensing of delimited pairs of tokens into a single token.
813    ///
814    /// # Arguments
815    ///
816    /// * `is_delimiter` - A function that returns `true` if the token is a delimiter.
817    /// * `valid_pairs` - A slice of tuples representing the valid pairs of tokens to condense.
818    ///
819    fn condense_delimited_pairs<F>(&mut self, is_delimiter: F, valid_pairs: &[(char, char)])
820    where
821        F: Fn(&TokenKind) -> bool,
822    {
823        if self.tokens.len() < 3 {
824            return;
825        }
826
827        let mut to_remove = VecDeque::new();
828        let mut cursor = 2;
829
830        loop {
831            let l1 = &self.tokens[cursor - 2];
832            let delim = &self.tokens[cursor - 1];
833            let l2 = &self.tokens[cursor];
834
835            let is_delimited_chunk = l1.kind.is_word()
836                && l1.span.len() == 1
837                && is_delimiter(&delim.kind)
838                && l2.kind.is_word()
839                && l2.span.len() == 1;
840
841            if is_delimited_chunk {
842                let (l1, l2) = (
843                    l1.get_ch(&self.source).first(),
844                    l2.get_ch(&self.source).first(),
845                );
846
847                let is_valid_pair = match (l1, l2) {
848                    (Some(l1), Some(l2)) => {
849                        let pair = (l1.to_ascii_lowercase(), l2.to_ascii_lowercase());
850                        valid_pairs.contains(&pair)
851                    }
852                    _ => false,
853                };
854
855                if is_valid_pair {
856                    self.tokens[cursor - 2].span = Span::new(
857                        self.tokens[cursor - 2].span.start,
858                        self.tokens[cursor].span.end,
859                    );
860                    to_remove.push_back(cursor - 1);
861                    to_remove.push_back(cursor);
862                }
863            }
864
865            cursor += 1;
866            if cursor >= self.tokens.len() {
867                break;
868            }
869        }
870
871        self.tokens.remove_indices(to_remove);
872    }
873
874    // Condenses "ampersand pairs" such as "R&D" or "Q&A" into single tokens.
875    fn condense_ampersand_pairs(&mut self) {
876        self.condense_delimited_pairs(
877            |kind| kind.is_ampersand(),
878            &[
879                ('b', 'b'), // bed & breakfast
880                ('b', 'w'), // black & white
881                ('g', 't'), // gin & tonic
882                ('k', 'r'), // Kernighan & Ritchie
883                ('q', 'a'), // question & answer
884                ('r', 'b'), // rhythm & blues
885                ('r', 'd'), // research & development
886                ('r', 'r'), // rest & relaxation
887                ('s', 'p'), // Standard & Poor's
888            ],
889        );
890    }
891
892    // Condenses "slash pairs" such as "I/O" into single tokens.
893    fn condense_slash_pairs(&mut self) {
894        self.condense_delimited_pairs(
895            |kind| kind.is_slash(),
896            &[
897                ('a', 'c'), // aircon; alternating current
898                ('b', 'w'), // black and white
899                ('c', 'o'), // care of
900                ('d', 'c'), // direct current
901                ('d', 'l'), // download
902                ('i', 'o'), // input/output
903                ('j', 'k'), // just kidding
904                ('n', 'a'), // not applicable
905                ('r', 'c'), // radio control
906                ('s', 'n'), // serial number
907                ('y', 'n'), // yes/no
908                ('y', 'o'), // years old
909            ],
910        );
911    }
912
913    fn uncached_ellipsis_pattern() -> Lrc<Repeating> {
914        let period = SequenceExpr::default().then_period();
915        Lrc::new(Repeating::new(Box::new(period), 2))
916    }
917
918    thread_local! {
919        static ELLIPSIS_EXPR: Lrc<Repeating> = Document::uncached_ellipsis_pattern();
920    }
921
922    fn condense_ellipsis(&mut self) {
923        let expr = Self::ELLIPSIS_EXPR.with(|v| v.clone());
924        self.condense_expr(&expr, |tok| {
925            tok.kind = TokenKind::Punctuation(Punctuation::Ellipsis)
926        });
927    }
928}
929
930impl TokenStringExt for Document {
931    fn tokens(&self) -> &[Token] {
932        &self.tokens
933    }
934
935    fn tokens_mut(&mut self) -> &mut [Token] {
936        &mut self.tokens
937    }
938}
939
940impl Display for Document {
941    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
942        for token in &self.tokens {
943            write!(f, "{}", self.get_span_content_str(&token.span))?;
944        }
945
946        Ok(())
947    }
948}
949
950#[cfg(test)]
951mod tests {
952    use itertools::Itertools;
953
954    use super::Document;
955    use crate::TokenStringExt;
956    use crate::{Span, parsers::MarkdownOptions};
957
958    fn assert_condensed_contractions(text: &str, final_tok_count: usize) {
959        let document = Document::new_plain_english_curated(text);
960
961        assert_eq!(document.tokens.len(), final_tok_count);
962
963        let document = Document::new_markdown_curated(text, MarkdownOptions::default());
964
965        assert_eq!(document.tokens.len(), final_tok_count);
966    }
967
968    #[test]
969    fn simple_contraction() {
970        assert_condensed_contractions("isn't", 1);
971    }
972
973    #[test]
974    fn simple_contraction2() {
975        assert_condensed_contractions("wasn't", 1);
976    }
977
978    #[test]
979    fn simple_contraction3() {
980        assert_condensed_contractions("There's", 1);
981    }
982
983    #[test]
984    fn simple_contraction4() {
985        assert_condensed_contractions("doesn't", 1);
986    }
987
988    #[test]
989    fn medium_contraction() {
990        assert_condensed_contractions("isn't wasn't", 3);
991    }
992
993    #[test]
994    fn medium_contraction2() {
995        assert_condensed_contractions("There's no way", 5);
996    }
997
998    #[test]
999    fn selects_token_at_char_index() {
1000        let text = "There were three little pigs. They built three little homes.";
1001        let document = Document::new_plain_english_curated(text);
1002
1003        let got = document.get_token_at_char_index(19).unwrap();
1004
1005        assert!(got.kind.is_word());
1006        assert_eq!(got.span, Span::new(17, 23));
1007    }
1008
1009    fn assert_token_count(source: &str, count: usize) {
1010        let document = Document::new_plain_english_curated(source);
1011
1012        dbg!(document.tokens().map(|t| t.kind.clone()).collect_vec());
1013        assert_eq!(document.tokens.len(), count);
1014    }
1015
1016    #[test]
1017    fn condenses_number_suffixes() {
1018        assert_token_count("1st", 1);
1019        assert_token_count("This is the 2nd test", 9);
1020        assert_token_count("This is the 3rd test", 9);
1021        assert_token_count(
1022            "It works even with weird capitalization like this: 600nD",
1023            18,
1024        );
1025    }
1026
1027    #[test]
1028    fn condenses_ie() {
1029        assert_token_count("There is a thing (i.e. that one)", 15);
1030        assert_token_count("We are trying to condense \"i.e.\"", 13);
1031        assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
1032    }
1033
1034    #[test]
1035    fn condenses_eg() {
1036        assert_token_count("We are trying to condense \"e.g.\"", 13);
1037        assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
1038    }
1039
1040    #[test]
1041    fn condenses_nsa() {
1042        assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
1043    }
1044
1045    #[test]
1046    fn parses_ellipsis() {
1047        assert_token_count("...", 1);
1048    }
1049
1050    #[test]
1051    fn parses_long_ellipsis() {
1052        assert_token_count(".....", 1);
1053    }
1054
1055    #[test]
1056    fn parses_short_ellipsis() {
1057        assert_token_count("..", 1);
1058    }
1059
1060    #[test]
1061    fn selects_token_at_offset() {
1062        let doc = Document::new_plain_english_curated("Foo bar baz");
1063
1064        let tok = doc.get_token_offset(1, -1).unwrap();
1065
1066        assert_eq!(tok.span, Span::new(0, 3));
1067    }
1068
1069    #[test]
1070    fn cant_select_token_before_start() {
1071        let doc = Document::new_plain_english_curated("Foo bar baz");
1072
1073        let tok = doc.get_token_offset(0, -1);
1074
1075        assert!(tok.is_none());
1076    }
1077
1078    #[test]
1079    fn select_next_word_pos_offset() {
1080        let doc = Document::new_plain_english_curated("Foo bar baz");
1081
1082        let bar = doc.get_next_word_from_offset(0, 1).unwrap();
1083        let bar = doc.get_span_content(&bar.span);
1084        assert_eq!(bar, ['b', 'a', 'r']);
1085    }
1086
1087    #[test]
1088    fn select_next_word_neg_offset() {
1089        let doc = Document::new_plain_english_curated("Foo bar baz");
1090
1091        let bar = doc.get_next_word_from_offset(2, -1).unwrap();
1092        let bar = doc.get_span_content(&bar.span);
1093        assert_eq!(bar, ['F', 'o', 'o']);
1094    }
1095
1096    #[test]
1097    fn cant_select_next_word_not_from_whitespace() {
1098        let doc = Document::new_plain_english_curated("Foo bar baz");
1099
1100        let tok = doc.get_next_word_from_offset(0, 2);
1101
1102        assert!(tok.is_none());
1103    }
1104
1105    #[test]
1106    fn cant_select_next_word_before_start() {
1107        let doc = Document::new_plain_english_curated("Foo bar baz");
1108
1109        let tok = doc.get_next_word_from_offset(0, -1);
1110
1111        assert!(tok.is_none());
1112    }
1113
1114    #[test]
1115    fn cant_select_next_word_with_punctuation_instead_of_whitespace() {
1116        let doc = Document::new_plain_english_curated("Foo, bar, baz");
1117
1118        let tok = doc.get_next_word_from_offset(0, 1);
1119
1120        assert!(tok.is_none());
1121    }
1122
1123    #[test]
1124    fn cant_select_next_word_with_punctuation_after_whitespace() {
1125        let doc = Document::new_plain_english_curated("Foo \"bar\", baz");
1126
1127        let tok = doc.get_next_word_from_offset(0, 1);
1128
1129        assert!(tok.is_none());
1130    }
1131
1132    #[test]
1133    fn condenses_filename_extensions() {
1134        let doc = Document::new_plain_english_curated(".c and .exe and .js");
1135        assert!(doc.tokens[0].kind.is_unlintable());
1136        assert!(doc.tokens[4].kind.is_unlintable());
1137        assert!(doc.tokens[8].kind.is_unlintable());
1138    }
1139
1140    #[test]
1141    fn condense_filename_extension_ok_at_start_and_end() {
1142        let doc = Document::new_plain_english_curated(".c and .EXE");
1143        assert!(doc.tokens.len() == 5);
1144        assert!(doc.tokens[0].kind.is_unlintable());
1145        assert!(doc.tokens[4].kind.is_unlintable());
1146    }
1147
1148    #[test]
1149    fn doesnt_condense_filename_extensions_with_mixed_case() {
1150        let doc = Document::new_plain_english_curated(".c and .Exe");
1151        assert!(doc.tokens.len() == 6);
1152        assert!(doc.tokens[0].kind.is_unlintable());
1153        assert!(doc.tokens[4].kind.is_punctuation());
1154        assert!(doc.tokens[5].kind.is_word());
1155    }
1156
1157    #[test]
1158    fn doesnt_condense_filename_extensions_with_non_letters() {
1159        let doc = Document::new_plain_english_curated(".COM and .C0M");
1160        assert!(doc.tokens.len() == 6);
1161        assert!(doc.tokens[0].kind.is_unlintable());
1162        assert!(doc.tokens[4].kind.is_punctuation());
1163        assert!(doc.tokens[5].kind.is_word());
1164    }
1165
1166    #[test]
1167    fn doesnt_condense_filename_extensions_longer_than_three() {
1168        let doc = Document::new_plain_english_curated(".dll and .dlls");
1169        assert!(doc.tokens.len() == 6);
1170        assert!(doc.tokens[0].kind.is_unlintable());
1171        assert!(doc.tokens[4].kind.is_punctuation());
1172        assert!(doc.tokens[5].kind.is_word());
1173    }
1174
1175    #[test]
1176    fn condense_filename_extension_in_parens() {
1177        let doc = Document::new_plain_english_curated(
1178            "true for the manual installation when trying to run the executable(.exe) after a manual download",
1179        );
1180        assert!(doc.tokens.len() > 23);
1181        assert!(doc.tokens[21].kind.is_open_round());
1182        assert!(doc.tokens[22].kind.is_unlintable());
1183        assert!(doc.tokens[23].kind.is_close_round());
1184    }
1185
1186    #[test]
1187    fn condense_tldr_uppercase() {
1188        let doc = Document::new_plain_english_curated("TL;DR");
1189        assert!(doc.tokens.len() == 1);
1190        assert!(doc.tokens[0].kind.is_word());
1191        assert!(doc.tokens[0].span.len() == 5);
1192    }
1193
1194    #[test]
1195    fn condense_tldr_lowercase() {
1196        let doc = Document::new_plain_english_curated("tl;dr");
1197        assert!(doc.tokens.len() == 1);
1198        assert!(doc.tokens[0].kind.is_word());
1199    }
1200
1201    #[test]
1202    fn condense_tldr_mixed_case_1() {
1203        let doc = Document::new_plain_english_curated("tl;DR");
1204        assert!(doc.tokens.len() == 1);
1205        assert!(doc.tokens[0].kind.is_word());
1206    }
1207
1208    #[test]
1209    fn condense_tldr_mixed_case_2() {
1210        let doc = Document::new_plain_english_curated("TL;Dr");
1211        assert!(doc.tokens.len() == 1);
1212        assert!(doc.tokens[0].kind.is_word());
1213    }
1214
1215    #[test]
1216    fn condense_tldr_pural() {
1217        let doc = Document::new_plain_english_curated(
1218            "managing the flow between components to produce relevant TL;DRs of current news articles",
1219        );
1220        // no token is a punctuation token - only words with whitespace between
1221        assert!(
1222            doc.tokens
1223                .iter()
1224                .all(|t| t.kind.is_word() || t.kind.is_whitespace())
1225        );
1226        // one of the word tokens contains a ';' character
1227        let tldrs = doc
1228            .tokens
1229            .iter()
1230            .filter(|t| t.get_ch(&doc.source).contains(&';'))
1231            .collect_vec();
1232        assert!(tldrs.len() == 1);
1233        assert!(tldrs[0].get_str(&doc.source) == "TL;DRs");
1234    }
1235
1236    #[test]
1237    fn condense_common_top_level_domains() {
1238        let doc = Document::new_plain_english_curated(".blog and .com and .NET");
1239        assert!(doc.tokens.len() == 9);
1240        assert!(doc.tokens[0].kind.is_unlintable());
1241        assert!(doc.tokens[4].kind.is_unlintable());
1242        assert!(doc.tokens[8].kind.is_unlintable());
1243    }
1244
1245    #[test]
1246    fn condense_common_top_level_domains_in_parens() {
1247        let doc = Document::new_plain_english_curated("(.blog)");
1248        assert!(doc.tokens.len() == 3);
1249        assert!(doc.tokens[0].kind.is_open_round());
1250        assert!(doc.tokens[1].kind.is_unlintable());
1251        assert!(doc.tokens[2].kind.is_close_round());
1252    }
1253
1254    #[test]
1255    fn doesnt_condense_unknown_top_level_domains() {
1256        let doc = Document::new_plain_english_curated(".harper");
1257        assert!(doc.tokens.len() == 2);
1258        assert!(doc.tokens[0].kind.is_punctuation());
1259        assert!(doc.tokens[1].kind.is_word());
1260    }
1261
1262    #[test]
1263    fn condense_r_and_d_caps() {
1264        let doc = Document::new_plain_english_curated("R&D");
1265        assert!(doc.tokens.len() == 1);
1266        assert!(doc.tokens[0].kind.is_word());
1267    }
1268
1269    #[test]
1270    fn condense_r_and_d_mixed_case() {
1271        let doc = Document::new_plain_english_curated("R&d");
1272        assert!(doc.tokens.len() == 1);
1273        assert!(doc.tokens[0].kind.is_word());
1274    }
1275
1276    #[test]
1277    fn condense_r_and_d_lowercase() {
1278        let doc = Document::new_plain_english_curated("r&d");
1279        assert!(doc.tokens.len() == 1);
1280        assert!(doc.tokens[0].kind.is_word());
1281    }
1282
1283    #[test]
1284    fn dont_condense_r_and_d_with_spaces() {
1285        let doc = Document::new_plain_english_curated("R & D");
1286        assert!(doc.tokens.len() == 5);
1287        assert!(doc.tokens[0].kind.is_word());
1288        assert!(doc.tokens[1].kind.is_whitespace());
1289        assert!(doc.tokens[2].kind.is_ampersand());
1290        assert!(doc.tokens[3].kind.is_whitespace());
1291        assert!(doc.tokens[4].kind.is_word());
1292    }
1293
1294    #[test]
1295    fn condense_q_and_a() {
1296        let doc =
1297            Document::new_plain_english_curated("A Q&A platform software for teams at any scales.");
1298        assert!(doc.tokens.len() >= 3);
1299        assert!(doc.tokens[2].kind.is_word());
1300        assert!(doc.tokens[2].get_str(&doc.source) == "Q&A");
1301    }
1302
1303    #[test]
1304    fn dont_allow_mixed_r_and_d_with_q_and_a() {
1305        let doc = Document::new_plain_english_curated("R&A or Q&D");
1306        assert!(doc.tokens.len() == 9);
1307        assert!(doc.tokens[1].kind.is_ampersand() || doc.tokens[7].kind.is_ampersand());
1308    }
1309
1310    #[test]
1311    fn condense_io() {
1312        let doc = Document::new_plain_english_curated("I/O");
1313        assert!(doc.tokens.len() == 1);
1314        assert!(doc.tokens[0].kind.is_word());
1315    }
1316
1317    #[test]
1318    fn finds_unmatched_quotes_in_document() {
1319        let raw = r#"
1320This is a paragraph with a single word "quoted."
1321
1322This is a second paragraph with no quotes.
1323
1324This is a third paragraph with a single erroneous "quote.
1325
1326This is a final paragraph with a weird "quote and a not-weird "quote".
1327            "#;
1328
1329        let doc = Document::new_markdown_default_curated(raw);
1330
1331        let quote_twins: Vec<_> = doc
1332            .iter_quotes()
1333            .map(|t| t.kind.as_quote().unwrap().twin_loc)
1334            .collect();
1335
1336        assert_eq!(
1337            quote_twins,
1338            vec![Some(19), Some(16), None, None, Some(89), Some(87)]
1339        )
1340    }
1341
1342    #[test]
1343    fn issue_1901() {
1344        let raw = r#"
1345"A quoted line"
1346"A quote without a closing mark
1347"Another quoted lined"
1348"The last quoted line"
1349            "#;
1350
1351        let doc = Document::new_markdown_default_curated(raw);
1352
1353        let quote_twins: Vec<_> = doc
1354            .iter_quotes()
1355            .map(|t| t.kind.as_quote().unwrap().twin_loc)
1356            .collect();
1357
1358        assert_eq!(
1359            quote_twins,
1360            vec![
1361                Some(6),
1362                Some(0),
1363                None,
1364                Some(27),
1365                Some(21),
1366                Some(37),
1367                Some(29)
1368            ]
1369        )
1370    }
1371}