Skip to main content

harper_core/
document.rs

1use std::cmp::Ordering;
2use std::collections::VecDeque;
3use std::fmt::Display;
4
5use harper_brill::{Chunker, Tagger, brill_tagger, burn_chunker};
6use itertools::Itertools;
7use paste::paste;
8
9use crate::expr::{Expr, ExprExt, FirstMatchOf, Repeating, SequenceExpr};
10use crate::parsers::{Markdown, MarkdownOptions, Parser, PlainEnglish};
11use crate::punctuation::Punctuation;
12use crate::spell::{Dictionary, FstDictionary};
13use crate::vec_ext::VecExt;
14use crate::{CharStringExt, FatStringToken, FatToken, Lrc, Token, TokenKind, TokenStringExt};
15use crate::{OrdinalSuffix, Span};
16
17/// A document containing some amount of lexed and parsed English text.
18#[derive(Debug, Clone)]
19pub struct Document {
20    source: Lrc<[char]>,
21    tokens: Vec<Token>,
22}
23
24impl Default for Document {
25    fn default() -> Self {
26        Self::new("", &PlainEnglish, &FstDictionary::curated())
27    }
28}
29
30impl Document {
31    /// Locate all the tokens that intersect a provided span.
32    ///
33    /// Desperately needs optimization.
34    pub fn token_indices_intersecting(&self, span: Span<char>) -> Vec<usize> {
35        self.tokens()
36            .enumerate()
37            .filter_map(|(idx, tok)| tok.span.overlaps_with(span).then_some(idx))
38            .collect()
39    }
40
41    /// Locate all the tokens that intersect a provided span and convert them to [`FatToken`]s.
42    ///
43    /// Desperately needs optimization.
44    pub fn fat_tokens_intersecting(&self, span: Span<char>) -> Vec<FatToken> {
45        let indices = self.token_indices_intersecting(span);
46
47        indices
48            .into_iter()
49            .map(|i| self.tokens[i].to_fat(&self.source))
50            .collect()
51    }
52
53    /// Lexes and parses text to produce a document using a provided language
54    /// parser and dictionary.
55    pub fn new(text: &str, parser: &impl Parser, dictionary: &impl Dictionary) -> Self {
56        let source: Lrc<_> = text.chars().collect();
57
58        Self::new_from_chars(source, parser, dictionary)
59    }
60
61    /// Lexes and parses text to produce a document using a provided language
62    /// parser and the included curated dictionary.
63    pub fn new_curated(text: &str, parser: &impl Parser) -> Self {
64        let source: Lrc<_> = text.chars().collect();
65
66        Self::new_from_chars(source, parser, &FstDictionary::curated())
67    }
68
69    /// Lexes and parses text to produce a document using a provided language
70    /// parser and dictionary.
71    pub fn new_from_chars(
72        source: Lrc<[char]>,
73        parser: &impl Parser,
74        dictionary: &impl Dictionary,
75    ) -> Self {
76        let tokens = parser.parse(&source);
77
78        let mut document = Self { source, tokens };
79        document.parse(dictionary);
80
81        document
82    }
83
84    /// Create a new document from character data using the built-in [`PlainEnglish`]
85    /// parser and curated dictionary. This avoids string-to-char conversions.
86    pub fn new_plain_english_curated_chars(source: &[char]) -> Self {
87        Self::new_from_chars(Lrc::from(source), &PlainEnglish, &FstDictionary::curated())
88    }
89
90    /// Parse text to produce a document using the built-in [`PlainEnglish`]
91    /// parser and curated dictionary.
92    pub fn new_plain_english_curated(text: &str) -> Self {
93        Self::new(text, &PlainEnglish, &FstDictionary::curated())
94    }
95
96    /// Create a new document simply by tokenizing the provided input and applying fix-ups. The
97    /// contained words will not contain any metadata.
98    ///
99    /// This avoids running potentially expensive metadata generation code, so this is more
100    /// efficient if you don't need that information.
101    pub(crate) fn new_basic_tokenize(text: &str, parser: &impl Parser) -> Self {
102        let source: Lrc<_> = text.chars().collect();
103        let tokens = parser.parse(&source);
104        let mut document = Self { source, tokens };
105        document.apply_fixups();
106        document
107    }
108
109    /// Parse text to produce a document using the built-in [`PlainEnglish`]
110    /// parser and a provided dictionary.
111    pub fn new_plain_english(text: &str, dictionary: &impl Dictionary) -> Self {
112        Self::new(text, &PlainEnglish, dictionary)
113    }
114
115    /// Parse text to produce a document using the built-in [`Markdown`] parser
116    /// and curated dictionary.
117    pub fn new_markdown_curated(text: &str, markdown_options: MarkdownOptions) -> Self {
118        Self::new(
119            text,
120            &Markdown::new(markdown_options),
121            &FstDictionary::curated(),
122        )
123    }
124
125    /// Create a new document from character data using the built-in [`Markdown`] parser
126    /// and curated dictionary. This avoids string-to-char conversions.
127    pub fn new_markdown_default_curated_chars(chars: &[char]) -> Self {
128        Self::new_from_chars(
129            chars.to_vec().into(),
130            &Markdown::default(),
131            &FstDictionary::curated(),
132        )
133    }
134
135    /// Parse text to produce a document using the built-in [`Markdown`] parser
136    /// and curated dictionary with the default Markdown configuration.
137    pub fn new_markdown_default_curated(text: &str) -> Self {
138        Self::new_markdown_curated(text, MarkdownOptions::default())
139    }
140
141    /// Parse text to produce a document using the built-in [`PlainEnglish`]
142    /// parser and the curated dictionary.
143    pub fn new_markdown(
144        text: &str,
145        markdown_options: MarkdownOptions,
146        dictionary: &impl Dictionary,
147    ) -> Self {
148        Self::new(text, &Markdown::new(markdown_options), dictionary)
149    }
150
151    /// Parse text to produce a document using the built-in [`PlainEnglish`]
152    /// parser and the curated dictionary with the default Markdown configuration.
153    pub fn new_markdown_default(text: &str, dictionary: &impl Dictionary) -> Self {
154        Self::new_markdown(text, MarkdownOptions::default(), dictionary)
155    }
156
157    fn apply_fixups(&mut self) {
158        self.condense_spaces();
159        self.condense_newlines();
160        self.newlines_to_breaks();
161        self.condense_dotted_initialisms();
162        self.condense_number_suffixes();
163        self.condense_ellipsis();
164        self.condense_dotted_truncations();
165        self.condense_common_top_level_domains();
166        self.condense_filename_extensions();
167        self.condense_tldr();
168        self.condense_ampersand_pairs();
169        self.condense_slash_pairs();
170        self.match_quotes();
171    }
172
173    /// Re-parse important language constructs.
174    ///
175    /// Should be run after every change to the underlying [`Self::source`].
176    fn parse(&mut self, dictionary: &impl Dictionary) {
177        self.apply_fixups();
178
179        let chunker = burn_chunker();
180        let tagger = brill_tagger();
181
182        for sent in self.tokens.iter_sentences_mut() {
183            let token_strings: Vec<_> = sent
184                .iter()
185                .filter(|t| !t.kind.is_whitespace())
186                .map(|t| t.get_str(&self.source))
187                .collect();
188
189            let token_tags = tagger.tag_sentence(&token_strings);
190            let np_flags = chunker.chunk_sentence(&token_strings, &token_tags);
191
192            // Annotate DictWord metadata
193            let word_sources: Vec<_> = sent
194                .iter()
195                .filter(|t| matches!(t.kind, TokenKind::Word(_)))
196                .map(|t| t.get_ch(&self.source))
197                .collect();
198
199            let mut ti = 0; // Index for token_tags/np_flags (all non-whitespace tokens)
200            let mut wi = 0; // Index for word_sources (only word tokens)
201            for token in sent.iter_mut() {
202                if let TokenKind::Word(meta) = &mut token.kind {
203                    let word_source = word_sources[wi];
204                    let mut found_meta = dictionary
205                        .get_word_metadata(word_source)
206                        .map(|c| c.into_owned());
207
208                    if let Some(inner) = &mut found_meta {
209                        inner.pos_tag = token_tags[ti].or_else(|| inner.infer_pos_tag());
210                        inner.np_member = Some(np_flags[ti]);
211                    }
212
213                    *meta = found_meta;
214                    ti += 1;
215                    wi += 1;
216                } else if !token.kind.is_whitespace() {
217                    ti += 1;
218                }
219            }
220        }
221    }
222
223    /// Convert all sets of newlines greater than 2 to paragraph breaks.
224    fn newlines_to_breaks(&mut self) {
225        for token in &mut self.tokens {
226            if let TokenKind::Newline(n) = token.kind
227                && n >= 2
228            {
229                token.kind = TokenKind::ParagraphBreak;
230            }
231        }
232    }
233
234    /// Given a list of indices, this function removes the subsequent
235    /// `stretch_len - 1` elements after each index.
236    ///
237    /// Will extend token spans to include removed elements.
238    /// Assumes condensed tokens are contiguous in source text.
239    fn condense_indices(&mut self, indices: &[usize], stretch_len: usize) {
240        // Update spans
241        for idx in indices {
242            let end_tok = self.tokens[idx + stretch_len - 1].clone();
243            let start_tok = &mut self.tokens[*idx];
244
245            start_tok.span.end = end_tok.span.end;
246        }
247
248        // Trim
249        let old = self.tokens.clone();
250        self.tokens.clear();
251
252        // Keep first chunk.
253        self.tokens
254            .extend_from_slice(&old[0..indices.first().copied().unwrap_or(indices.len())]);
255
256        let mut iter = indices.iter().peekable();
257
258        while let (Some(a_idx), b) = (iter.next(), iter.peek()) {
259            self.tokens.push(old[*a_idx].clone());
260
261            if let Some(b_idx) = b {
262                self.tokens
263                    .extend_from_slice(&old[a_idx + stretch_len..**b_idx]);
264            }
265        }
266
267        // Keep last chunk.
268        self.tokens.extend_from_slice(
269            &old[indices
270                .last()
271                .map(|v| v + stretch_len)
272                .unwrap_or(indices.len())..],
273        );
274    }
275
276    pub fn get_token_at_char_index(&self, char_index: usize) -> Option<&Token> {
277        let index = self
278            .tokens
279            .binary_search_by(|t| {
280                if t.span.overlaps_with(Span::new_with_len(char_index, 1)) {
281                    Ordering::Equal
282                } else {
283                    t.span.start.cmp(&char_index)
284                }
285            })
286            .ok()?;
287
288        Some(&self.tokens[index])
289    }
290
291    /// Defensively attempt to grab a specific token.
292    pub fn get_token(&self, index: usize) -> Option<&Token> {
293        self.tokens.get(index)
294    }
295
296    /// Get a token at a signed offset from a base index, or None if out of bounds.
297    pub fn get_token_offset(&self, base: usize, offset: isize) -> Option<&Token> {
298        match base.checked_add_signed(offset) {
299            None => None,
300            Some(idx) => self.get_token(idx),
301        }
302    }
303
304    /// Get an iterator over all the tokens contained in the document.
305    pub fn tokens(&self) -> impl Iterator<Item = &Token> + '_ {
306        self.tokens.iter()
307    }
308
309    pub fn iter_nominal_phrases(&self) -> impl Iterator<Item = &[Token]> {
310        fn is_np_member(t: &Token) -> bool {
311            t.kind
312                .as_word()
313                .and_then(|x| x.as_ref())
314                .and_then(|w| w.np_member)
315                .unwrap_or(false)
316        }
317
318        fn trim(slice: &[Token]) -> &[Token] {
319            let mut start = 0;
320            let mut end = slice.len();
321            while start < end && slice[start].kind.is_whitespace() {
322                start += 1;
323            }
324            while end > start && slice[end - 1].kind.is_whitespace() {
325                end -= 1;
326            }
327            &slice[start..end]
328        }
329
330        self.tokens
331            .as_slice()
332            .split(|t| !(is_np_member(t) || t.kind.is_whitespace()))
333            .filter_map(|s| {
334                let s = trim(s);
335                if s.iter().any(is_np_member) {
336                    Some(s)
337                } else {
338                    None
339                }
340            })
341    }
342
343    /// Get an iterator over all the tokens contained in the document.
344    pub fn fat_tokens(&self) -> impl Iterator<Item = FatToken> + '_ {
345        self.tokens().map(|token| token.to_fat(&self.source))
346    }
347
348    /// Get the next or previous word token relative to a base index, if separated by whitespace.
349    /// Returns None if the next/previous token is not a word or does not exist.
350    pub fn get_next_word_from_offset(&self, base: usize, offset: isize) -> Option<&Token> {
351        // Look for whitespace at the expected offset
352        if !self.get_token_offset(base, offset)?.kind.is_whitespace() {
353            return None;
354        }
355        // Now look beyond the whitespace for a word token
356        let word_token = self.get_token_offset(base, offset + offset.signum());
357        let word_token = word_token?;
358        word_token.kind.is_word().then_some(word_token)
359    }
360
361    /// Get an iterator over all the tokens contained in the document.
362    pub fn fat_string_tokens(&self) -> impl Iterator<Item = FatStringToken> + '_ {
363        self.fat_tokens().map(|t| t.into())
364    }
365
366    pub fn get_span_content(&self, span: &Span<char>) -> &[char] {
367        span.get_content(&self.source)
368    }
369
370    pub fn get_span_content_str(&self, span: &Span<char>) -> String {
371        String::from_iter(self.get_span_content(span))
372    }
373
374    pub fn get_full_string(&self) -> String {
375        self.get_span_content_str(&Span::new(0, self.source.len()))
376    }
377
378    pub fn get_full_content(&self) -> &[char] {
379        &self.source
380    }
381
382    pub fn get_source(&self) -> &[char] {
383        &self.source
384    }
385
386    pub fn get_tokens(&self) -> &[Token] {
387        &self.tokens
388    }
389
390    /// Searches for quotation marks and fills the
391    /// [`Punctuation::Quote::twin_loc`] field. This is on a best-effort
392    /// basis.
393    ///
394    /// Current algorithm is based on https://leancrew.com/all-this/2025/03/a-mac-smart-quote-curiosity
395    fn match_quotes(&mut self) {
396        let mut pg_indices: Vec<_> = vec![0];
397        pg_indices.extend(self.iter_paragraph_break_indices());
398        pg_indices.push(self.tokens.len());
399
400        // Avoid allocation in loop
401        let mut quote_indices = Vec::new();
402        let mut open_quote_indices = Vec::new();
403
404        for (start, end) in pg_indices.into_iter().tuple_windows() {
405            let pg = &mut self.tokens[start..end];
406
407            quote_indices.clear();
408            quote_indices.extend(pg.iter_quote_indices());
409            open_quote_indices.clear();
410
411            // Find open quotes first.
412            for quote in &quote_indices {
413                let is_open = *quote == 0
414                    || pg[0..*quote].iter_word_likes().next().is_none()
415                    || pg[quote - 1].kind.is_whitespace()
416                    || matches!(
417                        pg[quote - 1].kind.as_punctuation(),
418                        Some(Punctuation::LessThan)
419                            | Some(Punctuation::OpenRound)
420                            | Some(Punctuation::OpenSquare)
421                            | Some(Punctuation::OpenCurly)
422                            | Some(Punctuation::Apostrophe)
423                    );
424
425                if is_open {
426                    open_quote_indices.push(*quote);
427                }
428            }
429
430            while let Some(open_idx) = open_quote_indices.pop() {
431                let Some(close_idx) = pg[open_idx + 1..].iter_quote_indices().next() else {
432                    continue;
433                };
434
435                if pg[close_idx + open_idx + 1]
436                    .kind
437                    .as_quote()
438                    .unwrap()
439                    .twin_loc
440                    .is_some()
441                {
442                    continue;
443                }
444
445                pg[open_idx].kind.as_mut_quote().unwrap().twin_loc =
446                    Some(close_idx + open_idx + start + 1);
447                pg[close_idx + open_idx + 1]
448                    .kind
449                    .as_mut_quote()
450                    .unwrap()
451                    .twin_loc = Some(open_idx + start);
452            }
453        }
454    }
455
456    /// Searches for number suffixes and condenses them down into single tokens
457    fn condense_number_suffixes(&mut self) {
458        if self.tokens.len() < 2 {
459            return;
460        }
461
462        let mut replace_starts = Vec::new();
463
464        for idx in 0..self.tokens.len() - 1 {
465            let b = &self.tokens[idx + 1];
466            let a = &self.tokens[idx];
467
468            // TODO: Allow spaces between `a` and `b`
469
470            if let (TokenKind::Number(..), TokenKind::Word(..)) = (&a.kind, &b.kind)
471                && let Some(found_suffix) =
472                    OrdinalSuffix::from_chars(self.get_span_content(&b.span))
473            {
474                self.tokens[idx].kind.as_mut_number().unwrap().suffix = Some(found_suffix);
475                replace_starts.push(idx);
476            }
477        }
478
479        self.condense_indices(&replace_starts, 2);
480    }
481
482    /// Searches for multiple sequential space tokens and condenses them down
483    /// into one.
484    fn condense_spaces(&mut self) {
485        let mut cursor = 0;
486        let copy = self.tokens.clone();
487
488        let mut remove_these = VecDeque::new();
489
490        while cursor < self.tokens.len() {
491            // Locate a stretch of one or more newline tokens.
492            let start_tok = &mut self.tokens[cursor];
493
494            if let TokenKind::Space(start_count) = &mut start_tok.kind {
495                loop {
496                    cursor += 1;
497
498                    if cursor >= copy.len() {
499                        break;
500                    }
501
502                    let child_tok = &copy[cursor];
503
504                    // Only condense adjacent spans
505                    if start_tok.span.end != child_tok.span.start {
506                        break;
507                    }
508
509                    if let TokenKind::Space(n) = child_tok.kind {
510                        *start_count += n;
511                        start_tok.span.end = child_tok.span.end;
512                        remove_these.push_back(cursor);
513                        cursor += 1;
514                    } else {
515                        break;
516                    };
517                }
518            }
519
520            cursor += 1;
521        }
522
523        self.tokens.remove_indices(remove_these);
524    }
525
526    thread_local! {
527        static DOTTED_TRUNCATION_EXPR: Lrc<FirstMatchOf> = Document::uncached_dotted_truncation_expr();
528    }
529
530    fn uncached_dotted_truncation_expr() -> Lrc<FirstMatchOf> {
531        Lrc::new(FirstMatchOf::new(vec![
532            Box::new(SequenceExpr::word_set(&["esp", "etc", "vs"]).then_period()),
533            Box::new(
534                SequenceExpr::aco("et")
535                    .then_whitespace()
536                    .t_aco("al")
537                    .then_period(),
538            ),
539        ]))
540    }
541
542    /// Assumes that the first matched token is the canonical one to be condensed into.
543    /// Takes a callback that can be used to retroactively edit the canonical token afterwards.
544    fn condense_expr<F>(&mut self, expr: &impl Expr, edit: F)
545    where
546        F: Fn(&mut Token),
547    {
548        let matches = expr.iter_matches_in_doc(self).collect::<Vec<_>>();
549
550        let mut remove_indices = VecDeque::with_capacity(matches.len());
551
552        for m in matches {
553            remove_indices.extend(m.start + 1..m.end);
554            self.tokens[m.start].span = self.tokens[m.into_iter()].span().unwrap();
555            edit(&mut self.tokens[m.start]);
556        }
557
558        self.tokens.remove_indices(remove_indices);
559    }
560
561    fn condense_dotted_truncations(&mut self) {
562        self.condense_expr(&Self::DOTTED_TRUNCATION_EXPR.with(|v| v.clone()), |_| {})
563    }
564
565    /// Searches for multiple sequential newline tokens and condenses them down
566    /// into one.
567    fn condense_newlines(&mut self) {
568        let mut cursor = 0;
569        let copy = self.tokens.clone();
570
571        let mut remove_these = VecDeque::new();
572
573        while cursor < self.tokens.len() {
574            // Locate a stretch of one or more newline tokens.
575            let start_tok = &mut self.tokens[cursor];
576
577            if let TokenKind::Newline(start_count) = &mut start_tok.kind {
578                loop {
579                    cursor += 1;
580
581                    if cursor >= copy.len() {
582                        break;
583                    }
584
585                    let child_tok = &copy[cursor];
586                    if let TokenKind::Newline(n) = child_tok.kind {
587                        *start_count += n;
588                        start_tok.span.end = child_tok.span.end;
589                        remove_these.push_back(cursor);
590                        cursor += 1;
591                    } else {
592                        break;
593                    };
594                }
595            }
596
597            cursor += 1;
598        }
599
600        self.tokens.remove_indices(remove_these);
601    }
602
603    /// Condenses words like "i.e.", "e.g." and "N.S.A." down to single words
604    /// using a state machine.
605    fn condense_dotted_initialisms(&mut self) {
606        if self.tokens.len() < 2 {
607            return;
608        }
609
610        let mut to_remove = VecDeque::new();
611
612        let mut cursor = 1;
613
614        let mut initialism_start = None;
615
616        loop {
617            let a = &self.tokens[cursor - 1];
618            let b = &self.tokens[cursor];
619
620            let is_initialism_chunk = a.kind.is_word() && a.span.len() == 1 && b.kind.is_period();
621
622            if is_initialism_chunk {
623                if initialism_start.is_none() {
624                    initialism_start = Some(cursor - 1);
625                } else {
626                    to_remove.push_back(cursor - 1);
627                }
628
629                to_remove.push_back(cursor);
630                cursor += 1;
631            } else {
632                if let Some(start) = initialism_start {
633                    let end = self.tokens[cursor - 2].span.end;
634                    let start_tok: &mut Token = &mut self.tokens[start];
635                    start_tok.span.end = end;
636                }
637
638                initialism_start = None;
639            }
640
641            cursor += 1;
642
643            if cursor >= self.tokens.len() - 1 {
644                break;
645            }
646        }
647
648        self.tokens.remove_indices(to_remove);
649    }
650
651    /// Condenses likely filename extensions down to single tokens.
652    fn condense_filename_extensions(&mut self) {
653        if self.tokens.len() < 2 {
654            return;
655        }
656
657        let mut to_remove = VecDeque::new();
658
659        let mut cursor = 1;
660
661        let mut ext_start = None;
662
663        loop {
664            // left context, dot, extension, right context
665            let l = self.get_token_offset(cursor, -2);
666            let d = &self.tokens[cursor - 1];
667            let x = &self.tokens[cursor];
668            let r = self.get_token_offset(cursor, 1);
669
670            let is_ext_chunk = d.kind.is_period()
671                && x.kind.is_word()
672                && x.span.len() <= 3
673                && ((l.is_none_or(|t| t.kind.is_whitespace())
674                    && r.is_none_or(|t| t.kind.is_whitespace()))
675                    || (l.is_some_and(|t| t.kind.is_open_round())
676                        && r.is_some_and(|t| t.kind.is_close_round())))
677                && {
678                    let ext_chars = x.get_ch(&self.source);
679                    ext_chars.iter().all(|c| c.is_ascii_lowercase())
680                        || ext_chars.iter().all(|c| c.is_ascii_uppercase())
681                };
682
683            if is_ext_chunk {
684                if ext_start.is_none() {
685                    ext_start = Some(cursor - 1);
686                    self.tokens[cursor - 1].kind = TokenKind::Unlintable;
687                } else {
688                    to_remove.push_back(cursor - 1);
689                }
690
691                to_remove.push_back(cursor);
692                cursor += 1;
693            } else {
694                if let Some(start) = ext_start {
695                    let end = self.tokens[cursor - 2].span.end;
696                    let start_tok: &mut Token = &mut self.tokens[start];
697                    start_tok.span.end = end;
698                }
699
700                ext_start = None;
701            }
702
703            cursor += 1;
704
705            if cursor >= self.tokens.len() {
706                break;
707            }
708        }
709
710        self.tokens.remove_indices(to_remove);
711    }
712
713    /// Condenses common top-level domains (for example: `.blog`, `.com`) down to single tokens.
714    fn condense_common_top_level_domains(&mut self) {
715        const COMMON_TOP_LEVEL_DOMAINS: &[&str; 106] = &[
716            "ai", "app", "blog", "co", "com", "dev", "edu", "gov", "info", "io", "me", "mil",
717            "net", "org", "shop", "tech", "uk", "us", "xyz", "jp", "de", "fr", "br", "it", "ru",
718            "es", "pl", "ca", "au", "cn", "in", "nl", "eu", "ch", "id", "at", "kr", "cz", "mx",
719            "be", "tv", "se", "tr", "tw", "al", "ua", "ir", "vn", "cl", "sk", "ly", "cc", "to",
720            "no", "fi", "pt", "dk", "ar", "hu", "tk", "gr", "il", "news", "ro", "my", "biz", "ie",
721            "za", "nz", "sg", "ee", "th", "pe", "bg", "hk", "rs", "lt", "link", "ph", "club", "si",
722            "site", "mobi", "by", "cat", "wiki", "la", "ga", "xxx", "cf", "hr", "ng", "jobs",
723            "online", "kz", "ug", "gq", "ae", "is", "lv", "pro", "fm", "tips", "ms", "sa", "int",
724        ];
725
726        if self.tokens.len() < 2 {
727            return;
728        }
729
730        let mut to_remove = VecDeque::new();
731        for cursor in 1..self.tokens.len() {
732            // left context, dot, tld, right context
733            let l = self.get_token_offset(cursor, -2);
734            let d = &self.tokens[cursor - 1];
735            let tld = &self.tokens[cursor];
736            let r = self.get_token_offset(cursor, 1);
737
738            let is_tld_chunk = d.kind.is_period()
739                && tld.kind.is_word()
740                && tld
741                    .get_ch(&self.source)
742                    .iter()
743                    .all(|c| c.is_ascii_alphabetic())
744                && tld
745                    .get_ch(&self.source)
746                    .eq_any_ignore_ascii_case_str(COMMON_TOP_LEVEL_DOMAINS)
747                && ((l.is_none_or(|t| t.kind.is_whitespace())
748                    && r.is_none_or(|t| t.kind.is_whitespace()))
749                    || (l.is_some_and(|t| t.kind.is_open_round())
750                        && r.is_some_and(|t| t.kind.is_close_round())));
751
752            if is_tld_chunk {
753                self.tokens[cursor - 1].kind = TokenKind::Unlintable;
754                self.tokens[cursor - 1].span.end = self.tokens[cursor].span.end;
755                to_remove.push_back(cursor);
756            }
757        }
758
759        self.tokens.remove_indices(to_remove);
760    }
761
762    /// Condenses "tl;dr" down to a single word token.
763    fn condense_tldr(&mut self) {
764        if self.tokens.len() < 3 {
765            return;
766        }
767
768        let mut to_remove = VecDeque::new();
769        let mut cursor = 2;
770
771        loop {
772            let tl = &self.tokens[cursor - 2];
773            let simicolon = &self.tokens[cursor - 1];
774            let dr = &self.tokens[cursor];
775
776            let is_tldr_chunk = tl.kind.is_word()
777                && tl.span.len() == 2
778                && tl.get_ch(&self.source).eq_ch(&['t', 'l'])
779                && simicolon.kind.is_semicolon()
780                && dr.kind.is_word()
781                && dr.span.len() >= 2
782                && dr.span.len() <= 3
783                && dr
784                    .get_ch(&self.source)
785                    .eq_any_ignore_ascii_case_chars(&[&['d', 'r'], &['d', 'r', 's']]);
786
787            if is_tldr_chunk {
788                // Update the first token to be the full "tl;dr" as a word
789                self.tokens[cursor - 2].span = Span::new(
790                    self.tokens[cursor - 2].span.start,
791                    self.tokens[cursor].span.end,
792                );
793
794                // Mark the semicolon and "dr" tokens for removal
795                to_remove.push_back(cursor - 1);
796                to_remove.push_back(cursor);
797            }
798
799            // Skip ahead since we've processed these tokens
800            cursor += 1;
801
802            if cursor >= self.tokens.len() {
803                break;
804            }
805        }
806
807        // Remove the marked tokens in reverse order to maintain correct indices
808        self.tokens.remove_indices(to_remove);
809    }
810
811    /// Allows condensing of delimited pairs of tokens into a single token.
812    ///
813    /// # Arguments
814    ///
815    /// * `is_delimiter` - A function that returns `true` if the token is a delimiter.
816    /// * `valid_pairs` - A slice of tuples representing the valid pairs of tokens to condense.
817    ///
818    fn condense_delimited_pairs<F>(&mut self, is_delimiter: F, valid_pairs: &[(char, char)])
819    where
820        F: Fn(&TokenKind) -> bool,
821    {
822        if self.tokens.len() < 3 {
823            return;
824        }
825
826        let mut to_remove = VecDeque::new();
827        let mut cursor = 2;
828
829        loop {
830            let l1 = &self.tokens[cursor - 2];
831            let delim = &self.tokens[cursor - 1];
832            let l2 = &self.tokens[cursor];
833
834            let is_delimited_chunk = l1.kind.is_word()
835                && l1.span.len() == 1
836                && is_delimiter(&delim.kind)
837                && l2.kind.is_word()
838                && l2.span.len() == 1;
839
840            if is_delimited_chunk {
841                let (l1, l2) = (
842                    l1.get_ch(&self.source).first(),
843                    l2.get_ch(&self.source).first(),
844                );
845
846                let is_valid_pair = match (l1, l2) {
847                    (Some(l1), Some(l2)) => {
848                        let pair = (l1.to_ascii_lowercase(), l2.to_ascii_lowercase());
849                        valid_pairs.contains(&pair)
850                    }
851                    _ => false,
852                };
853
854                if is_valid_pair {
855                    self.tokens[cursor - 2].span = Span::new(
856                        self.tokens[cursor - 2].span.start,
857                        self.tokens[cursor].span.end,
858                    );
859                    to_remove.push_back(cursor - 1);
860                    to_remove.push_back(cursor);
861                }
862            }
863
864            cursor += 1;
865            if cursor >= self.tokens.len() {
866                break;
867            }
868        }
869
870        self.tokens.remove_indices(to_remove);
871    }
872
873    // Condenses "ampersand pairs" such as "R&D" or "Q&A" into single tokens.
874    fn condense_ampersand_pairs(&mut self) {
875        self.condense_delimited_pairs(
876            |kind| kind.is_ampersand(),
877            &[
878                ('b', 'b'), // bed & breakfast
879                ('b', 'w'), // black & white
880                ('g', 't'), // gin & tonic
881                ('k', 'r'), // Kernighan & Ritchie
882                ('q', 'a'), // question & answer
883                ('r', 'b'), // rhythm & blues
884                ('r', 'd'), // research & development
885                ('r', 'r'), // rest & relaxation
886                ('s', 'p'), // Standard & Poor's
887            ],
888        );
889    }
890
891    // Condenses "slash pairs" such as "I/O" into single tokens.
892    fn condense_slash_pairs(&mut self) {
893        self.condense_delimited_pairs(
894            |kind| kind.is_slash(),
895            &[
896                ('a', 'c'), // aircon; alternating current
897                ('b', 'w'), // black and white
898                ('c', 'o'), // care of
899                ('d', 'c'), // direct current
900                ('d', 'l'), // download
901                ('i', 'o'), // input/output
902                ('j', 'k'), // just kidding
903                ('n', 'a'), // not applicable
904                ('r', 'c'), // radio control
905                ('s', 'n'), // serial number
906                ('y', 'n'), // yes/no
907                ('y', 'o'), // years old
908            ],
909        );
910    }
911
912    fn uncached_ellipsis_pattern() -> Lrc<Repeating> {
913        let period = SequenceExpr::default().then_period();
914        Lrc::new(Repeating::new(Box::new(period), 2))
915    }
916
917    thread_local! {
918        static ELLIPSIS_EXPR: Lrc<Repeating> = Document::uncached_ellipsis_pattern();
919    }
920
921    fn condense_ellipsis(&mut self) {
922        let expr = Self::ELLIPSIS_EXPR.with(|v| v.clone());
923        self.condense_expr(&expr, |tok| {
924            tok.kind = TokenKind::Punctuation(Punctuation::Ellipsis)
925        });
926    }
927}
928
929/// Creates functions necessary to implement [`TokenStringExt]` on a document.
930macro_rules! create_fns_on_doc {
931    ($thing:ident) => {
932        paste! {
933            fn [< first_ $thing >](&self) -> Option<&Token> {
934                self.tokens.[< first_ $thing >]()
935            }
936
937            fn [< last_ $thing >](&self) -> Option<&Token> {
938                self.tokens.[< last_ $thing >]()
939            }
940
941            fn [< last_ $thing _index>](&self) -> Option<usize> {
942                self.tokens.[< last_ $thing _index >]()
943            }
944
945            fn [<iter_ $thing _indices>](&self) -> impl DoubleEndedIterator<Item = usize> + '_ {
946                self.tokens.[< iter_ $thing _indices >]()
947            }
948
949            fn [<iter_ $thing s>](&self) -> impl Iterator<Item = &Token> + '_ {
950                self.tokens.[< iter_ $thing s >]()
951            }
952        }
953    };
954}
955
956impl TokenStringExt for Document {
957    create_fns_on_doc!(adjective);
958    create_fns_on_doc!(apostrophe);
959    create_fns_on_doc!(at);
960    create_fns_on_doc!(chunk_terminator);
961    create_fns_on_doc!(comma);
962    create_fns_on_doc!(conjunction);
963    create_fns_on_doc!(currency);
964    create_fns_on_doc!(ellipsis);
965    create_fns_on_doc!(hostname);
966    create_fns_on_doc!(likely_homograph);
967    create_fns_on_doc!(noun);
968    create_fns_on_doc!(number);
969    create_fns_on_doc!(paragraph_break);
970    create_fns_on_doc!(pipe);
971    create_fns_on_doc!(preposition);
972    create_fns_on_doc!(punctuation);
973    create_fns_on_doc!(quote);
974    create_fns_on_doc!(sentence_terminator);
975    create_fns_on_doc!(space);
976    create_fns_on_doc!(unlintable);
977    create_fns_on_doc!(verb);
978    create_fns_on_doc!(word);
979    create_fns_on_doc!(word_like);
980    create_fns_on_doc!(heading_start);
981
982    fn first_sentence_word(&self) -> Option<&Token> {
983        self.tokens.first_sentence_word()
984    }
985
986    fn first_non_whitespace(&self) -> Option<&Token> {
987        self.tokens.first_non_whitespace()
988    }
989
990    fn span(&self) -> Option<Span<char>> {
991        self.tokens.span()
992    }
993
994    fn iter_linking_verb_indices(&self) -> impl Iterator<Item = usize> + '_ {
995        self.tokens.iter_linking_verb_indices()
996    }
997
998    fn iter_linking_verbs(&self) -> impl Iterator<Item = &Token> + '_ {
999        self.tokens.iter_linking_verbs()
1000    }
1001
1002    fn iter_chunks(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
1003        self.tokens.iter_chunks()
1004    }
1005
1006    fn iter_paragraphs(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
1007        self.tokens.iter_paragraphs()
1008    }
1009
1010    fn iter_headings(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
1011        self.tokens.iter_headings()
1012    }
1013
1014    fn iter_sentences(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
1015        self.tokens.iter_sentences()
1016    }
1017
1018    fn iter_sentences_mut(&mut self) -> impl Iterator<Item = &'_ mut [Token]> + '_ {
1019        self.tokens.iter_sentences_mut()
1020    }
1021}
1022
1023impl Display for Document {
1024    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1025        for token in &self.tokens {
1026            write!(f, "{}", self.get_span_content_str(&token.span))?;
1027        }
1028
1029        Ok(())
1030    }
1031}
1032
1033#[cfg(test)]
1034mod tests {
1035    use itertools::Itertools;
1036
1037    use super::Document;
1038    use crate::TokenStringExt;
1039    use crate::{Span, parsers::MarkdownOptions};
1040
1041    fn assert_condensed_contractions(text: &str, final_tok_count: usize) {
1042        let document = Document::new_plain_english_curated(text);
1043
1044        assert_eq!(document.tokens.len(), final_tok_count);
1045
1046        let document = Document::new_markdown_curated(text, MarkdownOptions::default());
1047
1048        assert_eq!(document.tokens.len(), final_tok_count);
1049    }
1050
1051    #[test]
1052    fn simple_contraction() {
1053        assert_condensed_contractions("isn't", 1);
1054    }
1055
1056    #[test]
1057    fn simple_contraction2() {
1058        assert_condensed_contractions("wasn't", 1);
1059    }
1060
1061    #[test]
1062    fn simple_contraction3() {
1063        assert_condensed_contractions("There's", 1);
1064    }
1065
1066    #[test]
1067    fn simple_contraction4() {
1068        assert_condensed_contractions("doesn't", 1);
1069    }
1070
1071    #[test]
1072    fn medium_contraction() {
1073        assert_condensed_contractions("isn't wasn't", 3);
1074    }
1075
1076    #[test]
1077    fn medium_contraction2() {
1078        assert_condensed_contractions("There's no way", 5);
1079    }
1080
1081    #[test]
1082    fn selects_token_at_char_index() {
1083        let text = "There were three little pigs. They built three little homes.";
1084        let document = Document::new_plain_english_curated(text);
1085
1086        let got = document.get_token_at_char_index(19).unwrap();
1087
1088        assert!(got.kind.is_word());
1089        assert_eq!(got.span, Span::new(17, 23));
1090    }
1091
1092    fn assert_token_count(source: &str, count: usize) {
1093        let document = Document::new_plain_english_curated(source);
1094
1095        dbg!(document.tokens().map(|t| t.kind.clone()).collect_vec());
1096        assert_eq!(document.tokens.len(), count);
1097    }
1098
1099    #[test]
1100    fn condenses_number_suffixes() {
1101        assert_token_count("1st", 1);
1102        assert_token_count("This is the 2nd test", 9);
1103        assert_token_count("This is the 3rd test", 9);
1104        assert_token_count(
1105            "It works even with weird capitalization like this: 600nD",
1106            18,
1107        );
1108    }
1109
1110    #[test]
1111    fn condenses_ie() {
1112        assert_token_count("There is a thing (i.e. that one)", 15);
1113        assert_token_count("We are trying to condense \"i.e.\"", 13);
1114        assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
1115    }
1116
1117    #[test]
1118    fn condenses_eg() {
1119        assert_token_count("We are trying to condense \"e.g.\"", 13);
1120        assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
1121    }
1122
1123    #[test]
1124    fn condenses_nsa() {
1125        assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
1126    }
1127
1128    #[test]
1129    fn parses_ellipsis() {
1130        assert_token_count("...", 1);
1131    }
1132
1133    #[test]
1134    fn parses_long_ellipsis() {
1135        assert_token_count(".....", 1);
1136    }
1137
1138    #[test]
1139    fn parses_short_ellipsis() {
1140        assert_token_count("..", 1);
1141    }
1142
1143    #[test]
1144    fn selects_token_at_offset() {
1145        let doc = Document::new_plain_english_curated("Foo bar baz");
1146
1147        let tok = doc.get_token_offset(1, -1).unwrap();
1148
1149        assert_eq!(tok.span, Span::new(0, 3));
1150    }
1151
1152    #[test]
1153    fn cant_select_token_before_start() {
1154        let doc = Document::new_plain_english_curated("Foo bar baz");
1155
1156        let tok = doc.get_token_offset(0, -1);
1157
1158        assert!(tok.is_none());
1159    }
1160
1161    #[test]
1162    fn select_next_word_pos_offset() {
1163        let doc = Document::new_plain_english_curated("Foo bar baz");
1164
1165        let bar = doc.get_next_word_from_offset(0, 1).unwrap();
1166        let bar = doc.get_span_content(&bar.span);
1167        assert_eq!(bar, ['b', 'a', 'r']);
1168    }
1169
1170    #[test]
1171    fn select_next_word_neg_offset() {
1172        let doc = Document::new_plain_english_curated("Foo bar baz");
1173
1174        let bar = doc.get_next_word_from_offset(2, -1).unwrap();
1175        let bar = doc.get_span_content(&bar.span);
1176        assert_eq!(bar, ['F', 'o', 'o']);
1177    }
1178
1179    #[test]
1180    fn cant_select_next_word_not_from_whitespace() {
1181        let doc = Document::new_plain_english_curated("Foo bar baz");
1182
1183        let tok = doc.get_next_word_from_offset(0, 2);
1184
1185        assert!(tok.is_none());
1186    }
1187
1188    #[test]
1189    fn cant_select_next_word_before_start() {
1190        let doc = Document::new_plain_english_curated("Foo bar baz");
1191
1192        let tok = doc.get_next_word_from_offset(0, -1);
1193
1194        assert!(tok.is_none());
1195    }
1196
1197    #[test]
1198    fn cant_select_next_word_with_punctuation_instead_of_whitespace() {
1199        let doc = Document::new_plain_english_curated("Foo, bar, baz");
1200
1201        let tok = doc.get_next_word_from_offset(0, 1);
1202
1203        assert!(tok.is_none());
1204    }
1205
1206    #[test]
1207    fn cant_select_next_word_with_punctuation_after_whitespace() {
1208        let doc = Document::new_plain_english_curated("Foo \"bar\", baz");
1209
1210        let tok = doc.get_next_word_from_offset(0, 1);
1211
1212        assert!(tok.is_none());
1213    }
1214
1215    #[test]
1216    fn condenses_filename_extensions() {
1217        let doc = Document::new_plain_english_curated(".c and .exe and .js");
1218        assert!(doc.tokens[0].kind.is_unlintable());
1219        assert!(doc.tokens[4].kind.is_unlintable());
1220        assert!(doc.tokens[8].kind.is_unlintable());
1221    }
1222
1223    #[test]
1224    fn condense_filename_extension_ok_at_start_and_end() {
1225        let doc = Document::new_plain_english_curated(".c and .EXE");
1226        assert!(doc.tokens.len() == 5);
1227        assert!(doc.tokens[0].kind.is_unlintable());
1228        assert!(doc.tokens[4].kind.is_unlintable());
1229    }
1230
1231    #[test]
1232    fn doesnt_condense_filename_extensions_with_mixed_case() {
1233        let doc = Document::new_plain_english_curated(".c and .Exe");
1234        assert!(doc.tokens.len() == 6);
1235        assert!(doc.tokens[0].kind.is_unlintable());
1236        assert!(doc.tokens[4].kind.is_punctuation());
1237        assert!(doc.tokens[5].kind.is_word());
1238    }
1239
1240    #[test]
1241    fn doesnt_condense_filename_extensions_with_non_letters() {
1242        let doc = Document::new_plain_english_curated(".COM and .C0M");
1243        assert!(doc.tokens.len() == 6);
1244        assert!(doc.tokens[0].kind.is_unlintable());
1245        assert!(doc.tokens[4].kind.is_punctuation());
1246        assert!(doc.tokens[5].kind.is_word());
1247    }
1248
1249    #[test]
1250    fn doesnt_condense_filename_extensions_longer_than_three() {
1251        let doc = Document::new_plain_english_curated(".dll and .dlls");
1252        assert!(doc.tokens.len() == 6);
1253        assert!(doc.tokens[0].kind.is_unlintable());
1254        assert!(doc.tokens[4].kind.is_punctuation());
1255        assert!(doc.tokens[5].kind.is_word());
1256    }
1257
1258    #[test]
1259    fn condense_filename_extension_in_parens() {
1260        let doc = Document::new_plain_english_curated(
1261            "true for the manual installation when trying to run the executable(.exe) after a manual download",
1262        );
1263        assert!(doc.tokens.len() > 23);
1264        assert!(doc.tokens[21].kind.is_open_round());
1265        assert!(doc.tokens[22].kind.is_unlintable());
1266        assert!(doc.tokens[23].kind.is_close_round());
1267    }
1268
1269    #[test]
1270    fn condense_tldr_uppercase() {
1271        let doc = Document::new_plain_english_curated("TL;DR");
1272        assert!(doc.tokens.len() == 1);
1273        assert!(doc.tokens[0].kind.is_word());
1274        assert!(doc.tokens[0].span.len() == 5);
1275    }
1276
1277    #[test]
1278    fn condense_tldr_lowercase() {
1279        let doc = Document::new_plain_english_curated("tl;dr");
1280        assert!(doc.tokens.len() == 1);
1281        assert!(doc.tokens[0].kind.is_word());
1282    }
1283
1284    #[test]
1285    fn condense_tldr_mixed_case_1() {
1286        let doc = Document::new_plain_english_curated("tl;DR");
1287        assert!(doc.tokens.len() == 1);
1288        assert!(doc.tokens[0].kind.is_word());
1289    }
1290
1291    #[test]
1292    fn condense_tldr_mixed_case_2() {
1293        let doc = Document::new_plain_english_curated("TL;Dr");
1294        assert!(doc.tokens.len() == 1);
1295        assert!(doc.tokens[0].kind.is_word());
1296    }
1297
1298    #[test]
1299    fn condense_tldr_pural() {
1300        let doc = Document::new_plain_english_curated(
1301            "managing the flow between components to produce relevant TL;DRs of current news articles",
1302        );
1303        // no token is a punctuation token - only words with whitespace between
1304        assert!(
1305            doc.tokens
1306                .iter()
1307                .all(|t| t.kind.is_word() || t.kind.is_whitespace())
1308        );
1309        // one of the word tokens contains a ';' character
1310        let tldrs = doc
1311            .tokens
1312            .iter()
1313            .filter(|t| t.get_ch(&doc.source).contains(&';'))
1314            .collect_vec();
1315        assert!(tldrs.len() == 1);
1316        assert!(tldrs[0].get_str(&doc.source) == "TL;DRs");
1317    }
1318
1319    #[test]
1320    fn condense_common_top_level_domains() {
1321        let doc = Document::new_plain_english_curated(".blog and .com and .NET");
1322        assert!(doc.tokens.len() == 9);
1323        assert!(doc.tokens[0].kind.is_unlintable());
1324        assert!(doc.tokens[4].kind.is_unlintable());
1325        assert!(doc.tokens[8].kind.is_unlintable());
1326    }
1327
1328    #[test]
1329    fn condense_common_top_level_domains_in_parens() {
1330        let doc = Document::new_plain_english_curated("(.blog)");
1331        assert!(doc.tokens.len() == 3);
1332        assert!(doc.tokens[0].kind.is_open_round());
1333        assert!(doc.tokens[1].kind.is_unlintable());
1334        assert!(doc.tokens[2].kind.is_close_round());
1335    }
1336
1337    #[test]
1338    fn doesnt_condense_unknown_top_level_domains() {
1339        let doc = Document::new_plain_english_curated(".harper");
1340        assert!(doc.tokens.len() == 2);
1341        assert!(doc.tokens[0].kind.is_punctuation());
1342        assert!(doc.tokens[1].kind.is_word());
1343    }
1344
1345    #[test]
1346    fn condense_r_and_d_caps() {
1347        let doc = Document::new_plain_english_curated("R&D");
1348        assert!(doc.tokens.len() == 1);
1349        assert!(doc.tokens[0].kind.is_word());
1350    }
1351
1352    #[test]
1353    fn condense_r_and_d_mixed_case() {
1354        let doc = Document::new_plain_english_curated("R&d");
1355        assert!(doc.tokens.len() == 1);
1356        assert!(doc.tokens[0].kind.is_word());
1357    }
1358
1359    #[test]
1360    fn condense_r_and_d_lowercase() {
1361        let doc = Document::new_plain_english_curated("r&d");
1362        assert!(doc.tokens.len() == 1);
1363        assert!(doc.tokens[0].kind.is_word());
1364    }
1365
1366    #[test]
1367    fn dont_condense_r_and_d_with_spaces() {
1368        let doc = Document::new_plain_english_curated("R & D");
1369        assert!(doc.tokens.len() == 5);
1370        assert!(doc.tokens[0].kind.is_word());
1371        assert!(doc.tokens[1].kind.is_whitespace());
1372        assert!(doc.tokens[2].kind.is_ampersand());
1373        assert!(doc.tokens[3].kind.is_whitespace());
1374        assert!(doc.tokens[4].kind.is_word());
1375    }
1376
1377    #[test]
1378    fn condense_q_and_a() {
1379        let doc =
1380            Document::new_plain_english_curated("A Q&A platform software for teams at any scales.");
1381        assert!(doc.tokens.len() >= 3);
1382        assert!(doc.tokens[2].kind.is_word());
1383        assert!(doc.tokens[2].get_str(&doc.source) == "Q&A");
1384    }
1385
1386    #[test]
1387    fn dont_allow_mixed_r_and_d_with_q_and_a() {
1388        let doc = Document::new_plain_english_curated("R&A or Q&D");
1389        assert!(doc.tokens.len() == 9);
1390        assert!(doc.tokens[1].kind.is_ampersand() || doc.tokens[7].kind.is_ampersand());
1391    }
1392
1393    #[test]
1394    fn condense_io() {
1395        let doc = Document::new_plain_english_curated("I/O");
1396        assert!(doc.tokens.len() == 1);
1397        assert!(doc.tokens[0].kind.is_word());
1398    }
1399
1400    #[test]
1401    fn finds_unmatched_quotes_in_document() {
1402        let raw = r#"
1403This is a paragraph with a single word "quoted."
1404
1405This is a second paragraph with no quotes.
1406
1407This is a third paragraph with a single erroneous "quote.
1408
1409This is a final paragraph with a weird "quote and a not-weird "quote".
1410            "#;
1411
1412        let doc = Document::new_markdown_default_curated(raw);
1413
1414        let quote_twins: Vec<_> = doc
1415            .iter_quotes()
1416            .map(|t| t.kind.as_quote().unwrap().twin_loc)
1417            .collect();
1418
1419        assert_eq!(
1420            quote_twins,
1421            vec![Some(19), Some(16), None, None, Some(89), Some(87)]
1422        )
1423    }
1424
1425    #[test]
1426    fn issue_1901() {
1427        let raw = r#"
1428"A quoted line"
1429"A quote without a closing mark
1430"Another quoted lined"
1431"The last quoted line"
1432            "#;
1433
1434        let doc = Document::new_markdown_default_curated(raw);
1435
1436        let quote_twins: Vec<_> = doc
1437            .iter_quotes()
1438            .map(|t| t.kind.as_quote().unwrap().twin_loc)
1439            .collect();
1440
1441        assert_eq!(
1442            quote_twins,
1443            vec![
1444                Some(6),
1445                Some(0),
1446                None,
1447                Some(27),
1448                Some(21),
1449                Some(37),
1450                Some(29)
1451            ]
1452        )
1453    }
1454}