Skip to main content

harper_core/
document.rs

1use std::cmp::Ordering;
2use std::collections::VecDeque;
3use std::fmt::Display;
4
5use harper_brill::{Chunker, Tagger, brill_tagger, burn_chunker};
6use itertools::Itertools;
7use paste::paste;
8
9use crate::expr::{Expr, ExprExt, FirstMatchOf, Repeating, SequenceExpr};
10use crate::parsers::{Markdown, MarkdownOptions, Parser, PlainEnglish};
11use crate::patterns::WordSet;
12use crate::punctuation::Punctuation;
13use crate::spell::{Dictionary, FstDictionary};
14use crate::vec_ext::VecExt;
15use crate::{CharStringExt, FatStringToken, FatToken, Lrc, Token, TokenKind, TokenStringExt};
16use crate::{OrdinalSuffix, Span};
17
18/// A document containing some amount of lexed and parsed English text.
19#[derive(Debug, Clone)]
20pub struct Document {
21    source: Lrc<Vec<char>>,
22    tokens: Vec<Token>,
23}
24
25impl Default for Document {
26    fn default() -> Self {
27        Self::new("", &PlainEnglish, &FstDictionary::curated())
28    }
29}
30
31impl Document {
32    /// Locate all the tokens that intersect a provided span.
33    ///
34    /// Desperately needs optimization.
35    pub fn token_indices_intersecting(&self, span: Span<char>) -> Vec<usize> {
36        self.tokens()
37            .enumerate()
38            .filter_map(|(idx, tok)| tok.span.overlaps_with(span).then_some(idx))
39            .collect()
40    }
41
42    /// Locate all the tokens that intersect a provided span and convert them to [`FatToken`]s.
43    ///
44    /// Desperately needs optimization.
45    pub fn fat_tokens_intersecting(&self, span: Span<char>) -> Vec<FatToken> {
46        let indices = self.token_indices_intersecting(span);
47
48        indices
49            .into_iter()
50            .map(|i| self.tokens[i].to_fat(&self.source))
51            .collect()
52    }
53
54    /// Lexes and parses text to produce a document using a provided language
55    /// parser and dictionary.
56    pub fn new(text: &str, parser: &impl Parser, dictionary: &impl Dictionary) -> Self {
57        let source: Vec<_> = text.chars().collect();
58
59        Self::new_from_vec(Lrc::new(source), parser, dictionary)
60    }
61
62    /// Lexes and parses text to produce a document using a provided language
63    /// parser and the included curated dictionary.
64    pub fn new_curated(text: &str, parser: &impl Parser) -> Self {
65        let source: Vec<_> = text.chars().collect();
66
67        Self::new_from_vec(Lrc::new(source), parser, &FstDictionary::curated())
68    }
69
70    /// Lexes and parses text to produce a document using a provided language
71    /// parser and dictionary.
72    pub fn new_from_vec(
73        source: Lrc<Vec<char>>,
74        parser: &impl Parser,
75        dictionary: &impl Dictionary,
76    ) -> Self {
77        let tokens = parser.parse(&source);
78
79        let mut document = Self { source, tokens };
80        document.parse(dictionary);
81
82        document
83    }
84
85    /// Parse text to produce a document using the built-in [`PlainEnglish`]
86    /// parser and curated dictionary.
87    pub fn new_plain_english_curated(text: &str) -> Self {
88        Self::new(text, &PlainEnglish, &FstDictionary::curated())
89    }
90
91    /// Create a new document simply by tokenizing the provided input and applying fix-ups. The
92    /// contained words will not contain any metadata.
93    ///
94    /// This avoids running potentially expensive metadata generation code, so this is more
95    /// efficient if you don't need that information.
96    pub(crate) fn new_basic_tokenize(text: &str, parser: &impl Parser) -> Self {
97        let source = Lrc::new(text.chars().collect_vec());
98        let tokens = parser.parse(&source);
99        let mut document = Self { source, tokens };
100        document.apply_fixups();
101        document
102    }
103
104    /// Parse text to produce a document using the built-in [`PlainEnglish`]
105    /// parser and a provided dictionary.
106    pub fn new_plain_english(text: &str, dictionary: &impl Dictionary) -> Self {
107        Self::new(text, &PlainEnglish, dictionary)
108    }
109
110    /// Parse text to produce a document using the built-in [`Markdown`] parser
111    /// and curated dictionary.
112    pub fn new_markdown_curated(text: &str, markdown_options: MarkdownOptions) -> Self {
113        Self::new(
114            text,
115            &Markdown::new(markdown_options),
116            &FstDictionary::curated(),
117        )
118    }
119
120    /// Parse text to produce a document using the built-in [`Markdown`] parser
121    /// and curated dictionary with the default Markdown configuration.
122    pub fn new_markdown_default_curated(text: &str) -> Self {
123        Self::new_markdown_curated(text, MarkdownOptions::default())
124    }
125
126    /// Parse text to produce a document using the built-in [`PlainEnglish`]
127    /// parser and the curated dictionary.
128    pub fn new_markdown(
129        text: &str,
130        markdown_options: MarkdownOptions,
131        dictionary: &impl Dictionary,
132    ) -> Self {
133        Self::new(text, &Markdown::new(markdown_options), dictionary)
134    }
135
136    /// Parse text to produce a document using the built-in [`PlainEnglish`]
137    /// parser and the curated dictionary with the default Markdown configuration.
138    pub fn new_markdown_default(text: &str, dictionary: &impl Dictionary) -> Self {
139        Self::new_markdown(text, MarkdownOptions::default(), dictionary)
140    }
141
142    fn apply_fixups(&mut self) {
143        self.condense_spaces();
144        self.condense_newlines();
145        self.newlines_to_breaks();
146        self.condense_dotted_initialisms();
147        self.condense_number_suffixes();
148        self.condense_ellipsis();
149        self.condense_latin();
150        self.condense_common_top_level_domains();
151        self.condense_filename_extensions();
152        self.condense_tldr();
153        self.condense_ampersand_pairs();
154        self.condense_slash_pairs();
155        self.match_quotes();
156    }
157
158    /// Re-parse important language constructs.
159    ///
160    /// Should be run after every change to the underlying [`Self::source`].
161    fn parse(&mut self, dictionary: &impl Dictionary) {
162        self.apply_fixups();
163
164        let chunker = burn_chunker();
165        let tagger = brill_tagger();
166
167        for sent in self.tokens.iter_sentences_mut() {
168            let token_strings: Vec<_> = sent
169                .iter()
170                .filter(|t| !t.kind.is_whitespace())
171                .map(|t| t.span.get_content_string(&self.source))
172                .collect();
173
174            let token_tags = tagger.tag_sentence(&token_strings);
175            let np_flags = chunker.chunk_sentence(&token_strings, &token_tags);
176
177            let mut i = 0;
178
179            // Annotate DictWord metadata
180            for token in sent.iter_mut() {
181                if let TokenKind::Word(meta) = &mut token.kind {
182                    let word_source = token.span.get_content(&self.source);
183                    let mut found_meta = dictionary
184                        .get_word_metadata(word_source)
185                        .map(|c| c.into_owned());
186
187                    if let Some(inner) = &mut found_meta {
188                        inner.pos_tag = token_tags[i].or_else(|| inner.infer_pos_tag());
189                        inner.np_member = Some(np_flags[i]);
190                    }
191
192                    *meta = found_meta;
193                    i += 1;
194                } else if !token.kind.is_whitespace() {
195                    i += 1;
196                }
197            }
198        }
199    }
200
201    /// Convert all sets of newlines greater than 2 to paragraph breaks.
202    fn newlines_to_breaks(&mut self) {
203        for token in &mut self.tokens {
204            if let TokenKind::Newline(n) = token.kind
205                && n >= 2
206            {
207                token.kind = TokenKind::ParagraphBreak;
208            }
209        }
210    }
211
212    /// Given a list of indices, this function removes the subsequent
213    /// `stretch_len - 1` elements after each index.
214    ///
215    /// Will extend token spans to include removed elements.
216    /// Assumes condensed tokens are contiguous in source text.
217    fn condense_indices(&mut self, indices: &[usize], stretch_len: usize) {
218        // Update spans
219        for idx in indices {
220            let end_tok = self.tokens[idx + stretch_len - 1].clone();
221            let start_tok = &mut self.tokens[*idx];
222
223            start_tok.span.end = end_tok.span.end;
224        }
225
226        // Trim
227        let old = self.tokens.clone();
228        self.tokens.clear();
229
230        // Keep first chunk.
231        self.tokens
232            .extend_from_slice(&old[0..indices.first().copied().unwrap_or(indices.len())]);
233
234        let mut iter = indices.iter().peekable();
235
236        while let (Some(a_idx), b) = (iter.next(), iter.peek()) {
237            self.tokens.push(old[*a_idx].clone());
238
239            if let Some(b_idx) = b {
240                self.tokens
241                    .extend_from_slice(&old[a_idx + stretch_len..**b_idx]);
242            }
243        }
244
245        // Keep last chunk.
246        self.tokens.extend_from_slice(
247            &old[indices
248                .last()
249                .map(|v| v + stretch_len)
250                .unwrap_or(indices.len())..],
251        );
252    }
253
254    pub fn get_token_at_char_index(&self, char_index: usize) -> Option<&Token> {
255        let index = self
256            .tokens
257            .binary_search_by(|t| {
258                if t.span.overlaps_with(Span::new_with_len(char_index, 1)) {
259                    Ordering::Equal
260                } else {
261                    t.span.start.cmp(&char_index)
262                }
263            })
264            .ok()?;
265
266        Some(&self.tokens[index])
267    }
268
269    /// Defensively attempt to grab a specific token.
270    pub fn get_token(&self, index: usize) -> Option<&Token> {
271        self.tokens.get(index)
272    }
273
274    /// Get a token at a signed offset from a base index, or None if out of bounds.
275    pub fn get_token_offset(&self, base: usize, offset: isize) -> Option<&Token> {
276        match base.checked_add_signed(offset) {
277            None => None,
278            Some(idx) => self.get_token(idx),
279        }
280    }
281
282    /// Get an iterator over all the tokens contained in the document.
283    pub fn tokens(&self) -> impl Iterator<Item = &Token> + '_ {
284        self.tokens.iter()
285    }
286
287    pub fn iter_nominal_phrases(&self) -> impl Iterator<Item = &[Token]> {
288        fn is_np_member(t: &Token) -> bool {
289            t.kind
290                .as_word()
291                .and_then(|x| x.as_ref())
292                .and_then(|w| w.np_member)
293                .unwrap_or(false)
294        }
295
296        fn trim(slice: &[Token]) -> &[Token] {
297            let mut start = 0;
298            let mut end = slice.len();
299            while start < end && slice[start].kind.is_whitespace() {
300                start += 1;
301            }
302            while end > start && slice[end - 1].kind.is_whitespace() {
303                end -= 1;
304            }
305            &slice[start..end]
306        }
307
308        self.tokens
309            .as_slice()
310            .split(|t| !(is_np_member(t) || t.kind.is_whitespace()))
311            .filter_map(|s| {
312                let s = trim(s);
313                if s.iter().any(is_np_member) {
314                    Some(s)
315                } else {
316                    None
317                }
318            })
319    }
320
321    /// Get an iterator over all the tokens contained in the document.
322    pub fn fat_tokens(&self) -> impl Iterator<Item = FatToken> + '_ {
323        self.tokens().map(|token| token.to_fat(&self.source))
324    }
325
326    /// Get the next or previous word token relative to a base index, if separated by whitespace.
327    /// Returns None if the next/previous token is not a word or does not exist.
328    pub fn get_next_word_from_offset(&self, base: usize, offset: isize) -> Option<&Token> {
329        // Look for whitespace at the expected offset
330        if !self.get_token_offset(base, offset)?.kind.is_whitespace() {
331            return None;
332        }
333        // Now look beyond the whitespace for a word token
334        let word_token = self.get_token_offset(base, offset + offset.signum());
335        let word_token = word_token?;
336        word_token.kind.is_word().then_some(word_token)
337    }
338
339    /// Get an iterator over all the tokens contained in the document.
340    pub fn fat_string_tokens(&self) -> impl Iterator<Item = FatStringToken> + '_ {
341        self.fat_tokens().map(|t| t.into())
342    }
343
344    pub fn get_span_content(&self, span: &Span<char>) -> &[char] {
345        span.get_content(&self.source)
346    }
347
348    pub fn get_span_content_str(&self, span: &Span<char>) -> String {
349        String::from_iter(self.get_span_content(span))
350    }
351
352    pub fn get_full_string(&self) -> String {
353        self.get_span_content_str(&Span::new(0, self.source.len()))
354    }
355
356    pub fn get_full_content(&self) -> &[char] {
357        &self.source
358    }
359
360    pub fn get_source(&self) -> &[char] {
361        &self.source
362    }
363
364    pub fn get_tokens(&self) -> &[Token] {
365        &self.tokens
366    }
367
368    /// Searches for quotation marks and fills the
369    /// [`Punctuation::Quote::twin_loc`] field. This is on a best-effort
370    /// basis.
371    ///
372    /// Current algorithm is based on https://leancrew.com/all-this/2025/03/a-mac-smart-quote-curiosity
373    fn match_quotes(&mut self) {
374        let mut pg_indices: Vec<_> = vec![0];
375        pg_indices.extend(self.iter_paragraph_break_indices());
376        pg_indices.push(self.tokens.len());
377
378        // Avoid allocation in loop
379        let mut quote_indices = Vec::new();
380        let mut open_quote_indices = Vec::new();
381
382        for (start, end) in pg_indices.into_iter().tuple_windows() {
383            let pg = &mut self.tokens[start..end];
384
385            quote_indices.clear();
386            quote_indices.extend(pg.iter_quote_indices());
387            open_quote_indices.clear();
388
389            // Find open quotes first.
390            for quote in &quote_indices {
391                let is_open = *quote == 0
392                    || pg[0..*quote].iter_word_likes().next().is_none()
393                    || pg[quote - 1].kind.is_whitespace()
394                    || matches!(
395                        pg[quote - 1].kind.as_punctuation(),
396                        Some(Punctuation::LessThan)
397                            | Some(Punctuation::OpenRound)
398                            | Some(Punctuation::OpenSquare)
399                            | Some(Punctuation::OpenCurly)
400                            | Some(Punctuation::Apostrophe)
401                    );
402
403                if is_open {
404                    open_quote_indices.push(*quote);
405                }
406            }
407
408            while let Some(open_idx) = open_quote_indices.pop() {
409                let Some(close_idx) = pg[open_idx + 1..].iter_quote_indices().next() else {
410                    continue;
411                };
412
413                if pg[close_idx + open_idx + 1]
414                    .kind
415                    .as_quote()
416                    .unwrap()
417                    .twin_loc
418                    .is_some()
419                {
420                    continue;
421                }
422
423                pg[open_idx].kind.as_mut_quote().unwrap().twin_loc =
424                    Some(close_idx + open_idx + start + 1);
425                pg[close_idx + open_idx + 1]
426                    .kind
427                    .as_mut_quote()
428                    .unwrap()
429                    .twin_loc = Some(open_idx + start);
430            }
431        }
432    }
433
434    /// Searches for number suffixes and condenses them down into single tokens
435    fn condense_number_suffixes(&mut self) {
436        if self.tokens.len() < 2 {
437            return;
438        }
439
440        let mut replace_starts = Vec::new();
441
442        for idx in 0..self.tokens.len() - 1 {
443            let b = &self.tokens[idx + 1];
444            let a = &self.tokens[idx];
445
446            // TODO: Allow spaces between `a` and `b`
447
448            if let (TokenKind::Number(..), TokenKind::Word(..)) = (&a.kind, &b.kind)
449                && let Some(found_suffix) =
450                    OrdinalSuffix::from_chars(self.get_span_content(&b.span))
451            {
452                self.tokens[idx].kind.as_mut_number().unwrap().suffix = Some(found_suffix);
453                replace_starts.push(idx);
454            }
455        }
456
457        self.condense_indices(&replace_starts, 2);
458    }
459
460    /// Searches for multiple sequential space tokens and condenses them down
461    /// into one.
462    fn condense_spaces(&mut self) {
463        let mut cursor = 0;
464        let copy = self.tokens.clone();
465
466        let mut remove_these = VecDeque::new();
467
468        while cursor < self.tokens.len() {
469            // Locate a stretch of one or more newline tokens.
470            let start_tok = &mut self.tokens[cursor];
471
472            if let TokenKind::Space(start_count) = &mut start_tok.kind {
473                loop {
474                    cursor += 1;
475
476                    if cursor >= copy.len() {
477                        break;
478                    }
479
480                    let child_tok = &copy[cursor];
481
482                    // Only condense adjacent spans
483                    if start_tok.span.end != child_tok.span.start {
484                        break;
485                    }
486
487                    if let TokenKind::Space(n) = child_tok.kind {
488                        *start_count += n;
489                        start_tok.span.end = child_tok.span.end;
490                        remove_these.push_back(cursor);
491                        cursor += 1;
492                    } else {
493                        break;
494                    };
495                }
496            }
497
498            cursor += 1;
499        }
500
501        self.tokens.remove_indices(remove_these);
502    }
503
504    thread_local! {
505        static LATIN_EXPR: Lrc<FirstMatchOf> = Document::uncached_latin_expr();
506    }
507
508    fn uncached_latin_expr() -> Lrc<FirstMatchOf> {
509        Lrc::new(FirstMatchOf::new(vec![
510            Box::new(
511                SequenceExpr::default()
512                    .then(WordSet::new(&["etc", "vs"]))
513                    .then_period(),
514            ),
515            Box::new(
516                SequenceExpr::aco("et")
517                    .then_whitespace()
518                    .t_aco("al")
519                    .then_period(),
520            ),
521        ]))
522    }
523
524    /// Assumes that the first matched token is the canonical one to be condensed into.
525    /// Takes a callback that can be used to retroactively edit the canonical token afterwards.
526    fn condense_expr<F>(&mut self, expr: &impl Expr, edit: F)
527    where
528        F: Fn(&mut Token),
529    {
530        let matches = expr.iter_matches_in_doc(self).collect::<Vec<_>>();
531
532        let mut remove_indices = VecDeque::with_capacity(matches.len());
533
534        for m in matches {
535            remove_indices.extend(m.start + 1..m.end);
536            self.tokens[m.start].span = self.tokens[m.into_iter()].span().unwrap();
537            edit(&mut self.tokens[m.start]);
538        }
539
540        self.tokens.remove_indices(remove_indices);
541    }
542
543    fn condense_latin(&mut self) {
544        self.condense_expr(&Self::LATIN_EXPR.with(|v| v.clone()), |_| {})
545    }
546
547    /// Searches for multiple sequential newline tokens and condenses them down
548    /// into one.
549    fn condense_newlines(&mut self) {
550        let mut cursor = 0;
551        let copy = self.tokens.clone();
552
553        let mut remove_these = VecDeque::new();
554
555        while cursor < self.tokens.len() {
556            // Locate a stretch of one or more newline tokens.
557            let start_tok = &mut self.tokens[cursor];
558
559            if let TokenKind::Newline(start_count) = &mut start_tok.kind {
560                loop {
561                    cursor += 1;
562
563                    if cursor >= copy.len() {
564                        break;
565                    }
566
567                    let child_tok = &copy[cursor];
568                    if let TokenKind::Newline(n) = child_tok.kind {
569                        *start_count += n;
570                        start_tok.span.end = child_tok.span.end;
571                        remove_these.push_back(cursor);
572                        cursor += 1;
573                    } else {
574                        break;
575                    };
576                }
577            }
578
579            cursor += 1;
580        }
581
582        self.tokens.remove_indices(remove_these);
583    }
584
585    /// Condenses words like "i.e.", "e.g." and "N.S.A." down to single words
586    /// using a state machine.
587    fn condense_dotted_initialisms(&mut self) {
588        if self.tokens.len() < 2 {
589            return;
590        }
591
592        let mut to_remove = VecDeque::new();
593
594        let mut cursor = 1;
595
596        let mut initialism_start = None;
597
598        loop {
599            let a = &self.tokens[cursor - 1];
600            let b = &self.tokens[cursor];
601
602            let is_initialism_chunk = a.kind.is_word() && a.span.len() == 1 && b.kind.is_period();
603
604            if is_initialism_chunk {
605                if initialism_start.is_none() {
606                    initialism_start = Some(cursor - 1);
607                } else {
608                    to_remove.push_back(cursor - 1);
609                }
610
611                to_remove.push_back(cursor);
612                cursor += 1;
613            } else {
614                if let Some(start) = initialism_start {
615                    let end = self.tokens[cursor - 2].span.end;
616                    let start_tok: &mut Token = &mut self.tokens[start];
617                    start_tok.span.end = end;
618                }
619
620                initialism_start = None;
621            }
622
623            cursor += 1;
624
625            if cursor >= self.tokens.len() - 1 {
626                break;
627            }
628        }
629
630        self.tokens.remove_indices(to_remove);
631    }
632
633    /// Condenses likely filename extensions down to single tokens.
634    fn condense_filename_extensions(&mut self) {
635        if self.tokens.len() < 2 {
636            return;
637        }
638
639        let mut to_remove = VecDeque::new();
640
641        let mut cursor = 1;
642
643        let mut ext_start = None;
644
645        loop {
646            // left context, dot, extension, right context
647            let l = self.get_token_offset(cursor, -2);
648            let d = &self.tokens[cursor - 1];
649            let x = &self.tokens[cursor];
650            let r = self.get_token_offset(cursor, 1);
651
652            let is_ext_chunk = d.kind.is_period()
653                && x.kind.is_word()
654                && x.span.len() <= 3
655                && ((l.is_none_or(|t| t.kind.is_whitespace())
656                    && r.is_none_or(|t| t.kind.is_whitespace()))
657                    || (l.is_some_and(|t| t.kind.is_open_round())
658                        && r.is_some_and(|t| t.kind.is_close_round())))
659                && {
660                    let ext_chars = x.span.get_content(&self.source);
661                    ext_chars.iter().all(|c| c.is_ascii_lowercase())
662                        || ext_chars.iter().all(|c| c.is_ascii_uppercase())
663                };
664
665            if is_ext_chunk {
666                if ext_start.is_none() {
667                    ext_start = Some(cursor - 1);
668                    self.tokens[cursor - 1].kind = TokenKind::Unlintable;
669                } else {
670                    to_remove.push_back(cursor - 1);
671                }
672
673                to_remove.push_back(cursor);
674                cursor += 1;
675            } else {
676                if let Some(start) = ext_start {
677                    let end = self.tokens[cursor - 2].span.end;
678                    let start_tok: &mut Token = &mut self.tokens[start];
679                    start_tok.span.end = end;
680                }
681
682                ext_start = None;
683            }
684
685            cursor += 1;
686
687            if cursor >= self.tokens.len() {
688                break;
689            }
690        }
691
692        self.tokens.remove_indices(to_remove);
693    }
694
695    /// Condenses common top-level domains (for example: `.blog`, `.com`) down to single tokens.
696    fn condense_common_top_level_domains(&mut self) {
697        const COMMON_TOP_LEVEL_DOMAINS: &[&str; 106] = &[
698            "ai", "app", "blog", "co", "com", "dev", "edu", "gov", "info", "io", "me", "mil",
699            "net", "org", "shop", "tech", "uk", "us", "xyz", "jp", "de", "fr", "br", "it", "ru",
700            "es", "pl", "ca", "au", "cn", "in", "nl", "eu", "ch", "id", "at", "kr", "cz", "mx",
701            "be", "tv", "se", "tr", "tw", "al", "ua", "ir", "vn", "cl", "sk", "ly", "cc", "to",
702            "no", "fi", "pt", "dk", "ar", "hu", "tk", "gr", "il", "news", "ro", "my", "biz", "ie",
703            "za", "nz", "sg", "ee", "th", "pe", "bg", "hk", "rs", "lt", "link", "ph", "club", "si",
704            "site", "mobi", "by", "cat", "wiki", "la", "ga", "xxx", "cf", "hr", "ng", "jobs",
705            "online", "kz", "ug", "gq", "ae", "is", "lv", "pro", "fm", "tips", "ms", "sa", "int",
706        ];
707
708        if self.tokens.len() < 2 {
709            return;
710        }
711
712        let mut to_remove = VecDeque::new();
713        for cursor in 1..self.tokens.len() {
714            // left context, dot, tld, right context
715            let l = self.get_token_offset(cursor, -2);
716            let d = &self.tokens[cursor - 1];
717            let tld = &self.tokens[cursor];
718            let r = self.get_token_offset(cursor, 1);
719
720            let is_tld_chunk = d.kind.is_period()
721                && tld.kind.is_word()
722                && tld
723                    .span
724                    .get_content(&self.source)
725                    .iter()
726                    .all(|c| c.is_ascii_alphabetic())
727                && tld
728                    .span
729                    .get_content(&self.source)
730                    .eq_any_ignore_ascii_case_str(COMMON_TOP_LEVEL_DOMAINS)
731                && ((l.is_none_or(|t| t.kind.is_whitespace())
732                    && r.is_none_or(|t| t.kind.is_whitespace()))
733                    || (l.is_some_and(|t| t.kind.is_open_round())
734                        && r.is_some_and(|t| t.kind.is_close_round())));
735
736            if is_tld_chunk {
737                self.tokens[cursor - 1].kind = TokenKind::Unlintable;
738                self.tokens[cursor - 1].span.end = self.tokens[cursor].span.end;
739                to_remove.push_back(cursor);
740            }
741        }
742
743        self.tokens.remove_indices(to_remove);
744    }
745
746    /// Condenses "tl;dr" down to a single word token.
747    fn condense_tldr(&mut self) {
748        if self.tokens.len() < 3 {
749            return;
750        }
751
752        let mut to_remove = VecDeque::new();
753        let mut cursor = 2;
754
755        loop {
756            let tl = &self.tokens[cursor - 2];
757            let simicolon = &self.tokens[cursor - 1];
758            let dr = &self.tokens[cursor];
759
760            let is_tldr_chunk = tl.kind.is_word()
761                && tl.span.len() == 2
762                && tl
763                    .span
764                    .get_content(&self.source)
765                    .eq_ignore_ascii_case_chars(&['t', 'l'])
766                && simicolon.kind.is_semicolon()
767                && dr.kind.is_word()
768                && dr.span.len() >= 2
769                && dr.span.len() <= 3
770                && dr
771                    .span
772                    .get_content(&self.source)
773                    .eq_any_ignore_ascii_case_chars(&[&['d', 'r'], &['d', 'r', 's']]);
774
775            if is_tldr_chunk {
776                // Update the first token to be the full "tl;dr" as a word
777                self.tokens[cursor - 2].span = Span::new(
778                    self.tokens[cursor - 2].span.start,
779                    self.tokens[cursor].span.end,
780                );
781
782                // Mark the semicolon and "dr" tokens for removal
783                to_remove.push_back(cursor - 1);
784                to_remove.push_back(cursor);
785            }
786
787            // Skip ahead since we've processed these tokens
788            cursor += 1;
789
790            if cursor >= self.tokens.len() {
791                break;
792            }
793        }
794
795        // Remove the marked tokens in reverse order to maintain correct indices
796        self.tokens.remove_indices(to_remove);
797    }
798
799    /// Allows condensing of delimited pairs of tokens into a single token.
800    ///
801    /// # Arguments
802    ///
803    /// * `is_delimiter` - A function that returns `true` if the token is a delimiter.
804    /// * `valid_pairs` - A slice of tuples representing the valid pairs of tokens to condense.
805    ///
806    fn condense_delimited_pairs<F>(&mut self, is_delimiter: F, valid_pairs: &[(char, char)])
807    where
808        F: Fn(&TokenKind) -> bool,
809    {
810        if self.tokens.len() < 3 {
811            return;
812        }
813
814        let mut to_remove = VecDeque::new();
815        let mut cursor = 2;
816
817        loop {
818            let l1 = &self.tokens[cursor - 2];
819            let delim = &self.tokens[cursor - 1];
820            let l2 = &self.tokens[cursor];
821
822            let is_delimited_chunk = l1.kind.is_word()
823                && l1.span.len() == 1
824                && is_delimiter(&delim.kind)
825                && l2.kind.is_word()
826                && l2.span.len() == 1;
827
828            if is_delimited_chunk {
829                let (l1, l2) = (
830                    l1.span.get_content(&self.source).first(),
831                    l2.span.get_content(&self.source).first(),
832                );
833
834                let is_valid_pair = match (l1, l2) {
835                    (Some(l1), Some(l2)) => {
836                        let pair = (l1.to_ascii_lowercase(), l2.to_ascii_lowercase());
837                        valid_pairs.contains(&pair)
838                    }
839                    _ => false,
840                };
841
842                if is_valid_pair {
843                    self.tokens[cursor - 2].span = Span::new(
844                        self.tokens[cursor - 2].span.start,
845                        self.tokens[cursor].span.end,
846                    );
847                    to_remove.push_back(cursor - 1);
848                    to_remove.push_back(cursor);
849                }
850            }
851
852            cursor += 1;
853            if cursor >= self.tokens.len() {
854                break;
855            }
856        }
857
858        self.tokens.remove_indices(to_remove);
859    }
860
861    // Condenses "ampersand pairs" such as "R&D" or "Q&A" into single tokens.
862    fn condense_ampersand_pairs(&mut self) {
863        self.condense_delimited_pairs(
864            |kind| kind.is_ampersand(),
865            &[
866                ('b', 'b'), // bed & breakfast
867                ('b', 'w'), // black & white
868                ('g', 't'), // gin & tonic
869                ('k', 'r'), // Kernighan & Ritchie
870                ('q', 'a'), // question & answer
871                ('r', 'b'), // rhythm & blues
872                ('r', 'd'), // research & development
873                ('r', 'r'), // rest & relaxation
874                ('s', 'p'), // Standard & Poor's
875            ],
876        );
877    }
878
879    // Condenses "slash pairs" such as "I/O" into single tokens.
880    fn condense_slash_pairs(&mut self) {
881        self.condense_delimited_pairs(
882            |kind| kind.is_slash(),
883            &[
884                ('a', 'c'), // aircon; alternating current
885                ('b', 'w'), // black and white
886                ('c', 'o'), // care of
887                ('d', 'c'), // direct current
888                ('d', 'l'), // download
889                ('i', 'o'), // input/output
890                ('j', 'k'), // just kidding
891                ('n', 'a'), // not applicable
892                ('r', 'c'), // radio control
893                ('s', 'n'), // serial number
894                ('y', 'n'), // yes/no
895                ('y', 'o'), // years old
896            ],
897        );
898    }
899
900    fn uncached_ellipsis_pattern() -> Lrc<Repeating> {
901        let period = SequenceExpr::default().then_period();
902        Lrc::new(Repeating::new(Box::new(period), 2))
903    }
904
905    thread_local! {
906        static ELLIPSIS_EXPR: Lrc<Repeating> = Document::uncached_ellipsis_pattern();
907    }
908
909    fn condense_ellipsis(&mut self) {
910        let expr = Self::ELLIPSIS_EXPR.with(|v| v.clone());
911        self.condense_expr(&expr, |tok| {
912            tok.kind = TokenKind::Punctuation(Punctuation::Ellipsis)
913        });
914    }
915}
916
917/// Creates functions necessary to implement [`TokenStringExt]` on a document.
918macro_rules! create_fns_on_doc {
919    ($thing:ident) => {
920        paste! {
921            fn [< first_ $thing >](&self) -> Option<&Token> {
922                self.tokens.[< first_ $thing >]()
923            }
924
925            fn [< last_ $thing >](&self) -> Option<&Token> {
926                self.tokens.[< last_ $thing >]()
927            }
928
929            fn [< last_ $thing _index>](&self) -> Option<usize> {
930                self.tokens.[< last_ $thing _index >]()
931            }
932
933            fn [<iter_ $thing _indices>](&self) -> impl DoubleEndedIterator<Item = usize> + '_ {
934                self.tokens.[< iter_ $thing _indices >]()
935            }
936
937            fn [<iter_ $thing s>](&self) -> impl Iterator<Item = &Token> + '_ {
938                self.tokens.[< iter_ $thing s >]()
939            }
940        }
941    };
942}
943
944impl TokenStringExt for Document {
945    create_fns_on_doc!(adjective);
946    create_fns_on_doc!(apostrophe);
947    create_fns_on_doc!(at);
948    create_fns_on_doc!(chunk_terminator);
949    create_fns_on_doc!(comma);
950    create_fns_on_doc!(conjunction);
951    create_fns_on_doc!(currency);
952    create_fns_on_doc!(ellipsis);
953    create_fns_on_doc!(hostname);
954    create_fns_on_doc!(likely_homograph);
955    create_fns_on_doc!(noun);
956    create_fns_on_doc!(number);
957    create_fns_on_doc!(paragraph_break);
958    create_fns_on_doc!(pipe);
959    create_fns_on_doc!(preposition);
960    create_fns_on_doc!(punctuation);
961    create_fns_on_doc!(quote);
962    create_fns_on_doc!(sentence_terminator);
963    create_fns_on_doc!(space);
964    create_fns_on_doc!(unlintable);
965    create_fns_on_doc!(verb);
966    create_fns_on_doc!(word);
967    create_fns_on_doc!(word_like);
968    create_fns_on_doc!(heading_start);
969
970    fn first_sentence_word(&self) -> Option<&Token> {
971        self.tokens.first_sentence_word()
972    }
973
974    fn first_non_whitespace(&self) -> Option<&Token> {
975        self.tokens.first_non_whitespace()
976    }
977
978    fn span(&self) -> Option<Span<char>> {
979        self.tokens.span()
980    }
981
982    fn iter_linking_verb_indices(&self) -> impl Iterator<Item = usize> + '_ {
983        self.tokens.iter_linking_verb_indices()
984    }
985
986    fn iter_linking_verbs(&self) -> impl Iterator<Item = &Token> + '_ {
987        self.tokens.iter_linking_verbs()
988    }
989
990    fn iter_chunks(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
991        self.tokens.iter_chunks()
992    }
993
994    fn iter_paragraphs(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
995        self.tokens.iter_paragraphs()
996    }
997
998    fn iter_headings(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
999        self.tokens.iter_headings()
1000    }
1001
1002    fn iter_sentences(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
1003        self.tokens.iter_sentences()
1004    }
1005
1006    fn iter_sentences_mut(&mut self) -> impl Iterator<Item = &'_ mut [Token]> + '_ {
1007        self.tokens.iter_sentences_mut()
1008    }
1009}
1010
1011impl Display for Document {
1012    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1013        for token in &self.tokens {
1014            write!(f, "{}", self.get_span_content_str(&token.span))?;
1015        }
1016
1017        Ok(())
1018    }
1019}
1020
1021#[cfg(test)]
1022mod tests {
1023    use itertools::Itertools;
1024
1025    use super::Document;
1026    use crate::TokenStringExt;
1027    use crate::{Span, parsers::MarkdownOptions};
1028
1029    fn assert_condensed_contractions(text: &str, final_tok_count: usize) {
1030        let document = Document::new_plain_english_curated(text);
1031
1032        assert_eq!(document.tokens.len(), final_tok_count);
1033
1034        let document = Document::new_markdown_curated(text, MarkdownOptions::default());
1035
1036        assert_eq!(document.tokens.len(), final_tok_count);
1037    }
1038
1039    #[test]
1040    fn simple_contraction() {
1041        assert_condensed_contractions("isn't", 1);
1042    }
1043
1044    #[test]
1045    fn simple_contraction2() {
1046        assert_condensed_contractions("wasn't", 1);
1047    }
1048
1049    #[test]
1050    fn simple_contraction3() {
1051        assert_condensed_contractions("There's", 1);
1052    }
1053
1054    #[test]
1055    fn simple_contraction4() {
1056        assert_condensed_contractions("doesn't", 1);
1057    }
1058
1059    #[test]
1060    fn medium_contraction() {
1061        assert_condensed_contractions("isn't wasn't", 3);
1062    }
1063
1064    #[test]
1065    fn medium_contraction2() {
1066        assert_condensed_contractions("There's no way", 5);
1067    }
1068
1069    #[test]
1070    fn selects_token_at_char_index() {
1071        let text = "There were three little pigs. They built three little homes.";
1072        let document = Document::new_plain_english_curated(text);
1073
1074        let got = document.get_token_at_char_index(19).unwrap();
1075
1076        assert!(got.kind.is_word());
1077        assert_eq!(got.span, Span::new(17, 23));
1078    }
1079
1080    fn assert_token_count(source: &str, count: usize) {
1081        let document = Document::new_plain_english_curated(source);
1082
1083        dbg!(document.tokens().map(|t| t.kind.clone()).collect_vec());
1084        assert_eq!(document.tokens.len(), count);
1085    }
1086
1087    #[test]
1088    fn condenses_number_suffixes() {
1089        assert_token_count("1st", 1);
1090        assert_token_count("This is the 2nd test", 9);
1091        assert_token_count("This is the 3rd test", 9);
1092        assert_token_count(
1093            "It works even with weird capitalization like this: 600nD",
1094            18,
1095        );
1096    }
1097
1098    #[test]
1099    fn condenses_ie() {
1100        assert_token_count("There is a thing (i.e. that one)", 15);
1101        assert_token_count("We are trying to condense \"i.e.\"", 13);
1102        assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
1103    }
1104
1105    #[test]
1106    fn condenses_eg() {
1107        assert_token_count("We are trying to condense \"e.g.\"", 13);
1108        assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
1109    }
1110
1111    #[test]
1112    fn condenses_nsa() {
1113        assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
1114    }
1115
1116    #[test]
1117    fn parses_ellipsis() {
1118        assert_token_count("...", 1);
1119    }
1120
1121    #[test]
1122    fn parses_long_ellipsis() {
1123        assert_token_count(".....", 1);
1124    }
1125
1126    #[test]
1127    fn parses_short_ellipsis() {
1128        assert_token_count("..", 1);
1129    }
1130
1131    #[test]
1132    fn selects_token_at_offset() {
1133        let doc = Document::new_plain_english_curated("Foo bar baz");
1134
1135        let tok = doc.get_token_offset(1, -1).unwrap();
1136
1137        assert_eq!(tok.span, Span::new(0, 3));
1138    }
1139
1140    #[test]
1141    fn cant_select_token_before_start() {
1142        let doc = Document::new_plain_english_curated("Foo bar baz");
1143
1144        let tok = doc.get_token_offset(0, -1);
1145
1146        assert!(tok.is_none());
1147    }
1148
1149    #[test]
1150    fn select_next_word_pos_offset() {
1151        let doc = Document::new_plain_english_curated("Foo bar baz");
1152
1153        let bar = doc.get_next_word_from_offset(0, 1).unwrap();
1154        let bar = doc.get_span_content(&bar.span);
1155        assert_eq!(bar, ['b', 'a', 'r']);
1156    }
1157
1158    #[test]
1159    fn select_next_word_neg_offset() {
1160        let doc = Document::new_plain_english_curated("Foo bar baz");
1161
1162        let bar = doc.get_next_word_from_offset(2, -1).unwrap();
1163        let bar = doc.get_span_content(&bar.span);
1164        assert_eq!(bar, ['F', 'o', 'o']);
1165    }
1166
1167    #[test]
1168    fn cant_select_next_word_not_from_whitespace() {
1169        let doc = Document::new_plain_english_curated("Foo bar baz");
1170
1171        let tok = doc.get_next_word_from_offset(0, 2);
1172
1173        assert!(tok.is_none());
1174    }
1175
1176    #[test]
1177    fn cant_select_next_word_before_start() {
1178        let doc = Document::new_plain_english_curated("Foo bar baz");
1179
1180        let tok = doc.get_next_word_from_offset(0, -1);
1181
1182        assert!(tok.is_none());
1183    }
1184
1185    #[test]
1186    fn cant_select_next_word_with_punctuation_instead_of_whitespace() {
1187        let doc = Document::new_plain_english_curated("Foo, bar, baz");
1188
1189        let tok = doc.get_next_word_from_offset(0, 1);
1190
1191        assert!(tok.is_none());
1192    }
1193
1194    #[test]
1195    fn cant_select_next_word_with_punctuation_after_whitespace() {
1196        let doc = Document::new_plain_english_curated("Foo \"bar\", baz");
1197
1198        let tok = doc.get_next_word_from_offset(0, 1);
1199
1200        assert!(tok.is_none());
1201    }
1202
1203    #[test]
1204    fn condenses_filename_extensions() {
1205        let doc = Document::new_plain_english_curated(".c and .exe and .js");
1206        assert!(doc.tokens[0].kind.is_unlintable());
1207        assert!(doc.tokens[4].kind.is_unlintable());
1208        assert!(doc.tokens[8].kind.is_unlintable());
1209    }
1210
1211    #[test]
1212    fn condense_filename_extension_ok_at_start_and_end() {
1213        let doc = Document::new_plain_english_curated(".c and .EXE");
1214        assert!(doc.tokens.len() == 5);
1215        assert!(doc.tokens[0].kind.is_unlintable());
1216        assert!(doc.tokens[4].kind.is_unlintable());
1217    }
1218
1219    #[test]
1220    fn doesnt_condense_filename_extensions_with_mixed_case() {
1221        let doc = Document::new_plain_english_curated(".c and .Exe");
1222        assert!(doc.tokens.len() == 6);
1223        assert!(doc.tokens[0].kind.is_unlintable());
1224        assert!(doc.tokens[4].kind.is_punctuation());
1225        assert!(doc.tokens[5].kind.is_word());
1226    }
1227
1228    #[test]
1229    fn doesnt_condense_filename_extensions_with_non_letters() {
1230        let doc = Document::new_plain_english_curated(".COM and .C0M");
1231        assert!(doc.tokens.len() == 6);
1232        assert!(doc.tokens[0].kind.is_unlintable());
1233        assert!(doc.tokens[4].kind.is_punctuation());
1234        assert!(doc.tokens[5].kind.is_word());
1235    }
1236
1237    #[test]
1238    fn doesnt_condense_filename_extensions_longer_than_three() {
1239        let doc = Document::new_plain_english_curated(".dll and .dlls");
1240        assert!(doc.tokens.len() == 6);
1241        assert!(doc.tokens[0].kind.is_unlintable());
1242        assert!(doc.tokens[4].kind.is_punctuation());
1243        assert!(doc.tokens[5].kind.is_word());
1244    }
1245
1246    #[test]
1247    fn condense_filename_extension_in_parens() {
1248        let doc = Document::new_plain_english_curated(
1249            "true for the manual installation when trying to run the executable(.exe) after a manual download",
1250        );
1251        assert!(doc.tokens.len() > 23);
1252        assert!(doc.tokens[21].kind.is_open_round());
1253        assert!(doc.tokens[22].kind.is_unlintable());
1254        assert!(doc.tokens[23].kind.is_close_round());
1255    }
1256
1257    #[test]
1258    fn condense_tldr_uppercase() {
1259        let doc = Document::new_plain_english_curated("TL;DR");
1260        assert!(doc.tokens.len() == 1);
1261        assert!(doc.tokens[0].kind.is_word());
1262        assert!(doc.tokens[0].span.len() == 5);
1263    }
1264
1265    #[test]
1266    fn condense_tldr_lowercase() {
1267        let doc = Document::new_plain_english_curated("tl;dr");
1268        assert!(doc.tokens.len() == 1);
1269        assert!(doc.tokens[0].kind.is_word());
1270    }
1271
1272    #[test]
1273    fn condense_tldr_mixed_case_1() {
1274        let doc = Document::new_plain_english_curated("tl;DR");
1275        assert!(doc.tokens.len() == 1);
1276        assert!(doc.tokens[0].kind.is_word());
1277    }
1278
1279    #[test]
1280    fn condense_tldr_mixed_case_2() {
1281        let doc = Document::new_plain_english_curated("TL;Dr");
1282        assert!(doc.tokens.len() == 1);
1283        assert!(doc.tokens[0].kind.is_word());
1284    }
1285
1286    #[test]
1287    fn condense_tldr_pural() {
1288        let doc = Document::new_plain_english_curated(
1289            "managing the flow between components to produce relevant TL;DRs of current news articles",
1290        );
1291        // no token is a punctuation token - only words with whitespace between
1292        assert!(
1293            doc.tokens
1294                .iter()
1295                .all(|t| t.kind.is_word() || t.kind.is_whitespace())
1296        );
1297        // one of the word tokens contains a ';' character
1298        let tldrs = doc
1299            .tokens
1300            .iter()
1301            .filter(|t| t.span.get_content(&doc.source).contains(&';'))
1302            .collect_vec();
1303        assert!(tldrs.len() == 1);
1304        assert!(tldrs[0].span.get_content_string(&doc.source) == "TL;DRs");
1305    }
1306
1307    #[test]
1308    fn condense_common_top_level_domains() {
1309        let doc = Document::new_plain_english_curated(".blog and .com and .NET");
1310        assert!(doc.tokens.len() == 9);
1311        assert!(doc.tokens[0].kind.is_unlintable());
1312        assert!(doc.tokens[4].kind.is_unlintable());
1313        assert!(doc.tokens[8].kind.is_unlintable());
1314    }
1315
1316    #[test]
1317    fn condense_common_top_level_domains_in_parens() {
1318        let doc = Document::new_plain_english_curated("(.blog)");
1319        assert!(doc.tokens.len() == 3);
1320        assert!(doc.tokens[0].kind.is_open_round());
1321        assert!(doc.tokens[1].kind.is_unlintable());
1322        assert!(doc.tokens[2].kind.is_close_round());
1323    }
1324
1325    #[test]
1326    fn doesnt_condense_unknown_top_level_domains() {
1327        let doc = Document::new_plain_english_curated(".harper");
1328        assert!(doc.tokens.len() == 2);
1329        assert!(doc.tokens[0].kind.is_punctuation());
1330        assert!(doc.tokens[1].kind.is_word());
1331    }
1332
1333    #[test]
1334    fn condense_r_and_d_caps() {
1335        let doc = Document::new_plain_english_curated("R&D");
1336        assert!(doc.tokens.len() == 1);
1337        assert!(doc.tokens[0].kind.is_word());
1338    }
1339
1340    #[test]
1341    fn condense_r_and_d_mixed_case() {
1342        let doc = Document::new_plain_english_curated("R&d");
1343        assert!(doc.tokens.len() == 1);
1344        assert!(doc.tokens[0].kind.is_word());
1345    }
1346
1347    #[test]
1348    fn condense_r_and_d_lowercase() {
1349        let doc = Document::new_plain_english_curated("r&d");
1350        assert!(doc.tokens.len() == 1);
1351        assert!(doc.tokens[0].kind.is_word());
1352    }
1353
1354    #[test]
1355    fn dont_condense_r_and_d_with_spaces() {
1356        let doc = Document::new_plain_english_curated("R & D");
1357        assert!(doc.tokens.len() == 5);
1358        assert!(doc.tokens[0].kind.is_word());
1359        assert!(doc.tokens[1].kind.is_whitespace());
1360        assert!(doc.tokens[2].kind.is_ampersand());
1361        assert!(doc.tokens[3].kind.is_whitespace());
1362        assert!(doc.tokens[4].kind.is_word());
1363    }
1364
1365    #[test]
1366    fn condense_q_and_a() {
1367        let doc =
1368            Document::new_plain_english_curated("A Q&A platform software for teams at any scales.");
1369        assert!(doc.tokens.len() >= 3);
1370        assert!(doc.tokens[2].kind.is_word());
1371        assert!(doc.tokens[2].span.get_content_string(&doc.source) == "Q&A");
1372    }
1373
1374    #[test]
1375    fn dont_allow_mixed_r_and_d_with_q_and_a() {
1376        let doc = Document::new_plain_english_curated("R&A or Q&D");
1377        assert!(doc.tokens.len() == 9);
1378        assert!(doc.tokens[1].kind.is_ampersand() || doc.tokens[7].kind.is_ampersand());
1379    }
1380
1381    #[test]
1382    fn condense_io() {
1383        let doc = Document::new_plain_english_curated("I/O");
1384        assert!(doc.tokens.len() == 1);
1385        assert!(doc.tokens[0].kind.is_word());
1386    }
1387
1388    #[test]
1389    fn finds_unmatched_quotes_in_document() {
1390        let raw = r#"
1391This is a paragraph with a single word "quoted."
1392
1393This is a second paragraph with no quotes.
1394
1395This is a third paragraph with a single erroneous "quote.
1396
1397This is a final paragraph with a weird "quote and a not-weird "quote".
1398            "#;
1399
1400        let doc = Document::new_markdown_default_curated(raw);
1401
1402        let quote_twins: Vec<_> = doc
1403            .iter_quotes()
1404            .map(|t| t.kind.as_quote().unwrap().twin_loc)
1405            .collect();
1406
1407        assert_eq!(
1408            quote_twins,
1409            vec![Some(19), Some(16), None, None, Some(89), Some(87)]
1410        )
1411    }
1412
1413    #[test]
1414    fn issue_1901() {
1415        let raw = r#"
1416"A quoted line"
1417"A quote without a closing mark
1418"Another quoted lined"
1419"The last quoted line"
1420            "#;
1421
1422        let doc = Document::new_markdown_default_curated(raw);
1423
1424        let quote_twins: Vec<_> = doc
1425            .iter_quotes()
1426            .map(|t| t.kind.as_quote().unwrap().twin_loc)
1427            .collect();
1428
1429        assert_eq!(
1430            quote_twins,
1431            vec![
1432                Some(6),
1433                Some(0),
1434                None,
1435                Some(27),
1436                Some(21),
1437                Some(37),
1438                Some(29)
1439            ]
1440        )
1441    }
1442}