Skip to main content

harper_core/
document.rs

1use std::cmp::Ordering;
2use std::collections::VecDeque;
3use std::fmt::Display;
4
5use harper_brill::{Chunker, Tagger, brill_tagger, burn_chunker};
6use itertools::Itertools;
7use paste::paste;
8
9use crate::expr::{Expr, ExprExt, FirstMatchOf, Repeating, SequenceExpr};
10use crate::parsers::{Markdown, MarkdownOptions, Parser, PlainEnglish};
11use crate::punctuation::Punctuation;
12use crate::spell::{Dictionary, FstDictionary};
13use crate::vec_ext::VecExt;
14use crate::{CharStringExt, FatStringToken, FatToken, Lrc, Token, TokenKind, TokenStringExt};
15use crate::{OrdinalSuffix, Span};
16
17/// A document containing some amount of lexed and parsed English text.
18#[derive(Debug, Clone)]
19pub struct Document {
20    source: Lrc<Vec<char>>,
21    tokens: Vec<Token>,
22}
23
24impl Default for Document {
25    fn default() -> Self {
26        Self::new("", &PlainEnglish, &FstDictionary::curated())
27    }
28}
29
30impl Document {
31    /// Locate all the tokens that intersect a provided span.
32    ///
33    /// Desperately needs optimization.
34    pub fn token_indices_intersecting(&self, span: Span<char>) -> Vec<usize> {
35        self.tokens()
36            .enumerate()
37            .filter_map(|(idx, tok)| tok.span.overlaps_with(span).then_some(idx))
38            .collect()
39    }
40
41    /// Locate all the tokens that intersect a provided span and convert them to [`FatToken`]s.
42    ///
43    /// Desperately needs optimization.
44    pub fn fat_tokens_intersecting(&self, span: Span<char>) -> Vec<FatToken> {
45        let indices = self.token_indices_intersecting(span);
46
47        indices
48            .into_iter()
49            .map(|i| self.tokens[i].to_fat(&self.source))
50            .collect()
51    }
52
53    /// Lexes and parses text to produce a document using a provided language
54    /// parser and dictionary.
55    pub fn new(text: &str, parser: &impl Parser, dictionary: &impl Dictionary) -> Self {
56        let source: Vec<_> = text.chars().collect();
57
58        Self::new_from_vec(Lrc::new(source), parser, dictionary)
59    }
60
61    /// Lexes and parses text to produce a document using a provided language
62    /// parser and the included curated dictionary.
63    pub fn new_curated(text: &str, parser: &impl Parser) -> Self {
64        let source: Vec<_> = text.chars().collect();
65
66        Self::new_from_vec(Lrc::new(source), parser, &FstDictionary::curated())
67    }
68
69    /// Lexes and parses text to produce a document using a provided language
70    /// parser and dictionary.
71    pub fn new_from_vec(
72        source: Lrc<Vec<char>>,
73        parser: &impl Parser,
74        dictionary: &impl Dictionary,
75    ) -> Self {
76        let tokens = parser.parse(&source);
77
78        let mut document = Self { source, tokens };
79        document.parse(dictionary);
80
81        document
82    }
83
84    /// Parse text to produce a document using the built-in [`PlainEnglish`]
85    /// parser and curated dictionary.
86    pub fn new_plain_english_curated(text: &str) -> Self {
87        Self::new(text, &PlainEnglish, &FstDictionary::curated())
88    }
89
90    /// Create a new document simply by tokenizing the provided input and applying fix-ups. The
91    /// contained words will not contain any metadata.
92    ///
93    /// This avoids running potentially expensive metadata generation code, so this is more
94    /// efficient if you don't need that information.
95    pub(crate) fn new_basic_tokenize(text: &str, parser: &impl Parser) -> Self {
96        let source = Lrc::new(text.chars().collect_vec());
97        let tokens = parser.parse(&source);
98        let mut document = Self { source, tokens };
99        document.apply_fixups();
100        document
101    }
102
103    /// Parse text to produce a document using the built-in [`PlainEnglish`]
104    /// parser and a provided dictionary.
105    pub fn new_plain_english(text: &str, dictionary: &impl Dictionary) -> Self {
106        Self::new(text, &PlainEnglish, dictionary)
107    }
108
109    /// Parse text to produce a document using the built-in [`Markdown`] parser
110    /// and curated dictionary.
111    pub fn new_markdown_curated(text: &str, markdown_options: MarkdownOptions) -> Self {
112        Self::new(
113            text,
114            &Markdown::new(markdown_options),
115            &FstDictionary::curated(),
116        )
117    }
118
119    /// Parse text to produce a document using the built-in [`Markdown`] parser
120    /// and curated dictionary with the default Markdown configuration.
121    pub fn new_markdown_default_curated(text: &str) -> Self {
122        Self::new_markdown_curated(text, MarkdownOptions::default())
123    }
124
125    /// Parse text to produce a document using the built-in [`PlainEnglish`]
126    /// parser and the curated dictionary.
127    pub fn new_markdown(
128        text: &str,
129        markdown_options: MarkdownOptions,
130        dictionary: &impl Dictionary,
131    ) -> Self {
132        Self::new(text, &Markdown::new(markdown_options), dictionary)
133    }
134
135    /// Parse text to produce a document using the built-in [`PlainEnglish`]
136    /// parser and the curated dictionary with the default Markdown configuration.
137    pub fn new_markdown_default(text: &str, dictionary: &impl Dictionary) -> Self {
138        Self::new_markdown(text, MarkdownOptions::default(), dictionary)
139    }
140
141    fn apply_fixups(&mut self) {
142        self.condense_spaces();
143        self.condense_newlines();
144        self.newlines_to_breaks();
145        self.condense_dotted_initialisms();
146        self.condense_number_suffixes();
147        self.condense_ellipsis();
148        self.condense_latin();
149        self.condense_common_top_level_domains();
150        self.condense_filename_extensions();
151        self.condense_tldr();
152        self.condense_ampersand_pairs();
153        self.condense_slash_pairs();
154        self.match_quotes();
155    }
156
157    /// Re-parse important language constructs.
158    ///
159    /// Should be run after every change to the underlying [`Self::source`].
160    fn parse(&mut self, dictionary: &impl Dictionary) {
161        self.apply_fixups();
162
163        let chunker = burn_chunker();
164        let tagger = brill_tagger();
165
166        for sent in self.tokens.iter_sentences_mut() {
167            let token_strings: Vec<_> = sent
168                .iter()
169                .filter(|t| !t.kind.is_whitespace())
170                .map(|t| t.span.get_content_string(&self.source))
171                .collect();
172
173            let token_tags = tagger.tag_sentence(&token_strings);
174            let np_flags = chunker.chunk_sentence(&token_strings, &token_tags);
175
176            let mut i = 0;
177
178            // Annotate DictWord metadata
179            for token in sent.iter_mut() {
180                if let TokenKind::Word(meta) = &mut token.kind {
181                    let word_source = token.span.get_content(&self.source);
182                    let mut found_meta = dictionary
183                        .get_word_metadata(word_source)
184                        .map(|c| c.into_owned());
185
186                    if let Some(inner) = &mut found_meta {
187                        inner.pos_tag = token_tags[i].or_else(|| inner.infer_pos_tag());
188                        inner.np_member = Some(np_flags[i]);
189                    }
190
191                    *meta = found_meta;
192                    i += 1;
193                } else if !token.kind.is_whitespace() {
194                    i += 1;
195                }
196            }
197        }
198    }
199
200    /// Convert all sets of newlines greater than 2 to paragraph breaks.
201    fn newlines_to_breaks(&mut self) {
202        for token in &mut self.tokens {
203            if let TokenKind::Newline(n) = token.kind
204                && n >= 2
205            {
206                token.kind = TokenKind::ParagraphBreak;
207            }
208        }
209    }
210
211    /// Given a list of indices, this function removes the subsequent
212    /// `stretch_len - 1` elements after each index.
213    ///
214    /// Will extend token spans to include removed elements.
215    /// Assumes condensed tokens are contiguous in source text.
216    fn condense_indices(&mut self, indices: &[usize], stretch_len: usize) {
217        // Update spans
218        for idx in indices {
219            let end_tok = self.tokens[idx + stretch_len - 1].clone();
220            let start_tok = &mut self.tokens[*idx];
221
222            start_tok.span.end = end_tok.span.end;
223        }
224
225        // Trim
226        let old = self.tokens.clone();
227        self.tokens.clear();
228
229        // Keep first chunk.
230        self.tokens
231            .extend_from_slice(&old[0..indices.first().copied().unwrap_or(indices.len())]);
232
233        let mut iter = indices.iter().peekable();
234
235        while let (Some(a_idx), b) = (iter.next(), iter.peek()) {
236            self.tokens.push(old[*a_idx].clone());
237
238            if let Some(b_idx) = b {
239                self.tokens
240                    .extend_from_slice(&old[a_idx + stretch_len..**b_idx]);
241            }
242        }
243
244        // Keep last chunk.
245        self.tokens.extend_from_slice(
246            &old[indices
247                .last()
248                .map(|v| v + stretch_len)
249                .unwrap_or(indices.len())..],
250        );
251    }
252
253    pub fn get_token_at_char_index(&self, char_index: usize) -> Option<&Token> {
254        let index = self
255            .tokens
256            .binary_search_by(|t| {
257                if t.span.overlaps_with(Span::new_with_len(char_index, 1)) {
258                    Ordering::Equal
259                } else {
260                    t.span.start.cmp(&char_index)
261                }
262            })
263            .ok()?;
264
265        Some(&self.tokens[index])
266    }
267
268    /// Defensively attempt to grab a specific token.
269    pub fn get_token(&self, index: usize) -> Option<&Token> {
270        self.tokens.get(index)
271    }
272
273    /// Get a token at a signed offset from a base index, or None if out of bounds.
274    pub fn get_token_offset(&self, base: usize, offset: isize) -> Option<&Token> {
275        match base.checked_add_signed(offset) {
276            None => None,
277            Some(idx) => self.get_token(idx),
278        }
279    }
280
281    /// Get an iterator over all the tokens contained in the document.
282    pub fn tokens(&self) -> impl Iterator<Item = &Token> + '_ {
283        self.tokens.iter()
284    }
285
286    pub fn iter_nominal_phrases(&self) -> impl Iterator<Item = &[Token]> {
287        fn is_np_member(t: &Token) -> bool {
288            t.kind
289                .as_word()
290                .and_then(|x| x.as_ref())
291                .and_then(|w| w.np_member)
292                .unwrap_or(false)
293        }
294
295        fn trim(slice: &[Token]) -> &[Token] {
296            let mut start = 0;
297            let mut end = slice.len();
298            while start < end && slice[start].kind.is_whitespace() {
299                start += 1;
300            }
301            while end > start && slice[end - 1].kind.is_whitespace() {
302                end -= 1;
303            }
304            &slice[start..end]
305        }
306
307        self.tokens
308            .as_slice()
309            .split(|t| !(is_np_member(t) || t.kind.is_whitespace()))
310            .filter_map(|s| {
311                let s = trim(s);
312                if s.iter().any(is_np_member) {
313                    Some(s)
314                } else {
315                    None
316                }
317            })
318    }
319
320    /// Get an iterator over all the tokens contained in the document.
321    pub fn fat_tokens(&self) -> impl Iterator<Item = FatToken> + '_ {
322        self.tokens().map(|token| token.to_fat(&self.source))
323    }
324
325    /// Get the next or previous word token relative to a base index, if separated by whitespace.
326    /// Returns None if the next/previous token is not a word or does not exist.
327    pub fn get_next_word_from_offset(&self, base: usize, offset: isize) -> Option<&Token> {
328        // Look for whitespace at the expected offset
329        if !self.get_token_offset(base, offset)?.kind.is_whitespace() {
330            return None;
331        }
332        // Now look beyond the whitespace for a word token
333        let word_token = self.get_token_offset(base, offset + offset.signum());
334        let word_token = word_token?;
335        word_token.kind.is_word().then_some(word_token)
336    }
337
338    /// Get an iterator over all the tokens contained in the document.
339    pub fn fat_string_tokens(&self) -> impl Iterator<Item = FatStringToken> + '_ {
340        self.fat_tokens().map(|t| t.into())
341    }
342
343    pub fn get_span_content(&self, span: &Span<char>) -> &[char] {
344        span.get_content(&self.source)
345    }
346
347    pub fn get_span_content_str(&self, span: &Span<char>) -> String {
348        String::from_iter(self.get_span_content(span))
349    }
350
351    pub fn get_full_string(&self) -> String {
352        self.get_span_content_str(&Span::new(0, self.source.len()))
353    }
354
355    pub fn get_full_content(&self) -> &[char] {
356        &self.source
357    }
358
359    pub fn get_source(&self) -> &[char] {
360        &self.source
361    }
362
363    pub fn get_tokens(&self) -> &[Token] {
364        &self.tokens
365    }
366
367    /// Searches for quotation marks and fills the
368    /// [`Punctuation::Quote::twin_loc`] field. This is on a best-effort
369    /// basis.
370    ///
371    /// Current algorithm is based on https://leancrew.com/all-this/2025/03/a-mac-smart-quote-curiosity
372    fn match_quotes(&mut self) {
373        let mut pg_indices: Vec<_> = vec![0];
374        pg_indices.extend(self.iter_paragraph_break_indices());
375        pg_indices.push(self.tokens.len());
376
377        // Avoid allocation in loop
378        let mut quote_indices = Vec::new();
379        let mut open_quote_indices = Vec::new();
380
381        for (start, end) in pg_indices.into_iter().tuple_windows() {
382            let pg = &mut self.tokens[start..end];
383
384            quote_indices.clear();
385            quote_indices.extend(pg.iter_quote_indices());
386            open_quote_indices.clear();
387
388            // Find open quotes first.
389            for quote in &quote_indices {
390                let is_open = *quote == 0
391                    || pg[0..*quote].iter_word_likes().next().is_none()
392                    || pg[quote - 1].kind.is_whitespace()
393                    || matches!(
394                        pg[quote - 1].kind.as_punctuation(),
395                        Some(Punctuation::LessThan)
396                            | Some(Punctuation::OpenRound)
397                            | Some(Punctuation::OpenSquare)
398                            | Some(Punctuation::OpenCurly)
399                            | Some(Punctuation::Apostrophe)
400                    );
401
402                if is_open {
403                    open_quote_indices.push(*quote);
404                }
405            }
406
407            while let Some(open_idx) = open_quote_indices.pop() {
408                let Some(close_idx) = pg[open_idx + 1..].iter_quote_indices().next() else {
409                    continue;
410                };
411
412                if pg[close_idx + open_idx + 1]
413                    .kind
414                    .as_quote()
415                    .unwrap()
416                    .twin_loc
417                    .is_some()
418                {
419                    continue;
420                }
421
422                pg[open_idx].kind.as_mut_quote().unwrap().twin_loc =
423                    Some(close_idx + open_idx + start + 1);
424                pg[close_idx + open_idx + 1]
425                    .kind
426                    .as_mut_quote()
427                    .unwrap()
428                    .twin_loc = Some(open_idx + start);
429            }
430        }
431    }
432
433    /// Searches for number suffixes and condenses them down into single tokens
434    fn condense_number_suffixes(&mut self) {
435        if self.tokens.len() < 2 {
436            return;
437        }
438
439        let mut replace_starts = Vec::new();
440
441        for idx in 0..self.tokens.len() - 1 {
442            let b = &self.tokens[idx + 1];
443            let a = &self.tokens[idx];
444
445            // TODO: Allow spaces between `a` and `b`
446
447            if let (TokenKind::Number(..), TokenKind::Word(..)) = (&a.kind, &b.kind)
448                && let Some(found_suffix) =
449                    OrdinalSuffix::from_chars(self.get_span_content(&b.span))
450            {
451                self.tokens[idx].kind.as_mut_number().unwrap().suffix = Some(found_suffix);
452                replace_starts.push(idx);
453            }
454        }
455
456        self.condense_indices(&replace_starts, 2);
457    }
458
459    /// Searches for multiple sequential space tokens and condenses them down
460    /// into one.
461    fn condense_spaces(&mut self) {
462        let mut cursor = 0;
463        let copy = self.tokens.clone();
464
465        let mut remove_these = VecDeque::new();
466
467        while cursor < self.tokens.len() {
468            // Locate a stretch of one or more newline tokens.
469            let start_tok = &mut self.tokens[cursor];
470
471            if let TokenKind::Space(start_count) = &mut start_tok.kind {
472                loop {
473                    cursor += 1;
474
475                    if cursor >= copy.len() {
476                        break;
477                    }
478
479                    let child_tok = &copy[cursor];
480
481                    // Only condense adjacent spans
482                    if start_tok.span.end != child_tok.span.start {
483                        break;
484                    }
485
486                    if let TokenKind::Space(n) = child_tok.kind {
487                        *start_count += n;
488                        start_tok.span.end = child_tok.span.end;
489                        remove_these.push_back(cursor);
490                        cursor += 1;
491                    } else {
492                        break;
493                    };
494                }
495            }
496
497            cursor += 1;
498        }
499
500        self.tokens.remove_indices(remove_these);
501    }
502
503    thread_local! {
504        static LATIN_EXPR: Lrc<FirstMatchOf> = Document::uncached_latin_expr();
505    }
506
507    fn uncached_latin_expr() -> Lrc<FirstMatchOf> {
508        Lrc::new(FirstMatchOf::new(vec![
509            Box::new(SequenceExpr::word_set(&["etc", "vs"]).then_period()),
510            Box::new(
511                SequenceExpr::aco("et")
512                    .then_whitespace()
513                    .t_aco("al")
514                    .then_period(),
515            ),
516        ]))
517    }
518
519    /// Assumes that the first matched token is the canonical one to be condensed into.
520    /// Takes a callback that can be used to retroactively edit the canonical token afterwards.
521    fn condense_expr<F>(&mut self, expr: &impl Expr, edit: F)
522    where
523        F: Fn(&mut Token),
524    {
525        let matches = expr.iter_matches_in_doc(self).collect::<Vec<_>>();
526
527        let mut remove_indices = VecDeque::with_capacity(matches.len());
528
529        for m in matches {
530            remove_indices.extend(m.start + 1..m.end);
531            self.tokens[m.start].span = self.tokens[m.into_iter()].span().unwrap();
532            edit(&mut self.tokens[m.start]);
533        }
534
535        self.tokens.remove_indices(remove_indices);
536    }
537
538    fn condense_latin(&mut self) {
539        self.condense_expr(&Self::LATIN_EXPR.with(|v| v.clone()), |_| {})
540    }
541
542    /// Searches for multiple sequential newline tokens and condenses them down
543    /// into one.
544    fn condense_newlines(&mut self) {
545        let mut cursor = 0;
546        let copy = self.tokens.clone();
547
548        let mut remove_these = VecDeque::new();
549
550        while cursor < self.tokens.len() {
551            // Locate a stretch of one or more newline tokens.
552            let start_tok = &mut self.tokens[cursor];
553
554            if let TokenKind::Newline(start_count) = &mut start_tok.kind {
555                loop {
556                    cursor += 1;
557
558                    if cursor >= copy.len() {
559                        break;
560                    }
561
562                    let child_tok = &copy[cursor];
563                    if let TokenKind::Newline(n) = child_tok.kind {
564                        *start_count += n;
565                        start_tok.span.end = child_tok.span.end;
566                        remove_these.push_back(cursor);
567                        cursor += 1;
568                    } else {
569                        break;
570                    };
571                }
572            }
573
574            cursor += 1;
575        }
576
577        self.tokens.remove_indices(remove_these);
578    }
579
580    /// Condenses words like "i.e.", "e.g." and "N.S.A." down to single words
581    /// using a state machine.
582    fn condense_dotted_initialisms(&mut self) {
583        if self.tokens.len() < 2 {
584            return;
585        }
586
587        let mut to_remove = VecDeque::new();
588
589        let mut cursor = 1;
590
591        let mut initialism_start = None;
592
593        loop {
594            let a = &self.tokens[cursor - 1];
595            let b = &self.tokens[cursor];
596
597            let is_initialism_chunk = a.kind.is_word() && a.span.len() == 1 && b.kind.is_period();
598
599            if is_initialism_chunk {
600                if initialism_start.is_none() {
601                    initialism_start = Some(cursor - 1);
602                } else {
603                    to_remove.push_back(cursor - 1);
604                }
605
606                to_remove.push_back(cursor);
607                cursor += 1;
608            } else {
609                if let Some(start) = initialism_start {
610                    let end = self.tokens[cursor - 2].span.end;
611                    let start_tok: &mut Token = &mut self.tokens[start];
612                    start_tok.span.end = end;
613                }
614
615                initialism_start = None;
616            }
617
618            cursor += 1;
619
620            if cursor >= self.tokens.len() - 1 {
621                break;
622            }
623        }
624
625        self.tokens.remove_indices(to_remove);
626    }
627
628    /// Condenses likely filename extensions down to single tokens.
629    fn condense_filename_extensions(&mut self) {
630        if self.tokens.len() < 2 {
631            return;
632        }
633
634        let mut to_remove = VecDeque::new();
635
636        let mut cursor = 1;
637
638        let mut ext_start = None;
639
640        loop {
641            // left context, dot, extension, right context
642            let l = self.get_token_offset(cursor, -2);
643            let d = &self.tokens[cursor - 1];
644            let x = &self.tokens[cursor];
645            let r = self.get_token_offset(cursor, 1);
646
647            let is_ext_chunk = d.kind.is_period()
648                && x.kind.is_word()
649                && x.span.len() <= 3
650                && ((l.is_none_or(|t| t.kind.is_whitespace())
651                    && r.is_none_or(|t| t.kind.is_whitespace()))
652                    || (l.is_some_and(|t| t.kind.is_open_round())
653                        && r.is_some_and(|t| t.kind.is_close_round())))
654                && {
655                    let ext_chars = x.span.get_content(&self.source);
656                    ext_chars.iter().all(|c| c.is_ascii_lowercase())
657                        || ext_chars.iter().all(|c| c.is_ascii_uppercase())
658                };
659
660            if is_ext_chunk {
661                if ext_start.is_none() {
662                    ext_start = Some(cursor - 1);
663                    self.tokens[cursor - 1].kind = TokenKind::Unlintable;
664                } else {
665                    to_remove.push_back(cursor - 1);
666                }
667
668                to_remove.push_back(cursor);
669                cursor += 1;
670            } else {
671                if let Some(start) = ext_start {
672                    let end = self.tokens[cursor - 2].span.end;
673                    let start_tok: &mut Token = &mut self.tokens[start];
674                    start_tok.span.end = end;
675                }
676
677                ext_start = None;
678            }
679
680            cursor += 1;
681
682            if cursor >= self.tokens.len() {
683                break;
684            }
685        }
686
687        self.tokens.remove_indices(to_remove);
688    }
689
690    /// Condenses common top-level domains (for example: `.blog`, `.com`) down to single tokens.
691    fn condense_common_top_level_domains(&mut self) {
692        const COMMON_TOP_LEVEL_DOMAINS: &[&str; 106] = &[
693            "ai", "app", "blog", "co", "com", "dev", "edu", "gov", "info", "io", "me", "mil",
694            "net", "org", "shop", "tech", "uk", "us", "xyz", "jp", "de", "fr", "br", "it", "ru",
695            "es", "pl", "ca", "au", "cn", "in", "nl", "eu", "ch", "id", "at", "kr", "cz", "mx",
696            "be", "tv", "se", "tr", "tw", "al", "ua", "ir", "vn", "cl", "sk", "ly", "cc", "to",
697            "no", "fi", "pt", "dk", "ar", "hu", "tk", "gr", "il", "news", "ro", "my", "biz", "ie",
698            "za", "nz", "sg", "ee", "th", "pe", "bg", "hk", "rs", "lt", "link", "ph", "club", "si",
699            "site", "mobi", "by", "cat", "wiki", "la", "ga", "xxx", "cf", "hr", "ng", "jobs",
700            "online", "kz", "ug", "gq", "ae", "is", "lv", "pro", "fm", "tips", "ms", "sa", "int",
701        ];
702
703        if self.tokens.len() < 2 {
704            return;
705        }
706
707        let mut to_remove = VecDeque::new();
708        for cursor in 1..self.tokens.len() {
709            // left context, dot, tld, right context
710            let l = self.get_token_offset(cursor, -2);
711            let d = &self.tokens[cursor - 1];
712            let tld = &self.tokens[cursor];
713            let r = self.get_token_offset(cursor, 1);
714
715            let is_tld_chunk = d.kind.is_period()
716                && tld.kind.is_word()
717                && tld
718                    .span
719                    .get_content(&self.source)
720                    .iter()
721                    .all(|c| c.is_ascii_alphabetic())
722                && tld
723                    .span
724                    .get_content(&self.source)
725                    .eq_any_ignore_ascii_case_str(COMMON_TOP_LEVEL_DOMAINS)
726                && ((l.is_none_or(|t| t.kind.is_whitespace())
727                    && r.is_none_or(|t| t.kind.is_whitespace()))
728                    || (l.is_some_and(|t| t.kind.is_open_round())
729                        && r.is_some_and(|t| t.kind.is_close_round())));
730
731            if is_tld_chunk {
732                self.tokens[cursor - 1].kind = TokenKind::Unlintable;
733                self.tokens[cursor - 1].span.end = self.tokens[cursor].span.end;
734                to_remove.push_back(cursor);
735            }
736        }
737
738        self.tokens.remove_indices(to_remove);
739    }
740
741    /// Condenses "tl;dr" down to a single word token.
742    fn condense_tldr(&mut self) {
743        if self.tokens.len() < 3 {
744            return;
745        }
746
747        let mut to_remove = VecDeque::new();
748        let mut cursor = 2;
749
750        loop {
751            let tl = &self.tokens[cursor - 2];
752            let simicolon = &self.tokens[cursor - 1];
753            let dr = &self.tokens[cursor];
754
755            let is_tldr_chunk = tl.kind.is_word()
756                && tl.span.len() == 2
757                && tl
758                    .span
759                    .get_content(&self.source)
760                    .eq_ignore_ascii_case_chars(&['t', 'l'])
761                && simicolon.kind.is_semicolon()
762                && dr.kind.is_word()
763                && dr.span.len() >= 2
764                && dr.span.len() <= 3
765                && dr
766                    .span
767                    .get_content(&self.source)
768                    .eq_any_ignore_ascii_case_chars(&[&['d', 'r'], &['d', 'r', 's']]);
769
770            if is_tldr_chunk {
771                // Update the first token to be the full "tl;dr" as a word
772                self.tokens[cursor - 2].span = Span::new(
773                    self.tokens[cursor - 2].span.start,
774                    self.tokens[cursor].span.end,
775                );
776
777                // Mark the semicolon and "dr" tokens for removal
778                to_remove.push_back(cursor - 1);
779                to_remove.push_back(cursor);
780            }
781
782            // Skip ahead since we've processed these tokens
783            cursor += 1;
784
785            if cursor >= self.tokens.len() {
786                break;
787            }
788        }
789
790        // Remove the marked tokens in reverse order to maintain correct indices
791        self.tokens.remove_indices(to_remove);
792    }
793
794    /// Allows condensing of delimited pairs of tokens into a single token.
795    ///
796    /// # Arguments
797    ///
798    /// * `is_delimiter` - A function that returns `true` if the token is a delimiter.
799    /// * `valid_pairs` - A slice of tuples representing the valid pairs of tokens to condense.
800    ///
801    fn condense_delimited_pairs<F>(&mut self, is_delimiter: F, valid_pairs: &[(char, char)])
802    where
803        F: Fn(&TokenKind) -> bool,
804    {
805        if self.tokens.len() < 3 {
806            return;
807        }
808
809        let mut to_remove = VecDeque::new();
810        let mut cursor = 2;
811
812        loop {
813            let l1 = &self.tokens[cursor - 2];
814            let delim = &self.tokens[cursor - 1];
815            let l2 = &self.tokens[cursor];
816
817            let is_delimited_chunk = l1.kind.is_word()
818                && l1.span.len() == 1
819                && is_delimiter(&delim.kind)
820                && l2.kind.is_word()
821                && l2.span.len() == 1;
822
823            if is_delimited_chunk {
824                let (l1, l2) = (
825                    l1.span.get_content(&self.source).first(),
826                    l2.span.get_content(&self.source).first(),
827                );
828
829                let is_valid_pair = match (l1, l2) {
830                    (Some(l1), Some(l2)) => {
831                        let pair = (l1.to_ascii_lowercase(), l2.to_ascii_lowercase());
832                        valid_pairs.contains(&pair)
833                    }
834                    _ => false,
835                };
836
837                if is_valid_pair {
838                    self.tokens[cursor - 2].span = Span::new(
839                        self.tokens[cursor - 2].span.start,
840                        self.tokens[cursor].span.end,
841                    );
842                    to_remove.push_back(cursor - 1);
843                    to_remove.push_back(cursor);
844                }
845            }
846
847            cursor += 1;
848            if cursor >= self.tokens.len() {
849                break;
850            }
851        }
852
853        self.tokens.remove_indices(to_remove);
854    }
855
856    // Condenses "ampersand pairs" such as "R&D" or "Q&A" into single tokens.
857    fn condense_ampersand_pairs(&mut self) {
858        self.condense_delimited_pairs(
859            |kind| kind.is_ampersand(),
860            &[
861                ('b', 'b'), // bed & breakfast
862                ('b', 'w'), // black & white
863                ('g', 't'), // gin & tonic
864                ('k', 'r'), // Kernighan & Ritchie
865                ('q', 'a'), // question & answer
866                ('r', 'b'), // rhythm & blues
867                ('r', 'd'), // research & development
868                ('r', 'r'), // rest & relaxation
869                ('s', 'p'), // Standard & Poor's
870            ],
871        );
872    }
873
874    // Condenses "slash pairs" such as "I/O" into single tokens.
875    fn condense_slash_pairs(&mut self) {
876        self.condense_delimited_pairs(
877            |kind| kind.is_slash(),
878            &[
879                ('a', 'c'), // aircon; alternating current
880                ('b', 'w'), // black and white
881                ('c', 'o'), // care of
882                ('d', 'c'), // direct current
883                ('d', 'l'), // download
884                ('i', 'o'), // input/output
885                ('j', 'k'), // just kidding
886                ('n', 'a'), // not applicable
887                ('r', 'c'), // radio control
888                ('s', 'n'), // serial number
889                ('y', 'n'), // yes/no
890                ('y', 'o'), // years old
891            ],
892        );
893    }
894
895    fn uncached_ellipsis_pattern() -> Lrc<Repeating> {
896        let period = SequenceExpr::default().then_period();
897        Lrc::new(Repeating::new(Box::new(period), 2))
898    }
899
900    thread_local! {
901        static ELLIPSIS_EXPR: Lrc<Repeating> = Document::uncached_ellipsis_pattern();
902    }
903
904    fn condense_ellipsis(&mut self) {
905        let expr = Self::ELLIPSIS_EXPR.with(|v| v.clone());
906        self.condense_expr(&expr, |tok| {
907            tok.kind = TokenKind::Punctuation(Punctuation::Ellipsis)
908        });
909    }
910}
911
912/// Creates functions necessary to implement [`TokenStringExt]` on a document.
913macro_rules! create_fns_on_doc {
914    ($thing:ident) => {
915        paste! {
916            fn [< first_ $thing >](&self) -> Option<&Token> {
917                self.tokens.[< first_ $thing >]()
918            }
919
920            fn [< last_ $thing >](&self) -> Option<&Token> {
921                self.tokens.[< last_ $thing >]()
922            }
923
924            fn [< last_ $thing _index>](&self) -> Option<usize> {
925                self.tokens.[< last_ $thing _index >]()
926            }
927
928            fn [<iter_ $thing _indices>](&self) -> impl DoubleEndedIterator<Item = usize> + '_ {
929                self.tokens.[< iter_ $thing _indices >]()
930            }
931
932            fn [<iter_ $thing s>](&self) -> impl Iterator<Item = &Token> + '_ {
933                self.tokens.[< iter_ $thing s >]()
934            }
935        }
936    };
937}
938
939impl TokenStringExt for Document {
940    create_fns_on_doc!(adjective);
941    create_fns_on_doc!(apostrophe);
942    create_fns_on_doc!(at);
943    create_fns_on_doc!(chunk_terminator);
944    create_fns_on_doc!(comma);
945    create_fns_on_doc!(conjunction);
946    create_fns_on_doc!(currency);
947    create_fns_on_doc!(ellipsis);
948    create_fns_on_doc!(hostname);
949    create_fns_on_doc!(likely_homograph);
950    create_fns_on_doc!(noun);
951    create_fns_on_doc!(number);
952    create_fns_on_doc!(paragraph_break);
953    create_fns_on_doc!(pipe);
954    create_fns_on_doc!(preposition);
955    create_fns_on_doc!(punctuation);
956    create_fns_on_doc!(quote);
957    create_fns_on_doc!(sentence_terminator);
958    create_fns_on_doc!(space);
959    create_fns_on_doc!(unlintable);
960    create_fns_on_doc!(verb);
961    create_fns_on_doc!(word);
962    create_fns_on_doc!(word_like);
963    create_fns_on_doc!(heading_start);
964
965    fn first_sentence_word(&self) -> Option<&Token> {
966        self.tokens.first_sentence_word()
967    }
968
969    fn first_non_whitespace(&self) -> Option<&Token> {
970        self.tokens.first_non_whitespace()
971    }
972
973    fn span(&self) -> Option<Span<char>> {
974        self.tokens.span()
975    }
976
977    fn iter_linking_verb_indices(&self) -> impl Iterator<Item = usize> + '_ {
978        self.tokens.iter_linking_verb_indices()
979    }
980
981    fn iter_linking_verbs(&self) -> impl Iterator<Item = &Token> + '_ {
982        self.tokens.iter_linking_verbs()
983    }
984
985    fn iter_chunks(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
986        self.tokens.iter_chunks()
987    }
988
989    fn iter_paragraphs(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
990        self.tokens.iter_paragraphs()
991    }
992
993    fn iter_headings(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
994        self.tokens.iter_headings()
995    }
996
997    fn iter_sentences(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
998        self.tokens.iter_sentences()
999    }
1000
1001    fn iter_sentences_mut(&mut self) -> impl Iterator<Item = &'_ mut [Token]> + '_ {
1002        self.tokens.iter_sentences_mut()
1003    }
1004}
1005
1006impl Display for Document {
1007    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1008        for token in &self.tokens {
1009            write!(f, "{}", self.get_span_content_str(&token.span))?;
1010        }
1011
1012        Ok(())
1013    }
1014}
1015
1016#[cfg(test)]
1017mod tests {
1018    use itertools::Itertools;
1019
1020    use super::Document;
1021    use crate::TokenStringExt;
1022    use crate::{Span, parsers::MarkdownOptions};
1023
1024    fn assert_condensed_contractions(text: &str, final_tok_count: usize) {
1025        let document = Document::new_plain_english_curated(text);
1026
1027        assert_eq!(document.tokens.len(), final_tok_count);
1028
1029        let document = Document::new_markdown_curated(text, MarkdownOptions::default());
1030
1031        assert_eq!(document.tokens.len(), final_tok_count);
1032    }
1033
1034    #[test]
1035    fn simple_contraction() {
1036        assert_condensed_contractions("isn't", 1);
1037    }
1038
1039    #[test]
1040    fn simple_contraction2() {
1041        assert_condensed_contractions("wasn't", 1);
1042    }
1043
1044    #[test]
1045    fn simple_contraction3() {
1046        assert_condensed_contractions("There's", 1);
1047    }
1048
1049    #[test]
1050    fn simple_contraction4() {
1051        assert_condensed_contractions("doesn't", 1);
1052    }
1053
1054    #[test]
1055    fn medium_contraction() {
1056        assert_condensed_contractions("isn't wasn't", 3);
1057    }
1058
1059    #[test]
1060    fn medium_contraction2() {
1061        assert_condensed_contractions("There's no way", 5);
1062    }
1063
1064    #[test]
1065    fn selects_token_at_char_index() {
1066        let text = "There were three little pigs. They built three little homes.";
1067        let document = Document::new_plain_english_curated(text);
1068
1069        let got = document.get_token_at_char_index(19).unwrap();
1070
1071        assert!(got.kind.is_word());
1072        assert_eq!(got.span, Span::new(17, 23));
1073    }
1074
1075    fn assert_token_count(source: &str, count: usize) {
1076        let document = Document::new_plain_english_curated(source);
1077
1078        dbg!(document.tokens().map(|t| t.kind.clone()).collect_vec());
1079        assert_eq!(document.tokens.len(), count);
1080    }
1081
1082    #[test]
1083    fn condenses_number_suffixes() {
1084        assert_token_count("1st", 1);
1085        assert_token_count("This is the 2nd test", 9);
1086        assert_token_count("This is the 3rd test", 9);
1087        assert_token_count(
1088            "It works even with weird capitalization like this: 600nD",
1089            18,
1090        );
1091    }
1092
1093    #[test]
1094    fn condenses_ie() {
1095        assert_token_count("There is a thing (i.e. that one)", 15);
1096        assert_token_count("We are trying to condense \"i.e.\"", 13);
1097        assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
1098    }
1099
1100    #[test]
1101    fn condenses_eg() {
1102        assert_token_count("We are trying to condense \"e.g.\"", 13);
1103        assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
1104    }
1105
1106    #[test]
1107    fn condenses_nsa() {
1108        assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
1109    }
1110
1111    #[test]
1112    fn parses_ellipsis() {
1113        assert_token_count("...", 1);
1114    }
1115
1116    #[test]
1117    fn parses_long_ellipsis() {
1118        assert_token_count(".....", 1);
1119    }
1120
1121    #[test]
1122    fn parses_short_ellipsis() {
1123        assert_token_count("..", 1);
1124    }
1125
1126    #[test]
1127    fn selects_token_at_offset() {
1128        let doc = Document::new_plain_english_curated("Foo bar baz");
1129
1130        let tok = doc.get_token_offset(1, -1).unwrap();
1131
1132        assert_eq!(tok.span, Span::new(0, 3));
1133    }
1134
1135    #[test]
1136    fn cant_select_token_before_start() {
1137        let doc = Document::new_plain_english_curated("Foo bar baz");
1138
1139        let tok = doc.get_token_offset(0, -1);
1140
1141        assert!(tok.is_none());
1142    }
1143
1144    #[test]
1145    fn select_next_word_pos_offset() {
1146        let doc = Document::new_plain_english_curated("Foo bar baz");
1147
1148        let bar = doc.get_next_word_from_offset(0, 1).unwrap();
1149        let bar = doc.get_span_content(&bar.span);
1150        assert_eq!(bar, ['b', 'a', 'r']);
1151    }
1152
1153    #[test]
1154    fn select_next_word_neg_offset() {
1155        let doc = Document::new_plain_english_curated("Foo bar baz");
1156
1157        let bar = doc.get_next_word_from_offset(2, -1).unwrap();
1158        let bar = doc.get_span_content(&bar.span);
1159        assert_eq!(bar, ['F', 'o', 'o']);
1160    }
1161
1162    #[test]
1163    fn cant_select_next_word_not_from_whitespace() {
1164        let doc = Document::new_plain_english_curated("Foo bar baz");
1165
1166        let tok = doc.get_next_word_from_offset(0, 2);
1167
1168        assert!(tok.is_none());
1169    }
1170
1171    #[test]
1172    fn cant_select_next_word_before_start() {
1173        let doc = Document::new_plain_english_curated("Foo bar baz");
1174
1175        let tok = doc.get_next_word_from_offset(0, -1);
1176
1177        assert!(tok.is_none());
1178    }
1179
1180    #[test]
1181    fn cant_select_next_word_with_punctuation_instead_of_whitespace() {
1182        let doc = Document::new_plain_english_curated("Foo, bar, baz");
1183
1184        let tok = doc.get_next_word_from_offset(0, 1);
1185
1186        assert!(tok.is_none());
1187    }
1188
1189    #[test]
1190    fn cant_select_next_word_with_punctuation_after_whitespace() {
1191        let doc = Document::new_plain_english_curated("Foo \"bar\", baz");
1192
1193        let tok = doc.get_next_word_from_offset(0, 1);
1194
1195        assert!(tok.is_none());
1196    }
1197
1198    #[test]
1199    fn condenses_filename_extensions() {
1200        let doc = Document::new_plain_english_curated(".c and .exe and .js");
1201        assert!(doc.tokens[0].kind.is_unlintable());
1202        assert!(doc.tokens[4].kind.is_unlintable());
1203        assert!(doc.tokens[8].kind.is_unlintable());
1204    }
1205
1206    #[test]
1207    fn condense_filename_extension_ok_at_start_and_end() {
1208        let doc = Document::new_plain_english_curated(".c and .EXE");
1209        assert!(doc.tokens.len() == 5);
1210        assert!(doc.tokens[0].kind.is_unlintable());
1211        assert!(doc.tokens[4].kind.is_unlintable());
1212    }
1213
1214    #[test]
1215    fn doesnt_condense_filename_extensions_with_mixed_case() {
1216        let doc = Document::new_plain_english_curated(".c and .Exe");
1217        assert!(doc.tokens.len() == 6);
1218        assert!(doc.tokens[0].kind.is_unlintable());
1219        assert!(doc.tokens[4].kind.is_punctuation());
1220        assert!(doc.tokens[5].kind.is_word());
1221    }
1222
1223    #[test]
1224    fn doesnt_condense_filename_extensions_with_non_letters() {
1225        let doc = Document::new_plain_english_curated(".COM and .C0M");
1226        assert!(doc.tokens.len() == 6);
1227        assert!(doc.tokens[0].kind.is_unlintable());
1228        assert!(doc.tokens[4].kind.is_punctuation());
1229        assert!(doc.tokens[5].kind.is_word());
1230    }
1231
1232    #[test]
1233    fn doesnt_condense_filename_extensions_longer_than_three() {
1234        let doc = Document::new_plain_english_curated(".dll and .dlls");
1235        assert!(doc.tokens.len() == 6);
1236        assert!(doc.tokens[0].kind.is_unlintable());
1237        assert!(doc.tokens[4].kind.is_punctuation());
1238        assert!(doc.tokens[5].kind.is_word());
1239    }
1240
1241    #[test]
1242    fn condense_filename_extension_in_parens() {
1243        let doc = Document::new_plain_english_curated(
1244            "true for the manual installation when trying to run the executable(.exe) after a manual download",
1245        );
1246        assert!(doc.tokens.len() > 23);
1247        assert!(doc.tokens[21].kind.is_open_round());
1248        assert!(doc.tokens[22].kind.is_unlintable());
1249        assert!(doc.tokens[23].kind.is_close_round());
1250    }
1251
1252    #[test]
1253    fn condense_tldr_uppercase() {
1254        let doc = Document::new_plain_english_curated("TL;DR");
1255        assert!(doc.tokens.len() == 1);
1256        assert!(doc.tokens[0].kind.is_word());
1257        assert!(doc.tokens[0].span.len() == 5);
1258    }
1259
1260    #[test]
1261    fn condense_tldr_lowercase() {
1262        let doc = Document::new_plain_english_curated("tl;dr");
1263        assert!(doc.tokens.len() == 1);
1264        assert!(doc.tokens[0].kind.is_word());
1265    }
1266
1267    #[test]
1268    fn condense_tldr_mixed_case_1() {
1269        let doc = Document::new_plain_english_curated("tl;DR");
1270        assert!(doc.tokens.len() == 1);
1271        assert!(doc.tokens[0].kind.is_word());
1272    }
1273
1274    #[test]
1275    fn condense_tldr_mixed_case_2() {
1276        let doc = Document::new_plain_english_curated("TL;Dr");
1277        assert!(doc.tokens.len() == 1);
1278        assert!(doc.tokens[0].kind.is_word());
1279    }
1280
1281    #[test]
1282    fn condense_tldr_pural() {
1283        let doc = Document::new_plain_english_curated(
1284            "managing the flow between components to produce relevant TL;DRs of current news articles",
1285        );
1286        // no token is a punctuation token - only words with whitespace between
1287        assert!(
1288            doc.tokens
1289                .iter()
1290                .all(|t| t.kind.is_word() || t.kind.is_whitespace())
1291        );
1292        // one of the word tokens contains a ';' character
1293        let tldrs = doc
1294            .tokens
1295            .iter()
1296            .filter(|t| t.span.get_content(&doc.source).contains(&';'))
1297            .collect_vec();
1298        assert!(tldrs.len() == 1);
1299        assert!(tldrs[0].span.get_content_string(&doc.source) == "TL;DRs");
1300    }
1301
1302    #[test]
1303    fn condense_common_top_level_domains() {
1304        let doc = Document::new_plain_english_curated(".blog and .com and .NET");
1305        assert!(doc.tokens.len() == 9);
1306        assert!(doc.tokens[0].kind.is_unlintable());
1307        assert!(doc.tokens[4].kind.is_unlintable());
1308        assert!(doc.tokens[8].kind.is_unlintable());
1309    }
1310
1311    #[test]
1312    fn condense_common_top_level_domains_in_parens() {
1313        let doc = Document::new_plain_english_curated("(.blog)");
1314        assert!(doc.tokens.len() == 3);
1315        assert!(doc.tokens[0].kind.is_open_round());
1316        assert!(doc.tokens[1].kind.is_unlintable());
1317        assert!(doc.tokens[2].kind.is_close_round());
1318    }
1319
1320    #[test]
1321    fn doesnt_condense_unknown_top_level_domains() {
1322        let doc = Document::new_plain_english_curated(".harper");
1323        assert!(doc.tokens.len() == 2);
1324        assert!(doc.tokens[0].kind.is_punctuation());
1325        assert!(doc.tokens[1].kind.is_word());
1326    }
1327
1328    #[test]
1329    fn condense_r_and_d_caps() {
1330        let doc = Document::new_plain_english_curated("R&D");
1331        assert!(doc.tokens.len() == 1);
1332        assert!(doc.tokens[0].kind.is_word());
1333    }
1334
1335    #[test]
1336    fn condense_r_and_d_mixed_case() {
1337        let doc = Document::new_plain_english_curated("R&d");
1338        assert!(doc.tokens.len() == 1);
1339        assert!(doc.tokens[0].kind.is_word());
1340    }
1341
1342    #[test]
1343    fn condense_r_and_d_lowercase() {
1344        let doc = Document::new_plain_english_curated("r&d");
1345        assert!(doc.tokens.len() == 1);
1346        assert!(doc.tokens[0].kind.is_word());
1347    }
1348
1349    #[test]
1350    fn dont_condense_r_and_d_with_spaces() {
1351        let doc = Document::new_plain_english_curated("R & D");
1352        assert!(doc.tokens.len() == 5);
1353        assert!(doc.tokens[0].kind.is_word());
1354        assert!(doc.tokens[1].kind.is_whitespace());
1355        assert!(doc.tokens[2].kind.is_ampersand());
1356        assert!(doc.tokens[3].kind.is_whitespace());
1357        assert!(doc.tokens[4].kind.is_word());
1358    }
1359
1360    #[test]
1361    fn condense_q_and_a() {
1362        let doc =
1363            Document::new_plain_english_curated("A Q&A platform software for teams at any scales.");
1364        assert!(doc.tokens.len() >= 3);
1365        assert!(doc.tokens[2].kind.is_word());
1366        assert!(doc.tokens[2].span.get_content_string(&doc.source) == "Q&A");
1367    }
1368
1369    #[test]
1370    fn dont_allow_mixed_r_and_d_with_q_and_a() {
1371        let doc = Document::new_plain_english_curated("R&A or Q&D");
1372        assert!(doc.tokens.len() == 9);
1373        assert!(doc.tokens[1].kind.is_ampersand() || doc.tokens[7].kind.is_ampersand());
1374    }
1375
1376    #[test]
1377    fn condense_io() {
1378        let doc = Document::new_plain_english_curated("I/O");
1379        assert!(doc.tokens.len() == 1);
1380        assert!(doc.tokens[0].kind.is_word());
1381    }
1382
1383    #[test]
1384    fn finds_unmatched_quotes_in_document() {
1385        let raw = r#"
1386This is a paragraph with a single word "quoted."
1387
1388This is a second paragraph with no quotes.
1389
1390This is a third paragraph with a single erroneous "quote.
1391
1392This is a final paragraph with a weird "quote and a not-weird "quote".
1393            "#;
1394
1395        let doc = Document::new_markdown_default_curated(raw);
1396
1397        let quote_twins: Vec<_> = doc
1398            .iter_quotes()
1399            .map(|t| t.kind.as_quote().unwrap().twin_loc)
1400            .collect();
1401
1402        assert_eq!(
1403            quote_twins,
1404            vec![Some(19), Some(16), None, None, Some(89), Some(87)]
1405        )
1406    }
1407
1408    #[test]
1409    fn issue_1901() {
1410        let raw = r#"
1411"A quoted line"
1412"A quote without a closing mark
1413"Another quoted lined"
1414"The last quoted line"
1415            "#;
1416
1417        let doc = Document::new_markdown_default_curated(raw);
1418
1419        let quote_twins: Vec<_> = doc
1420            .iter_quotes()
1421            .map(|t| t.kind.as_quote().unwrap().twin_loc)
1422            .collect();
1423
1424        assert_eq!(
1425            quote_twins,
1426            vec![
1427                Some(6),
1428                Some(0),
1429                None,
1430                Some(27),
1431                Some(21),
1432                Some(37),
1433                Some(29)
1434            ]
1435        )
1436    }
1437}