Skip to main content

harper_core/
document.rs

1use std::cmp::Ordering;
2use std::collections::VecDeque;
3use std::fmt::Display;
4
5use harper_brill::{Chunker, Tagger, brill_tagger, burn_chunker};
6use itertools::Itertools;
7use paste::paste;
8
9use crate::expr::{Expr, ExprExt, FirstMatchOf, Repeating, SequenceExpr};
10use crate::parsers::{Markdown, MarkdownOptions, Parser, PlainEnglish};
11use crate::punctuation::Punctuation;
12use crate::spell::{Dictionary, FstDictionary};
13use crate::vec_ext::VecExt;
14use crate::{CharStringExt, FatStringToken, FatToken, Lrc, Token, TokenKind, TokenStringExt};
15use crate::{OrdinalSuffix, Span};
16
17/// A document containing some amount of lexed and parsed English text.
18#[derive(Debug, Clone)]
19pub struct Document {
20    source: Lrc<Vec<char>>,
21    tokens: Vec<Token>,
22}
23
24impl Default for Document {
25    fn default() -> Self {
26        Self::new("", &PlainEnglish, &FstDictionary::curated())
27    }
28}
29
30impl Document {
31    /// Locate all the tokens that intersect a provided span.
32    ///
33    /// Desperately needs optimization.
34    pub fn token_indices_intersecting(&self, span: Span<char>) -> Vec<usize> {
35        self.tokens()
36            .enumerate()
37            .filter_map(|(idx, tok)| tok.span.overlaps_with(span).then_some(idx))
38            .collect()
39    }
40
41    /// Locate all the tokens that intersect a provided span and convert them to [`FatToken`]s.
42    ///
43    /// Desperately needs optimization.
44    pub fn fat_tokens_intersecting(&self, span: Span<char>) -> Vec<FatToken> {
45        let indices = self.token_indices_intersecting(span);
46
47        indices
48            .into_iter()
49            .map(|i| self.tokens[i].to_fat(&self.source))
50            .collect()
51    }
52
53    /// Lexes and parses text to produce a document using a provided language
54    /// parser and dictionary.
55    pub fn new(text: &str, parser: &impl Parser, dictionary: &impl Dictionary) -> Self {
56        let source: Vec<_> = text.chars().collect();
57
58        Self::new_from_vec(Lrc::new(source), parser, dictionary)
59    }
60
61    /// Lexes and parses text to produce a document using a provided language
62    /// parser and the included curated dictionary.
63    pub fn new_curated(text: &str, parser: &impl Parser) -> Self {
64        let source: Vec<_> = text.chars().collect();
65
66        Self::new_from_vec(Lrc::new(source), parser, &FstDictionary::curated())
67    }
68
69    /// Lexes and parses text to produce a document using a provided language
70    /// parser and dictionary.
71    pub fn new_from_vec(
72        source: Lrc<Vec<char>>,
73        parser: &impl Parser,
74        dictionary: &impl Dictionary,
75    ) -> Self {
76        let tokens = parser.parse(&source);
77
78        let mut document = Self { source, tokens };
79        document.parse(dictionary);
80
81        document
82    }
83
84    /// Create a new document from character data using the built-in [`PlainEnglish`]
85    /// parser and curated dictionary. This avoids string-to-char conversions.
86    pub fn new_plain_english_curated_chars(source: &[char]) -> Self {
87        Self::new_from_vec(
88            Lrc::new(source.to_vec()),
89            &PlainEnglish,
90            &FstDictionary::curated(),
91        )
92    }
93
94    /// Parse text to produce a document using the built-in [`PlainEnglish`]
95    /// parser and curated dictionary.
96    pub fn new_plain_english_curated(text: &str) -> Self {
97        Self::new(text, &PlainEnglish, &FstDictionary::curated())
98    }
99
100    /// Create a new document simply by tokenizing the provided input and applying fix-ups. The
101    /// contained words will not contain any metadata.
102    ///
103    /// This avoids running potentially expensive metadata generation code, so this is more
104    /// efficient if you don't need that information.
105    pub(crate) fn new_basic_tokenize(text: &str, parser: &impl Parser) -> Self {
106        let source = Lrc::new(text.chars().collect_vec());
107        let tokens = parser.parse(&source);
108        let mut document = Self { source, tokens };
109        document.apply_fixups();
110        document
111    }
112
113    /// Parse text to produce a document using the built-in [`PlainEnglish`]
114    /// parser and a provided dictionary.
115    pub fn new_plain_english(text: &str, dictionary: &impl Dictionary) -> Self {
116        Self::new(text, &PlainEnglish, dictionary)
117    }
118
119    /// Parse text to produce a document using the built-in [`Markdown`] parser
120    /// and curated dictionary.
121    pub fn new_markdown_curated(text: &str, markdown_options: MarkdownOptions) -> Self {
122        Self::new(
123            text,
124            &Markdown::new(markdown_options),
125            &FstDictionary::curated(),
126        )
127    }
128
129    /// Create a new document from character data using the built-in [`Markdown`] parser
130    /// and curated dictionary. This avoids string-to-char conversions.
131    pub fn new_markdown_default_curated_chars(chars: &[char]) -> Self {
132        Self::new_from_vec(
133            chars.to_vec().into(),
134            &Markdown::default(),
135            &FstDictionary::curated(),
136        )
137    }
138
139    /// Parse text to produce a document using the built-in [`Markdown`] parser
140    /// and curated dictionary with the default Markdown configuration.
141    pub fn new_markdown_default_curated(text: &str) -> Self {
142        Self::new_markdown_curated(text, MarkdownOptions::default())
143    }
144
145    /// Parse text to produce a document using the built-in [`PlainEnglish`]
146    /// parser and the curated dictionary.
147    pub fn new_markdown(
148        text: &str,
149        markdown_options: MarkdownOptions,
150        dictionary: &impl Dictionary,
151    ) -> Self {
152        Self::new(text, &Markdown::new(markdown_options), dictionary)
153    }
154
155    /// Parse text to produce a document using the built-in [`PlainEnglish`]
156    /// parser and the curated dictionary with the default Markdown configuration.
157    pub fn new_markdown_default(text: &str, dictionary: &impl Dictionary) -> Self {
158        Self::new_markdown(text, MarkdownOptions::default(), dictionary)
159    }
160
161    fn apply_fixups(&mut self) {
162        self.condense_spaces();
163        self.condense_newlines();
164        self.newlines_to_breaks();
165        self.condense_dotted_initialisms();
166        self.condense_number_suffixes();
167        self.condense_ellipsis();
168        self.condense_dotted_truncations();
169        self.condense_common_top_level_domains();
170        self.condense_filename_extensions();
171        self.condense_tldr();
172        self.condense_ampersand_pairs();
173        self.condense_slash_pairs();
174        self.match_quotes();
175    }
176
177    /// Re-parse important language constructs.
178    ///
179    /// Should be run after every change to the underlying [`Self::source`].
180    fn parse(&mut self, dictionary: &impl Dictionary) {
181        self.apply_fixups();
182
183        let chunker = burn_chunker();
184        let tagger = brill_tagger();
185
186        for sent in self.tokens.iter_sentences_mut() {
187            let token_strings: Vec<_> = sent
188                .iter()
189                .filter(|t| !t.kind.is_whitespace())
190                .map(|t| t.span.get_content_string(&self.source))
191                .collect();
192
193            let token_tags = tagger.tag_sentence(&token_strings);
194            let np_flags = chunker.chunk_sentence(&token_strings, &token_tags);
195
196            let mut i = 0;
197
198            // Annotate DictWord metadata
199            for token in sent.iter_mut() {
200                if let TokenKind::Word(meta) = &mut token.kind {
201                    let word_source = token.span.get_content(&self.source);
202                    let mut found_meta = dictionary
203                        .get_word_metadata(word_source)
204                        .map(|c| c.into_owned());
205
206                    if let Some(inner) = &mut found_meta {
207                        inner.pos_tag = token_tags[i].or_else(|| inner.infer_pos_tag());
208                        inner.np_member = Some(np_flags[i]);
209                    }
210
211                    *meta = found_meta;
212                    i += 1;
213                } else if !token.kind.is_whitespace() {
214                    i += 1;
215                }
216            }
217        }
218    }
219
220    /// Convert all sets of newlines greater than 2 to paragraph breaks.
221    fn newlines_to_breaks(&mut self) {
222        for token in &mut self.tokens {
223            if let TokenKind::Newline(n) = token.kind
224                && n >= 2
225            {
226                token.kind = TokenKind::ParagraphBreak;
227            }
228        }
229    }
230
231    /// Given a list of indices, this function removes the subsequent
232    /// `stretch_len - 1` elements after each index.
233    ///
234    /// Will extend token spans to include removed elements.
235    /// Assumes condensed tokens are contiguous in source text.
236    fn condense_indices(&mut self, indices: &[usize], stretch_len: usize) {
237        // Update spans
238        for idx in indices {
239            let end_tok = self.tokens[idx + stretch_len - 1].clone();
240            let start_tok = &mut self.tokens[*idx];
241
242            start_tok.span.end = end_tok.span.end;
243        }
244
245        // Trim
246        let old = self.tokens.clone();
247        self.tokens.clear();
248
249        // Keep first chunk.
250        self.tokens
251            .extend_from_slice(&old[0..indices.first().copied().unwrap_or(indices.len())]);
252
253        let mut iter = indices.iter().peekable();
254
255        while let (Some(a_idx), b) = (iter.next(), iter.peek()) {
256            self.tokens.push(old[*a_idx].clone());
257
258            if let Some(b_idx) = b {
259                self.tokens
260                    .extend_from_slice(&old[a_idx + stretch_len..**b_idx]);
261            }
262        }
263
264        // Keep last chunk.
265        self.tokens.extend_from_slice(
266            &old[indices
267                .last()
268                .map(|v| v + stretch_len)
269                .unwrap_or(indices.len())..],
270        );
271    }
272
273    pub fn get_token_at_char_index(&self, char_index: usize) -> Option<&Token> {
274        let index = self
275            .tokens
276            .binary_search_by(|t| {
277                if t.span.overlaps_with(Span::new_with_len(char_index, 1)) {
278                    Ordering::Equal
279                } else {
280                    t.span.start.cmp(&char_index)
281                }
282            })
283            .ok()?;
284
285        Some(&self.tokens[index])
286    }
287
288    /// Defensively attempt to grab a specific token.
289    pub fn get_token(&self, index: usize) -> Option<&Token> {
290        self.tokens.get(index)
291    }
292
293    /// Get a token at a signed offset from a base index, or None if out of bounds.
294    pub fn get_token_offset(&self, base: usize, offset: isize) -> Option<&Token> {
295        match base.checked_add_signed(offset) {
296            None => None,
297            Some(idx) => self.get_token(idx),
298        }
299    }
300
301    /// Get an iterator over all the tokens contained in the document.
302    pub fn tokens(&self) -> impl Iterator<Item = &Token> + '_ {
303        self.tokens.iter()
304    }
305
306    pub fn iter_nominal_phrases(&self) -> impl Iterator<Item = &[Token]> {
307        fn is_np_member(t: &Token) -> bool {
308            t.kind
309                .as_word()
310                .and_then(|x| x.as_ref())
311                .and_then(|w| w.np_member)
312                .unwrap_or(false)
313        }
314
315        fn trim(slice: &[Token]) -> &[Token] {
316            let mut start = 0;
317            let mut end = slice.len();
318            while start < end && slice[start].kind.is_whitespace() {
319                start += 1;
320            }
321            while end > start && slice[end - 1].kind.is_whitespace() {
322                end -= 1;
323            }
324            &slice[start..end]
325        }
326
327        self.tokens
328            .as_slice()
329            .split(|t| !(is_np_member(t) || t.kind.is_whitespace()))
330            .filter_map(|s| {
331                let s = trim(s);
332                if s.iter().any(is_np_member) {
333                    Some(s)
334                } else {
335                    None
336                }
337            })
338    }
339
340    /// Get an iterator over all the tokens contained in the document.
341    pub fn fat_tokens(&self) -> impl Iterator<Item = FatToken> + '_ {
342        self.tokens().map(|token| token.to_fat(&self.source))
343    }
344
345    /// Get the next or previous word token relative to a base index, if separated by whitespace.
346    /// Returns None if the next/previous token is not a word or does not exist.
347    pub fn get_next_word_from_offset(&self, base: usize, offset: isize) -> Option<&Token> {
348        // Look for whitespace at the expected offset
349        if !self.get_token_offset(base, offset)?.kind.is_whitespace() {
350            return None;
351        }
352        // Now look beyond the whitespace for a word token
353        let word_token = self.get_token_offset(base, offset + offset.signum());
354        let word_token = word_token?;
355        word_token.kind.is_word().then_some(word_token)
356    }
357
358    /// Get an iterator over all the tokens contained in the document.
359    pub fn fat_string_tokens(&self) -> impl Iterator<Item = FatStringToken> + '_ {
360        self.fat_tokens().map(|t| t.into())
361    }
362
363    pub fn get_span_content(&self, span: &Span<char>) -> &[char] {
364        span.get_content(&self.source)
365    }
366
367    pub fn get_span_content_str(&self, span: &Span<char>) -> String {
368        String::from_iter(self.get_span_content(span))
369    }
370
371    pub fn get_full_string(&self) -> String {
372        self.get_span_content_str(&Span::new(0, self.source.len()))
373    }
374
375    pub fn get_full_content(&self) -> &[char] {
376        &self.source
377    }
378
379    pub fn get_source(&self) -> &[char] {
380        &self.source
381    }
382
383    pub fn get_tokens(&self) -> &[Token] {
384        &self.tokens
385    }
386
387    /// Searches for quotation marks and fills the
388    /// [`Punctuation::Quote::twin_loc`] field. This is on a best-effort
389    /// basis.
390    ///
391    /// Current algorithm is based on https://leancrew.com/all-this/2025/03/a-mac-smart-quote-curiosity
392    fn match_quotes(&mut self) {
393        let mut pg_indices: Vec<_> = vec![0];
394        pg_indices.extend(self.iter_paragraph_break_indices());
395        pg_indices.push(self.tokens.len());
396
397        // Avoid allocation in loop
398        let mut quote_indices = Vec::new();
399        let mut open_quote_indices = Vec::new();
400
401        for (start, end) in pg_indices.into_iter().tuple_windows() {
402            let pg = &mut self.tokens[start..end];
403
404            quote_indices.clear();
405            quote_indices.extend(pg.iter_quote_indices());
406            open_quote_indices.clear();
407
408            // Find open quotes first.
409            for quote in &quote_indices {
410                let is_open = *quote == 0
411                    || pg[0..*quote].iter_word_likes().next().is_none()
412                    || pg[quote - 1].kind.is_whitespace()
413                    || matches!(
414                        pg[quote - 1].kind.as_punctuation(),
415                        Some(Punctuation::LessThan)
416                            | Some(Punctuation::OpenRound)
417                            | Some(Punctuation::OpenSquare)
418                            | Some(Punctuation::OpenCurly)
419                            | Some(Punctuation::Apostrophe)
420                    );
421
422                if is_open {
423                    open_quote_indices.push(*quote);
424                }
425            }
426
427            while let Some(open_idx) = open_quote_indices.pop() {
428                let Some(close_idx) = pg[open_idx + 1..].iter_quote_indices().next() else {
429                    continue;
430                };
431
432                if pg[close_idx + open_idx + 1]
433                    .kind
434                    .as_quote()
435                    .unwrap()
436                    .twin_loc
437                    .is_some()
438                {
439                    continue;
440                }
441
442                pg[open_idx].kind.as_mut_quote().unwrap().twin_loc =
443                    Some(close_idx + open_idx + start + 1);
444                pg[close_idx + open_idx + 1]
445                    .kind
446                    .as_mut_quote()
447                    .unwrap()
448                    .twin_loc = Some(open_idx + start);
449            }
450        }
451    }
452
453    /// Searches for number suffixes and condenses them down into single tokens
454    fn condense_number_suffixes(&mut self) {
455        if self.tokens.len() < 2 {
456            return;
457        }
458
459        let mut replace_starts = Vec::new();
460
461        for idx in 0..self.tokens.len() - 1 {
462            let b = &self.tokens[idx + 1];
463            let a = &self.tokens[idx];
464
465            // TODO: Allow spaces between `a` and `b`
466
467            if let (TokenKind::Number(..), TokenKind::Word(..)) = (&a.kind, &b.kind)
468                && let Some(found_suffix) =
469                    OrdinalSuffix::from_chars(self.get_span_content(&b.span))
470            {
471                self.tokens[idx].kind.as_mut_number().unwrap().suffix = Some(found_suffix);
472                replace_starts.push(idx);
473            }
474        }
475
476        self.condense_indices(&replace_starts, 2);
477    }
478
479    /// Searches for multiple sequential space tokens and condenses them down
480    /// into one.
481    fn condense_spaces(&mut self) {
482        let mut cursor = 0;
483        let copy = self.tokens.clone();
484
485        let mut remove_these = VecDeque::new();
486
487        while cursor < self.tokens.len() {
488            // Locate a stretch of one or more newline tokens.
489            let start_tok = &mut self.tokens[cursor];
490
491            if let TokenKind::Space(start_count) = &mut start_tok.kind {
492                loop {
493                    cursor += 1;
494
495                    if cursor >= copy.len() {
496                        break;
497                    }
498
499                    let child_tok = &copy[cursor];
500
501                    // Only condense adjacent spans
502                    if start_tok.span.end != child_tok.span.start {
503                        break;
504                    }
505
506                    if let TokenKind::Space(n) = child_tok.kind {
507                        *start_count += n;
508                        start_tok.span.end = child_tok.span.end;
509                        remove_these.push_back(cursor);
510                        cursor += 1;
511                    } else {
512                        break;
513                    };
514                }
515            }
516
517            cursor += 1;
518        }
519
520        self.tokens.remove_indices(remove_these);
521    }
522
523    thread_local! {
524        static DOTTED_TRUNCATION_EXPR: Lrc<FirstMatchOf> = Document::uncached_dotted_truncation_expr();
525    }
526
527    fn uncached_dotted_truncation_expr() -> Lrc<FirstMatchOf> {
528        Lrc::new(FirstMatchOf::new(vec![
529            Box::new(SequenceExpr::word_set(&["esp", "etc", "vs"]).then_period()),
530            Box::new(
531                SequenceExpr::aco("et")
532                    .then_whitespace()
533                    .t_aco("al")
534                    .then_period(),
535            ),
536        ]))
537    }
538
539    /// Assumes that the first matched token is the canonical one to be condensed into.
540    /// Takes a callback that can be used to retroactively edit the canonical token afterwards.
541    fn condense_expr<F>(&mut self, expr: &impl Expr, edit: F)
542    where
543        F: Fn(&mut Token),
544    {
545        let matches = expr.iter_matches_in_doc(self).collect::<Vec<_>>();
546
547        let mut remove_indices = VecDeque::with_capacity(matches.len());
548
549        for m in matches {
550            remove_indices.extend(m.start + 1..m.end);
551            self.tokens[m.start].span = self.tokens[m.into_iter()].span().unwrap();
552            edit(&mut self.tokens[m.start]);
553        }
554
555        self.tokens.remove_indices(remove_indices);
556    }
557
558    fn condense_dotted_truncations(&mut self) {
559        self.condense_expr(&Self::DOTTED_TRUNCATION_EXPR.with(|v| v.clone()), |_| {})
560    }
561
562    /// Searches for multiple sequential newline tokens and condenses them down
563    /// into one.
564    fn condense_newlines(&mut self) {
565        let mut cursor = 0;
566        let copy = self.tokens.clone();
567
568        let mut remove_these = VecDeque::new();
569
570        while cursor < self.tokens.len() {
571            // Locate a stretch of one or more newline tokens.
572            let start_tok = &mut self.tokens[cursor];
573
574            if let TokenKind::Newline(start_count) = &mut start_tok.kind {
575                loop {
576                    cursor += 1;
577
578                    if cursor >= copy.len() {
579                        break;
580                    }
581
582                    let child_tok = &copy[cursor];
583                    if let TokenKind::Newline(n) = child_tok.kind {
584                        *start_count += n;
585                        start_tok.span.end = child_tok.span.end;
586                        remove_these.push_back(cursor);
587                        cursor += 1;
588                    } else {
589                        break;
590                    };
591                }
592            }
593
594            cursor += 1;
595        }
596
597        self.tokens.remove_indices(remove_these);
598    }
599
600    /// Condenses words like "i.e.", "e.g." and "N.S.A." down to single words
601    /// using a state machine.
602    fn condense_dotted_initialisms(&mut self) {
603        if self.tokens.len() < 2 {
604            return;
605        }
606
607        let mut to_remove = VecDeque::new();
608
609        let mut cursor = 1;
610
611        let mut initialism_start = None;
612
613        loop {
614            let a = &self.tokens[cursor - 1];
615            let b = &self.tokens[cursor];
616
617            let is_initialism_chunk = a.kind.is_word() && a.span.len() == 1 && b.kind.is_period();
618
619            if is_initialism_chunk {
620                if initialism_start.is_none() {
621                    initialism_start = Some(cursor - 1);
622                } else {
623                    to_remove.push_back(cursor - 1);
624                }
625
626                to_remove.push_back(cursor);
627                cursor += 1;
628            } else {
629                if let Some(start) = initialism_start {
630                    let end = self.tokens[cursor - 2].span.end;
631                    let start_tok: &mut Token = &mut self.tokens[start];
632                    start_tok.span.end = end;
633                }
634
635                initialism_start = None;
636            }
637
638            cursor += 1;
639
640            if cursor >= self.tokens.len() - 1 {
641                break;
642            }
643        }
644
645        self.tokens.remove_indices(to_remove);
646    }
647
648    /// Condenses likely filename extensions down to single tokens.
649    fn condense_filename_extensions(&mut self) {
650        if self.tokens.len() < 2 {
651            return;
652        }
653
654        let mut to_remove = VecDeque::new();
655
656        let mut cursor = 1;
657
658        let mut ext_start = None;
659
660        loop {
661            // left context, dot, extension, right context
662            let l = self.get_token_offset(cursor, -2);
663            let d = &self.tokens[cursor - 1];
664            let x = &self.tokens[cursor];
665            let r = self.get_token_offset(cursor, 1);
666
667            let is_ext_chunk = d.kind.is_period()
668                && x.kind.is_word()
669                && x.span.len() <= 3
670                && ((l.is_none_or(|t| t.kind.is_whitespace())
671                    && r.is_none_or(|t| t.kind.is_whitespace()))
672                    || (l.is_some_and(|t| t.kind.is_open_round())
673                        && r.is_some_and(|t| t.kind.is_close_round())))
674                && {
675                    let ext_chars = x.span.get_content(&self.source);
676                    ext_chars.iter().all(|c| c.is_ascii_lowercase())
677                        || ext_chars.iter().all(|c| c.is_ascii_uppercase())
678                };
679
680            if is_ext_chunk {
681                if ext_start.is_none() {
682                    ext_start = Some(cursor - 1);
683                    self.tokens[cursor - 1].kind = TokenKind::Unlintable;
684                } else {
685                    to_remove.push_back(cursor - 1);
686                }
687
688                to_remove.push_back(cursor);
689                cursor += 1;
690            } else {
691                if let Some(start) = ext_start {
692                    let end = self.tokens[cursor - 2].span.end;
693                    let start_tok: &mut Token = &mut self.tokens[start];
694                    start_tok.span.end = end;
695                }
696
697                ext_start = None;
698            }
699
700            cursor += 1;
701
702            if cursor >= self.tokens.len() {
703                break;
704            }
705        }
706
707        self.tokens.remove_indices(to_remove);
708    }
709
710    /// Condenses common top-level domains (for example: `.blog`, `.com`) down to single tokens.
711    fn condense_common_top_level_domains(&mut self) {
712        const COMMON_TOP_LEVEL_DOMAINS: &[&str; 106] = &[
713            "ai", "app", "blog", "co", "com", "dev", "edu", "gov", "info", "io", "me", "mil",
714            "net", "org", "shop", "tech", "uk", "us", "xyz", "jp", "de", "fr", "br", "it", "ru",
715            "es", "pl", "ca", "au", "cn", "in", "nl", "eu", "ch", "id", "at", "kr", "cz", "mx",
716            "be", "tv", "se", "tr", "tw", "al", "ua", "ir", "vn", "cl", "sk", "ly", "cc", "to",
717            "no", "fi", "pt", "dk", "ar", "hu", "tk", "gr", "il", "news", "ro", "my", "biz", "ie",
718            "za", "nz", "sg", "ee", "th", "pe", "bg", "hk", "rs", "lt", "link", "ph", "club", "si",
719            "site", "mobi", "by", "cat", "wiki", "la", "ga", "xxx", "cf", "hr", "ng", "jobs",
720            "online", "kz", "ug", "gq", "ae", "is", "lv", "pro", "fm", "tips", "ms", "sa", "int",
721        ];
722
723        if self.tokens.len() < 2 {
724            return;
725        }
726
727        let mut to_remove = VecDeque::new();
728        for cursor in 1..self.tokens.len() {
729            // left context, dot, tld, right context
730            let l = self.get_token_offset(cursor, -2);
731            let d = &self.tokens[cursor - 1];
732            let tld = &self.tokens[cursor];
733            let r = self.get_token_offset(cursor, 1);
734
735            let is_tld_chunk = d.kind.is_period()
736                && tld.kind.is_word()
737                && tld
738                    .span
739                    .get_content(&self.source)
740                    .iter()
741                    .all(|c| c.is_ascii_alphabetic())
742                && tld
743                    .span
744                    .get_content(&self.source)
745                    .eq_any_ignore_ascii_case_str(COMMON_TOP_LEVEL_DOMAINS)
746                && ((l.is_none_or(|t| t.kind.is_whitespace())
747                    && r.is_none_or(|t| t.kind.is_whitespace()))
748                    || (l.is_some_and(|t| t.kind.is_open_round())
749                        && r.is_some_and(|t| t.kind.is_close_round())));
750
751            if is_tld_chunk {
752                self.tokens[cursor - 1].kind = TokenKind::Unlintable;
753                self.tokens[cursor - 1].span.end = self.tokens[cursor].span.end;
754                to_remove.push_back(cursor);
755            }
756        }
757
758        self.tokens.remove_indices(to_remove);
759    }
760
761    /// Condenses "tl;dr" down to a single word token.
762    fn condense_tldr(&mut self) {
763        if self.tokens.len() < 3 {
764            return;
765        }
766
767        let mut to_remove = VecDeque::new();
768        let mut cursor = 2;
769
770        loop {
771            let tl = &self.tokens[cursor - 2];
772            let simicolon = &self.tokens[cursor - 1];
773            let dr = &self.tokens[cursor];
774
775            let is_tldr_chunk = tl.kind.is_word()
776                && tl.span.len() == 2
777                && tl
778                    .span
779                    .get_content(&self.source)
780                    .eq_ignore_ascii_case_chars(&['t', 'l'])
781                && simicolon.kind.is_semicolon()
782                && dr.kind.is_word()
783                && dr.span.len() >= 2
784                && dr.span.len() <= 3
785                && dr
786                    .span
787                    .get_content(&self.source)
788                    .eq_any_ignore_ascii_case_chars(&[&['d', 'r'], &['d', 'r', 's']]);
789
790            if is_tldr_chunk {
791                // Update the first token to be the full "tl;dr" as a word
792                self.tokens[cursor - 2].span = Span::new(
793                    self.tokens[cursor - 2].span.start,
794                    self.tokens[cursor].span.end,
795                );
796
797                // Mark the semicolon and "dr" tokens for removal
798                to_remove.push_back(cursor - 1);
799                to_remove.push_back(cursor);
800            }
801
802            // Skip ahead since we've processed these tokens
803            cursor += 1;
804
805            if cursor >= self.tokens.len() {
806                break;
807            }
808        }
809
810        // Remove the marked tokens in reverse order to maintain correct indices
811        self.tokens.remove_indices(to_remove);
812    }
813
814    /// Allows condensing of delimited pairs of tokens into a single token.
815    ///
816    /// # Arguments
817    ///
818    /// * `is_delimiter` - A function that returns `true` if the token is a delimiter.
819    /// * `valid_pairs` - A slice of tuples representing the valid pairs of tokens to condense.
820    ///
821    fn condense_delimited_pairs<F>(&mut self, is_delimiter: F, valid_pairs: &[(char, char)])
822    where
823        F: Fn(&TokenKind) -> bool,
824    {
825        if self.tokens.len() < 3 {
826            return;
827        }
828
829        let mut to_remove = VecDeque::new();
830        let mut cursor = 2;
831
832        loop {
833            let l1 = &self.tokens[cursor - 2];
834            let delim = &self.tokens[cursor - 1];
835            let l2 = &self.tokens[cursor];
836
837            let is_delimited_chunk = l1.kind.is_word()
838                && l1.span.len() == 1
839                && is_delimiter(&delim.kind)
840                && l2.kind.is_word()
841                && l2.span.len() == 1;
842
843            if is_delimited_chunk {
844                let (l1, l2) = (
845                    l1.span.get_content(&self.source).first(),
846                    l2.span.get_content(&self.source).first(),
847                );
848
849                let is_valid_pair = match (l1, l2) {
850                    (Some(l1), Some(l2)) => {
851                        let pair = (l1.to_ascii_lowercase(), l2.to_ascii_lowercase());
852                        valid_pairs.contains(&pair)
853                    }
854                    _ => false,
855                };
856
857                if is_valid_pair {
858                    self.tokens[cursor - 2].span = Span::new(
859                        self.tokens[cursor - 2].span.start,
860                        self.tokens[cursor].span.end,
861                    );
862                    to_remove.push_back(cursor - 1);
863                    to_remove.push_back(cursor);
864                }
865            }
866
867            cursor += 1;
868            if cursor >= self.tokens.len() {
869                break;
870            }
871        }
872
873        self.tokens.remove_indices(to_remove);
874    }
875
876    // Condenses "ampersand pairs" such as "R&D" or "Q&A" into single tokens.
877    fn condense_ampersand_pairs(&mut self) {
878        self.condense_delimited_pairs(
879            |kind| kind.is_ampersand(),
880            &[
881                ('b', 'b'), // bed & breakfast
882                ('b', 'w'), // black & white
883                ('g', 't'), // gin & tonic
884                ('k', 'r'), // Kernighan & Ritchie
885                ('q', 'a'), // question & answer
886                ('r', 'b'), // rhythm & blues
887                ('r', 'd'), // research & development
888                ('r', 'r'), // rest & relaxation
889                ('s', 'p'), // Standard & Poor's
890            ],
891        );
892    }
893
894    // Condenses "slash pairs" such as "I/O" into single tokens.
895    fn condense_slash_pairs(&mut self) {
896        self.condense_delimited_pairs(
897            |kind| kind.is_slash(),
898            &[
899                ('a', 'c'), // aircon; alternating current
900                ('b', 'w'), // black and white
901                ('c', 'o'), // care of
902                ('d', 'c'), // direct current
903                ('d', 'l'), // download
904                ('i', 'o'), // input/output
905                ('j', 'k'), // just kidding
906                ('n', 'a'), // not applicable
907                ('r', 'c'), // radio control
908                ('s', 'n'), // serial number
909                ('y', 'n'), // yes/no
910                ('y', 'o'), // years old
911            ],
912        );
913    }
914
915    fn uncached_ellipsis_pattern() -> Lrc<Repeating> {
916        let period = SequenceExpr::default().then_period();
917        Lrc::new(Repeating::new(Box::new(period), 2))
918    }
919
920    thread_local! {
921        static ELLIPSIS_EXPR: Lrc<Repeating> = Document::uncached_ellipsis_pattern();
922    }
923
924    fn condense_ellipsis(&mut self) {
925        let expr = Self::ELLIPSIS_EXPR.with(|v| v.clone());
926        self.condense_expr(&expr, |tok| {
927            tok.kind = TokenKind::Punctuation(Punctuation::Ellipsis)
928        });
929    }
930}
931
932/// Creates functions necessary to implement [`TokenStringExt]` on a document.
933macro_rules! create_fns_on_doc {
934    ($thing:ident) => {
935        paste! {
936            fn [< first_ $thing >](&self) -> Option<&Token> {
937                self.tokens.[< first_ $thing >]()
938            }
939
940            fn [< last_ $thing >](&self) -> Option<&Token> {
941                self.tokens.[< last_ $thing >]()
942            }
943
944            fn [< last_ $thing _index>](&self) -> Option<usize> {
945                self.tokens.[< last_ $thing _index >]()
946            }
947
948            fn [<iter_ $thing _indices>](&self) -> impl DoubleEndedIterator<Item = usize> + '_ {
949                self.tokens.[< iter_ $thing _indices >]()
950            }
951
952            fn [<iter_ $thing s>](&self) -> impl Iterator<Item = &Token> + '_ {
953                self.tokens.[< iter_ $thing s >]()
954            }
955        }
956    };
957}
958
959impl TokenStringExt for Document {
960    create_fns_on_doc!(adjective);
961    create_fns_on_doc!(apostrophe);
962    create_fns_on_doc!(at);
963    create_fns_on_doc!(chunk_terminator);
964    create_fns_on_doc!(comma);
965    create_fns_on_doc!(conjunction);
966    create_fns_on_doc!(currency);
967    create_fns_on_doc!(ellipsis);
968    create_fns_on_doc!(hostname);
969    create_fns_on_doc!(likely_homograph);
970    create_fns_on_doc!(noun);
971    create_fns_on_doc!(number);
972    create_fns_on_doc!(paragraph_break);
973    create_fns_on_doc!(pipe);
974    create_fns_on_doc!(preposition);
975    create_fns_on_doc!(punctuation);
976    create_fns_on_doc!(quote);
977    create_fns_on_doc!(sentence_terminator);
978    create_fns_on_doc!(space);
979    create_fns_on_doc!(unlintable);
980    create_fns_on_doc!(verb);
981    create_fns_on_doc!(word);
982    create_fns_on_doc!(word_like);
983    create_fns_on_doc!(heading_start);
984
985    fn first_sentence_word(&self) -> Option<&Token> {
986        self.tokens.first_sentence_word()
987    }
988
989    fn first_non_whitespace(&self) -> Option<&Token> {
990        self.tokens.first_non_whitespace()
991    }
992
993    fn span(&self) -> Option<Span<char>> {
994        self.tokens.span()
995    }
996
997    fn iter_linking_verb_indices(&self) -> impl Iterator<Item = usize> + '_ {
998        self.tokens.iter_linking_verb_indices()
999    }
1000
1001    fn iter_linking_verbs(&self) -> impl Iterator<Item = &Token> + '_ {
1002        self.tokens.iter_linking_verbs()
1003    }
1004
1005    fn iter_chunks(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
1006        self.tokens.iter_chunks()
1007    }
1008
1009    fn iter_paragraphs(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
1010        self.tokens.iter_paragraphs()
1011    }
1012
1013    fn iter_headings(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
1014        self.tokens.iter_headings()
1015    }
1016
1017    fn iter_sentences(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
1018        self.tokens.iter_sentences()
1019    }
1020
1021    fn iter_sentences_mut(&mut self) -> impl Iterator<Item = &'_ mut [Token]> + '_ {
1022        self.tokens.iter_sentences_mut()
1023    }
1024}
1025
1026impl Display for Document {
1027    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1028        for token in &self.tokens {
1029            write!(f, "{}", self.get_span_content_str(&token.span))?;
1030        }
1031
1032        Ok(())
1033    }
1034}
1035
1036#[cfg(test)]
1037mod tests {
1038    use itertools::Itertools;
1039
1040    use super::Document;
1041    use crate::TokenStringExt;
1042    use crate::{Span, parsers::MarkdownOptions};
1043
1044    fn assert_condensed_contractions(text: &str, final_tok_count: usize) {
1045        let document = Document::new_plain_english_curated(text);
1046
1047        assert_eq!(document.tokens.len(), final_tok_count);
1048
1049        let document = Document::new_markdown_curated(text, MarkdownOptions::default());
1050
1051        assert_eq!(document.tokens.len(), final_tok_count);
1052    }
1053
1054    #[test]
1055    fn simple_contraction() {
1056        assert_condensed_contractions("isn't", 1);
1057    }
1058
1059    #[test]
1060    fn simple_contraction2() {
1061        assert_condensed_contractions("wasn't", 1);
1062    }
1063
1064    #[test]
1065    fn simple_contraction3() {
1066        assert_condensed_contractions("There's", 1);
1067    }
1068
1069    #[test]
1070    fn simple_contraction4() {
1071        assert_condensed_contractions("doesn't", 1);
1072    }
1073
1074    #[test]
1075    fn medium_contraction() {
1076        assert_condensed_contractions("isn't wasn't", 3);
1077    }
1078
1079    #[test]
1080    fn medium_contraction2() {
1081        assert_condensed_contractions("There's no way", 5);
1082    }
1083
1084    #[test]
1085    fn selects_token_at_char_index() {
1086        let text = "There were three little pigs. They built three little homes.";
1087        let document = Document::new_plain_english_curated(text);
1088
1089        let got = document.get_token_at_char_index(19).unwrap();
1090
1091        assert!(got.kind.is_word());
1092        assert_eq!(got.span, Span::new(17, 23));
1093    }
1094
1095    fn assert_token_count(source: &str, count: usize) {
1096        let document = Document::new_plain_english_curated(source);
1097
1098        dbg!(document.tokens().map(|t| t.kind.clone()).collect_vec());
1099        assert_eq!(document.tokens.len(), count);
1100    }
1101
1102    #[test]
1103    fn condenses_number_suffixes() {
1104        assert_token_count("1st", 1);
1105        assert_token_count("This is the 2nd test", 9);
1106        assert_token_count("This is the 3rd test", 9);
1107        assert_token_count(
1108            "It works even with weird capitalization like this: 600nD",
1109            18,
1110        );
1111    }
1112
1113    #[test]
1114    fn condenses_ie() {
1115        assert_token_count("There is a thing (i.e. that one)", 15);
1116        assert_token_count("We are trying to condense \"i.e.\"", 13);
1117        assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
1118    }
1119
1120    #[test]
1121    fn condenses_eg() {
1122        assert_token_count("We are trying to condense \"e.g.\"", 13);
1123        assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
1124    }
1125
1126    #[test]
1127    fn condenses_nsa() {
1128        assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
1129    }
1130
1131    #[test]
1132    fn parses_ellipsis() {
1133        assert_token_count("...", 1);
1134    }
1135
1136    #[test]
1137    fn parses_long_ellipsis() {
1138        assert_token_count(".....", 1);
1139    }
1140
1141    #[test]
1142    fn parses_short_ellipsis() {
1143        assert_token_count("..", 1);
1144    }
1145
1146    #[test]
1147    fn selects_token_at_offset() {
1148        let doc = Document::new_plain_english_curated("Foo bar baz");
1149
1150        let tok = doc.get_token_offset(1, -1).unwrap();
1151
1152        assert_eq!(tok.span, Span::new(0, 3));
1153    }
1154
1155    #[test]
1156    fn cant_select_token_before_start() {
1157        let doc = Document::new_plain_english_curated("Foo bar baz");
1158
1159        let tok = doc.get_token_offset(0, -1);
1160
1161        assert!(tok.is_none());
1162    }
1163
1164    #[test]
1165    fn select_next_word_pos_offset() {
1166        let doc = Document::new_plain_english_curated("Foo bar baz");
1167
1168        let bar = doc.get_next_word_from_offset(0, 1).unwrap();
1169        let bar = doc.get_span_content(&bar.span);
1170        assert_eq!(bar, ['b', 'a', 'r']);
1171    }
1172
1173    #[test]
1174    fn select_next_word_neg_offset() {
1175        let doc = Document::new_plain_english_curated("Foo bar baz");
1176
1177        let bar = doc.get_next_word_from_offset(2, -1).unwrap();
1178        let bar = doc.get_span_content(&bar.span);
1179        assert_eq!(bar, ['F', 'o', 'o']);
1180    }
1181
1182    #[test]
1183    fn cant_select_next_word_not_from_whitespace() {
1184        let doc = Document::new_plain_english_curated("Foo bar baz");
1185
1186        let tok = doc.get_next_word_from_offset(0, 2);
1187
1188        assert!(tok.is_none());
1189    }
1190
1191    #[test]
1192    fn cant_select_next_word_before_start() {
1193        let doc = Document::new_plain_english_curated("Foo bar baz");
1194
1195        let tok = doc.get_next_word_from_offset(0, -1);
1196
1197        assert!(tok.is_none());
1198    }
1199
1200    #[test]
1201    fn cant_select_next_word_with_punctuation_instead_of_whitespace() {
1202        let doc = Document::new_plain_english_curated("Foo, bar, baz");
1203
1204        let tok = doc.get_next_word_from_offset(0, 1);
1205
1206        assert!(tok.is_none());
1207    }
1208
1209    #[test]
1210    fn cant_select_next_word_with_punctuation_after_whitespace() {
1211        let doc = Document::new_plain_english_curated("Foo \"bar\", baz");
1212
1213        let tok = doc.get_next_word_from_offset(0, 1);
1214
1215        assert!(tok.is_none());
1216    }
1217
1218    #[test]
1219    fn condenses_filename_extensions() {
1220        let doc = Document::new_plain_english_curated(".c and .exe and .js");
1221        assert!(doc.tokens[0].kind.is_unlintable());
1222        assert!(doc.tokens[4].kind.is_unlintable());
1223        assert!(doc.tokens[8].kind.is_unlintable());
1224    }
1225
1226    #[test]
1227    fn condense_filename_extension_ok_at_start_and_end() {
1228        let doc = Document::new_plain_english_curated(".c and .EXE");
1229        assert!(doc.tokens.len() == 5);
1230        assert!(doc.tokens[0].kind.is_unlintable());
1231        assert!(doc.tokens[4].kind.is_unlintable());
1232    }
1233
1234    #[test]
1235    fn doesnt_condense_filename_extensions_with_mixed_case() {
1236        let doc = Document::new_plain_english_curated(".c and .Exe");
1237        assert!(doc.tokens.len() == 6);
1238        assert!(doc.tokens[0].kind.is_unlintable());
1239        assert!(doc.tokens[4].kind.is_punctuation());
1240        assert!(doc.tokens[5].kind.is_word());
1241    }
1242
1243    #[test]
1244    fn doesnt_condense_filename_extensions_with_non_letters() {
1245        let doc = Document::new_plain_english_curated(".COM and .C0M");
1246        assert!(doc.tokens.len() == 6);
1247        assert!(doc.tokens[0].kind.is_unlintable());
1248        assert!(doc.tokens[4].kind.is_punctuation());
1249        assert!(doc.tokens[5].kind.is_word());
1250    }
1251
1252    #[test]
1253    fn doesnt_condense_filename_extensions_longer_than_three() {
1254        let doc = Document::new_plain_english_curated(".dll and .dlls");
1255        assert!(doc.tokens.len() == 6);
1256        assert!(doc.tokens[0].kind.is_unlintable());
1257        assert!(doc.tokens[4].kind.is_punctuation());
1258        assert!(doc.tokens[5].kind.is_word());
1259    }
1260
1261    #[test]
1262    fn condense_filename_extension_in_parens() {
1263        let doc = Document::new_plain_english_curated(
1264            "true for the manual installation when trying to run the executable(.exe) after a manual download",
1265        );
1266        assert!(doc.tokens.len() > 23);
1267        assert!(doc.tokens[21].kind.is_open_round());
1268        assert!(doc.tokens[22].kind.is_unlintable());
1269        assert!(doc.tokens[23].kind.is_close_round());
1270    }
1271
1272    #[test]
1273    fn condense_tldr_uppercase() {
1274        let doc = Document::new_plain_english_curated("TL;DR");
1275        assert!(doc.tokens.len() == 1);
1276        assert!(doc.tokens[0].kind.is_word());
1277        assert!(doc.tokens[0].span.len() == 5);
1278    }
1279
1280    #[test]
1281    fn condense_tldr_lowercase() {
1282        let doc = Document::new_plain_english_curated("tl;dr");
1283        assert!(doc.tokens.len() == 1);
1284        assert!(doc.tokens[0].kind.is_word());
1285    }
1286
1287    #[test]
1288    fn condense_tldr_mixed_case_1() {
1289        let doc = Document::new_plain_english_curated("tl;DR");
1290        assert!(doc.tokens.len() == 1);
1291        assert!(doc.tokens[0].kind.is_word());
1292    }
1293
1294    #[test]
1295    fn condense_tldr_mixed_case_2() {
1296        let doc = Document::new_plain_english_curated("TL;Dr");
1297        assert!(doc.tokens.len() == 1);
1298        assert!(doc.tokens[0].kind.is_word());
1299    }
1300
1301    #[test]
1302    fn condense_tldr_pural() {
1303        let doc = Document::new_plain_english_curated(
1304            "managing the flow between components to produce relevant TL;DRs of current news articles",
1305        );
1306        // no token is a punctuation token - only words with whitespace between
1307        assert!(
1308            doc.tokens
1309                .iter()
1310                .all(|t| t.kind.is_word() || t.kind.is_whitespace())
1311        );
1312        // one of the word tokens contains a ';' character
1313        let tldrs = doc
1314            .tokens
1315            .iter()
1316            .filter(|t| t.span.get_content(&doc.source).contains(&';'))
1317            .collect_vec();
1318        assert!(tldrs.len() == 1);
1319        assert!(tldrs[0].span.get_content_string(&doc.source) == "TL;DRs");
1320    }
1321
1322    #[test]
1323    fn condense_common_top_level_domains() {
1324        let doc = Document::new_plain_english_curated(".blog and .com and .NET");
1325        assert!(doc.tokens.len() == 9);
1326        assert!(doc.tokens[0].kind.is_unlintable());
1327        assert!(doc.tokens[4].kind.is_unlintable());
1328        assert!(doc.tokens[8].kind.is_unlintable());
1329    }
1330
1331    #[test]
1332    fn condense_common_top_level_domains_in_parens() {
1333        let doc = Document::new_plain_english_curated("(.blog)");
1334        assert!(doc.tokens.len() == 3);
1335        assert!(doc.tokens[0].kind.is_open_round());
1336        assert!(doc.tokens[1].kind.is_unlintable());
1337        assert!(doc.tokens[2].kind.is_close_round());
1338    }
1339
1340    #[test]
1341    fn doesnt_condense_unknown_top_level_domains() {
1342        let doc = Document::new_plain_english_curated(".harper");
1343        assert!(doc.tokens.len() == 2);
1344        assert!(doc.tokens[0].kind.is_punctuation());
1345        assert!(doc.tokens[1].kind.is_word());
1346    }
1347
1348    #[test]
1349    fn condense_r_and_d_caps() {
1350        let doc = Document::new_plain_english_curated("R&D");
1351        assert!(doc.tokens.len() == 1);
1352        assert!(doc.tokens[0].kind.is_word());
1353    }
1354
1355    #[test]
1356    fn condense_r_and_d_mixed_case() {
1357        let doc = Document::new_plain_english_curated("R&d");
1358        assert!(doc.tokens.len() == 1);
1359        assert!(doc.tokens[0].kind.is_word());
1360    }
1361
1362    #[test]
1363    fn condense_r_and_d_lowercase() {
1364        let doc = Document::new_plain_english_curated("r&d");
1365        assert!(doc.tokens.len() == 1);
1366        assert!(doc.tokens[0].kind.is_word());
1367    }
1368
1369    #[test]
1370    fn dont_condense_r_and_d_with_spaces() {
1371        let doc = Document::new_plain_english_curated("R & D");
1372        assert!(doc.tokens.len() == 5);
1373        assert!(doc.tokens[0].kind.is_word());
1374        assert!(doc.tokens[1].kind.is_whitespace());
1375        assert!(doc.tokens[2].kind.is_ampersand());
1376        assert!(doc.tokens[3].kind.is_whitespace());
1377        assert!(doc.tokens[4].kind.is_word());
1378    }
1379
1380    #[test]
1381    fn condense_q_and_a() {
1382        let doc =
1383            Document::new_plain_english_curated("A Q&A platform software for teams at any scales.");
1384        assert!(doc.tokens.len() >= 3);
1385        assert!(doc.tokens[2].kind.is_word());
1386        assert!(doc.tokens[2].span.get_content_string(&doc.source) == "Q&A");
1387    }
1388
1389    #[test]
1390    fn dont_allow_mixed_r_and_d_with_q_and_a() {
1391        let doc = Document::new_plain_english_curated("R&A or Q&D");
1392        assert!(doc.tokens.len() == 9);
1393        assert!(doc.tokens[1].kind.is_ampersand() || doc.tokens[7].kind.is_ampersand());
1394    }
1395
1396    #[test]
1397    fn condense_io() {
1398        let doc = Document::new_plain_english_curated("I/O");
1399        assert!(doc.tokens.len() == 1);
1400        assert!(doc.tokens[0].kind.is_word());
1401    }
1402
1403    #[test]
1404    fn finds_unmatched_quotes_in_document() {
1405        let raw = r#"
1406This is a paragraph with a single word "quoted."
1407
1408This is a second paragraph with no quotes.
1409
1410This is a third paragraph with a single erroneous "quote.
1411
1412This is a final paragraph with a weird "quote and a not-weird "quote".
1413            "#;
1414
1415        let doc = Document::new_markdown_default_curated(raw);
1416
1417        let quote_twins: Vec<_> = doc
1418            .iter_quotes()
1419            .map(|t| t.kind.as_quote().unwrap().twin_loc)
1420            .collect();
1421
1422        assert_eq!(
1423            quote_twins,
1424            vec![Some(19), Some(16), None, None, Some(89), Some(87)]
1425        )
1426    }
1427
1428    #[test]
1429    fn issue_1901() {
1430        let raw = r#"
1431"A quoted line"
1432"A quote without a closing mark
1433"Another quoted lined"
1434"The last quoted line"
1435            "#;
1436
1437        let doc = Document::new_markdown_default_curated(raw);
1438
1439        let quote_twins: Vec<_> = doc
1440            .iter_quotes()
1441            .map(|t| t.kind.as_quote().unwrap().twin_loc)
1442            .collect();
1443
1444        assert_eq!(
1445            quote_twins,
1446            vec![
1447                Some(6),
1448                Some(0),
1449                None,
1450                Some(27),
1451                Some(21),
1452                Some(37),
1453                Some(29)
1454            ]
1455        )
1456    }
1457}