harper_core/
document.rs

1use std::cmp::Ordering;
2use std::collections::VecDeque;
3use std::fmt::Display;
4
5use harper_brill::{Chunker, Tagger, brill_tagger, burn_chunker};
6use paste::paste;
7
8use crate::expr::{Expr, ExprExt, FirstMatchOf, Repeating, SequenceExpr};
9use crate::parsers::{Markdown, MarkdownOptions, Parser, PlainEnglish};
10use crate::patterns::WordSet;
11use crate::punctuation::Punctuation;
12use crate::spell::{Dictionary, FstDictionary};
13use crate::vec_ext::VecExt;
14use crate::{CharStringExt, FatStringToken, FatToken, Lrc, Token, TokenKind, TokenStringExt};
15use crate::{OrdinalSuffix, Span};
16
17/// A document containing some amount of lexed and parsed English text.
18#[derive(Debug, Clone)]
19pub struct Document {
20    source: Lrc<Vec<char>>,
21    tokens: Vec<Token>,
22}
23
24impl Default for Document {
25    fn default() -> Self {
26        Self::new("", &PlainEnglish, &FstDictionary::curated())
27    }
28}
29
30impl Document {
31    /// Locate all the tokens that intersect a provided span.
32    ///
33    /// Desperately needs optimization.
34    pub fn token_indices_intersecting(&self, span: Span<char>) -> Vec<usize> {
35        self.tokens()
36            .enumerate()
37            .filter_map(|(idx, tok)| tok.span.overlaps_with(span).then_some(idx))
38            .collect()
39    }
40
41    /// Locate all the tokens that intersect a provided span and convert them to [`FatToken`]s.
42    ///
43    /// Desperately needs optimization.
44    pub fn fat_tokens_intersecting(&self, span: Span<char>) -> Vec<FatToken> {
45        let indices = self.token_indices_intersecting(span);
46
47        indices
48            .into_iter()
49            .map(|i| self.tokens[i].to_fat(&self.source))
50            .collect()
51    }
52
53    /// Lexes and parses text to produce a document using a provided language
54    /// parser and dictionary.
55    pub fn new(text: &str, parser: &impl Parser, dictionary: &impl Dictionary) -> Self {
56        let source: Vec<_> = text.chars().collect();
57
58        Self::new_from_vec(Lrc::new(source), parser, dictionary)
59    }
60
61    /// Lexes and parses text to produce a document using a provided language
62    /// parser and the included curated dictionary.
63    pub fn new_curated(text: &str, parser: &impl Parser) -> Self {
64        let source: Vec<_> = text.chars().collect();
65
66        Self::new_from_vec(Lrc::new(source), parser, &FstDictionary::curated())
67    }
68
69    /// Lexes and parses text to produce a document using a provided language
70    /// parser and dictionary.
71    pub fn new_from_vec(
72        source: Lrc<Vec<char>>,
73        parser: &impl Parser,
74        dictionary: &impl Dictionary,
75    ) -> Self {
76        let tokens = parser.parse(&source);
77
78        let mut document = Self { source, tokens };
79        document.parse(dictionary);
80
81        document
82    }
83
84    /// Parse text to produce a document using the built-in [`PlainEnglish`]
85    /// parser and curated dictionary.
86    pub fn new_plain_english_curated(text: &str) -> Self {
87        Self::new(text, &PlainEnglish, &FstDictionary::curated())
88    }
89
90    /// Parse text to produce a document using the built-in [`PlainEnglish`]
91    /// parser and a provided dictionary.
92    pub fn new_plain_english(text: &str, dictionary: &impl Dictionary) -> Self {
93        Self::new(text, &PlainEnglish, dictionary)
94    }
95
96    /// Parse text to produce a document using the built-in [`Markdown`] parser
97    /// and curated dictionary.
98    pub fn new_markdown_curated(text: &str, markdown_options: MarkdownOptions) -> Self {
99        Self::new(
100            text,
101            &Markdown::new(markdown_options),
102            &FstDictionary::curated(),
103        )
104    }
105
106    /// Parse text to produce a document using the built-in [`Markdown`] parser
107    /// and curated dictionary with the default Markdown configuration.
108    pub fn new_markdown_default_curated(text: &str) -> Self {
109        Self::new_markdown_curated(text, MarkdownOptions::default())
110    }
111
112    /// Parse text to produce a document using the built-in [`PlainEnglish`]
113    /// parser and the curated dictionary.
114    pub fn new_markdown(
115        text: &str,
116        markdown_options: MarkdownOptions,
117        dictionary: &impl Dictionary,
118    ) -> Self {
119        Self::new(text, &Markdown::new(markdown_options), dictionary)
120    }
121
122    /// Parse text to produce a document using the built-in [`PlainEnglish`]
123    /// parser and the curated dictionary with the default Markdown configuration.
124    pub fn new_markdown_default(text: &str, dictionary: &impl Dictionary) -> Self {
125        Self::new_markdown(text, MarkdownOptions::default(), dictionary)
126    }
127
128    /// Re-parse important language constructs.
129    ///
130    /// Should be run after every change to the underlying [`Self::source`].
131    fn parse(&mut self, dictionary: &impl Dictionary) {
132        self.condense_spaces();
133        self.condense_newlines();
134        self.newlines_to_breaks();
135        self.condense_contractions();
136        self.condense_dotted_initialisms();
137        self.condense_number_suffixes();
138        self.condense_ellipsis();
139        self.condense_latin();
140        self.condense_filename_extensions();
141        self.condense_tldr();
142        self.condense_ampersand_pairs();
143        self.condense_slash_pairs();
144        self.match_quotes();
145
146        let chunker = burn_chunker();
147        let tagger = brill_tagger();
148
149        for sent in self.tokens.iter_sentences_mut() {
150            let token_strings: Vec<_> = sent
151                .iter()
152                .filter(|t| !t.kind.is_whitespace())
153                .map(|t| t.span.get_content_string(&self.source))
154                .collect();
155
156            let token_tags = tagger.tag_sentence(&token_strings);
157            let np_flags = chunker.chunk_sentence(&token_strings, &token_tags);
158
159            let mut i = 0;
160
161            // Annotate word metadata
162            for token in sent.iter_mut() {
163                if let TokenKind::Word(meta) = &mut token.kind {
164                    let word_source = token.span.get_content(&self.source);
165                    let mut found_meta = dictionary.get_word_metadata(word_source).cloned();
166
167                    if let Some(inner) = &mut found_meta {
168                        inner.pos_tag = token_tags[i].or_else(|| inner.infer_pos_tag());
169                        inner.np_member = Some(np_flags[i]);
170                    }
171
172                    *meta = found_meta;
173                    i += 1;
174                } else if !token.kind.is_whitespace() {
175                    i += 1;
176                }
177            }
178        }
179    }
180
181    /// Convert all sets of newlines greater than 2 to paragraph breaks.
182    fn newlines_to_breaks(&mut self) {
183        for token in &mut self.tokens {
184            if let TokenKind::Newline(n) = token.kind
185                && n >= 2
186            {
187                token.kind = TokenKind::ParagraphBreak;
188            }
189        }
190    }
191
192    /// Given a list of indices, this function removes the subsequent
193    /// `stretch_len - 1` elements after each index.
194    ///
195    /// Will extend token spans to include removed elements.
196    /// Assumes condensed tokens are contiguous in source text.
197    fn condense_indices(&mut self, indices: &[usize], stretch_len: usize) {
198        // Update spans
199        for idx in indices {
200            let end_tok = self.tokens[idx + stretch_len - 1].clone();
201            let start_tok = &mut self.tokens[*idx];
202
203            start_tok.span.end = end_tok.span.end;
204        }
205
206        // Trim
207        let old = self.tokens.clone();
208        self.tokens.clear();
209
210        // Keep first chunk.
211        self.tokens
212            .extend_from_slice(&old[0..indices.first().copied().unwrap_or(indices.len())]);
213
214        let mut iter = indices.iter().peekable();
215
216        while let (Some(a_idx), b) = (iter.next(), iter.peek()) {
217            self.tokens.push(old[*a_idx].clone());
218
219            if let Some(b_idx) = b {
220                self.tokens
221                    .extend_from_slice(&old[a_idx + stretch_len..**b_idx]);
222            }
223        }
224
225        // Keep last chunk.
226        self.tokens.extend_from_slice(
227            &old[indices
228                .last()
229                .map(|v| v + stretch_len)
230                .unwrap_or(indices.len())..],
231        );
232    }
233
234    pub fn get_token_at_char_index(&self, char_index: usize) -> Option<&Token> {
235        let index = self
236            .tokens
237            .binary_search_by(|t| {
238                if t.span.overlaps_with(Span::new_with_len(char_index, 1)) {
239                    Ordering::Equal
240                } else {
241                    t.span.start.cmp(&char_index)
242                }
243            })
244            .ok()?;
245
246        Some(&self.tokens[index])
247    }
248
249    /// Defensively attempt to grab a specific token.
250    pub fn get_token(&self, index: usize) -> Option<&Token> {
251        self.tokens.get(index)
252    }
253
254    /// Get a token at a signed offset from a base index, or None if out of bounds.
255    pub fn get_token_offset(&self, base: usize, offset: isize) -> Option<&Token> {
256        match base.checked_add_signed(offset) {
257            None => None,
258            Some(idx) => self.get_token(idx),
259        }
260    }
261
262    /// Get an iterator over all the tokens contained in the document.
263    pub fn tokens(&self) -> impl Iterator<Item = &Token> + '_ {
264        self.tokens.iter()
265    }
266
267    pub fn iter_nominal_phrases(&self) -> impl Iterator<Item = &[Token]> {
268        fn is_np_member(t: &Token) -> bool {
269            t.kind
270                .as_word()
271                .and_then(|x| x.as_ref())
272                .and_then(|w| w.np_member)
273                .unwrap_or(false)
274        }
275
276        fn trim(slice: &[Token]) -> &[Token] {
277            let mut start = 0;
278            let mut end = slice.len();
279            while start < end && slice[start].kind.is_whitespace() {
280                start += 1;
281            }
282            while end > start && slice[end - 1].kind.is_whitespace() {
283                end -= 1;
284            }
285            &slice[start..end]
286        }
287
288        self.tokens
289            .as_slice()
290            .split(|t| !(is_np_member(t) || t.kind.is_whitespace()))
291            .filter_map(|s| {
292                let s = trim(s);
293                if s.iter().any(is_np_member) {
294                    Some(s)
295                } else {
296                    None
297                }
298            })
299    }
300
301    /// Get an iterator over all the tokens contained in the document.
302    pub fn fat_tokens(&self) -> impl Iterator<Item = FatToken> + '_ {
303        self.tokens().map(|token| token.to_fat(&self.source))
304    }
305
306    /// Get the next or previous word token relative to a base index, if separated by whitespace.
307    /// Returns None if the next/previous token is not a word or does not exist.
308    pub fn get_next_word_from_offset(&self, base: usize, offset: isize) -> Option<&Token> {
309        // Look for whitespace at the expected offset
310        if !self.get_token_offset(base, offset)?.kind.is_whitespace() {
311            return None;
312        }
313        // Now look beyond the whitespace for a word token
314        let word_token = self.get_token_offset(base, offset + offset.signum());
315        let word_token = word_token?;
316        word_token.kind.is_word().then_some(word_token)
317    }
318
319    /// Get an iterator over all the tokens contained in the document.
320    pub fn fat_string_tokens(&self) -> impl Iterator<Item = FatStringToken> + '_ {
321        self.fat_tokens().map(|t| t.into())
322    }
323
324    pub fn get_span_content(&self, span: &Span<char>) -> &[char] {
325        span.get_content(&self.source)
326    }
327
328    pub fn get_span_content_str(&self, span: &Span<char>) -> String {
329        String::from_iter(self.get_span_content(span))
330    }
331
332    pub fn get_full_string(&self) -> String {
333        self.get_span_content_str(&Span::new(0, self.source.len()))
334    }
335
336    pub fn get_full_content(&self) -> &[char] {
337        &self.source
338    }
339
340    pub fn get_source(&self) -> &[char] {
341        &self.source
342    }
343
344    pub fn get_tokens(&self) -> &[Token] {
345        &self.tokens
346    }
347
348    /// Searches for quotation marks and fills the
349    /// [`Punctuation::Quote::twin_loc`] field. This is on a best-effort
350    /// basis.
351    ///
352    /// Current algorithm is basic and could use some work.
353    fn match_quotes(&mut self) {
354        let quote_indices: Vec<usize> = self.tokens.iter_quote_indices().collect();
355
356        for i in 0..quote_indices.len() / 2 {
357            let a_i = quote_indices[i * 2];
358            let b_i = quote_indices[i * 2 + 1];
359
360            {
361                let a = self.tokens[a_i].kind.as_mut_quote().unwrap();
362                a.twin_loc = Some(b_i);
363            }
364
365            {
366                let b = self.tokens[b_i].kind.as_mut_quote().unwrap();
367                b.twin_loc = Some(a_i);
368            }
369        }
370    }
371
372    /// Searches for number suffixes and condenses them down into single tokens
373    fn condense_number_suffixes(&mut self) {
374        if self.tokens.len() < 2 {
375            return;
376        }
377
378        let mut replace_starts = Vec::new();
379
380        for idx in 0..self.tokens.len() - 1 {
381            let b = &self.tokens[idx + 1];
382            let a = &self.tokens[idx];
383
384            // TODO: Allow spaces between `a` and `b`
385
386            if let (TokenKind::Number(..), TokenKind::Word(..)) = (&a.kind, &b.kind)
387                && let Some(found_suffix) =
388                    OrdinalSuffix::from_chars(self.get_span_content(&b.span))
389            {
390                self.tokens[idx].kind.as_mut_number().unwrap().suffix = Some(found_suffix);
391                replace_starts.push(idx);
392            }
393        }
394
395        self.condense_indices(&replace_starts, 2);
396    }
397
398    /// Searches for multiple sequential space tokens and condenses them down
399    /// into one.
400    fn condense_spaces(&mut self) {
401        let mut cursor = 0;
402        let copy = self.tokens.clone();
403
404        let mut remove_these = VecDeque::new();
405
406        while cursor < self.tokens.len() {
407            // Locate a stretch of one or more newline tokens.
408            let start_tok = &mut self.tokens[cursor];
409
410            if let TokenKind::Space(start_count) = &mut start_tok.kind {
411                loop {
412                    cursor += 1;
413
414                    if cursor >= copy.len() {
415                        break;
416                    }
417
418                    let child_tok = &copy[cursor];
419
420                    // Only condense adjacent spans
421                    if start_tok.span.end != child_tok.span.start {
422                        break;
423                    }
424
425                    if let TokenKind::Space(n) = child_tok.kind {
426                        *start_count += n;
427                        start_tok.span.end = child_tok.span.end;
428                        remove_these.push_back(cursor);
429                        cursor += 1;
430                    } else {
431                        break;
432                    };
433                }
434            }
435
436            cursor += 1;
437        }
438
439        self.tokens.remove_indices(remove_these);
440    }
441
442    thread_local! {
443        static LATIN_EXPR: Lrc<FirstMatchOf> = Document::uncached_latin_expr();
444    }
445
446    fn uncached_latin_expr() -> Lrc<FirstMatchOf> {
447        Lrc::new(FirstMatchOf::new(vec![
448            Box::new(
449                SequenceExpr::default()
450                    .then(WordSet::new(&["etc", "vs"]))
451                    .then_period(),
452            ),
453            Box::new(
454                SequenceExpr::aco("et")
455                    .then_whitespace()
456                    .t_aco("al")
457                    .then_period(),
458            ),
459        ]))
460    }
461
462    /// Assumes that the first matched token is the canonical one to be condensed into.
463    /// Takes a callback that can be used to retroactively edit the canonical token afterwards.
464    fn condense_expr<F>(&mut self, expr: &impl Expr, edit: F)
465    where
466        F: Fn(&mut Token),
467    {
468        let matches = expr.iter_matches_in_doc(self).collect::<Vec<_>>();
469
470        let mut remove_indices = VecDeque::with_capacity(matches.len());
471
472        for m in matches {
473            remove_indices.extend(m.start + 1..m.end);
474            self.tokens[m.start].span = self.tokens[m.into_iter()].span().unwrap();
475            edit(&mut self.tokens[m.start]);
476        }
477
478        self.tokens.remove_indices(remove_indices);
479    }
480
481    fn condense_latin(&mut self) {
482        self.condense_expr(&Self::LATIN_EXPR.with(|v| v.clone()), |_| {})
483    }
484
485    /// Searches for multiple sequential newline tokens and condenses them down
486    /// into one.
487    fn condense_newlines(&mut self) {
488        let mut cursor = 0;
489        let copy = self.tokens.clone();
490
491        let mut remove_these = VecDeque::new();
492
493        while cursor < self.tokens.len() {
494            // Locate a stretch of one or more newline tokens.
495            let start_tok = &mut self.tokens[cursor];
496
497            if let TokenKind::Newline(start_count) = &mut start_tok.kind {
498                loop {
499                    cursor += 1;
500
501                    if cursor >= copy.len() {
502                        break;
503                    }
504
505                    let child_tok = &copy[cursor];
506                    if let TokenKind::Newline(n) = child_tok.kind {
507                        *start_count += n;
508                        start_tok.span.end = child_tok.span.end;
509                        remove_these.push_back(cursor);
510                        cursor += 1;
511                    } else {
512                        break;
513                    };
514                }
515            }
516
517            cursor += 1;
518        }
519
520        self.tokens.remove_indices(remove_these);
521    }
522
523    /// Condenses words like "i.e.", "e.g." and "N.S.A." down to single words
524    /// using a state machine.
525    fn condense_dotted_initialisms(&mut self) {
526        if self.tokens.len() < 2 {
527            return;
528        }
529
530        let mut to_remove = VecDeque::new();
531
532        let mut cursor = 1;
533
534        let mut initialism_start = None;
535
536        loop {
537            let a = &self.tokens[cursor - 1];
538            let b = &self.tokens[cursor];
539
540            let is_initialism_chunk = a.kind.is_word() && a.span.len() == 1 && b.kind.is_period();
541
542            if is_initialism_chunk {
543                if initialism_start.is_none() {
544                    initialism_start = Some(cursor - 1);
545                } else {
546                    to_remove.push_back(cursor - 1);
547                }
548
549                to_remove.push_back(cursor);
550                cursor += 1;
551            } else {
552                if let Some(start) = initialism_start {
553                    let end = self.tokens[cursor - 2].span.end;
554                    let start_tok: &mut Token = &mut self.tokens[start];
555                    start_tok.span.end = end;
556                }
557
558                initialism_start = None;
559            }
560
561            cursor += 1;
562
563            if cursor >= self.tokens.len() - 1 {
564                break;
565            }
566        }
567
568        self.tokens.remove_indices(to_remove);
569    }
570
571    /// Condenses likely filename extensions down to single tokens.
572    fn condense_filename_extensions(&mut self) {
573        if self.tokens.len() < 2 {
574            return;
575        }
576
577        let mut to_remove = VecDeque::new();
578
579        let mut cursor = 1;
580
581        let mut ext_start = None;
582
583        loop {
584            // left context, dot, extension, right context
585            let l = self.get_token_offset(cursor, -2);
586            let d = &self.tokens[cursor - 1];
587            let x = &self.tokens[cursor];
588            let r = self.get_token_offset(cursor, 1);
589
590            let is_ext_chunk = d.kind.is_period()
591                && x.kind.is_word()
592                && x.span.len() <= 3
593                && ((l.is_none_or(|t| t.kind.is_whitespace())
594                    && r.is_none_or(|t| t.kind.is_whitespace()))
595                    || (l.is_some_and(|t| t.kind.is_open_round())
596                        && r.is_some_and(|t| t.kind.is_close_round())))
597                && {
598                    let ext_chars = x.span.get_content(&self.source);
599                    ext_chars.iter().all(|c| c.is_ascii_lowercase())
600                        || ext_chars.iter().all(|c| c.is_ascii_uppercase())
601                };
602
603            if is_ext_chunk {
604                if ext_start.is_none() {
605                    ext_start = Some(cursor - 1);
606                    self.tokens[cursor - 1].kind = TokenKind::Unlintable;
607                } else {
608                    to_remove.push_back(cursor - 1);
609                }
610
611                to_remove.push_back(cursor);
612                cursor += 1;
613            } else {
614                if let Some(start) = ext_start {
615                    let end = self.tokens[cursor - 2].span.end;
616                    let start_tok: &mut Token = &mut self.tokens[start];
617                    start_tok.span.end = end;
618                }
619
620                ext_start = None;
621            }
622
623            cursor += 1;
624
625            if cursor >= self.tokens.len() {
626                break;
627            }
628        }
629
630        self.tokens.remove_indices(to_remove);
631    }
632
633    /// Condenses "tl;dr" down to a single word token.
634    fn condense_tldr(&mut self) {
635        if self.tokens.len() < 3 {
636            return;
637        }
638
639        let mut to_remove = VecDeque::new();
640        let mut cursor = 2;
641
642        loop {
643            let tl = &self.tokens[cursor - 2];
644            let simicolon = &self.tokens[cursor - 1];
645            let dr = &self.tokens[cursor];
646
647            let is_tldr_chunk = tl.kind.is_word()
648                && tl.span.len() == 2
649                && tl
650                    .span
651                    .get_content(&self.source)
652                    .eq_ignore_ascii_case_chars(&['t', 'l'])
653                && simicolon.kind.is_semicolon()
654                && dr.kind.is_word()
655                && dr.span.len() >= 2
656                && dr.span.len() <= 3
657                && dr
658                    .span
659                    .get_content(&self.source)
660                    .eq_any_ignore_ascii_case_chars(&[&['d', 'r'], &['d', 'r', 's']]);
661
662            if is_tldr_chunk {
663                // Update the first token to be the full "tl;dr" as a word
664                self.tokens[cursor - 2].span = Span::new(
665                    self.tokens[cursor - 2].span.start,
666                    self.tokens[cursor].span.end,
667                );
668
669                // Mark the semicolon and "dr" tokens for removal
670                to_remove.push_back(cursor - 1);
671                to_remove.push_back(cursor);
672            }
673
674            // Skip ahead since we've processed these tokens
675            cursor += 1;
676
677            if cursor >= self.tokens.len() {
678                break;
679            }
680        }
681
682        // Remove the marked tokens in reverse order to maintain correct indices
683        self.tokens.remove_indices(to_remove);
684    }
685
686    /// Allows condensing of delimited pairs of tokens into a single token.
687    ///
688    /// # Arguments
689    ///
690    /// * `is_delimiter` - A function that returns `true` if the token is a delimiter.
691    /// * `valid_pairs` - A slice of tuples representing the valid pairs of tokens to condense.
692    ///
693    fn condense_delimited_pairs<F>(&mut self, is_delimiter: F, valid_pairs: &[(char, char)])
694    where
695        F: Fn(&TokenKind) -> bool,
696    {
697        if self.tokens.len() < 3 {
698            return;
699        }
700
701        let mut to_remove = VecDeque::new();
702        let mut cursor = 2;
703
704        loop {
705            let l1 = &self.tokens[cursor - 2];
706            let delim = &self.tokens[cursor - 1];
707            let l2 = &self.tokens[cursor];
708
709            let is_delimited_chunk = l1.kind.is_word()
710                && l1.span.len() == 1
711                && is_delimiter(&delim.kind)
712                && l2.kind.is_word()
713                && l2.span.len() == 1;
714
715            if is_delimited_chunk {
716                let (l1, l2) = (
717                    l1.span.get_content(&self.source).first(),
718                    l2.span.get_content(&self.source).first(),
719                );
720
721                let is_valid_pair = match (l1, l2) {
722                    (Some(l1), Some(l2)) => {
723                        let pair = (l1.to_ascii_lowercase(), l2.to_ascii_lowercase());
724                        valid_pairs.contains(&pair)
725                    }
726                    _ => false,
727                };
728
729                if is_valid_pair {
730                    self.tokens[cursor - 2].span = Span::new(
731                        self.tokens[cursor - 2].span.start,
732                        self.tokens[cursor].span.end,
733                    );
734                    to_remove.push_back(cursor - 1);
735                    to_remove.push_back(cursor);
736                }
737            }
738
739            cursor += 1;
740            if cursor >= self.tokens.len() {
741                break;
742            }
743        }
744
745        self.tokens.remove_indices(to_remove);
746    }
747
748    // Condenses "ampersand pairs" such as "R&D" or "Q&A" into single tokens.
749    fn condense_ampersand_pairs(&mut self) {
750        self.condense_delimited_pairs(
751            |kind| kind.is_ampersand(),
752            &[
753                ('b', 'b'), // bed & breakfast
754                ('b', 'w'), // black & white
755                ('g', 't'), // gin & tonic
756                ('k', 'r'), // Kernighan & Ritchie
757                ('q', 'a'), // question & answer
758                ('r', 'b'), // rhythm & blues
759                ('r', 'd'), // research & development
760                ('r', 'r'), // rest & relaxation
761                ('s', 'p'), // Standard & Poor's
762            ],
763        );
764    }
765
766    // Condenses "slash pairs" such as "I/O" into single tokens.
767    fn condense_slash_pairs(&mut self) {
768        self.condense_delimited_pairs(
769            |kind| kind.is_slash(),
770            &[
771                ('a', 'c'), // aircon; alternating current
772                ('b', 'w'), // black and white
773                ('c', 'o'), // care of
774                ('d', 'c'), // direct current
775                ('d', 'l'), // download
776                ('i', 'o'), // input/output
777                ('j', 'k'), // just kidding
778                ('n', 'a'), // not applicable
779                ('r', 'c'), // radio control
780                ('s', 'n'), // serial number
781                ('y', 'n'), // yes/no
782                ('y', 'o'), // years old
783            ],
784        );
785    }
786
787    fn uncached_ellipsis_pattern() -> Lrc<Repeating> {
788        let period = SequenceExpr::default().then_period();
789        Lrc::new(Repeating::new(Box::new(period), 2))
790    }
791
792    thread_local! {
793        static ELLIPSIS_EXPR: Lrc<Repeating> = Document::uncached_ellipsis_pattern();
794    }
795
796    fn condense_ellipsis(&mut self) {
797        let expr = Self::ELLIPSIS_EXPR.with(|v| v.clone());
798        self.condense_expr(&expr, |tok| {
799            tok.kind = TokenKind::Punctuation(Punctuation::Ellipsis)
800        });
801    }
802
803    fn uncached_contraction_expr() -> Lrc<SequenceExpr> {
804        Lrc::new(
805            SequenceExpr::default()
806                .then_any_word()
807                .then_apostrophe()
808                .then_any_word(),
809        )
810    }
811
812    thread_local! {
813        static CONTRACTION_EXPR: Lrc<SequenceExpr> = Document::uncached_contraction_expr();
814    }
815
816    /// Searches for contractions and condenses them down into single
817    /// tokens.
818    fn condense_contractions(&mut self) {
819        let expr = Self::CONTRACTION_EXPR.with(|v| v.clone());
820
821        self.condense_expr(&expr, |_| {})
822    }
823}
824
825/// Creates functions necessary to implement [`TokenStringExt]` on a document.
826macro_rules! create_fns_on_doc {
827    ($thing:ident) => {
828        paste! {
829            fn [< first_ $thing >](&self) -> Option<&Token> {
830                self.tokens.[< first_ $thing >]()
831            }
832
833            fn [< last_ $thing >](&self) -> Option<&Token> {
834                self.tokens.[< last_ $thing >]()
835            }
836
837            fn [< last_ $thing _index>](&self) -> Option<usize> {
838                self.tokens.[< last_ $thing _index >]()
839            }
840
841            fn [<iter_ $thing _indices>](&self) -> impl DoubleEndedIterator<Item = usize> + '_ {
842                self.tokens.[< iter_ $thing _indices >]()
843            }
844
845            fn [<iter_ $thing s>](&self) -> impl Iterator<Item = &Token> + '_ {
846                self.tokens.[< iter_ $thing s >]()
847            }
848        }
849    };
850}
851
852impl TokenStringExt for Document {
853    create_fns_on_doc!(adjective);
854    create_fns_on_doc!(apostrophe);
855    create_fns_on_doc!(at);
856    create_fns_on_doc!(chunk_terminator);
857    create_fns_on_doc!(comma);
858    create_fns_on_doc!(conjunction);
859    create_fns_on_doc!(currency);
860    create_fns_on_doc!(ellipsis);
861    create_fns_on_doc!(hostname);
862    create_fns_on_doc!(likely_homograph);
863    create_fns_on_doc!(noun);
864    create_fns_on_doc!(number);
865    create_fns_on_doc!(paragraph_break);
866    create_fns_on_doc!(pipe);
867    create_fns_on_doc!(preposition);
868    create_fns_on_doc!(punctuation);
869    create_fns_on_doc!(quote);
870    create_fns_on_doc!(sentence_terminator);
871    create_fns_on_doc!(space);
872    create_fns_on_doc!(unlintable);
873    create_fns_on_doc!(verb);
874    create_fns_on_doc!(word);
875    create_fns_on_doc!(word_like);
876
877    fn first_sentence_word(&self) -> Option<&Token> {
878        self.tokens.first_sentence_word()
879    }
880
881    fn first_non_whitespace(&self) -> Option<&Token> {
882        self.tokens.first_non_whitespace()
883    }
884
885    fn span(&self) -> Option<Span<char>> {
886        self.tokens.span()
887    }
888
889    fn iter_linking_verb_indices(&self) -> impl Iterator<Item = usize> + '_ {
890        self.tokens.iter_linking_verb_indices()
891    }
892
893    fn iter_linking_verbs(&self) -> impl Iterator<Item = &Token> + '_ {
894        self.tokens.iter_linking_verbs()
895    }
896
897    fn iter_chunks(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
898        self.tokens.iter_chunks()
899    }
900
901    fn iter_paragraphs(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
902        self.tokens.iter_paragraphs()
903    }
904
905    fn iter_sentences(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
906        self.tokens.iter_sentences()
907    }
908
909    fn iter_sentences_mut(&mut self) -> impl Iterator<Item = &'_ mut [Token]> + '_ {
910        self.tokens.iter_sentences_mut()
911    }
912}
913
914impl Display for Document {
915    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
916        for token in &self.tokens {
917            write!(f, "{}", self.get_span_content_str(&token.span))?;
918        }
919
920        Ok(())
921    }
922}
923
924#[cfg(test)]
925mod tests {
926    use itertools::Itertools;
927
928    use super::Document;
929    use crate::{Span, parsers::MarkdownOptions};
930
931    fn assert_condensed_contractions(text: &str, final_tok_count: usize) {
932        let document = Document::new_plain_english_curated(text);
933
934        assert_eq!(document.tokens.len(), final_tok_count);
935
936        let document = Document::new_markdown_curated(text, MarkdownOptions::default());
937
938        assert_eq!(document.tokens.len(), final_tok_count);
939    }
940
941    #[test]
942    fn simple_contraction() {
943        assert_condensed_contractions("isn't", 1);
944    }
945
946    #[test]
947    fn simple_contraction2() {
948        assert_condensed_contractions("wasn't", 1);
949    }
950
951    #[test]
952    fn simple_contraction3() {
953        assert_condensed_contractions("There's", 1);
954    }
955
956    #[test]
957    fn medium_contraction() {
958        assert_condensed_contractions("isn't wasn't", 3);
959    }
960
961    #[test]
962    fn medium_contraction2() {
963        assert_condensed_contractions("There's no way", 5);
964    }
965
966    #[test]
967    fn selects_token_at_char_index() {
968        let text = "There were three little pigs. They built three little homes.";
969        let document = Document::new_plain_english_curated(text);
970
971        let got = document.get_token_at_char_index(19).unwrap();
972
973        assert!(got.kind.is_word());
974        assert_eq!(got.span, Span::new(17, 23));
975    }
976
977    fn assert_token_count(source: &str, count: usize) {
978        let document = Document::new_plain_english_curated(source);
979
980        dbg!(document.tokens().map(|t| t.kind.clone()).collect_vec());
981        assert_eq!(document.tokens.len(), count);
982    }
983
984    #[test]
985    fn condenses_number_suffixes() {
986        assert_token_count("1st", 1);
987        assert_token_count("This is the 2nd test", 9);
988        assert_token_count("This is the 3rd test", 9);
989        assert_token_count(
990            "It works even with weird capitalization like this: 600nD",
991            18,
992        );
993    }
994
995    #[test]
996    fn condenses_ie() {
997        assert_token_count("There is a thing (i.e. that one)", 15);
998        assert_token_count("We are trying to condense \"i.e.\"", 13);
999        assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
1000    }
1001
1002    #[test]
1003    fn condenses_eg() {
1004        assert_token_count("We are trying to condense \"e.g.\"", 13);
1005        assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
1006    }
1007
1008    #[test]
1009    fn condenses_nsa() {
1010        assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
1011    }
1012
1013    #[test]
1014    fn parses_ellipsis() {
1015        assert_token_count("...", 1);
1016    }
1017
1018    #[test]
1019    fn parses_long_ellipsis() {
1020        assert_token_count(".....", 1);
1021    }
1022
1023    #[test]
1024    fn parses_short_ellipsis() {
1025        assert_token_count("..", 1);
1026    }
1027
1028    #[test]
1029    fn selects_token_at_offset() {
1030        let doc = Document::new_plain_english_curated("Foo bar baz");
1031
1032        let tok = doc.get_token_offset(1, -1).unwrap();
1033
1034        assert_eq!(tok.span, Span::new(0, 3));
1035    }
1036
1037    #[test]
1038    fn cant_select_token_before_start() {
1039        let doc = Document::new_plain_english_curated("Foo bar baz");
1040
1041        let tok = doc.get_token_offset(0, -1);
1042
1043        assert!(tok.is_none());
1044    }
1045
1046    #[test]
1047    fn select_next_word_pos_offset() {
1048        let doc = Document::new_plain_english_curated("Foo bar baz");
1049
1050        let bar = doc.get_next_word_from_offset(0, 1).unwrap();
1051        let bar = doc.get_span_content(&bar.span);
1052        assert_eq!(bar, ['b', 'a', 'r']);
1053    }
1054
1055    #[test]
1056    fn select_next_word_neg_offset() {
1057        let doc = Document::new_plain_english_curated("Foo bar baz");
1058
1059        let bar = doc.get_next_word_from_offset(2, -1).unwrap();
1060        let bar = doc.get_span_content(&bar.span);
1061        assert_eq!(bar, ['F', 'o', 'o']);
1062    }
1063
1064    #[test]
1065    fn cant_select_next_word_not_from_whitespace() {
1066        let doc = Document::new_plain_english_curated("Foo bar baz");
1067
1068        let tok = doc.get_next_word_from_offset(0, 2);
1069
1070        assert!(tok.is_none());
1071    }
1072
1073    #[test]
1074    fn cant_select_next_word_before_start() {
1075        let doc = Document::new_plain_english_curated("Foo bar baz");
1076
1077        let tok = doc.get_next_word_from_offset(0, -1);
1078
1079        assert!(tok.is_none());
1080    }
1081
1082    #[test]
1083    fn cant_select_next_word_with_punctuation_instead_of_whitespace() {
1084        let doc = Document::new_plain_english_curated("Foo, bar, baz");
1085
1086        let tok = doc.get_next_word_from_offset(0, 1);
1087
1088        assert!(tok.is_none());
1089    }
1090
1091    #[test]
1092    fn cant_select_next_word_with_punctuation_after_whitespace() {
1093        let doc = Document::new_plain_english_curated("Foo \"bar\", baz");
1094
1095        let tok = doc.get_next_word_from_offset(0, 1);
1096
1097        assert!(tok.is_none());
1098    }
1099
1100    #[test]
1101    fn condenses_filename_extensions() {
1102        let doc = Document::new_plain_english_curated(".c and .exe and .js");
1103        assert!(doc.tokens[0].kind.is_unlintable());
1104        assert!(doc.tokens[4].kind.is_unlintable());
1105        assert!(doc.tokens[8].kind.is_unlintable());
1106    }
1107
1108    #[test]
1109    fn condense_filename_extension_ok_at_start_and_end() {
1110        let doc = Document::new_plain_english_curated(".c and .EXE");
1111        assert!(doc.tokens.len() == 5);
1112        assert!(doc.tokens[0].kind.is_unlintable());
1113        assert!(doc.tokens[4].kind.is_unlintable());
1114    }
1115
1116    #[test]
1117    fn doesnt_condense_filename_extensions_with_mixed_case() {
1118        let doc = Document::new_plain_english_curated(".c and .Exe");
1119        assert!(doc.tokens.len() == 6);
1120        assert!(doc.tokens[0].kind.is_unlintable());
1121        assert!(doc.tokens[4].kind.is_punctuation());
1122        assert!(doc.tokens[5].kind.is_word());
1123    }
1124
1125    #[test]
1126    fn doesnt_condense_filename_extensions_with_non_letters() {
1127        let doc = Document::new_plain_english_curated(".COM and .C0M");
1128        assert!(doc.tokens.len() == 6);
1129        assert!(doc.tokens[0].kind.is_unlintable());
1130        assert!(doc.tokens[4].kind.is_punctuation());
1131        assert!(doc.tokens[5].kind.is_word());
1132    }
1133
1134    #[test]
1135    fn doesnt_condense_filename_extensions_longer_than_three() {
1136        let doc = Document::new_plain_english_curated(".dll and .dlls");
1137        assert!(doc.tokens.len() == 6);
1138        assert!(doc.tokens[0].kind.is_unlintable());
1139        assert!(doc.tokens[4].kind.is_punctuation());
1140        assert!(doc.tokens[5].kind.is_word());
1141    }
1142
1143    #[test]
1144    fn condense_filename_extension_in_parens() {
1145        let doc = Document::new_plain_english_curated(
1146            "true for the manual installation when trying to run the executable(.exe) after a manual download",
1147        );
1148        assert!(doc.tokens.len() > 23);
1149        assert!(doc.tokens[21].kind.is_open_round());
1150        assert!(doc.tokens[22].kind.is_unlintable());
1151        assert!(doc.tokens[23].kind.is_close_round());
1152    }
1153
1154    #[test]
1155    fn condense_tldr_uppercase() {
1156        let doc = Document::new_plain_english_curated("TL;DR");
1157        assert!(doc.tokens.len() == 1);
1158        assert!(doc.tokens[0].kind.is_word());
1159        assert!(doc.tokens[0].span.len() == 5);
1160    }
1161
1162    #[test]
1163    fn condense_tldr_lowercase() {
1164        let doc = Document::new_plain_english_curated("tl;dr");
1165        assert!(doc.tokens.len() == 1);
1166        assert!(doc.tokens[0].kind.is_word());
1167    }
1168
1169    #[test]
1170    fn condense_tldr_mixed_case_1() {
1171        let doc = Document::new_plain_english_curated("tl;DR");
1172        assert!(doc.tokens.len() == 1);
1173        assert!(doc.tokens[0].kind.is_word());
1174    }
1175
1176    #[test]
1177    fn condense_tldr_mixed_case_2() {
1178        let doc = Document::new_plain_english_curated("TL;Dr");
1179        assert!(doc.tokens.len() == 1);
1180        assert!(doc.tokens[0].kind.is_word());
1181    }
1182
1183    #[test]
1184    fn condense_tldr_pural() {
1185        let doc = Document::new_plain_english_curated(
1186            "managing the flow between components to produce relevant TL;DRs of current news articles",
1187        );
1188        // no token is a punctuation token - only words with whitespace between
1189        assert!(
1190            doc.tokens
1191                .iter()
1192                .all(|t| t.kind.is_word() || t.kind.is_whitespace())
1193        );
1194        // one of the word tokens contains a ';' character
1195        let tldrs = doc
1196            .tokens
1197            .iter()
1198            .filter(|t| t.span.get_content(&doc.source).contains(&';'))
1199            .collect_vec();
1200        assert!(tldrs.len() == 1);
1201        assert!(tldrs[0].span.get_content_string(&doc.source) == "TL;DRs");
1202    }
1203
1204    #[test]
1205    fn condense_r_and_d_caps() {
1206        let doc = Document::new_plain_english_curated("R&D");
1207        assert!(doc.tokens.len() == 1);
1208        assert!(doc.tokens[0].kind.is_word());
1209    }
1210
1211    #[test]
1212    fn condense_r_and_d_mixed_case() {
1213        let doc = Document::new_plain_english_curated("R&d");
1214        assert!(doc.tokens.len() == 1);
1215        assert!(doc.tokens[0].kind.is_word());
1216    }
1217
1218    #[test]
1219    fn condense_r_and_d_lowercase() {
1220        let doc = Document::new_plain_english_curated("r&d");
1221        assert!(doc.tokens.len() == 1);
1222        assert!(doc.tokens[0].kind.is_word());
1223    }
1224
1225    #[test]
1226    fn dont_condense_r_and_d_with_spaces() {
1227        let doc = Document::new_plain_english_curated("R & D");
1228        assert!(doc.tokens.len() == 5);
1229        assert!(doc.tokens[0].kind.is_word());
1230        assert!(doc.tokens[1].kind.is_whitespace());
1231        assert!(doc.tokens[2].kind.is_ampersand());
1232        assert!(doc.tokens[3].kind.is_whitespace());
1233        assert!(doc.tokens[4].kind.is_word());
1234    }
1235
1236    #[test]
1237    fn condense_q_and_a() {
1238        let doc =
1239            Document::new_plain_english_curated("A Q&A platform software for teams at any scales.");
1240        assert!(doc.tokens.len() >= 3);
1241        assert!(doc.tokens[2].kind.is_word());
1242        assert!(doc.tokens[2].span.get_content_string(&doc.source) == "Q&A");
1243    }
1244
1245    #[test]
1246    fn dont_allow_mixed_r_and_d_with_q_and_a() {
1247        let doc = Document::new_plain_english_curated("R&A or Q&D");
1248        assert!(doc.tokens.len() == 9);
1249        assert!(doc.tokens[1].kind.is_ampersand() || doc.tokens[7].kind.is_ampersand());
1250    }
1251
1252    #[test]
1253    fn condense_io() {
1254        let doc = Document::new_plain_english_curated("I/O");
1255        assert!(doc.tokens.len() == 1);
1256        assert!(doc.tokens[0].kind.is_word());
1257    }
1258}