harper_core/
document.rs

1use std::cmp::Ordering;
2use std::collections::VecDeque;
3use std::fmt::Display;
4
5use harper_brill::{Chunker, Tagger, brill_tagger, burn_chunker};
6use paste::paste;
7
8use crate::expr::{Expr, ExprExt, FirstMatchOf, Repeating, SequenceExpr};
9use crate::parsers::{Markdown, MarkdownOptions, Parser, PlainEnglish};
10use crate::patterns::WordSet;
11use crate::punctuation::Punctuation;
12use crate::spell::{Dictionary, FstDictionary};
13use crate::vec_ext::VecExt;
14use crate::{CharStringExt, FatStringToken, FatToken, Lrc, Token, TokenKind, TokenStringExt};
15use crate::{OrdinalSuffix, Span};
16
17/// A document containing some amount of lexed and parsed English text.
18#[derive(Debug, Clone)]
19pub struct Document {
20    source: Lrc<Vec<char>>,
21    tokens: Vec<Token>,
22}
23
24impl Default for Document {
25    fn default() -> Self {
26        Self::new("", &PlainEnglish, &FstDictionary::curated())
27    }
28}
29
30impl Document {
31    /// Locate all the tokens that intersect a provided span.
32    ///
33    /// Desperately needs optimization.
34    pub fn token_indices_intersecting(&self, span: Span<char>) -> Vec<usize> {
35        self.tokens()
36            .enumerate()
37            .filter_map(|(idx, tok)| tok.span.overlaps_with(span).then_some(idx))
38            .collect()
39    }
40
41    /// Locate all the tokens that intersect a provided span and convert them to [`FatToken`]s.
42    ///
43    /// Desperately needs optimization.
44    pub fn fat_tokens_intersecting(&self, span: Span<char>) -> Vec<FatToken> {
45        let indices = self.token_indices_intersecting(span);
46
47        indices
48            .into_iter()
49            .map(|i| self.tokens[i].to_fat(&self.source))
50            .collect()
51    }
52
53    /// Lexes and parses text to produce a document using a provided language
54    /// parser and dictionary.
55    pub fn new(text: &str, parser: &impl Parser, dictionary: &impl Dictionary) -> Self {
56        let source: Vec<_> = text.chars().collect();
57
58        Self::new_from_vec(Lrc::new(source), parser, dictionary)
59    }
60
61    /// Lexes and parses text to produce a document using a provided language
62    /// parser and the included curated dictionary.
63    pub fn new_curated(text: &str, parser: &impl Parser) -> Self {
64        let source: Vec<_> = text.chars().collect();
65
66        Self::new_from_vec(Lrc::new(source), parser, &FstDictionary::curated())
67    }
68
69    /// Lexes and parses text to produce a document using a provided language
70    /// parser and dictionary.
71    pub fn new_from_vec(
72        source: Lrc<Vec<char>>,
73        parser: &impl Parser,
74        dictionary: &impl Dictionary,
75    ) -> Self {
76        let tokens = parser.parse(&source);
77
78        let mut document = Self { source, tokens };
79        document.parse(dictionary);
80
81        document
82    }
83
84    /// Parse text to produce a document using the built-in [`PlainEnglish`]
85    /// parser and curated dictionary.
86    pub fn new_plain_english_curated(text: &str) -> Self {
87        Self::new(text, &PlainEnglish, &FstDictionary::curated())
88    }
89
90    /// Parse text to produce a document using the built-in [`PlainEnglish`]
91    /// parser and a provided dictionary.
92    pub fn new_plain_english(text: &str, dictionary: &impl Dictionary) -> Self {
93        Self::new(text, &PlainEnglish, dictionary)
94    }
95
96    /// Parse text to produce a document using the built-in [`Markdown`] parser
97    /// and curated dictionary.
98    pub fn new_markdown_curated(text: &str, markdown_options: MarkdownOptions) -> Self {
99        Self::new(
100            text,
101            &Markdown::new(markdown_options),
102            &FstDictionary::curated(),
103        )
104    }
105
106    /// Parse text to produce a document using the built-in [`Markdown`] parser
107    /// and curated dictionary with the default Markdown configuration.
108    pub fn new_markdown_default_curated(text: &str) -> Self {
109        Self::new_markdown_curated(text, MarkdownOptions::default())
110    }
111
112    /// Parse text to produce a document using the built-in [`PlainEnglish`]
113    /// parser and the curated dictionary.
114    pub fn new_markdown(
115        text: &str,
116        markdown_options: MarkdownOptions,
117        dictionary: &impl Dictionary,
118    ) -> Self {
119        Self::new(text, &Markdown::new(markdown_options), dictionary)
120    }
121
122    /// Parse text to produce a document using the built-in [`PlainEnglish`]
123    /// parser and the curated dictionary with the default Markdown configuration.
124    pub fn new_markdown_default(text: &str, dictionary: &impl Dictionary) -> Self {
125        Self::new_markdown(text, MarkdownOptions::default(), dictionary)
126    }
127
128    /// Re-parse important language constructs.
129    ///
130    /// Should be run after every change to the underlying [`Self::source`].
131    fn parse(&mut self, dictionary: &impl Dictionary) {
132        self.condense_spaces();
133        self.condense_newlines();
134        self.newlines_to_breaks();
135        self.condense_contractions();
136        self.condense_dotted_initialisms();
137        self.condense_number_suffixes();
138        self.condense_ellipsis();
139        self.condense_latin();
140        self.condense_filename_extensions();
141        self.condense_tldr();
142        self.condense_ampersand_pairs();
143        self.condense_slash_pairs();
144        self.match_quotes();
145
146        let chunker = burn_chunker();
147        let tagger = brill_tagger();
148
149        for sent in self.tokens.iter_sentences_mut() {
150            let token_strings: Vec<_> = sent
151                .iter()
152                .filter(|t| !t.kind.is_whitespace())
153                .map(|t| t.span.get_content_string(&self.source))
154                .collect();
155
156            let token_tags = tagger.tag_sentence(&token_strings);
157            let np_flags = chunker.chunk_sentence(&token_strings, &token_tags);
158
159            let mut i = 0;
160
161            // Annotate DictWord metadata
162            for token in sent.iter_mut() {
163                if let TokenKind::Word(meta) = &mut token.kind {
164                    let word_source = token.span.get_content(&self.source);
165                    let mut found_meta = dictionary
166                        .get_lexeme_metadata(word_source)
167                        .map(|c| c.into_owned());
168
169                    if let Some(inner) = &mut found_meta {
170                        inner.pos_tag = token_tags[i].or_else(|| inner.infer_pos_tag());
171                        inner.np_member = Some(np_flags[i]);
172                    }
173
174                    *meta = found_meta;
175                    i += 1;
176                } else if !token.kind.is_whitespace() {
177                    i += 1;
178                }
179            }
180        }
181    }
182
183    /// Convert all sets of newlines greater than 2 to paragraph breaks.
184    fn newlines_to_breaks(&mut self) {
185        for token in &mut self.tokens {
186            if let TokenKind::Newline(n) = token.kind
187                && n >= 2
188            {
189                token.kind = TokenKind::ParagraphBreak;
190            }
191        }
192    }
193
194    /// Given a list of indices, this function removes the subsequent
195    /// `stretch_len - 1` elements after each index.
196    ///
197    /// Will extend token spans to include removed elements.
198    /// Assumes condensed tokens are contiguous in source text.
199    fn condense_indices(&mut self, indices: &[usize], stretch_len: usize) {
200        // Update spans
201        for idx in indices {
202            let end_tok = self.tokens[idx + stretch_len - 1].clone();
203            let start_tok = &mut self.tokens[*idx];
204
205            start_tok.span.end = end_tok.span.end;
206        }
207
208        // Trim
209        let old = self.tokens.clone();
210        self.tokens.clear();
211
212        // Keep first chunk.
213        self.tokens
214            .extend_from_slice(&old[0..indices.first().copied().unwrap_or(indices.len())]);
215
216        let mut iter = indices.iter().peekable();
217
218        while let (Some(a_idx), b) = (iter.next(), iter.peek()) {
219            self.tokens.push(old[*a_idx].clone());
220
221            if let Some(b_idx) = b {
222                self.tokens
223                    .extend_from_slice(&old[a_idx + stretch_len..**b_idx]);
224            }
225        }
226
227        // Keep last chunk.
228        self.tokens.extend_from_slice(
229            &old[indices
230                .last()
231                .map(|v| v + stretch_len)
232                .unwrap_or(indices.len())..],
233        );
234    }
235
236    pub fn get_token_at_char_index(&self, char_index: usize) -> Option<&Token> {
237        let index = self
238            .tokens
239            .binary_search_by(|t| {
240                if t.span.overlaps_with(Span::new_with_len(char_index, 1)) {
241                    Ordering::Equal
242                } else {
243                    t.span.start.cmp(&char_index)
244                }
245            })
246            .ok()?;
247
248        Some(&self.tokens[index])
249    }
250
251    /// Defensively attempt to grab a specific token.
252    pub fn get_token(&self, index: usize) -> Option<&Token> {
253        self.tokens.get(index)
254    }
255
256    /// Get a token at a signed offset from a base index, or None if out of bounds.
257    pub fn get_token_offset(&self, base: usize, offset: isize) -> Option<&Token> {
258        match base.checked_add_signed(offset) {
259            None => None,
260            Some(idx) => self.get_token(idx),
261        }
262    }
263
264    /// Get an iterator over all the tokens contained in the document.
265    pub fn tokens(&self) -> impl Iterator<Item = &Token> + '_ {
266        self.tokens.iter()
267    }
268
269    pub fn iter_nominal_phrases(&self) -> impl Iterator<Item = &[Token]> {
270        fn is_np_member(t: &Token) -> bool {
271            t.kind
272                .as_word()
273                .and_then(|x| x.as_ref())
274                .and_then(|w| w.np_member)
275                .unwrap_or(false)
276        }
277
278        fn trim(slice: &[Token]) -> &[Token] {
279            let mut start = 0;
280            let mut end = slice.len();
281            while start < end && slice[start].kind.is_whitespace() {
282                start += 1;
283            }
284            while end > start && slice[end - 1].kind.is_whitespace() {
285                end -= 1;
286            }
287            &slice[start..end]
288        }
289
290        self.tokens
291            .as_slice()
292            .split(|t| !(is_np_member(t) || t.kind.is_whitespace()))
293            .filter_map(|s| {
294                let s = trim(s);
295                if s.iter().any(is_np_member) {
296                    Some(s)
297                } else {
298                    None
299                }
300            })
301    }
302
303    /// Get an iterator over all the tokens contained in the document.
304    pub fn fat_tokens(&self) -> impl Iterator<Item = FatToken> + '_ {
305        self.tokens().map(|token| token.to_fat(&self.source))
306    }
307
308    /// Get the next or previous word token relative to a base index, if separated by whitespace.
309    /// Returns None if the next/previous token is not a word or does not exist.
310    pub fn get_next_word_from_offset(&self, base: usize, offset: isize) -> Option<&Token> {
311        // Look for whitespace at the expected offset
312        if !self.get_token_offset(base, offset)?.kind.is_whitespace() {
313            return None;
314        }
315        // Now look beyond the whitespace for a word token
316        let word_token = self.get_token_offset(base, offset + offset.signum());
317        let word_token = word_token?;
318        word_token.kind.is_word().then_some(word_token)
319    }
320
321    /// Get an iterator over all the tokens contained in the document.
322    pub fn fat_string_tokens(&self) -> impl Iterator<Item = FatStringToken> + '_ {
323        self.fat_tokens().map(|t| t.into())
324    }
325
326    pub fn get_span_content(&self, span: &Span<char>) -> &[char] {
327        span.get_content(&self.source)
328    }
329
330    pub fn get_span_content_str(&self, span: &Span<char>) -> String {
331        String::from_iter(self.get_span_content(span))
332    }
333
334    pub fn get_full_string(&self) -> String {
335        self.get_span_content_str(&Span::new(0, self.source.len()))
336    }
337
338    pub fn get_full_content(&self) -> &[char] {
339        &self.source
340    }
341
342    pub fn get_source(&self) -> &[char] {
343        &self.source
344    }
345
346    pub fn get_tokens(&self) -> &[Token] {
347        &self.tokens
348    }
349
350    /// Searches for quotation marks and fills the
351    /// [`Punctuation::Quote::twin_loc`] field. This is on a best-effort
352    /// basis.
353    ///
354    /// Current algorithm is basic and could use some work.
355    fn match_quotes(&mut self) {
356        let quote_indices: Vec<usize> = self.tokens.iter_quote_indices().collect();
357
358        for i in 0..quote_indices.len() / 2 {
359            let a_i = quote_indices[i * 2];
360            let b_i = quote_indices[i * 2 + 1];
361
362            {
363                let a = self.tokens[a_i].kind.as_mut_quote().unwrap();
364                a.twin_loc = Some(b_i);
365            }
366
367            {
368                let b = self.tokens[b_i].kind.as_mut_quote().unwrap();
369                b.twin_loc = Some(a_i);
370            }
371        }
372    }
373
374    /// Searches for number suffixes and condenses them down into single tokens
375    fn condense_number_suffixes(&mut self) {
376        if self.tokens.len() < 2 {
377            return;
378        }
379
380        let mut replace_starts = Vec::new();
381
382        for idx in 0..self.tokens.len() - 1 {
383            let b = &self.tokens[idx + 1];
384            let a = &self.tokens[idx];
385
386            // TODO: Allow spaces between `a` and `b`
387
388            if let (TokenKind::Number(..), TokenKind::Word(..)) = (&a.kind, &b.kind)
389                && let Some(found_suffix) =
390                    OrdinalSuffix::from_chars(self.get_span_content(&b.span))
391            {
392                self.tokens[idx].kind.as_mut_number().unwrap().suffix = Some(found_suffix);
393                replace_starts.push(idx);
394            }
395        }
396
397        self.condense_indices(&replace_starts, 2);
398    }
399
400    /// Searches for multiple sequential space tokens and condenses them down
401    /// into one.
402    fn condense_spaces(&mut self) {
403        let mut cursor = 0;
404        let copy = self.tokens.clone();
405
406        let mut remove_these = VecDeque::new();
407
408        while cursor < self.tokens.len() {
409            // Locate a stretch of one or more newline tokens.
410            let start_tok = &mut self.tokens[cursor];
411
412            if let TokenKind::Space(start_count) = &mut start_tok.kind {
413                loop {
414                    cursor += 1;
415
416                    if cursor >= copy.len() {
417                        break;
418                    }
419
420                    let child_tok = &copy[cursor];
421
422                    // Only condense adjacent spans
423                    if start_tok.span.end != child_tok.span.start {
424                        break;
425                    }
426
427                    if let TokenKind::Space(n) = child_tok.kind {
428                        *start_count += n;
429                        start_tok.span.end = child_tok.span.end;
430                        remove_these.push_back(cursor);
431                        cursor += 1;
432                    } else {
433                        break;
434                    };
435                }
436            }
437
438            cursor += 1;
439        }
440
441        self.tokens.remove_indices(remove_these);
442    }
443
444    thread_local! {
445        static LATIN_EXPR: Lrc<FirstMatchOf> = Document::uncached_latin_expr();
446    }
447
448    fn uncached_latin_expr() -> Lrc<FirstMatchOf> {
449        Lrc::new(FirstMatchOf::new(vec![
450            Box::new(
451                SequenceExpr::default()
452                    .then(WordSet::new(&["etc", "vs"]))
453                    .then_period(),
454            ),
455            Box::new(
456                SequenceExpr::aco("et")
457                    .then_whitespace()
458                    .t_aco("al")
459                    .then_period(),
460            ),
461        ]))
462    }
463
464    /// Assumes that the first matched token is the canonical one to be condensed into.
465    /// Takes a callback that can be used to retroactively edit the canonical token afterwards.
466    fn condense_expr<F>(&mut self, expr: &impl Expr, edit: F)
467    where
468        F: Fn(&mut Token),
469    {
470        let matches = expr.iter_matches_in_doc(self).collect::<Vec<_>>();
471
472        let mut remove_indices = VecDeque::with_capacity(matches.len());
473
474        for m in matches {
475            remove_indices.extend(m.start + 1..m.end);
476            self.tokens[m.start].span = self.tokens[m.into_iter()].span().unwrap();
477            edit(&mut self.tokens[m.start]);
478        }
479
480        self.tokens.remove_indices(remove_indices);
481    }
482
483    fn condense_latin(&mut self) {
484        self.condense_expr(&Self::LATIN_EXPR.with(|v| v.clone()), |_| {})
485    }
486
487    /// Searches for multiple sequential newline tokens and condenses them down
488    /// into one.
489    fn condense_newlines(&mut self) {
490        let mut cursor = 0;
491        let copy = self.tokens.clone();
492
493        let mut remove_these = VecDeque::new();
494
495        while cursor < self.tokens.len() {
496            // Locate a stretch of one or more newline tokens.
497            let start_tok = &mut self.tokens[cursor];
498
499            if let TokenKind::Newline(start_count) = &mut start_tok.kind {
500                loop {
501                    cursor += 1;
502
503                    if cursor >= copy.len() {
504                        break;
505                    }
506
507                    let child_tok = &copy[cursor];
508                    if let TokenKind::Newline(n) = child_tok.kind {
509                        *start_count += n;
510                        start_tok.span.end = child_tok.span.end;
511                        remove_these.push_back(cursor);
512                        cursor += 1;
513                    } else {
514                        break;
515                    };
516                }
517            }
518
519            cursor += 1;
520        }
521
522        self.tokens.remove_indices(remove_these);
523    }
524
525    /// Condenses words like "i.e.", "e.g." and "N.S.A." down to single words
526    /// using a state machine.
527    fn condense_dotted_initialisms(&mut self) {
528        if self.tokens.len() < 2 {
529            return;
530        }
531
532        let mut to_remove = VecDeque::new();
533
534        let mut cursor = 1;
535
536        let mut initialism_start = None;
537
538        loop {
539            let a = &self.tokens[cursor - 1];
540            let b = &self.tokens[cursor];
541
542            let is_initialism_chunk = a.kind.is_word() && a.span.len() == 1 && b.kind.is_period();
543
544            if is_initialism_chunk {
545                if initialism_start.is_none() {
546                    initialism_start = Some(cursor - 1);
547                } else {
548                    to_remove.push_back(cursor - 1);
549                }
550
551                to_remove.push_back(cursor);
552                cursor += 1;
553            } else {
554                if let Some(start) = initialism_start {
555                    let end = self.tokens[cursor - 2].span.end;
556                    let start_tok: &mut Token = &mut self.tokens[start];
557                    start_tok.span.end = end;
558                }
559
560                initialism_start = None;
561            }
562
563            cursor += 1;
564
565            if cursor >= self.tokens.len() - 1 {
566                break;
567            }
568        }
569
570        self.tokens.remove_indices(to_remove);
571    }
572
573    /// Condenses likely filename extensions down to single tokens.
574    fn condense_filename_extensions(&mut self) {
575        if self.tokens.len() < 2 {
576            return;
577        }
578
579        let mut to_remove = VecDeque::new();
580
581        let mut cursor = 1;
582
583        let mut ext_start = None;
584
585        loop {
586            // left context, dot, extension, right context
587            let l = self.get_token_offset(cursor, -2);
588            let d = &self.tokens[cursor - 1];
589            let x = &self.tokens[cursor];
590            let r = self.get_token_offset(cursor, 1);
591
592            let is_ext_chunk = d.kind.is_period()
593                && x.kind.is_word()
594                && x.span.len() <= 3
595                && ((l.is_none_or(|t| t.kind.is_whitespace())
596                    && r.is_none_or(|t| t.kind.is_whitespace()))
597                    || (l.is_some_and(|t| t.kind.is_open_round())
598                        && r.is_some_and(|t| t.kind.is_close_round())))
599                && {
600                    let ext_chars = x.span.get_content(&self.source);
601                    ext_chars.iter().all(|c| c.is_ascii_lowercase())
602                        || ext_chars.iter().all(|c| c.is_ascii_uppercase())
603                };
604
605            if is_ext_chunk {
606                if ext_start.is_none() {
607                    ext_start = Some(cursor - 1);
608                    self.tokens[cursor - 1].kind = TokenKind::Unlintable;
609                } else {
610                    to_remove.push_back(cursor - 1);
611                }
612
613                to_remove.push_back(cursor);
614                cursor += 1;
615            } else {
616                if let Some(start) = ext_start {
617                    let end = self.tokens[cursor - 2].span.end;
618                    let start_tok: &mut Token = &mut self.tokens[start];
619                    start_tok.span.end = end;
620                }
621
622                ext_start = None;
623            }
624
625            cursor += 1;
626
627            if cursor >= self.tokens.len() {
628                break;
629            }
630        }
631
632        self.tokens.remove_indices(to_remove);
633    }
634
635    /// Condenses "tl;dr" down to a single word token.
636    fn condense_tldr(&mut self) {
637        if self.tokens.len() < 3 {
638            return;
639        }
640
641        let mut to_remove = VecDeque::new();
642        let mut cursor = 2;
643
644        loop {
645            let tl = &self.tokens[cursor - 2];
646            let simicolon = &self.tokens[cursor - 1];
647            let dr = &self.tokens[cursor];
648
649            let is_tldr_chunk = tl.kind.is_word()
650                && tl.span.len() == 2
651                && tl
652                    .span
653                    .get_content(&self.source)
654                    .eq_ignore_ascii_case_chars(&['t', 'l'])
655                && simicolon.kind.is_semicolon()
656                && dr.kind.is_word()
657                && dr.span.len() >= 2
658                && dr.span.len() <= 3
659                && dr
660                    .span
661                    .get_content(&self.source)
662                    .eq_any_ignore_ascii_case_chars(&[&['d', 'r'], &['d', 'r', 's']]);
663
664            if is_tldr_chunk {
665                // Update the first token to be the full "tl;dr" as a word
666                self.tokens[cursor - 2].span = Span::new(
667                    self.tokens[cursor - 2].span.start,
668                    self.tokens[cursor].span.end,
669                );
670
671                // Mark the semicolon and "dr" tokens for removal
672                to_remove.push_back(cursor - 1);
673                to_remove.push_back(cursor);
674            }
675
676            // Skip ahead since we've processed these tokens
677            cursor += 1;
678
679            if cursor >= self.tokens.len() {
680                break;
681            }
682        }
683
684        // Remove the marked tokens in reverse order to maintain correct indices
685        self.tokens.remove_indices(to_remove);
686    }
687
688    /// Allows condensing of delimited pairs of tokens into a single token.
689    ///
690    /// # Arguments
691    ///
692    /// * `is_delimiter` - A function that returns `true` if the token is a delimiter.
693    /// * `valid_pairs` - A slice of tuples representing the valid pairs of tokens to condense.
694    ///
695    fn condense_delimited_pairs<F>(&mut self, is_delimiter: F, valid_pairs: &[(char, char)])
696    where
697        F: Fn(&TokenKind) -> bool,
698    {
699        if self.tokens.len() < 3 {
700            return;
701        }
702
703        let mut to_remove = VecDeque::new();
704        let mut cursor = 2;
705
706        loop {
707            let l1 = &self.tokens[cursor - 2];
708            let delim = &self.tokens[cursor - 1];
709            let l2 = &self.tokens[cursor];
710
711            let is_delimited_chunk = l1.kind.is_word()
712                && l1.span.len() == 1
713                && is_delimiter(&delim.kind)
714                && l2.kind.is_word()
715                && l2.span.len() == 1;
716
717            if is_delimited_chunk {
718                let (l1, l2) = (
719                    l1.span.get_content(&self.source).first(),
720                    l2.span.get_content(&self.source).first(),
721                );
722
723                let is_valid_pair = match (l1, l2) {
724                    (Some(l1), Some(l2)) => {
725                        let pair = (l1.to_ascii_lowercase(), l2.to_ascii_lowercase());
726                        valid_pairs.contains(&pair)
727                    }
728                    _ => false,
729                };
730
731                if is_valid_pair {
732                    self.tokens[cursor - 2].span = Span::new(
733                        self.tokens[cursor - 2].span.start,
734                        self.tokens[cursor].span.end,
735                    );
736                    to_remove.push_back(cursor - 1);
737                    to_remove.push_back(cursor);
738                }
739            }
740
741            cursor += 1;
742            if cursor >= self.tokens.len() {
743                break;
744            }
745        }
746
747        self.tokens.remove_indices(to_remove);
748    }
749
750    // Condenses "ampersand pairs" such as "R&D" or "Q&A" into single tokens.
751    fn condense_ampersand_pairs(&mut self) {
752        self.condense_delimited_pairs(
753            |kind| kind.is_ampersand(),
754            &[
755                ('b', 'b'), // bed & breakfast
756                ('b', 'w'), // black & white
757                ('g', 't'), // gin & tonic
758                ('k', 'r'), // Kernighan & Ritchie
759                ('q', 'a'), // question & answer
760                ('r', 'b'), // rhythm & blues
761                ('r', 'd'), // research & development
762                ('r', 'r'), // rest & relaxation
763                ('s', 'p'), // Standard & Poor's
764            ],
765        );
766    }
767
768    // Condenses "slash pairs" such as "I/O" into single tokens.
769    fn condense_slash_pairs(&mut self) {
770        self.condense_delimited_pairs(
771            |kind| kind.is_slash(),
772            &[
773                ('a', 'c'), // aircon; alternating current
774                ('b', 'w'), // black and white
775                ('c', 'o'), // care of
776                ('d', 'c'), // direct current
777                ('d', 'l'), // download
778                ('i', 'o'), // input/output
779                ('j', 'k'), // just kidding
780                ('n', 'a'), // not applicable
781                ('r', 'c'), // radio control
782                ('s', 'n'), // serial number
783                ('y', 'n'), // yes/no
784                ('y', 'o'), // years old
785            ],
786        );
787    }
788
789    fn uncached_ellipsis_pattern() -> Lrc<Repeating> {
790        let period = SequenceExpr::default().then_period();
791        Lrc::new(Repeating::new(Box::new(period), 2))
792    }
793
794    thread_local! {
795        static ELLIPSIS_EXPR: Lrc<Repeating> = Document::uncached_ellipsis_pattern();
796    }
797
798    fn condense_ellipsis(&mut self) {
799        let expr = Self::ELLIPSIS_EXPR.with(|v| v.clone());
800        self.condense_expr(&expr, |tok| {
801            tok.kind = TokenKind::Punctuation(Punctuation::Ellipsis)
802        });
803    }
804
805    fn uncached_contraction_expr() -> Lrc<SequenceExpr> {
806        Lrc::new(
807            SequenceExpr::default()
808                .then_any_word()
809                .then_apostrophe()
810                .then_any_word(),
811        )
812    }
813
814    thread_local! {
815        static CONTRACTION_EXPR: Lrc<SequenceExpr> = Document::uncached_contraction_expr();
816    }
817
818    /// Searches for contractions and condenses them down into single
819    /// tokens.
820    fn condense_contractions(&mut self) {
821        let expr = Self::CONTRACTION_EXPR.with(|v| v.clone());
822
823        self.condense_expr(&expr, |_| {})
824    }
825}
826
827/// Creates functions necessary to implement [`TokenStringExt]` on a document.
828macro_rules! create_fns_on_doc {
829    ($thing:ident) => {
830        paste! {
831            fn [< first_ $thing >](&self) -> Option<&Token> {
832                self.tokens.[< first_ $thing >]()
833            }
834
835            fn [< last_ $thing >](&self) -> Option<&Token> {
836                self.tokens.[< last_ $thing >]()
837            }
838
839            fn [< last_ $thing _index>](&self) -> Option<usize> {
840                self.tokens.[< last_ $thing _index >]()
841            }
842
843            fn [<iter_ $thing _indices>](&self) -> impl DoubleEndedIterator<Item = usize> + '_ {
844                self.tokens.[< iter_ $thing _indices >]()
845            }
846
847            fn [<iter_ $thing s>](&self) -> impl Iterator<Item = &Token> + '_ {
848                self.tokens.[< iter_ $thing s >]()
849            }
850        }
851    };
852}
853
854impl TokenStringExt for Document {
855    create_fns_on_doc!(adjective);
856    create_fns_on_doc!(apostrophe);
857    create_fns_on_doc!(at);
858    create_fns_on_doc!(chunk_terminator);
859    create_fns_on_doc!(comma);
860    create_fns_on_doc!(conjunction);
861    create_fns_on_doc!(currency);
862    create_fns_on_doc!(ellipsis);
863    create_fns_on_doc!(hostname);
864    create_fns_on_doc!(likely_homograph);
865    create_fns_on_doc!(noun);
866    create_fns_on_doc!(number);
867    create_fns_on_doc!(paragraph_break);
868    create_fns_on_doc!(pipe);
869    create_fns_on_doc!(preposition);
870    create_fns_on_doc!(punctuation);
871    create_fns_on_doc!(quote);
872    create_fns_on_doc!(sentence_terminator);
873    create_fns_on_doc!(space);
874    create_fns_on_doc!(unlintable);
875    create_fns_on_doc!(verb);
876    create_fns_on_doc!(word);
877    create_fns_on_doc!(word_like);
878
879    fn first_sentence_word(&self) -> Option<&Token> {
880        self.tokens.first_sentence_word()
881    }
882
883    fn first_non_whitespace(&self) -> Option<&Token> {
884        self.tokens.first_non_whitespace()
885    }
886
887    fn span(&self) -> Option<Span<char>> {
888        self.tokens.span()
889    }
890
891    fn iter_linking_verb_indices(&self) -> impl Iterator<Item = usize> + '_ {
892        self.tokens.iter_linking_verb_indices()
893    }
894
895    fn iter_linking_verbs(&self) -> impl Iterator<Item = &Token> + '_ {
896        self.tokens.iter_linking_verbs()
897    }
898
899    fn iter_chunks(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
900        self.tokens.iter_chunks()
901    }
902
903    fn iter_paragraphs(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
904        self.tokens.iter_paragraphs()
905    }
906
907    fn iter_sentences(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
908        self.tokens.iter_sentences()
909    }
910
911    fn iter_sentences_mut(&mut self) -> impl Iterator<Item = &'_ mut [Token]> + '_ {
912        self.tokens.iter_sentences_mut()
913    }
914}
915
916impl Display for Document {
917    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
918        for token in &self.tokens {
919            write!(f, "{}", self.get_span_content_str(&token.span))?;
920        }
921
922        Ok(())
923    }
924}
925
926#[cfg(test)]
927mod tests {
928    use itertools::Itertools;
929
930    use super::Document;
931    use crate::{Span, parsers::MarkdownOptions};
932
933    fn assert_condensed_contractions(text: &str, final_tok_count: usize) {
934        let document = Document::new_plain_english_curated(text);
935
936        assert_eq!(document.tokens.len(), final_tok_count);
937
938        let document = Document::new_markdown_curated(text, MarkdownOptions::default());
939
940        assert_eq!(document.tokens.len(), final_tok_count);
941    }
942
943    #[test]
944    fn simple_contraction() {
945        assert_condensed_contractions("isn't", 1);
946    }
947
948    #[test]
949    fn simple_contraction2() {
950        assert_condensed_contractions("wasn't", 1);
951    }
952
953    #[test]
954    fn simple_contraction3() {
955        assert_condensed_contractions("There's", 1);
956    }
957
958    #[test]
959    fn medium_contraction() {
960        assert_condensed_contractions("isn't wasn't", 3);
961    }
962
963    #[test]
964    fn medium_contraction2() {
965        assert_condensed_contractions("There's no way", 5);
966    }
967
968    #[test]
969    fn selects_token_at_char_index() {
970        let text = "There were three little pigs. They built three little homes.";
971        let document = Document::new_plain_english_curated(text);
972
973        let got = document.get_token_at_char_index(19).unwrap();
974
975        assert!(got.kind.is_word());
976        assert_eq!(got.span, Span::new(17, 23));
977    }
978
979    fn assert_token_count(source: &str, count: usize) {
980        let document = Document::new_plain_english_curated(source);
981
982        dbg!(document.tokens().map(|t| t.kind.clone()).collect_vec());
983        assert_eq!(document.tokens.len(), count);
984    }
985
986    #[test]
987    fn condenses_number_suffixes() {
988        assert_token_count("1st", 1);
989        assert_token_count("This is the 2nd test", 9);
990        assert_token_count("This is the 3rd test", 9);
991        assert_token_count(
992            "It works even with weird capitalization like this: 600nD",
993            18,
994        );
995    }
996
997    #[test]
998    fn condenses_ie() {
999        assert_token_count("There is a thing (i.e. that one)", 15);
1000        assert_token_count("We are trying to condense \"i.e.\"", 13);
1001        assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
1002    }
1003
1004    #[test]
1005    fn condenses_eg() {
1006        assert_token_count("We are trying to condense \"e.g.\"", 13);
1007        assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
1008    }
1009
1010    #[test]
1011    fn condenses_nsa() {
1012        assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
1013    }
1014
1015    #[test]
1016    fn parses_ellipsis() {
1017        assert_token_count("...", 1);
1018    }
1019
1020    #[test]
1021    fn parses_long_ellipsis() {
1022        assert_token_count(".....", 1);
1023    }
1024
1025    #[test]
1026    fn parses_short_ellipsis() {
1027        assert_token_count("..", 1);
1028    }
1029
1030    #[test]
1031    fn selects_token_at_offset() {
1032        let doc = Document::new_plain_english_curated("Foo bar baz");
1033
1034        let tok = doc.get_token_offset(1, -1).unwrap();
1035
1036        assert_eq!(tok.span, Span::new(0, 3));
1037    }
1038
1039    #[test]
1040    fn cant_select_token_before_start() {
1041        let doc = Document::new_plain_english_curated("Foo bar baz");
1042
1043        let tok = doc.get_token_offset(0, -1);
1044
1045        assert!(tok.is_none());
1046    }
1047
1048    #[test]
1049    fn select_next_word_pos_offset() {
1050        let doc = Document::new_plain_english_curated("Foo bar baz");
1051
1052        let bar = doc.get_next_word_from_offset(0, 1).unwrap();
1053        let bar = doc.get_span_content(&bar.span);
1054        assert_eq!(bar, ['b', 'a', 'r']);
1055    }
1056
1057    #[test]
1058    fn select_next_word_neg_offset() {
1059        let doc = Document::new_plain_english_curated("Foo bar baz");
1060
1061        let bar = doc.get_next_word_from_offset(2, -1).unwrap();
1062        let bar = doc.get_span_content(&bar.span);
1063        assert_eq!(bar, ['F', 'o', 'o']);
1064    }
1065
1066    #[test]
1067    fn cant_select_next_word_not_from_whitespace() {
1068        let doc = Document::new_plain_english_curated("Foo bar baz");
1069
1070        let tok = doc.get_next_word_from_offset(0, 2);
1071
1072        assert!(tok.is_none());
1073    }
1074
1075    #[test]
1076    fn cant_select_next_word_before_start() {
1077        let doc = Document::new_plain_english_curated("Foo bar baz");
1078
1079        let tok = doc.get_next_word_from_offset(0, -1);
1080
1081        assert!(tok.is_none());
1082    }
1083
1084    #[test]
1085    fn cant_select_next_word_with_punctuation_instead_of_whitespace() {
1086        let doc = Document::new_plain_english_curated("Foo, bar, baz");
1087
1088        let tok = doc.get_next_word_from_offset(0, 1);
1089
1090        assert!(tok.is_none());
1091    }
1092
1093    #[test]
1094    fn cant_select_next_word_with_punctuation_after_whitespace() {
1095        let doc = Document::new_plain_english_curated("Foo \"bar\", baz");
1096
1097        let tok = doc.get_next_word_from_offset(0, 1);
1098
1099        assert!(tok.is_none());
1100    }
1101
1102    #[test]
1103    fn condenses_filename_extensions() {
1104        let doc = Document::new_plain_english_curated(".c and .exe and .js");
1105        assert!(doc.tokens[0].kind.is_unlintable());
1106        assert!(doc.tokens[4].kind.is_unlintable());
1107        assert!(doc.tokens[8].kind.is_unlintable());
1108    }
1109
1110    #[test]
1111    fn condense_filename_extension_ok_at_start_and_end() {
1112        let doc = Document::new_plain_english_curated(".c and .EXE");
1113        assert!(doc.tokens.len() == 5);
1114        assert!(doc.tokens[0].kind.is_unlintable());
1115        assert!(doc.tokens[4].kind.is_unlintable());
1116    }
1117
1118    #[test]
1119    fn doesnt_condense_filename_extensions_with_mixed_case() {
1120        let doc = Document::new_plain_english_curated(".c and .Exe");
1121        assert!(doc.tokens.len() == 6);
1122        assert!(doc.tokens[0].kind.is_unlintable());
1123        assert!(doc.tokens[4].kind.is_punctuation());
1124        assert!(doc.tokens[5].kind.is_word());
1125    }
1126
1127    #[test]
1128    fn doesnt_condense_filename_extensions_with_non_letters() {
1129        let doc = Document::new_plain_english_curated(".COM and .C0M");
1130        assert!(doc.tokens.len() == 6);
1131        assert!(doc.tokens[0].kind.is_unlintable());
1132        assert!(doc.tokens[4].kind.is_punctuation());
1133        assert!(doc.tokens[5].kind.is_word());
1134    }
1135
1136    #[test]
1137    fn doesnt_condense_filename_extensions_longer_than_three() {
1138        let doc = Document::new_plain_english_curated(".dll and .dlls");
1139        assert!(doc.tokens.len() == 6);
1140        assert!(doc.tokens[0].kind.is_unlintable());
1141        assert!(doc.tokens[4].kind.is_punctuation());
1142        assert!(doc.tokens[5].kind.is_word());
1143    }
1144
1145    #[test]
1146    fn condense_filename_extension_in_parens() {
1147        let doc = Document::new_plain_english_curated(
1148            "true for the manual installation when trying to run the executable(.exe) after a manual download",
1149        );
1150        assert!(doc.tokens.len() > 23);
1151        assert!(doc.tokens[21].kind.is_open_round());
1152        assert!(doc.tokens[22].kind.is_unlintable());
1153        assert!(doc.tokens[23].kind.is_close_round());
1154    }
1155
1156    #[test]
1157    fn condense_tldr_uppercase() {
1158        let doc = Document::new_plain_english_curated("TL;DR");
1159        assert!(doc.tokens.len() == 1);
1160        assert!(doc.tokens[0].kind.is_word());
1161        assert!(doc.tokens[0].span.len() == 5);
1162    }
1163
1164    #[test]
1165    fn condense_tldr_lowercase() {
1166        let doc = Document::new_plain_english_curated("tl;dr");
1167        assert!(doc.tokens.len() == 1);
1168        assert!(doc.tokens[0].kind.is_word());
1169    }
1170
1171    #[test]
1172    fn condense_tldr_mixed_case_1() {
1173        let doc = Document::new_plain_english_curated("tl;DR");
1174        assert!(doc.tokens.len() == 1);
1175        assert!(doc.tokens[0].kind.is_word());
1176    }
1177
1178    #[test]
1179    fn condense_tldr_mixed_case_2() {
1180        let doc = Document::new_plain_english_curated("TL;Dr");
1181        assert!(doc.tokens.len() == 1);
1182        assert!(doc.tokens[0].kind.is_word());
1183    }
1184
1185    #[test]
1186    fn condense_tldr_pural() {
1187        let doc = Document::new_plain_english_curated(
1188            "managing the flow between components to produce relevant TL;DRs of current news articles",
1189        );
1190        // no token is a punctuation token - only words with whitespace between
1191        assert!(
1192            doc.tokens
1193                .iter()
1194                .all(|t| t.kind.is_word() || t.kind.is_whitespace())
1195        );
1196        // one of the word tokens contains a ';' character
1197        let tldrs = doc
1198            .tokens
1199            .iter()
1200            .filter(|t| t.span.get_content(&doc.source).contains(&';'))
1201            .collect_vec();
1202        assert!(tldrs.len() == 1);
1203        assert!(tldrs[0].span.get_content_string(&doc.source) == "TL;DRs");
1204    }
1205
1206    #[test]
1207    fn condense_r_and_d_caps() {
1208        let doc = Document::new_plain_english_curated("R&D");
1209        assert!(doc.tokens.len() == 1);
1210        assert!(doc.tokens[0].kind.is_word());
1211    }
1212
1213    #[test]
1214    fn condense_r_and_d_mixed_case() {
1215        let doc = Document::new_plain_english_curated("R&d");
1216        assert!(doc.tokens.len() == 1);
1217        assert!(doc.tokens[0].kind.is_word());
1218    }
1219
1220    #[test]
1221    fn condense_r_and_d_lowercase() {
1222        let doc = Document::new_plain_english_curated("r&d");
1223        assert!(doc.tokens.len() == 1);
1224        assert!(doc.tokens[0].kind.is_word());
1225    }
1226
1227    #[test]
1228    fn dont_condense_r_and_d_with_spaces() {
1229        let doc = Document::new_plain_english_curated("R & D");
1230        assert!(doc.tokens.len() == 5);
1231        assert!(doc.tokens[0].kind.is_word());
1232        assert!(doc.tokens[1].kind.is_whitespace());
1233        assert!(doc.tokens[2].kind.is_ampersand());
1234        assert!(doc.tokens[3].kind.is_whitespace());
1235        assert!(doc.tokens[4].kind.is_word());
1236    }
1237
1238    #[test]
1239    fn condense_q_and_a() {
1240        let doc =
1241            Document::new_plain_english_curated("A Q&A platform software for teams at any scales.");
1242        assert!(doc.tokens.len() >= 3);
1243        assert!(doc.tokens[2].kind.is_word());
1244        assert!(doc.tokens[2].span.get_content_string(&doc.source) == "Q&A");
1245    }
1246
1247    #[test]
1248    fn dont_allow_mixed_r_and_d_with_q_and_a() {
1249        let doc = Document::new_plain_english_curated("R&A or Q&D");
1250        assert!(doc.tokens.len() == 9);
1251        assert!(doc.tokens[1].kind.is_ampersand() || doc.tokens[7].kind.is_ampersand());
1252    }
1253
1254    #[test]
1255    fn condense_io() {
1256        let doc = Document::new_plain_english_curated("I/O");
1257        assert!(doc.tokens.len() == 1);
1258        assert!(doc.tokens[0].kind.is_word());
1259    }
1260}
harper_core/document.rs

harper_core/
document.rs