harper_core/
document.rs

1use std::cmp::Ordering;
2use std::collections::VecDeque;
3use std::fmt::Display;
4
5use harper_brill::{Chunker, Tagger, brill_tagger, burn_chunker};
6use paste::paste;
7
8use crate::expr::{Expr, ExprExt, FirstMatchOf, Repeating, SequenceExpr};
9use crate::parsers::{Markdown, MarkdownOptions, Parser, PlainEnglish};
10use crate::patterns::WordSet;
11use crate::punctuation::Punctuation;
12use crate::spell::{Dictionary, FstDictionary};
13use crate::vec_ext::VecExt;
14use crate::{CharStringExt, FatStringToken, FatToken, Lrc, Token, TokenKind, TokenStringExt};
15use crate::{OrdinalSuffix, Span};
16
17/// A document containing some amount of lexed and parsed English text.
18#[derive(Debug, Clone)]
19pub struct Document {
20    source: Lrc<Vec<char>>,
21    tokens: Vec<Token>,
22}
23
24impl Default for Document {
25    fn default() -> Self {
26        Self::new("", &PlainEnglish, &FstDictionary::curated())
27    }
28}
29
30impl Document {
31    /// Locate all the tokens that intersect a provided span.
32    ///
33    /// Desperately needs optimization.
34    pub fn token_indices_intersecting(&self, span: Span<char>) -> Vec<usize> {
35        self.tokens()
36            .enumerate()
37            .filter_map(|(idx, tok)| tok.span.overlaps_with(span).then_some(idx))
38            .collect()
39    }
40
41    /// Locate all the tokens that intersect a provided span and convert them to [`FatToken`]s.
42    ///
43    /// Desperately needs optimization.
44    pub fn fat_tokens_intersecting(&self, span: Span<char>) -> Vec<FatToken> {
45        let indices = self.token_indices_intersecting(span);
46
47        indices
48            .into_iter()
49            .map(|i| self.tokens[i].to_fat(&self.source))
50            .collect()
51    }
52
53    /// Lexes and parses text to produce a document using a provided language
54    /// parser and dictionary.
55    pub fn new(text: &str, parser: &impl Parser, dictionary: &impl Dictionary) -> Self {
56        let source: Vec<_> = text.chars().collect();
57
58        Self::new_from_vec(Lrc::new(source), parser, dictionary)
59    }
60
61    /// Lexes and parses text to produce a document using a provided language
62    /// parser and the included curated dictionary.
63    pub fn new_curated(text: &str, parser: &impl Parser) -> Self {
64        let source: Vec<_> = text.chars().collect();
65
66        Self::new_from_vec(Lrc::new(source), parser, &FstDictionary::curated())
67    }
68
69    /// Lexes and parses text to produce a document using a provided language
70    /// parser and dictionary.
71    pub fn new_from_vec(
72        source: Lrc<Vec<char>>,
73        parser: &impl Parser,
74        dictionary: &impl Dictionary,
75    ) -> Self {
76        let tokens = parser.parse(&source);
77
78        let mut document = Self { source, tokens };
79        document.parse(dictionary);
80
81        document
82    }
83
84    /// Parse text to produce a document using the built-in [`PlainEnglish`]
85    /// parser and curated dictionary.
86    pub fn new_plain_english_curated(text: &str) -> Self {
87        Self::new(text, &PlainEnglish, &FstDictionary::curated())
88    }
89
90    /// Parse text to produce a document using the built-in [`PlainEnglish`]
91    /// parser and a provided dictionary.
92    pub fn new_plain_english(text: &str, dictionary: &impl Dictionary) -> Self {
93        Self::new(text, &PlainEnglish, dictionary)
94    }
95
96    /// Parse text to produce a document using the built-in [`Markdown`] parser
97    /// and curated dictionary.
98    pub fn new_markdown_curated(text: &str, markdown_options: MarkdownOptions) -> Self {
99        Self::new(
100            text,
101            &Markdown::new(markdown_options),
102            &FstDictionary::curated(),
103        )
104    }
105
106    /// Parse text to produce a document using the built-in [`Markdown`] parser
107    /// and curated dictionary with the default Markdown configuration.
108    pub fn new_markdown_default_curated(text: &str) -> Self {
109        Self::new_markdown_curated(text, MarkdownOptions::default())
110    }
111
112    /// Parse text to produce a document using the built-in [`PlainEnglish`]
113    /// parser and the curated dictionary.
114    pub fn new_markdown(
115        text: &str,
116        markdown_options: MarkdownOptions,
117        dictionary: &impl Dictionary,
118    ) -> Self {
119        Self::new(text, &Markdown::new(markdown_options), dictionary)
120    }
121
122    /// Parse text to produce a document using the built-in [`PlainEnglish`]
123    /// parser and the curated dictionary with the default Markdown configuration.
124    pub fn new_markdown_default(text: &str, dictionary: &impl Dictionary) -> Self {
125        Self::new_markdown(text, MarkdownOptions::default(), dictionary)
126    }
127
128    /// Re-parse important language constructs.
129    ///
130    /// Should be run after every change to the underlying [`Self::source`].
131    fn parse(&mut self, dictionary: &impl Dictionary) {
132        self.condense_spaces();
133        self.condense_newlines();
134        self.newlines_to_breaks();
135        self.condense_contractions();
136        self.condense_dotted_initialisms();
137        self.condense_number_suffixes();
138        self.condense_ellipsis();
139        self.condense_latin();
140        self.condense_filename_extensions();
141        self.condense_tldr();
142        self.match_quotes();
143
144        let chunker = burn_chunker();
145        let tagger = brill_tagger();
146
147        for sent in self.tokens.iter_sentences_mut() {
148            let token_strings: Vec<_> = sent
149                .iter()
150                .filter(|t| !t.kind.is_whitespace())
151                .map(|t| t.span.get_content_string(&self.source))
152                .collect();
153
154            let token_tags = tagger.tag_sentence(&token_strings);
155            let np_flags = chunker.chunk_sentence(&token_strings, &token_tags);
156
157            let mut i = 0;
158
159            // Annotate word metadata
160            for token in sent.iter_mut() {
161                if let TokenKind::Word(meta) = &mut token.kind {
162                    let word_source = token.span.get_content(&self.source);
163                    let mut found_meta = dictionary.get_word_metadata(word_source).cloned();
164
165                    if let Some(inner) = &mut found_meta {
166                        inner.pos_tag = token_tags[i].or_else(|| inner.infer_pos_tag());
167                        inner.np_member = Some(np_flags[i]);
168                    }
169
170                    *meta = found_meta;
171                    i += 1;
172                } else if !token.kind.is_whitespace() {
173                    i += 1;
174                }
175            }
176        }
177    }
178
179    /// Convert all sets of newlines greater than 2 to paragraph breaks.
180    fn newlines_to_breaks(&mut self) {
181        for token in &mut self.tokens {
182            if let TokenKind::Newline(n) = token.kind
183                && n >= 2
184            {
185                token.kind = TokenKind::ParagraphBreak;
186            }
187        }
188    }
189
190    /// Given a list of indices, this function removes the subsequent
191    /// `stretch_len - 1` elements after each index.
192    ///
193    /// Will extend token spans to include removed elements.
194    /// Assumes condensed tokens are contiguous in source text.
195    fn condense_indices(&mut self, indices: &[usize], stretch_len: usize) {
196        // Update spans
197        for idx in indices {
198            let end_tok = self.tokens[idx + stretch_len - 1].clone();
199            let start_tok = &mut self.tokens[*idx];
200
201            start_tok.span.end = end_tok.span.end;
202        }
203
204        // Trim
205        let old = self.tokens.clone();
206        self.tokens.clear();
207
208        // Keep first chunk.
209        self.tokens
210            .extend_from_slice(&old[0..indices.first().copied().unwrap_or(indices.len())]);
211
212        let mut iter = indices.iter().peekable();
213
214        while let (Some(a_idx), b) = (iter.next(), iter.peek()) {
215            self.tokens.push(old[*a_idx].clone());
216
217            if let Some(b_idx) = b {
218                self.tokens
219                    .extend_from_slice(&old[a_idx + stretch_len..**b_idx]);
220            }
221        }
222
223        // Keep last chunk.
224        self.tokens.extend_from_slice(
225            &old[indices
226                .last()
227                .map(|v| v + stretch_len)
228                .unwrap_or(indices.len())..],
229        );
230    }
231
232    pub fn get_token_at_char_index(&self, char_index: usize) -> Option<&Token> {
233        let index = self
234            .tokens
235            .binary_search_by(|t| {
236                if t.span.overlaps_with(Span::new_with_len(char_index, 1)) {
237                    Ordering::Equal
238                } else {
239                    t.span.start.cmp(&char_index)
240                }
241            })
242            .ok()?;
243
244        Some(&self.tokens[index])
245    }
246
247    /// Defensively attempt to grab a specific token.
248    pub fn get_token(&self, index: usize) -> Option<&Token> {
249        self.tokens.get(index)
250    }
251
252    /// Get a token at a signed offset from a base index, or None if out of bounds.
253    pub fn get_token_offset(&self, base: usize, offset: isize) -> Option<&Token> {
254        match base.checked_add_signed(offset) {
255            None => None,
256            Some(idx) => self.get_token(idx),
257        }
258    }
259
260    /// Get an iterator over all the tokens contained in the document.
261    pub fn tokens(&self) -> impl Iterator<Item = &Token> + '_ {
262        self.tokens.iter()
263    }
264
265    pub fn iter_nominal_phrases(&self) -> impl Iterator<Item = &[Token]> {
266        fn is_np_member(t: &Token) -> bool {
267            t.kind
268                .as_word()
269                .and_then(|x| x.as_ref())
270                .and_then(|w| w.np_member)
271                .unwrap_or(false)
272        }
273
274        fn trim(slice: &[Token]) -> &[Token] {
275            let mut start = 0;
276            let mut end = slice.len();
277            while start < end && slice[start].kind.is_whitespace() {
278                start += 1;
279            }
280            while end > start && slice[end - 1].kind.is_whitespace() {
281                end -= 1;
282            }
283            &slice[start..end]
284        }
285
286        self.tokens
287            .as_slice()
288            .split(|t| !(is_np_member(t) || t.kind.is_whitespace()))
289            .filter_map(|s| {
290                let s = trim(s);
291                if s.iter().any(is_np_member) {
292                    Some(s)
293                } else {
294                    None
295                }
296            })
297    }
298
299    /// Get an iterator over all the tokens contained in the document.
300    pub fn fat_tokens(&self) -> impl Iterator<Item = FatToken> + '_ {
301        self.tokens().map(|token| token.to_fat(&self.source))
302    }
303
304    /// Get the next or previous word token relative to a base index, if separated by whitespace.
305    /// Returns None if the next/previous token is not a word or does not exist.
306    pub fn get_next_word_from_offset(&self, base: usize, offset: isize) -> Option<&Token> {
307        // Look for whitespace at the expected offset
308        if !self.get_token_offset(base, offset)?.kind.is_whitespace() {
309            return None;
310        }
311        // Now look beyond the whitespace for a word token
312        let word_token = self.get_token_offset(base, offset + offset.signum());
313        let word_token = word_token?;
314        word_token.kind.is_word().then_some(word_token)
315    }
316
317    /// Get an iterator over all the tokens contained in the document.
318    pub fn fat_string_tokens(&self) -> impl Iterator<Item = FatStringToken> + '_ {
319        self.fat_tokens().map(|t| t.into())
320    }
321
322    pub fn get_span_content(&self, span: &Span<char>) -> &[char] {
323        span.get_content(&self.source)
324    }
325
326    pub fn get_span_content_str(&self, span: &Span<char>) -> String {
327        String::from_iter(self.get_span_content(span))
328    }
329
330    pub fn get_full_string(&self) -> String {
331        self.get_span_content_str(&Span::new(0, self.source.len()))
332    }
333
334    pub fn get_full_content(&self) -> &[char] {
335        &self.source
336    }
337
338    pub fn get_source(&self) -> &[char] {
339        &self.source
340    }
341
342    pub fn get_tokens(&self) -> &[Token] {
343        &self.tokens
344    }
345
346    /// Searches for quotation marks and fills the
347    /// [`Punctuation::Quote::twin_loc`] field. This is on a best-effort
348    /// basis.
349    ///
350    /// Current algorithm is basic and could use some work.
351    fn match_quotes(&mut self) {
352        let quote_indices: Vec<usize> = self.tokens.iter_quote_indices().collect();
353
354        for i in 0..quote_indices.len() / 2 {
355            let a_i = quote_indices[i * 2];
356            let b_i = quote_indices[i * 2 + 1];
357
358            {
359                let a = self.tokens[a_i].kind.as_mut_quote().unwrap();
360                a.twin_loc = Some(b_i);
361            }
362
363            {
364                let b = self.tokens[b_i].kind.as_mut_quote().unwrap();
365                b.twin_loc = Some(a_i);
366            }
367        }
368    }
369
370    /// Searches for number suffixes and condenses them down into single tokens
371    fn condense_number_suffixes(&mut self) {
372        if self.tokens.len() < 2 {
373            return;
374        }
375
376        let mut replace_starts = Vec::new();
377
378        for idx in 0..self.tokens.len() - 1 {
379            let b = &self.tokens[idx + 1];
380            let a = &self.tokens[idx];
381
382            // TODO: Allow spaces between `a` and `b`
383
384            if let (TokenKind::Number(..), TokenKind::Word(..)) = (&a.kind, &b.kind)
385                && let Some(found_suffix) =
386                    OrdinalSuffix::from_chars(self.get_span_content(&b.span))
387            {
388                self.tokens[idx].kind.as_mut_number().unwrap().suffix = Some(found_suffix);
389                replace_starts.push(idx);
390            }
391        }
392
393        self.condense_indices(&replace_starts, 2);
394    }
395
396    /// Searches for multiple sequential space tokens and condenses them down
397    /// into one.
398    fn condense_spaces(&mut self) {
399        let mut cursor = 0;
400        let copy = self.tokens.clone();
401
402        let mut remove_these = VecDeque::new();
403
404        while cursor < self.tokens.len() {
405            // Locate a stretch of one or more newline tokens.
406            let start_tok = &mut self.tokens[cursor];
407
408            if let TokenKind::Space(start_count) = &mut start_tok.kind {
409                loop {
410                    cursor += 1;
411
412                    if cursor >= copy.len() {
413                        break;
414                    }
415
416                    let child_tok = &copy[cursor];
417
418                    // Only condense adjacent spans
419                    if start_tok.span.end != child_tok.span.start {
420                        break;
421                    }
422
423                    if let TokenKind::Space(n) = child_tok.kind {
424                        *start_count += n;
425                        start_tok.span.end = child_tok.span.end;
426                        remove_these.push_back(cursor);
427                        cursor += 1;
428                    } else {
429                        break;
430                    };
431                }
432            }
433
434            cursor += 1;
435        }
436
437        self.tokens.remove_indices(remove_these);
438    }
439
440    thread_local! {
441        static LATIN_EXPR: Lrc<FirstMatchOf> = Document::uncached_latin_expr();
442    }
443
444    fn uncached_latin_expr() -> Lrc<FirstMatchOf> {
445        Lrc::new(FirstMatchOf::new(vec![
446            Box::new(
447                SequenceExpr::default()
448                    .then(WordSet::new(&["etc", "vs"]))
449                    .then_period(),
450            ),
451            Box::new(
452                SequenceExpr::aco("et")
453                    .then_whitespace()
454                    .t_aco("al")
455                    .then_period(),
456            ),
457        ]))
458    }
459
460    /// Assumes that the first matched token is the canonical one to be condensed into.
461    /// Takes a callback that can be used to retroactively edit the canonical token afterwards.
462    fn condense_expr<F>(&mut self, expr: &impl Expr, edit: F)
463    where
464        F: Fn(&mut Token),
465    {
466        let matches = expr.iter_matches_in_doc(self).collect::<Vec<_>>();
467
468        let mut remove_indices = VecDeque::with_capacity(matches.len());
469
470        for m in matches {
471            remove_indices.extend(m.start + 1..m.end);
472            self.tokens[m.start].span = self.tokens[m.into_iter()].span().unwrap();
473            edit(&mut self.tokens[m.start]);
474        }
475
476        self.tokens.remove_indices(remove_indices);
477    }
478
479    fn condense_latin(&mut self) {
480        self.condense_expr(&Self::LATIN_EXPR.with(|v| v.clone()), |_| {})
481    }
482
483    /// Searches for multiple sequential newline tokens and condenses them down
484    /// into one.
485    fn condense_newlines(&mut self) {
486        let mut cursor = 0;
487        let copy = self.tokens.clone();
488
489        let mut remove_these = VecDeque::new();
490
491        while cursor < self.tokens.len() {
492            // Locate a stretch of one or more newline tokens.
493            let start_tok = &mut self.tokens[cursor];
494
495            if let TokenKind::Newline(start_count) = &mut start_tok.kind {
496                loop {
497                    cursor += 1;
498
499                    if cursor >= copy.len() {
500                        break;
501                    }
502
503                    let child_tok = &copy[cursor];
504                    if let TokenKind::Newline(n) = child_tok.kind {
505                        *start_count += n;
506                        start_tok.span.end = child_tok.span.end;
507                        remove_these.push_back(cursor);
508                        cursor += 1;
509                    } else {
510                        break;
511                    };
512                }
513            }
514
515            cursor += 1;
516        }
517
518        self.tokens.remove_indices(remove_these);
519    }
520
521    /// Condenses words like "i.e.", "e.g." and "N.S.A." down to single words
522    /// using a state machine.
523    fn condense_dotted_initialisms(&mut self) {
524        if self.tokens.len() < 2 {
525            return;
526        }
527
528        let mut to_remove = VecDeque::new();
529
530        let mut cursor = 1;
531
532        let mut initialism_start = None;
533
534        loop {
535            let a = &self.tokens[cursor - 1];
536            let b = &self.tokens[cursor];
537
538            let is_initialism_chunk = a.kind.is_word() && a.span.len() == 1 && b.kind.is_period();
539
540            if is_initialism_chunk {
541                if initialism_start.is_none() {
542                    initialism_start = Some(cursor - 1);
543                } else {
544                    to_remove.push_back(cursor - 1);
545                }
546
547                to_remove.push_back(cursor);
548                cursor += 1;
549            } else {
550                if let Some(start) = initialism_start {
551                    let end = self.tokens[cursor - 2].span.end;
552                    let start_tok: &mut Token = &mut self.tokens[start];
553                    start_tok.span.end = end;
554                }
555
556                initialism_start = None;
557            }
558
559            cursor += 1;
560
561            if cursor >= self.tokens.len() - 1 {
562                break;
563            }
564        }
565
566        self.tokens.remove_indices(to_remove);
567    }
568
569    /// Condenses likely filename extensions down to single tokens.
570    fn condense_filename_extensions(&mut self) {
571        if self.tokens.len() < 2 {
572            return;
573        }
574
575        let mut to_remove = VecDeque::new();
576
577        let mut cursor = 1;
578
579        let mut ext_start = None;
580
581        loop {
582            // left context, dot, extension, right context
583            let l = self.get_token_offset(cursor, -2);
584            let d = &self.tokens[cursor - 1];
585            let x = &self.tokens[cursor];
586            let r = self.get_token_offset(cursor, 1);
587
588            let is_ext_chunk = d.kind.is_period()
589                && x.kind.is_word()
590                && x.span.len() <= 3
591                && ((l.is_none_or(|t| t.kind.is_whitespace())
592                    && r.is_none_or(|t| t.kind.is_whitespace()))
593                    || (l.is_some_and(|t| t.kind.is_open_round())
594                        && r.is_some_and(|t| t.kind.is_close_round())))
595                && {
596                    let ext_chars = x.span.get_content(&self.source);
597                    ext_chars.iter().all(|c| c.is_ascii_lowercase())
598                        || ext_chars.iter().all(|c| c.is_ascii_uppercase())
599                };
600
601            if is_ext_chunk {
602                if ext_start.is_none() {
603                    ext_start = Some(cursor - 1);
604                    self.tokens[cursor - 1].kind = TokenKind::Unlintable;
605                } else {
606                    to_remove.push_back(cursor - 1);
607                }
608
609                to_remove.push_back(cursor);
610                cursor += 1;
611            } else {
612                if let Some(start) = ext_start {
613                    let end = self.tokens[cursor - 2].span.end;
614                    let start_tok: &mut Token = &mut self.tokens[start];
615                    start_tok.span.end = end;
616                }
617
618                ext_start = None;
619            }
620
621            cursor += 1;
622
623            if cursor >= self.tokens.len() {
624                break;
625            }
626        }
627
628        self.tokens.remove_indices(to_remove);
629    }
630
631    /// Condenses "tl;dr" down to a single word token.
632    fn condense_tldr(&mut self) {
633        if self.tokens.len() < 3 {
634            return;
635        }
636
637        let mut to_remove = VecDeque::new();
638        let mut cursor = 2;
639
640        loop {
641            let tl = &self.tokens[cursor - 2];
642            let simicolon = &self.tokens[cursor - 1];
643            let dr = &self.tokens[cursor];
644
645            let is_tldr_chunk = tl.kind.is_word()
646                && tl.span.len() == 2
647                && tl
648                    .span
649                    .get_content(&self.source)
650                    .eq_ignore_ascii_case_chars(&['t', 'l'])
651                && simicolon.kind.is_semicolon()
652                && dr.kind.is_word()
653                && dr.span.len() >= 2
654                && dr.span.len() <= 3
655                && dr
656                    .span
657                    .get_content(&self.source)
658                    .eq_any_ignore_ascii_case_chars(&[&['d', 'r'], &['d', 'r', 's']]);
659
660            if is_tldr_chunk {
661                // Update the first token to be the full "tl;dr" as a word
662                self.tokens[cursor - 2].span = Span::new(
663                    self.tokens[cursor - 2].span.start,
664                    self.tokens[cursor].span.end,
665                );
666
667                // Mark the semicolon and "dr" tokens for removal
668                to_remove.push_back(cursor - 1);
669                to_remove.push_back(cursor);
670            }
671
672            // Skip ahead since we've processed these tokens
673            cursor += 1;
674
675            if cursor >= self.tokens.len() {
676                break;
677            }
678        }
679
680        // Remove the marked tokens in reverse order to maintain correct indices
681        self.tokens.remove_indices(to_remove);
682    }
683
684    fn uncached_ellipsis_pattern() -> Lrc<Repeating> {
685        let period = SequenceExpr::default().then_period();
686        Lrc::new(Repeating::new(Box::new(period), 2))
687    }
688
689    thread_local! {
690        static ELLIPSIS_EXPR: Lrc<Repeating> = Document::uncached_ellipsis_pattern();
691    }
692
693    fn condense_ellipsis(&mut self) {
694        let expr = Self::ELLIPSIS_EXPR.with(|v| v.clone());
695        self.condense_expr(&expr, |tok| {
696            tok.kind = TokenKind::Punctuation(Punctuation::Ellipsis)
697        });
698    }
699
700    fn uncached_contraction_expr() -> Lrc<SequenceExpr> {
701        Lrc::new(
702            SequenceExpr::default()
703                .then_any_word()
704                .then_apostrophe()
705                .then_any_word(),
706        )
707    }
708
709    thread_local! {
710        static CONTRACTION_EXPR: Lrc<SequenceExpr> = Document::uncached_contraction_expr();
711    }
712
713    /// Searches for contractions and condenses them down into single
714    /// tokens.
715    fn condense_contractions(&mut self) {
716        let expr = Self::CONTRACTION_EXPR.with(|v| v.clone());
717
718        self.condense_expr(&expr, |_| {})
719    }
720}
721
722/// Creates functions necessary to implement [`TokenStringExt]` on a document.
723macro_rules! create_fns_on_doc {
724    ($thing:ident) => {
725        paste! {
726            fn [< first_ $thing >](&self) -> Option<&Token> {
727                self.tokens.[< first_ $thing >]()
728            }
729
730            fn [< last_ $thing >](&self) -> Option<&Token> {
731                self.tokens.[< last_ $thing >]()
732            }
733
734            fn [< last_ $thing _index>](&self) -> Option<usize> {
735                self.tokens.[< last_ $thing _index >]()
736            }
737
738            fn [<iter_ $thing _indices>](&self) -> impl DoubleEndedIterator<Item = usize> + '_ {
739                self.tokens.[< iter_ $thing _indices >]()
740            }
741
742            fn [<iter_ $thing s>](&self) -> impl Iterator<Item = &Token> + '_ {
743                self.tokens.[< iter_ $thing s >]()
744            }
745        }
746    };
747}
748
749impl TokenStringExt for Document {
750    create_fns_on_doc!(adjective);
751    create_fns_on_doc!(apostrophe);
752    create_fns_on_doc!(at);
753    create_fns_on_doc!(chunk_terminator);
754    create_fns_on_doc!(comma);
755    create_fns_on_doc!(conjunction);
756    create_fns_on_doc!(currency);
757    create_fns_on_doc!(ellipsis);
758    create_fns_on_doc!(hostname);
759    create_fns_on_doc!(likely_homograph);
760    create_fns_on_doc!(noun);
761    create_fns_on_doc!(number);
762    create_fns_on_doc!(paragraph_break);
763    create_fns_on_doc!(pipe);
764    create_fns_on_doc!(preposition);
765    create_fns_on_doc!(punctuation);
766    create_fns_on_doc!(quote);
767    create_fns_on_doc!(sentence_terminator);
768    create_fns_on_doc!(space);
769    create_fns_on_doc!(unlintable);
770    create_fns_on_doc!(verb);
771    create_fns_on_doc!(word);
772    create_fns_on_doc!(word_like);
773
774    fn first_sentence_word(&self) -> Option<&Token> {
775        self.tokens.first_sentence_word()
776    }
777
778    fn first_non_whitespace(&self) -> Option<&Token> {
779        self.tokens.first_non_whitespace()
780    }
781
782    fn span(&self) -> Option<Span<char>> {
783        self.tokens.span()
784    }
785
786    fn iter_linking_verb_indices(&self) -> impl Iterator<Item = usize> + '_ {
787        self.tokens.iter_linking_verb_indices()
788    }
789
790    fn iter_linking_verbs(&self) -> impl Iterator<Item = &Token> + '_ {
791        self.tokens.iter_linking_verbs()
792    }
793
794    fn iter_chunks(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
795        self.tokens.iter_chunks()
796    }
797
798    fn iter_paragraphs(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
799        self.tokens.iter_paragraphs()
800    }
801
802    fn iter_sentences(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
803        self.tokens.iter_sentences()
804    }
805
806    fn iter_sentences_mut(&mut self) -> impl Iterator<Item = &'_ mut [Token]> + '_ {
807        self.tokens.iter_sentences_mut()
808    }
809}
810
811impl Display for Document {
812    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
813        for token in &self.tokens {
814            write!(f, "{}", self.get_span_content_str(&token.span))?;
815        }
816
817        Ok(())
818    }
819}
820
821#[cfg(test)]
822mod tests {
823    use itertools::Itertools;
824
825    use super::Document;
826    use crate::{Span, parsers::MarkdownOptions};
827
828    fn assert_condensed_contractions(text: &str, final_tok_count: usize) {
829        let document = Document::new_plain_english_curated(text);
830
831        assert_eq!(document.tokens.len(), final_tok_count);
832
833        let document = Document::new_markdown_curated(text, MarkdownOptions::default());
834
835        assert_eq!(document.tokens.len(), final_tok_count);
836    }
837
838    #[test]
839    fn simple_contraction() {
840        assert_condensed_contractions("isn't", 1);
841    }
842
843    #[test]
844    fn simple_contraction2() {
845        assert_condensed_contractions("wasn't", 1);
846    }
847
848    #[test]
849    fn simple_contraction3() {
850        assert_condensed_contractions("There's", 1);
851    }
852
853    #[test]
854    fn medium_contraction() {
855        assert_condensed_contractions("isn't wasn't", 3);
856    }
857
858    #[test]
859    fn medium_contraction2() {
860        assert_condensed_contractions("There's no way", 5);
861    }
862
863    #[test]
864    fn selects_token_at_char_index() {
865        let text = "There were three little pigs. They built three little homes.";
866        let document = Document::new_plain_english_curated(text);
867
868        let got = document.get_token_at_char_index(19).unwrap();
869
870        assert!(got.kind.is_word());
871        assert_eq!(got.span, Span::new(17, 23));
872    }
873
874    fn assert_token_count(source: &str, count: usize) {
875        let document = Document::new_plain_english_curated(source);
876
877        dbg!(document.tokens().map(|t| t.kind.clone()).collect_vec());
878        assert_eq!(document.tokens.len(), count);
879    }
880
881    #[test]
882    fn condenses_number_suffixes() {
883        assert_token_count("1st", 1);
884        assert_token_count("This is the 2nd test", 9);
885        assert_token_count("This is the 3rd test", 9);
886        assert_token_count(
887            "It works even with weird capitalization like this: 600nD",
888            18,
889        );
890    }
891
892    #[test]
893    fn condenses_ie() {
894        assert_token_count("There is a thing (i.e. that one)", 15);
895        assert_token_count("We are trying to condense \"i.e.\"", 13);
896        assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
897    }
898
899    #[test]
900    fn condenses_eg() {
901        assert_token_count("We are trying to condense \"e.g.\"", 13);
902        assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
903    }
904
905    #[test]
906    fn condenses_nsa() {
907        assert_token_count(r#"Condenses words like "i.e.", "e.g." and "N.S.A.""#, 20);
908    }
909
910    #[test]
911    fn parses_ellipsis() {
912        assert_token_count("...", 1);
913    }
914
915    #[test]
916    fn parses_long_ellipsis() {
917        assert_token_count(".....", 1);
918    }
919
920    #[test]
921    fn parses_short_ellipsis() {
922        assert_token_count("..", 1);
923    }
924
925    #[test]
926    fn selects_token_at_offset() {
927        let doc = Document::new_plain_english_curated("Foo bar baz");
928
929        let tok = doc.get_token_offset(1, -1).unwrap();
930
931        assert_eq!(tok.span, Span::new(0, 3));
932    }
933
934    #[test]
935    fn cant_select_token_before_start() {
936        let doc = Document::new_plain_english_curated("Foo bar baz");
937
938        let tok = doc.get_token_offset(0, -1);
939
940        assert!(tok.is_none());
941    }
942
943    #[test]
944    fn select_next_word_pos_offset() {
945        let doc = Document::new_plain_english_curated("Foo bar baz");
946
947        let bar = doc.get_next_word_from_offset(0, 1).unwrap();
948        let bar = doc.get_span_content(&bar.span);
949        assert_eq!(bar, ['b', 'a', 'r']);
950    }
951
952    #[test]
953    fn select_next_word_neg_offset() {
954        let doc = Document::new_plain_english_curated("Foo bar baz");
955
956        let bar = doc.get_next_word_from_offset(2, -1).unwrap();
957        let bar = doc.get_span_content(&bar.span);
958        assert_eq!(bar, ['F', 'o', 'o']);
959    }
960
961    #[test]
962    fn cant_select_next_word_not_from_whitespace() {
963        let doc = Document::new_plain_english_curated("Foo bar baz");
964
965        let tok = doc.get_next_word_from_offset(0, 2);
966
967        assert!(tok.is_none());
968    }
969
970    #[test]
971    fn cant_select_next_word_before_start() {
972        let doc = Document::new_plain_english_curated("Foo bar baz");
973
974        let tok = doc.get_next_word_from_offset(0, -1);
975
976        assert!(tok.is_none());
977    }
978
979    #[test]
980    fn cant_select_next_word_with_punctuation_instead_of_whitespace() {
981        let doc = Document::new_plain_english_curated("Foo, bar, baz");
982
983        let tok = doc.get_next_word_from_offset(0, 1);
984
985        assert!(tok.is_none());
986    }
987
988    #[test]
989    fn cant_select_next_word_with_punctuation_after_whitespace() {
990        let doc = Document::new_plain_english_curated("Foo \"bar\", baz");
991
992        let tok = doc.get_next_word_from_offset(0, 1);
993
994        assert!(tok.is_none());
995    }
996
997    #[test]
998    fn condenses_filename_extensions() {
999        let doc = Document::new_plain_english_curated(".c and .exe and .js");
1000        assert!(doc.tokens[0].kind.is_unlintable());
1001        assert!(doc.tokens[4].kind.is_unlintable());
1002        assert!(doc.tokens[8].kind.is_unlintable());
1003    }
1004
1005    #[test]
1006    fn condense_filename_extension_ok_at_start_and_end() {
1007        let doc = Document::new_plain_english_curated(".c and .EXE");
1008        assert!(doc.tokens.len() == 5);
1009        assert!(doc.tokens[0].kind.is_unlintable());
1010        assert!(doc.tokens[4].kind.is_unlintable());
1011    }
1012
1013    #[test]
1014    fn doesnt_condense_filename_extensions_with_mixed_case() {
1015        let doc = Document::new_plain_english_curated(".c and .Exe");
1016        assert!(doc.tokens.len() == 6);
1017        assert!(doc.tokens[0].kind.is_unlintable());
1018        assert!(doc.tokens[4].kind.is_punctuation());
1019        assert!(doc.tokens[5].kind.is_word());
1020    }
1021
1022    #[test]
1023    fn doesnt_condense_filename_extensions_with_non_letters() {
1024        let doc = Document::new_plain_english_curated(".COM and .C0M");
1025        assert!(doc.tokens.len() == 6);
1026        assert!(doc.tokens[0].kind.is_unlintable());
1027        assert!(doc.tokens[4].kind.is_punctuation());
1028        assert!(doc.tokens[5].kind.is_word());
1029    }
1030
1031    #[test]
1032    fn doesnt_condense_filename_extensions_longer_than_three() {
1033        let doc = Document::new_plain_english_curated(".dll and .dlls");
1034        assert!(doc.tokens.len() == 6);
1035        assert!(doc.tokens[0].kind.is_unlintable());
1036        assert!(doc.tokens[4].kind.is_punctuation());
1037        assert!(doc.tokens[5].kind.is_word());
1038    }
1039
1040    #[test]
1041    fn condense_filename_extension_in_parens() {
1042        let doc = Document::new_plain_english_curated(
1043            "true for the manual installation when trying to run the executable(.exe) after a manual download",
1044        );
1045        assert!(doc.tokens.len() > 23);
1046        assert!(doc.tokens[21].kind.is_open_round());
1047        assert!(doc.tokens[22].kind.is_unlintable());
1048        assert!(doc.tokens[23].kind.is_close_round());
1049    }
1050
1051    #[test]
1052    fn condense_tldr_uppercase() {
1053        let doc = Document::new_plain_english_curated("TL;DR");
1054        assert!(doc.tokens.len() == 1);
1055        assert!(doc.tokens[0].kind.is_word());
1056        assert!(doc.tokens[0].span.len() == 5);
1057    }
1058
1059    #[test]
1060    fn condense_tldr_lowercase() {
1061        let doc = Document::new_plain_english_curated("tl;dr");
1062        assert!(doc.tokens.len() == 1);
1063        assert!(doc.tokens[0].kind.is_word());
1064    }
1065
1066    #[test]
1067    fn condense_tldr_mixed_case_1() {
1068        let doc = Document::new_plain_english_curated("tl;DR");
1069        assert!(doc.tokens.len() == 1);
1070        assert!(doc.tokens[0].kind.is_word());
1071    }
1072
1073    #[test]
1074    fn condense_tldr_mixed_case_2() {
1075        let doc = Document::new_plain_english_curated("TL;Dr");
1076        assert!(doc.tokens.len() == 1);
1077        assert!(doc.tokens[0].kind.is_word());
1078    }
1079
1080    #[test]
1081    fn condense_tldr_pural() {
1082        let doc = Document::new_plain_english_curated(
1083            "managing the flow between components to produce relevant TL;DRs of current news articles",
1084        );
1085        // no token is a punctuation token - only words with whitespace between
1086        assert!(
1087            doc.tokens
1088                .iter()
1089                .all(|t| t.kind.is_word() || t.kind.is_whitespace())
1090        );
1091        // one of the word tokens contains a ';' character
1092        let tldrs = doc
1093            .tokens
1094            .iter()
1095            .filter(|t| t.span.get_content(&doc.source).contains(&';'))
1096            .collect_vec();
1097        assert!(tldrs.len() == 1);
1098        assert!(tldrs[0].span.get_content_string(&doc.source) == "TL;DRs");
1099    }
1100}
harper_core/document.rs

harper_core/
document.rs