harper_core/patterns/
similar_to_phrase.rs

1use crate::{Document, Token, TokenKind};
2
3use super::{Pattern, SequencePattern, Word, within_edit_distance::WithinEditDistance};
4
5pub struct SimilarToPhrase {
6    phrase: SequencePattern,
7    fuzzy_phrase: SequencePattern,
8}
9
10impl SimilarToPhrase {
11    /// Create an error-tolerant SequencePattern that looks for phrases similar to (but not the same as) that contained
12    /// in the provided text.
13    ///
14    /// This is an expensive operation, so try to only do it at startup and in tests.
15    ///
16    /// It will panic if your document is too complex, so only run this with curated phrases.
17    pub fn from_phrase(text: &str, max_edit_dist: u8) -> Self {
18        let document = Document::new_plain_english_curated(text);
19
20        Self::from_doc(&document, max_edit_dist)
21    }
22
23    /// Create an error-tolerant SequencePattern that looks for phrases similar to (but not the same as) that contained
24    /// in the provided document.
25    ///
26    /// This is an expensive operation, so try to only do it at startup and in tests.
27    ///
28    /// It will panic if your document contains certain token types, so only run this with curated phrases.
29    pub fn from_doc(document: &Document, max_edit_dist: u8) -> Self {
30        let mut phrase = SequencePattern::default();
31        let mut fuzzy_phrase = SequencePattern::default();
32
33        for token in document.fat_tokens() {
34            match token.kind {
35                TokenKind::Word(_word_metadata) => {
36                    phrase = phrase.then(Word::from_chars(token.content.as_slice()));
37                    fuzzy_phrase = fuzzy_phrase
38                        .then(WithinEditDistance::new(token.content.into(), max_edit_dist));
39                }
40                TokenKind::Space(_) => {
41                    fuzzy_phrase = fuzzy_phrase.then_whitespace();
42                    phrase = phrase.then_whitespace();
43                }
44                TokenKind::ParagraphBreak => {
45                    fuzzy_phrase = fuzzy_phrase.then_whitespace();
46                    phrase = phrase.then_whitespace();
47                }
48                _ => panic!("Fell out of expected document formats."),
49            }
50        }
51
52        Self {
53            phrase,
54            fuzzy_phrase,
55        }
56    }
57}
58
59impl Pattern for SimilarToPhrase {
60    fn matches(&self, tokens: &[Token], source: &[char]) -> Option<usize> {
61        if self.phrase.matches(tokens, source).is_some() {
62            return None;
63        }
64        self.fuzzy_phrase.matches(tokens, source)
65    }
66}