harper_core/patterns/
fixed_phrase.rs

1use crate::{Document, Token, TokenKind};
2
3use super::{Pattern, SequencePattern, Word};
4
5/// Matches a fixed sequence of tokens as they appear in the input.
6/// Case-insensitive for words but maintains exact matching for other token types.
7///
8/// # Example
9///
10/// ```rust
11/// use harper_core::patterns::{FixedPhrase, Pattern};
12/// use harper_core::Document;
13///
14/// let doc = Document::new_plain_english_curated("Hello, world!");
15/// let phrase = FixedPhrase::from_phrase("Hello, world!");
16/// assert!(phrase.matches(doc.get_tokens(), doc.get_source()).is_some());
17/// ```
18pub struct FixedPhrase {
19    inner: SequencePattern,
20}
21
22impl FixedPhrase {
23    /// Creates a [FixedPhrase] from a plain text string.
24    /// Uses plain English tokenization rules.
25    pub fn from_phrase(text: &str) -> Self {
26        let document = Document::new_plain_english_curated(text);
27        Self::from_document(&document)
28    }
29
30    /// Creates a [FixedPhrase] from a pre-tokenized document.
31    /// Allows custom tokenization by creating a `Document` first.
32    pub fn from_document(doc: &Document) -> Self {
33        let mut phrase = SequencePattern::default();
34
35        for token in doc.fat_tokens() {
36            match token.kind {
37                TokenKind::Word(_word_metadata) => {
38                    phrase = phrase.then(Word::from_chars(token.content.as_slice()));
39                }
40                TokenKind::Space(_) => {
41                    phrase = phrase.then_whitespace();
42                }
43                TokenKind::Punctuation(p) => {
44                    phrase = phrase.then(move |t: &Token, _source: &[char]| {
45                        t.kind.as_punctuation().cloned() == Some(p)
46                    })
47                }
48                TokenKind::ParagraphBreak => {
49                    phrase = phrase.then_whitespace();
50                }
51                TokenKind::Number(n) => {
52                    phrase = phrase
53                        .then(move |tok: &Token, _source: &[char]| tok.kind == TokenKind::Number(n))
54                }
55                _ => panic!("Fell out of expected document formats."),
56            }
57        }
58
59        Self { inner: phrase }
60    }
61}
62
63impl Pattern for FixedPhrase {
64    /// Matches this phrase against the start of the token slice.
65    /// Returns `Some(len)` on match (number of tokens consumed), `None` otherwise.
66    fn matches(&self, tokens: &[Token], source: &[char]) -> Option<usize> {
67        self.inner.matches(tokens, source)
68    }
69}
70
71#[cfg(test)]
72mod tests {
73    use crate::{
74        Document,
75        patterns::{FixedPhrase, Pattern},
76    };
77
78    #[test]
79    fn test_not_case_sensitive() {
80        let doc_lower = Document::new_plain_english_curated("hello world");
81        let doc_upper = Document::new_plain_english_curated("HELLO WORLD");
82        let doc_title = Document::new_plain_english_curated("Hello World");
83        let phrase = FixedPhrase::from_document(&doc_lower);
84        assert_eq!(
85            phrase.matches(doc_lower.get_tokens(), doc_title.get_source()),
86            Some(3)
87        );
88        assert_eq!(
89            phrase.matches(doc_lower.get_tokens(), doc_upper.get_source()),
90            Some(3)
91        );
92        assert_eq!(
93            phrase.matches(doc_title.get_tokens(), doc_lower.get_source()),
94            Some(3)
95        );
96        assert_eq!(
97            phrase.matches(doc_title.get_tokens(), doc_upper.get_source()),
98            Some(3)
99        );
100        assert_eq!(
101            phrase.matches(doc_upper.get_tokens(), doc_lower.get_source()),
102            Some(3)
103        );
104        assert_eq!(
105            phrase.matches(doc_upper.get_tokens(), doc_title.get_source()),
106            Some(3)
107        );
108    }
109}