harper_core/expr/
mergeable_words.rs

1use std::sync::Arc;
2
3use super::{Expr, SequenceExpr, SpaceOrHyphen};
4use crate::spell::{Dictionary, FstDictionary};
5use crate::{CharString, DictWordMetadata, Span, Token};
6
7type PredicateFn =
8    dyn Fn(Option<&DictWordMetadata>, Option<&DictWordMetadata>) -> bool + Send + Sync;
9
10/// An [`Expr`] that identifies adjacent words that could potentially be merged into a single word.
11///
12/// This checks if two adjacent words could form a valid compound word, but first verifies
13/// that the two words aren't already a valid entry in the dictionary (like "straight away").
14pub struct MergeableWords {
15    inner: SequenceExpr,
16    dict: Arc<FstDictionary>,
17    predicate: Box<PredicateFn>,
18}
19
20impl MergeableWords {
21    pub fn new(
22        predicate: impl Fn(Option<&DictWordMetadata>, Option<&DictWordMetadata>) -> bool
23        + Send
24        + Sync
25        + 'static,
26    ) -> Self {
27        Self {
28            inner: SequenceExpr::default()
29                .then_any_word()
30                .then(SpaceOrHyphen)
31                .then_any_word(),
32            dict: FstDictionary::curated(),
33            predicate: Box::new(predicate),
34        }
35    }
36
37    /// Get the merged word from the dictionary if these words can be merged.
38    /// Returns None if the words should remain separate (according to the predicate).
39    pub fn get_merged_word(
40        &self,
41        word_a: &Token,
42        word_b: &Token,
43        source: &[char],
44    ) -> Option<CharString> {
45        let a_chars: CharString = word_a.span.get_content(source).into();
46        let b_chars: CharString = word_b.span.get_content(source).into();
47
48        // First check if the open compound exists in the dictionary
49        let mut compound = a_chars.clone();
50        compound.push(' ');
51        compound.extend_from_slice(&b_chars);
52        let meta_open = self.dict.get_lexeme_metadata(&compound);
53
54        // Then check if the closed compound exists in the dictionary
55        compound.remove(a_chars.len());
56        let meta_closed = self.dict.get_lexeme_metadata(&compound);
57
58        if (self.predicate)(meta_closed.as_deref(), meta_open.as_deref()) {
59            return Some(compound);
60        }
61
62        None
63    }
64}
65
66impl Expr for MergeableWords {
67    fn run(&self, cursor: usize, tokens: &[Token], source: &[char]) -> Option<Span<Token>> {
68        let inner_match = self.inner.run(cursor, tokens, source)?;
69
70        if inner_match.len() != 3 {
71            return None;
72        }
73
74        if self
75            .get_merged_word(&tokens[cursor], &tokens[cursor + 2], source)
76            .is_some()
77        {
78            return Some(inner_match);
79        }
80
81        None
82    }
83}
84
85#[cfg(test)]
86mod tests {
87    use super::MergeableWords;
88    use crate::{DictWordMetadata, Document};
89
90    fn predicate(
91        meta_closed: Option<&DictWordMetadata>,
92        meta_open: Option<&DictWordMetadata>,
93    ) -> bool {
94        meta_open.is_none() && meta_closed.is_some_and(|m| m.is_noun() && !m.is_proper_noun())
95    }
96
97    #[test]
98    fn merges_open_compound_not_in_dict() {
99        // note book is not in the dictionary, but notebook is
100        let doc = Document::new_plain_english_curated("note book");
101        let a = doc.tokens().next().unwrap();
102        let b = doc.tokens().nth(2).unwrap();
103
104        let merged = MergeableWords::new(predicate).get_merged_word(a, b, doc.get_source());
105
106        assert_eq!(merged, Some("notebook".chars().collect()));
107    }
108
109    #[test]
110    fn does_not_merge_open_compound_in_dict() {
111        // straight away is in the dictionary, and straightaway is
112        let doc = Document::new_plain_english_curated("straight away");
113        let a = doc.tokens().next().unwrap();
114        let b = doc.tokens().nth(2).unwrap();
115
116        let merged = MergeableWords::new(predicate).get_merged_word(a, b, doc.get_source());
117
118        assert_eq!(merged, None);
119    }
120
121    #[test]
122    fn does_not_merge_invalid_compound() {
123        // neither quick for nor quickfox are in the dictionary
124        let doc = Document::new_plain_english_curated("quick fox");
125        let a = doc.tokens().next().unwrap();
126        let b = doc.tokens().nth(2).unwrap();
127
128        let merged = MergeableWords::new(predicate).get_merged_word(a, b, doc.get_source());
129
130        assert_eq!(merged, None);
131    }
132
133    #[test]
134    fn merges_open_compound() {
135        // Dictionary has "frontline" but not "front line"
136        let doc = Document::new_plain_english_curated("front line");
137        let a = doc.tokens().next().unwrap();
138        let b = doc.tokens().nth(2).unwrap();
139
140        let merged = MergeableWords::new(predicate).get_merged_word(a, b, doc.get_source());
141
142        assert_eq!(merged, Some("frontline".chars().collect()));
143    }
144
145    #[test]
146    fn merges_hyphenated_compound() {
147        // Doesn't check for "front-line" in the dictionary but matches it and "frontline" is in the dictionary
148        let doc = Document::new_plain_english_curated("front-line");
149        let a = doc.tokens().next().unwrap();
150        let b = doc.tokens().nth(2).unwrap();
151
152        let merged = MergeableWords::new(predicate).get_merged_word(a, b, doc.get_source());
153
154        assert_eq!(merged, Some("frontline".chars().collect()));
155    }
156}