harper_core/expr/
mergeable_words.rs

1use std::sync::Arc;
2
3use super::{Expr, SequenceExpr, SpaceOrHyphen};
4use crate::spell::{Dictionary, FstDictionary};
5use crate::{CharString, Span, Token, WordMetadata};
6
7type PredicateFn = dyn Fn(Option<&WordMetadata>, Option<&WordMetadata>) -> bool + Send + Sync;
8
9/// A [`Expr`] that identifies adjacent words that could potentially be merged into a single word.
10///
11/// This checks if two adjacent words could form a valid compound word, but first verifies
12/// that the two words aren't already a valid lexeme in the dictionary (like "straight away").
13pub struct MergeableWords {
14    inner: SequenceExpr,
15    dict: Arc<FstDictionary>,
16    predicate: Box<PredicateFn>,
17}
18
19impl MergeableWords {
20    pub fn new(
21        predicate: impl Fn(Option<&WordMetadata>, Option<&WordMetadata>) -> bool + Send + Sync + 'static,
22    ) -> Self {
23        Self {
24            inner: SequenceExpr::default()
25                .then_any_word()
26                .then(SpaceOrHyphen)
27                .then_any_word(),
28            dict: FstDictionary::curated(),
29            predicate: Box::new(predicate),
30        }
31    }
32
33    /// Get the merged word from the dictionary if these words can be merged.
34    /// Returns None if the words should remain separate (according to the predicate).
35    pub fn get_merged_word(
36        &self,
37        word_a: &Token,
38        word_b: &Token,
39        source: &[char],
40    ) -> Option<CharString> {
41        let a_chars: CharString = word_a.span.get_content(source).into();
42        let b_chars: CharString = word_b.span.get_content(source).into();
43
44        // First check if the open compound exists in the dictionary
45        let mut compound = a_chars.clone();
46        compound.push(' ');
47        compound.extend_from_slice(&b_chars);
48        let meta_open = self.dict.get_word_metadata(&compound);
49
50        // Then check if the closed compound exists in the dictionary
51        compound.remove(a_chars.len());
52        let meta_closed = self.dict.get_word_metadata(&compound);
53
54        if (self.predicate)(meta_closed, meta_open) {
55            return Some(compound);
56        }
57
58        None
59    }
60}
61
62impl Expr for MergeableWords {
63    fn run(&self, cursor: usize, tokens: &[Token], source: &[char]) -> Option<Span<Token>> {
64        let inner_match = self.inner.run(cursor, tokens, source)?;
65
66        if inner_match.len() != 3 {
67            return None;
68        }
69
70        if self
71            .get_merged_word(&tokens[cursor], &tokens[cursor + 2], source)
72            .is_some()
73        {
74            return Some(inner_match);
75        }
76
77        None
78    }
79}
80
81#[cfg(test)]
82mod tests {
83    use super::MergeableWords;
84    use crate::{Document, WordMetadata};
85
86    fn predicate(meta_closed: Option<&WordMetadata>, meta_open: Option<&WordMetadata>) -> bool {
87        meta_open.is_none() && meta_closed.is_some_and(|m| m.is_noun() && !m.is_proper_noun())
88    }
89
90    #[test]
91    fn merges_open_compound_not_in_dict() {
92        // note book is not in the dictionary, but notebook is
93        let doc = Document::new_plain_english_curated("note book");
94        let a = doc.tokens().next().unwrap();
95        let b = doc.tokens().nth(2).unwrap();
96
97        let merged = MergeableWords::new(predicate).get_merged_word(a, b, doc.get_source());
98
99        assert_eq!(merged, Some("notebook".chars().collect()));
100    }
101
102    #[test]
103    fn does_not_merge_open_compound_in_dict() {
104        // straight away is in the dictionary, and straightaway is
105        let doc = Document::new_plain_english_curated("straight away");
106        let a = doc.tokens().next().unwrap();
107        let b = doc.tokens().nth(2).unwrap();
108
109        let merged = MergeableWords::new(predicate).get_merged_word(a, b, doc.get_source());
110
111        assert_eq!(merged, None);
112    }
113
114    #[test]
115    fn does_not_merge_invalid_compound() {
116        // neither quick for nor quickfox are in the dictionary
117        let doc = Document::new_plain_english_curated("quick fox");
118        let a = doc.tokens().next().unwrap();
119        let b = doc.tokens().nth(2).unwrap();
120
121        let merged = MergeableWords::new(predicate).get_merged_word(a, b, doc.get_source());
122
123        assert_eq!(merged, None);
124    }
125
126    #[test]
127    fn merges_open_compound() {
128        // Dictionary has "frontline" but not "front line"
129        let doc = Document::new_plain_english_curated("front line");
130        let a = doc.tokens().next().unwrap();
131        let b = doc.tokens().nth(2).unwrap();
132
133        let merged = MergeableWords::new(predicate).get_merged_word(a, b, doc.get_source());
134
135        assert_eq!(merged, Some("frontline".chars().collect()));
136    }
137
138    #[test]
139    fn merges_hyphenated_compound() {
140        // Doesn't check for "front-line" in the dictionary but matches it and "frontline" is in the dictionary
141        let doc = Document::new_plain_english_curated("front-line");
142        let a = doc.tokens().next().unwrap();
143        let b = doc.tokens().nth(2).unwrap();
144
145        let merged = MergeableWords::new(predicate).get_merged_word(a, b, doc.get_source());
146
147        assert_eq!(merged, Some("frontline".chars().collect()));
148    }
149}