Skip to main content

harper_core/expr/
mergeable_words.rs

1use std::sync::Arc;
2
3use super::{Expr, SequenceExpr};
4use crate::spell::{Dictionary, FstDictionary};
5use crate::{CharString, DictWordMetadata, Span, Token};
6
7type PredicateFn =
8    dyn Fn(Option<&DictWordMetadata>, Option<&DictWordMetadata>) -> bool + Send + Sync;
9
10/// An [`Expr`] that identifies adjacent words that could potentially be merged into a single word.
11///
12/// This checks if two adjacent words could form a valid compound word, but first verifies
13/// that the two words aren't already a valid entry in the dictionary (like "straight away").
14pub struct MergeableWords {
15    inner: SequenceExpr,
16    dict: Arc<FstDictionary>,
17    predicate: Box<PredicateFn>,
18}
19
20impl MergeableWords {
21    pub fn new(
22        predicate: impl Fn(Option<&DictWordMetadata>, Option<&DictWordMetadata>) -> bool
23        + Send
24        + Sync
25        + 'static,
26    ) -> Self {
27        Self {
28            inner: SequenceExpr::any_word().t_ws_h().then_any_word(),
29            dict: FstDictionary::curated(),
30            predicate: Box::new(predicate),
31        }
32    }
33
34    /// Get the merged word from the dictionary if these words can be merged.
35    /// Returns None if the words should remain separate (according to the predicate).
36    pub fn get_merged_word(
37        &self,
38        word_a: &Token,
39        word_b: &Token,
40        source: &[char],
41    ) -> Option<CharString> {
42        let a_chars: CharString = word_a.span.get_content(source).into();
43        let b_chars: CharString = word_b.span.get_content(source).into();
44
45        // First check if the open compound exists in the dictionary
46        let mut compound = a_chars.clone();
47        compound.push(' ');
48        compound.extend_from_slice(&b_chars);
49        let meta_open = self.dict.get_word_metadata(&compound);
50
51        // Then check if the closed compound exists in the dictionary
52        compound.remove(a_chars.len());
53        let meta_closed = self.dict.get_word_metadata(&compound);
54
55        if (self.predicate)(meta_closed.as_deref(), meta_open.as_deref()) {
56            return Some(compound);
57        }
58
59        None
60    }
61}
62
63impl Expr for MergeableWords {
64    fn run(&self, cursor: usize, tokens: &[Token], source: &[char]) -> Option<Span<Token>> {
65        let inner_match = self.inner.run(cursor, tokens, source)?;
66
67        if inner_match.len() != 3 {
68            return None;
69        }
70
71        if self
72            .get_merged_word(&tokens[cursor], &tokens[cursor + 2], source)
73            .is_some()
74        {
75            return Some(inner_match);
76        }
77
78        None
79    }
80}
81
82#[cfg(test)]
83mod tests {
84    use super::MergeableWords;
85    use crate::{DictWordMetadata, Document};
86
87    fn predicate(
88        meta_closed: Option<&DictWordMetadata>,
89        meta_open: Option<&DictWordMetadata>,
90    ) -> bool {
91        meta_open.is_none() && meta_closed.is_some_and(|m| m.is_noun() && !m.is_proper_noun())
92    }
93
94    #[test]
95    fn merges_open_compound_not_in_dict() {
96        // note book is not in the dictionary, but notebook is
97        let doc = Document::new_plain_english_curated("note book");
98        let a = doc.tokens().next().unwrap();
99        let b = doc.tokens().nth(2).unwrap();
100
101        let merged = MergeableWords::new(predicate).get_merged_word(a, b, doc.get_source());
102
103        assert_eq!(merged, Some("notebook".chars().collect()));
104    }
105
106    #[test]
107    fn does_not_merge_open_compound_in_dict() {
108        // straight away is in the dictionary, and straightaway is
109        let doc = Document::new_plain_english_curated("straight away");
110        let a = doc.tokens().next().unwrap();
111        let b = doc.tokens().nth(2).unwrap();
112
113        let merged = MergeableWords::new(predicate).get_merged_word(a, b, doc.get_source());
114
115        assert_eq!(merged, None);
116    }
117
118    #[test]
119    fn does_not_merge_invalid_compound() {
120        // neither quick for nor quickfox are in the dictionary
121        let doc = Document::new_plain_english_curated("quick fox");
122        let a = doc.tokens().next().unwrap();
123        let b = doc.tokens().nth(2).unwrap();
124
125        let merged = MergeableWords::new(predicate).get_merged_word(a, b, doc.get_source());
126
127        assert_eq!(merged, None);
128    }
129
130    #[test]
131    fn merges_open_compound() {
132        // Dictionary has "frontline" but not "front line"
133        let doc = Document::new_plain_english_curated("front line");
134        let a = doc.tokens().next().unwrap();
135        let b = doc.tokens().nth(2).unwrap();
136
137        let merged = MergeableWords::new(predicate).get_merged_word(a, b, doc.get_source());
138
139        assert_eq!(merged, Some("frontline".chars().collect()));
140    }
141
142    #[test]
143    fn merges_hyphenated_compound() {
144        // Doesn't check for "front-line" in the dictionary but matches it and "frontline" is in the dictionary
145        let doc = Document::new_plain_english_curated("front-line");
146        let a = doc.tokens().next().unwrap();
147        let b = doc.tokens().nth(2).unwrap();
148
149        let merged = MergeableWords::new(predicate).get_merged_word(a, b, doc.get_source());
150
151        assert_eq!(merged, Some("frontline".chars().collect()));
152    }
153}