harper_core/expr/
mergeable_words.rs1use std::sync::Arc;
2
3use super::{Expr, SequenceExpr, SpaceOrHyphen};
4use crate::spell::{Dictionary, FstDictionary};
5use crate::{CharString, Span, Token, WordMetadata};
6
7type PredicateFn = dyn Fn(Option<&WordMetadata>, Option<&WordMetadata>) -> bool + Send + Sync;
8
9pub struct MergeableWords {
14 inner: SequenceExpr,
15 dict: Arc<FstDictionary>,
16 predicate: Box<PredicateFn>,
17}
18
19impl MergeableWords {
20 pub fn new(
21 predicate: impl Fn(Option<&WordMetadata>, Option<&WordMetadata>) -> bool + Send + Sync + 'static,
22 ) -> Self {
23 Self {
24 inner: SequenceExpr::default()
25 .then_any_word()
26 .then(SpaceOrHyphen)
27 .then_any_word(),
28 dict: FstDictionary::curated(),
29 predicate: Box::new(predicate),
30 }
31 }
32
33 pub fn get_merged_word(
36 &self,
37 word_a: &Token,
38 word_b: &Token,
39 source: &[char],
40 ) -> Option<CharString> {
41 let a_chars: CharString = word_a.span.get_content(source).into();
42 let b_chars: CharString = word_b.span.get_content(source).into();
43
44 let mut compound = a_chars.clone();
46 compound.push(' ');
47 compound.extend_from_slice(&b_chars);
48 let meta_open = self.dict.get_word_metadata(&compound);
49
50 compound.remove(a_chars.len());
52 let meta_closed = self.dict.get_word_metadata(&compound);
53
54 if (self.predicate)(meta_closed, meta_open) {
55 return Some(compound);
56 }
57
58 None
59 }
60}
61
62impl Expr for MergeableWords {
63 fn run(&self, cursor: usize, tokens: &[Token], source: &[char]) -> Option<Span<Token>> {
64 let inner_match = self.inner.run(cursor, tokens, source)?;
65
66 if inner_match.len() != 3 {
67 return None;
68 }
69
70 if self
71 .get_merged_word(&tokens[cursor], &tokens[cursor + 2], source)
72 .is_some()
73 {
74 return Some(inner_match);
75 }
76
77 None
78 }
79}
80
81#[cfg(test)]
82mod tests {
83 use super::MergeableWords;
84 use crate::{Document, WordMetadata};
85
86 fn predicate(meta_closed: Option<&WordMetadata>, meta_open: Option<&WordMetadata>) -> bool {
87 meta_open.is_none() && meta_closed.is_some_and(|m| m.is_noun() && !m.is_proper_noun())
88 }
89
90 #[test]
91 fn merges_open_compound_not_in_dict() {
92 let doc = Document::new_plain_english_curated("note book");
94 let a = doc.tokens().next().unwrap();
95 let b = doc.tokens().nth(2).unwrap();
96
97 let merged = MergeableWords::new(predicate).get_merged_word(a, b, doc.get_source());
98
99 assert_eq!(merged, Some("notebook".chars().collect()));
100 }
101
102 #[test]
103 fn does_not_merge_open_compound_in_dict() {
104 let doc = Document::new_plain_english_curated("straight away");
106 let a = doc.tokens().next().unwrap();
107 let b = doc.tokens().nth(2).unwrap();
108
109 let merged = MergeableWords::new(predicate).get_merged_word(a, b, doc.get_source());
110
111 assert_eq!(merged, None);
112 }
113
114 #[test]
115 fn does_not_merge_invalid_compound() {
116 let doc = Document::new_plain_english_curated("quick fox");
118 let a = doc.tokens().next().unwrap();
119 let b = doc.tokens().nth(2).unwrap();
120
121 let merged = MergeableWords::new(predicate).get_merged_word(a, b, doc.get_source());
122
123 assert_eq!(merged, None);
124 }
125
126 #[test]
127 fn merges_open_compound() {
128 let doc = Document::new_plain_english_curated("front line");
130 let a = doc.tokens().next().unwrap();
131 let b = doc.tokens().nth(2).unwrap();
132
133 let merged = MergeableWords::new(predicate).get_merged_word(a, b, doc.get_source());
134
135 assert_eq!(merged, Some("frontline".chars().collect()));
136 }
137
138 #[test]
139 fn merges_hyphenated_compound() {
140 let doc = Document::new_plain_english_curated("front-line");
142 let a = doc.tokens().next().unwrap();
143 let b = doc.tokens().nth(2).unwrap();
144
145 let merged = MergeableWords::new(predicate).get_merged_word(a, b, doc.get_source());
146
147 assert_eq!(merged, Some("frontline".chars().collect()));
148 }
149}