harper_core/expr/
mergeable_words.rs1use std::sync::Arc;
2
3use super::{Expr, SequenceExpr, SpaceOrHyphen};
4use crate::spell::{Dictionary, FstDictionary};
5use crate::{CharString, DictWordMetadata, Span, Token};
6
7type PredicateFn =
8 dyn Fn(Option<&DictWordMetadata>, Option<&DictWordMetadata>) -> bool + Send + Sync;
9
10pub struct MergeableWords {
15 inner: SequenceExpr,
16 dict: Arc<FstDictionary>,
17 predicate: Box<PredicateFn>,
18}
19
20impl MergeableWords {
21 pub fn new(
22 predicate: impl Fn(Option<&DictWordMetadata>, Option<&DictWordMetadata>) -> bool
23 + Send
24 + Sync
25 + 'static,
26 ) -> Self {
27 Self {
28 inner: SequenceExpr::default()
29 .then_any_word()
30 .then(SpaceOrHyphen)
31 .then_any_word(),
32 dict: FstDictionary::curated(),
33 predicate: Box::new(predicate),
34 }
35 }
36
37 pub fn get_merged_word(
40 &self,
41 word_a: &Token,
42 word_b: &Token,
43 source: &[char],
44 ) -> Option<CharString> {
45 let a_chars: CharString = word_a.span.get_content(source).into();
46 let b_chars: CharString = word_b.span.get_content(source).into();
47
48 let mut compound = a_chars.clone();
50 compound.push(' ');
51 compound.extend_from_slice(&b_chars);
52 let meta_open = self.dict.get_lexeme_metadata(&compound);
53
54 compound.remove(a_chars.len());
56 let meta_closed = self.dict.get_lexeme_metadata(&compound);
57
58 if (self.predicate)(meta_closed.as_deref(), meta_open.as_deref()) {
59 return Some(compound);
60 }
61
62 None
63 }
64}
65
66impl Expr for MergeableWords {
67 fn run(&self, cursor: usize, tokens: &[Token], source: &[char]) -> Option<Span<Token>> {
68 let inner_match = self.inner.run(cursor, tokens, source)?;
69
70 if inner_match.len() != 3 {
71 return None;
72 }
73
74 if self
75 .get_merged_word(&tokens[cursor], &tokens[cursor + 2], source)
76 .is_some()
77 {
78 return Some(inner_match);
79 }
80
81 None
82 }
83}
84
85#[cfg(test)]
86mod tests {
87 use super::MergeableWords;
88 use crate::{DictWordMetadata, Document};
89
90 fn predicate(
91 meta_closed: Option<&DictWordMetadata>,
92 meta_open: Option<&DictWordMetadata>,
93 ) -> bool {
94 meta_open.is_none() && meta_closed.is_some_and(|m| m.is_noun() && !m.is_proper_noun())
95 }
96
97 #[test]
98 fn merges_open_compound_not_in_dict() {
99 let doc = Document::new_plain_english_curated("note book");
101 let a = doc.tokens().next().unwrap();
102 let b = doc.tokens().nth(2).unwrap();
103
104 let merged = MergeableWords::new(predicate).get_merged_word(a, b, doc.get_source());
105
106 assert_eq!(merged, Some("notebook".chars().collect()));
107 }
108
109 #[test]
110 fn does_not_merge_open_compound_in_dict() {
111 let doc = Document::new_plain_english_curated("straight away");
113 let a = doc.tokens().next().unwrap();
114 let b = doc.tokens().nth(2).unwrap();
115
116 let merged = MergeableWords::new(predicate).get_merged_word(a, b, doc.get_source());
117
118 assert_eq!(merged, None);
119 }
120
121 #[test]
122 fn does_not_merge_invalid_compound() {
123 let doc = Document::new_plain_english_curated("quick fox");
125 let a = doc.tokens().next().unwrap();
126 let b = doc.tokens().nth(2).unwrap();
127
128 let merged = MergeableWords::new(predicate).get_merged_word(a, b, doc.get_source());
129
130 assert_eq!(merged, None);
131 }
132
133 #[test]
134 fn merges_open_compound() {
135 let doc = Document::new_plain_english_curated("front line");
137 let a = doc.tokens().next().unwrap();
138 let b = doc.tokens().nth(2).unwrap();
139
140 let merged = MergeableWords::new(predicate).get_merged_word(a, b, doc.get_source());
141
142 assert_eq!(merged, Some("frontline".chars().collect()));
143 }
144
145 #[test]
146 fn merges_hyphenated_compound() {
147 let doc = Document::new_plain_english_curated("front-line");
149 let a = doc.tokens().next().unwrap();
150 let b = doc.tokens().nth(2).unwrap();
151
152 let merged = MergeableWords::new(predicate).get_merged_word(a, b, doc.get_source());
153
154 assert_eq!(merged, Some("frontline".chars().collect()));
155 }
156}