harper_core/expr/
mergeable_words.rs1use std::sync::Arc;
2
3use super::{Expr, SequenceExpr};
4use crate::spell::{Dictionary, FstDictionary};
5use crate::{CharString, DictWordMetadata, Span, Token};
6
7type PredicateFn =
8 dyn Fn(Option<&DictWordMetadata>, Option<&DictWordMetadata>) -> bool + Send + Sync;
9
10pub struct MergeableWords {
15 inner: SequenceExpr,
16 dict: Arc<FstDictionary>,
17 predicate: Box<PredicateFn>,
18}
19
20impl MergeableWords {
21 pub fn new(
22 predicate: impl Fn(Option<&DictWordMetadata>, Option<&DictWordMetadata>) -> bool
23 + Send
24 + Sync
25 + 'static,
26 ) -> Self {
27 Self {
28 inner: SequenceExpr::any_word().t_ws_h().then_any_word(),
29 dict: FstDictionary::curated(),
30 predicate: Box::new(predicate),
31 }
32 }
33
34 pub fn get_merged_word(
37 &self,
38 word_a: &Token,
39 word_b: &Token,
40 source: &[char],
41 ) -> Option<CharString> {
42 let a_chars: CharString = word_a.span.get_content(source).into();
43 let b_chars: CharString = word_b.span.get_content(source).into();
44
45 let mut compound = a_chars.clone();
47 compound.push(' ');
48 compound.extend_from_slice(&b_chars);
49 let meta_open = self.dict.get_word_metadata(&compound);
50
51 compound.remove(a_chars.len());
53 let meta_closed = self.dict.get_word_metadata(&compound);
54
55 if (self.predicate)(meta_closed.as_deref(), meta_open.as_deref()) {
56 return Some(compound);
57 }
58
59 None
60 }
61}
62
63impl Expr for MergeableWords {
64 fn run(&self, cursor: usize, tokens: &[Token], source: &[char]) -> Option<Span<Token>> {
65 let inner_match = self.inner.run(cursor, tokens, source)?;
66
67 if inner_match.len() != 3 {
68 return None;
69 }
70
71 if self
72 .get_merged_word(&tokens[cursor], &tokens[cursor + 2], source)
73 .is_some()
74 {
75 return Some(inner_match);
76 }
77
78 None
79 }
80}
81
82#[cfg(test)]
83mod tests {
84 use super::MergeableWords;
85 use crate::{DictWordMetadata, Document};
86
87 fn predicate(
88 meta_closed: Option<&DictWordMetadata>,
89 meta_open: Option<&DictWordMetadata>,
90 ) -> bool {
91 meta_open.is_none() && meta_closed.is_some_and(|m| m.is_noun() && !m.is_proper_noun())
92 }
93
94 #[test]
95 fn merges_open_compound_not_in_dict() {
96 let doc = Document::new_plain_english_curated("note book");
98 let a = doc.tokens().next().unwrap();
99 let b = doc.tokens().nth(2).unwrap();
100
101 let merged = MergeableWords::new(predicate).get_merged_word(a, b, doc.get_source());
102
103 assert_eq!(merged, Some("notebook".chars().collect()));
104 }
105
106 #[test]
107 fn does_not_merge_open_compound_in_dict() {
108 let doc = Document::new_plain_english_curated("straight away");
110 let a = doc.tokens().next().unwrap();
111 let b = doc.tokens().nth(2).unwrap();
112
113 let merged = MergeableWords::new(predicate).get_merged_word(a, b, doc.get_source());
114
115 assert_eq!(merged, None);
116 }
117
118 #[test]
119 fn does_not_merge_invalid_compound() {
120 let doc = Document::new_plain_english_curated("quick fox");
122 let a = doc.tokens().next().unwrap();
123 let b = doc.tokens().nth(2).unwrap();
124
125 let merged = MergeableWords::new(predicate).get_merged_word(a, b, doc.get_source());
126
127 assert_eq!(merged, None);
128 }
129
130 #[test]
131 fn merges_open_compound() {
132 let doc = Document::new_plain_english_curated("front line");
134 let a = doc.tokens().next().unwrap();
135 let b = doc.tokens().nth(2).unwrap();
136
137 let merged = MergeableWords::new(predicate).get_merged_word(a, b, doc.get_source());
138
139 assert_eq!(merged, Some("frontline".chars().collect()));
140 }
141
142 #[test]
143 fn merges_hyphenated_compound() {
144 let doc = Document::new_plain_english_curated("front-line");
146 let a = doc.tokens().next().unwrap();
147 let b = doc.tokens().nth(2).unwrap();
148
149 let merged = MergeableWords::new(predicate).get_merged_word(a, b, doc.get_source());
150
151 assert_eq!(merged, Some("frontline".chars().collect()));
152 }
153}