1use std::collections::{HashMap, HashSet};
36use std::fs::File;
37use std::io::{BufRead, BufReader};
38use std::path::{Path, PathBuf};
39
40use std::borrow::Cow;
41
42use anyhow::{Context, Result};
43use wordnet_types::Pos;
44
45#[derive(Clone, Debug, Eq, PartialEq)]
47pub enum CandidateSource {
48 Surface,
49 Exception,
50 Rule {
51 suffix: &'static str,
52 replacement: &'static str,
53 },
54}
55
56#[derive(Clone, Debug, Eq, PartialEq)]
58pub struct LemmaCandidate<'a> {
59 pub pos: Pos,
60 pub lemma: Cow<'a, str>,
61 pub source: CandidateSource,
62}
63
64pub struct Morphy {
66 exceptions: HashMap<Pos, HashMap<String, Vec<String>>>,
67}
68
69impl Morphy {
70 pub fn load(dict_dir: impl AsRef<Path>) -> Result<Self> {
74 let dir = dict_dir.as_ref();
75 Ok(Self {
76 exceptions: HashMap::from([
77 (Pos::Noun, load_exc(dir.join("noun.exc"))?),
78 (Pos::Verb, load_exc(dir.join("verb.exc"))?),
79 (Pos::Adj, load_exc(dir.join("adj.exc"))?),
80 (Pos::Adv, load_exc(dir.join("adv.exc"))?),
81 ]),
82 })
83 }
84
85 pub fn lemmas_for<'a, F>(
90 &'a self,
91 pos: Pos,
92 surface: &str,
93 lemma_exists: F,
94 ) -> Vec<LemmaCandidate<'a>>
95 where
96 F: Fn(Pos, &str) -> bool,
97 {
98 let mut seen: HashSet<Cow<'a, str>> = HashSet::new();
99 let mut out: Vec<LemmaCandidate<'a>> = Vec::new();
100 let norm_surface = normalize(surface);
101
102 if lemma_exists(pos, &norm_surface) {
104 push_unique(
105 &mut out,
106 &mut seen,
107 LemmaCandidate {
108 pos,
109 lemma: Cow::Owned(norm_surface.clone()),
110 source: CandidateSource::Surface,
111 },
112 );
113 }
114
115 if let Some(exc_map) = self.exceptions.get(&pos)
117 && let Some(entries) = exc_map.get(&norm_surface)
118 {
119 for lemma in entries {
120 if lemma_exists(pos, lemma) {
121 push_unique(
122 &mut out,
123 &mut seen,
124 LemmaCandidate {
125 pos,
126 lemma: Cow::Borrowed(lemma.as_str()),
127 source: CandidateSource::Exception,
128 },
129 );
130 }
131 }
132 }
133
134 for (suffix, replacement) in rules_for(pos) {
136 if let Some(candidate) = apply_rule(&norm_surface, suffix, replacement)
137 && lemma_exists(pos, &candidate)
138 {
139 push_unique(
140 &mut out,
141 &mut seen,
142 LemmaCandidate {
143 pos,
144 lemma: Cow::Owned(candidate),
145 source: CandidateSource::Rule {
146 suffix,
147 replacement,
148 },
149 },
150 );
151 }
152 }
153
154 out
155 }
156}
157
158fn load_exc(path: PathBuf) -> Result<HashMap<String, Vec<String>>> {
159 if !path.exists() {
160 return Ok(HashMap::new());
161 }
162 let file =
163 File::open(&path).with_context(|| format!("open exception file {}", path.display()))?;
164 let reader = BufReader::new(file);
165 let mut map = HashMap::new();
166 for (lineno, line) in reader.lines().enumerate() {
167 let line =
168 line.with_context(|| format!("read line {} in {}", lineno + 1, path.display()))?;
169 let mut parts = line.split_whitespace();
170 let surface = match parts.next() {
171 Some(s) => normalize(s),
172 None => continue,
173 };
174 let lemmas: Vec<String> = parts.map(normalize).collect();
175 if !lemmas.is_empty() {
176 map.insert(surface, lemmas);
177 }
178 }
179 Ok(map)
180}
181
182fn normalize(text: &str) -> String {
183 text.trim().to_lowercase().replace(' ', "_")
184}
185
186fn push_unique<'a>(
187 out: &mut Vec<LemmaCandidate<'a>>,
188 seen: &mut HashSet<Cow<'a, str>>,
189 candidate: LemmaCandidate<'a>,
190) {
191 if seen.insert(candidate.lemma.clone()) {
192 out.push(candidate);
193 }
194}
195
196fn apply_rule(surface: &str, suffix: &str, replacement: &str) -> Option<String> {
197 surface.strip_suffix(suffix).map(|stem| {
198 let mut candidate = if replacement.is_empty() {
199 stem.to_string()
200 } else {
201 format!("{stem}{replacement}")
202 };
203
204 if replacement.is_empty() && candidate.len() >= 2 {
206 let mut chars = candidate.chars();
207 let prev = chars.next_back();
208 let last = chars.next_back();
209 if let (Some(a), Some(b)) = (prev, last)
210 && a == b
211 {
212 candidate.pop();
213 }
214 }
215
216 candidate
217 })
218}
219
220fn rules_for(pos: Pos) -> &'static [(&'static str, &'static str)] {
221 match pos {
222 Pos::Noun => &[
223 ("s", ""),
224 ("ses", "s"),
225 ("xes", "x"),
226 ("zes", "z"),
227 ("ches", "ch"),
228 ("shes", "sh"),
229 ("men", "man"),
230 ("ies", "y"),
231 ],
232 Pos::Verb => &[
233 ("s", ""),
234 ("ies", "y"),
235 ("es", "e"),
236 ("es", ""),
237 ("ed", "e"),
238 ("ed", ""),
239 ("ing", "e"),
240 ("ing", ""),
241 ],
242 Pos::Adj | Pos::Adv => &[("er", ""), ("er", "e"), ("est", ""), ("est", "e")],
243 }
244}
245
246#[cfg(test)]
247mod tests {
248 use super::*;
249
250 fn fake_exists(targets: &[(&str, Pos)]) -> impl Fn(Pos, &str) -> bool {
251 let set: HashSet<(Pos, String)> = targets
252 .iter()
253 .map(|(lemma, pos)| (*pos, normalize(lemma)))
254 .collect();
255 move |pos, lemma| set.contains(&(pos, normalize(lemma)))
256 }
257
258 #[test]
259 fn uses_exceptions_and_rules() {
260 let mut morph = Morphy {
261 exceptions: HashMap::new(),
262 };
263 morph.exceptions.insert(
264 Pos::Noun,
265 HashMap::from([("children".into(), vec!["child".into()])]),
266 );
267
268 let candidates =
269 morph.lemmas_for(Pos::Noun, "children", fake_exists(&[("child", Pos::Noun)]));
270 assert_eq!(candidates.len(), 1);
271 assert!(matches!(candidates[0].source, CandidateSource::Exception));
272 assert_eq!(candidates[0].lemma, "child");
273 }
274
275 #[test]
276 fn includes_surface_and_rule_hits() {
277 let morph = Morphy {
278 exceptions: HashMap::new(),
279 };
280 let candidates = morph.lemmas_for(
281 Pos::Verb,
282 "running",
283 fake_exists(&[("running", Pos::Verb), ("run", Pos::Verb)]),
284 );
285 assert_eq!(candidates.len(), 2);
286 assert!(matches!(candidates[0].source, CandidateSource::Surface));
287 assert!(matches!(candidates[1].source, CandidateSource::Rule { .. }));
288 }
289}