wordnet_morphy/
lib.rs

1//! WordNet-style morphological processing (morphy).
2//!
3//! Faithful to the classic morphy algorithm: check exceptions, apply suffix
4//! rules, and verify candidates via a caller-provided lemma existence
5//! predicate. The crate is intentionally decoupled from any particular loader;
6//! it only depends on `Pos` and the callback you supply.
7//!
8//! # How it works
9//! 1. Emit the surface form if it exists.
10//! 2. Check exceptions (`*.exc` files).
11//! 3. Apply POS-specific suffix rules.
12//! 4. Deduplicate while preserving provenance (`Surface`, `Exception`, `Rule`).
13//!
14//! # Example
15//! ```no_run
16//! use wordnet_db::{LoadMode, WordNet};
17//! use wordnet_morphy::Morphy;
18//! use wordnet_types::Pos;
19//!
20//! # fn main() -> anyhow::Result<()> {
21//! let dict = "/path/to/wordnet";
22//! let wn = WordNet::load_with_mode(dict, LoadMode::Mmap)?;
23//! let morph = Morphy::load(dict)?;
24//! let exists = |pos, lemma: &str| wn.lemma_exists(pos, lemma);
25//!
26//! let cands = morph.lemmas_for(Pos::Verb, "running", exists);
27//! for cand in cands {
28//!     println!("{:?}: {}", cand.source, cand.lemma);
29//! }
30//! # Ok(()) }
31//! ```
32//!
33//! For a runnable demo, see `cargo run -p wordnet-morphy --example lookup -- <dict> [--demo|<word>]`.
34
35use std::collections::{HashMap, HashSet};
36use std::fs::File;
37use std::io::{BufRead, BufReader};
38use std::path::{Path, PathBuf};
39
40use std::borrow::Cow;
41
42use anyhow::{Context, Result};
43use wordnet_types::Pos;
44
45/// Where a candidate lemma originated.
46#[derive(Clone, Debug, Eq, PartialEq)]
47pub enum CandidateSource {
48    Surface,
49    Exception,
50    Rule {
51        suffix: &'static str,
52        replacement: &'static str,
53    },
54}
55
56/// A lemma candidate paired with its POS and provenance.
57#[derive(Clone, Debug, Eq, PartialEq)]
58pub struct LemmaCandidate<'a> {
59    pub pos: Pos,
60    pub lemma: Cow<'a, str>,
61    pub source: CandidateSource,
62}
63
64/// Minimal morphy implementation parameterised by caller-provided existence checks.
65pub struct Morphy {
66    exceptions: HashMap<Pos, HashMap<String, Vec<String>>>,
67}
68
69impl Morphy {
70    /// Load morphy exception lists (`*.exc`) from a WordNet dict directory.
71    ///
72    /// Files are optional; missing ones are treated as empty.
73    pub fn load(dict_dir: impl AsRef<Path>) -> Result<Self> {
74        let dir = dict_dir.as_ref();
75        Ok(Self {
76            exceptions: HashMap::from([
77                (Pos::Noun, load_exc(dir.join("noun.exc"))?),
78                (Pos::Verb, load_exc(dir.join("verb.exc"))?),
79                (Pos::Adj, load_exc(dir.join("adj.exc"))?),
80                (Pos::Adv, load_exc(dir.join("adv.exc"))?),
81            ]),
82        })
83    }
84
85    /// Generate lemmas for a surface form, returning enriched provenance.
86    ///
87    /// The callback `lemma_exists` typically delegates to `WordNet::lemma_exists`
88    /// so this crate stays ignorant of any concrete database layout.
89    pub fn lemmas_for<'a, F>(
90        &'a self,
91        pos: Pos,
92        surface: &str,
93        lemma_exists: F,
94    ) -> Vec<LemmaCandidate<'a>>
95    where
96        F: Fn(Pos, &str) -> bool,
97    {
98        let mut seen: HashSet<Cow<'a, str>> = HashSet::new();
99        let mut out: Vec<LemmaCandidate<'a>> = Vec::new();
100        let norm_surface = normalize(surface);
101
102        // Surface form first if it exists.
103        if lemma_exists(pos, &norm_surface) {
104            push_unique(
105                &mut out,
106                &mut seen,
107                LemmaCandidate {
108                    pos,
109                    lemma: Cow::Owned(norm_surface.clone()),
110                    source: CandidateSource::Surface,
111                },
112            );
113        }
114
115        // Exceptions: may include multiple lemmas per surface form.
116        if let Some(exc_map) = self.exceptions.get(&pos)
117            && let Some(entries) = exc_map.get(&norm_surface)
118        {
119            for lemma in entries {
120                if lemma_exists(pos, lemma) {
121                    push_unique(
122                        &mut out,
123                        &mut seen,
124                        LemmaCandidate {
125                            pos,
126                            lemma: Cow::Borrowed(lemma.as_str()),
127                            source: CandidateSource::Exception,
128                        },
129                    );
130                }
131            }
132        }
133
134        // Rule-based guesses.
135        for (suffix, replacement) in rules_for(pos) {
136            if let Some(candidate) = apply_rule(&norm_surface, suffix, replacement)
137                && lemma_exists(pos, &candidate)
138            {
139                push_unique(
140                    &mut out,
141                    &mut seen,
142                    LemmaCandidate {
143                        pos,
144                        lemma: Cow::Owned(candidate),
145                        source: CandidateSource::Rule {
146                            suffix,
147                            replacement,
148                        },
149                    },
150                );
151            }
152        }
153
154        out
155    }
156}
157
158fn load_exc(path: PathBuf) -> Result<HashMap<String, Vec<String>>> {
159    if !path.exists() {
160        return Ok(HashMap::new());
161    }
162    let file =
163        File::open(&path).with_context(|| format!("open exception file {}", path.display()))?;
164    let reader = BufReader::new(file);
165    let mut map = HashMap::new();
166    for (lineno, line) in reader.lines().enumerate() {
167        let line =
168            line.with_context(|| format!("read line {} in {}", lineno + 1, path.display()))?;
169        let mut parts = line.split_whitespace();
170        let surface = match parts.next() {
171            Some(s) => normalize(s),
172            None => continue,
173        };
174        let lemmas: Vec<String> = parts.map(normalize).collect();
175        if !lemmas.is_empty() {
176            map.insert(surface, lemmas);
177        }
178    }
179    Ok(map)
180}
181
182fn normalize(text: &str) -> String {
183    text.trim().to_lowercase().replace(' ', "_")
184}
185
186fn push_unique<'a>(
187    out: &mut Vec<LemmaCandidate<'a>>,
188    seen: &mut HashSet<Cow<'a, str>>,
189    candidate: LemmaCandidate<'a>,
190) {
191    if seen.insert(candidate.lemma.clone()) {
192        out.push(candidate);
193    }
194}
195
196fn apply_rule(surface: &str, suffix: &str, replacement: &str) -> Option<String> {
197    surface.strip_suffix(suffix).map(|stem| {
198        let mut candidate = if replacement.is_empty() {
199            stem.to_string()
200        } else {
201            format!("{stem}{replacement}")
202        };
203
204        // Handle doubled consonants from inflected forms (e.g. "running" -> "runn").
205        if replacement.is_empty() && candidate.len() >= 2 {
206            let mut chars = candidate.chars();
207            let prev = chars.next_back();
208            let last = chars.next_back();
209            if let (Some(a), Some(b)) = (prev, last)
210                && a == b
211            {
212                candidate.pop();
213            }
214        }
215
216        candidate
217    })
218}
219
220fn rules_for(pos: Pos) -> &'static [(&'static str, &'static str)] {
221    match pos {
222        Pos::Noun => &[
223            ("s", ""),
224            ("ses", "s"),
225            ("xes", "x"),
226            ("zes", "z"),
227            ("ches", "ch"),
228            ("shes", "sh"),
229            ("men", "man"),
230            ("ies", "y"),
231        ],
232        Pos::Verb => &[
233            ("s", ""),
234            ("ies", "y"),
235            ("es", "e"),
236            ("es", ""),
237            ("ed", "e"),
238            ("ed", ""),
239            ("ing", "e"),
240            ("ing", ""),
241        ],
242        Pos::Adj | Pos::Adv => &[("er", ""), ("er", "e"), ("est", ""), ("est", "e")],
243    }
244}
245
246#[cfg(test)]
247mod tests {
248    use super::*;
249
250    fn fake_exists(targets: &[(&str, Pos)]) -> impl Fn(Pos, &str) -> bool {
251        let set: HashSet<(Pos, String)> = targets
252            .iter()
253            .map(|(lemma, pos)| (*pos, normalize(lemma)))
254            .collect();
255        move |pos, lemma| set.contains(&(pos, normalize(lemma)))
256    }
257
258    #[test]
259    fn uses_exceptions_and_rules() {
260        let mut morph = Morphy {
261            exceptions: HashMap::new(),
262        };
263        morph.exceptions.insert(
264            Pos::Noun,
265            HashMap::from([("children".into(), vec!["child".into()])]),
266        );
267
268        let candidates =
269            morph.lemmas_for(Pos::Noun, "children", fake_exists(&[("child", Pos::Noun)]));
270        assert_eq!(candidates.len(), 1);
271        assert!(matches!(candidates[0].source, CandidateSource::Exception));
272        assert_eq!(candidates[0].lemma, "child");
273    }
274
275    #[test]
276    fn includes_surface_and_rule_hits() {
277        let morph = Morphy {
278            exceptions: HashMap::new(),
279        };
280        let candidates = morph.lemmas_for(
281            Pos::Verb,
282            "running",
283            fake_exists(&[("running", Pos::Verb), ("run", Pos::Verb)]),
284        );
285        assert_eq!(candidates.len(), 2);
286        assert!(matches!(candidates[0].source, CandidateSource::Surface));
287        assert!(matches!(candidates[1].source, CandidateSource::Rule { .. }));
288    }
289}