ohnomore/transform/
lemmatization.rs

1//! Lemmatization transformations.
2//!
3//! This module provides transformations that converts lemmas to TüBa-D/Z-style
4//! lemmas.
5
6use std::collections::HashMap;
7use std::io::{BufRead, Cursor};
8
9use fst::{Set, SetBuilder};
10
11use crate::constants::*;
12use crate::transform::named_entity::restore_named_entity_case;
13use crate::transform::svp::longest_prefixes;
14use crate::transform::{DependencyGraph, Transform};
15use crate::LemmatizationError;
16
17/// Set the lemma of reflexive personal pronouns (PRF) to `#refl`.
18pub struct AddReflexiveTag;
19
20impl Transform for AddReflexiveTag {
21    fn transform(&self, graph: &dyn DependencyGraph, node: usize) -> String {
22        let token = graph.token(node);
23        let lemma = token.lemma();
24
25        if token.xpos() == REFLEXIVE_PERSONAL_PRONOUN_TAG {
26            REFLEXIVE_PERSONAL_PRONOUN_LEMMA.to_owned()
27        } else {
28            lemma.to_owned()
29        }
30    }
31}
32
33/// Add separable verb prefixes to verbs.
34///
35/// TüBa-D/Z marks separable verb prefixes in the verb lemma. E.g. *ab#zeichnen*,
36/// where *ab* is the separable prefix. This transformation handles cases where
37/// the prefix is separated from the verb. For example, in the sentence
38///
39/// *Diese änderungen zeichnen sich bereits ab .*
40///
41/// The transformation rule will lemmatize *zeichnen* to *ab#zeichnen*. The
42/// separable particle of a verb is found using dependency structure. In some
43/// limited cases, it will also handle verbs with multiple `competing' separable
44/// prefixes. For example, *nimmt* in
45///
46/// *[...] nimmt eher zu als ab*
47///
48/// is lemmatized as *zu#nehmen|ab#nehmen*.
49pub struct AddSeparatedVerbPrefix {
50    multiple_prefixes: bool,
51}
52
53impl AddSeparatedVerbPrefix {
54    pub fn new(multiple_prefixes: bool) -> Self {
55        AddSeparatedVerbPrefix { multiple_prefixes }
56    }
57}
58
59impl Transform for AddSeparatedVerbPrefix {
60    fn transform(&self, graph: &dyn DependencyGraph, node: usize) -> String {
61        let token = graph.token(node);
62        let lemma = token.lemma();
63
64        if !is_separable_verb(token.xpos()) {
65            return lemma.to_owned();
66        }
67
68        let mut lemma = lemma.to_owned();
69
70        // Find all nodes that are attached with the separable verb dependency
71        // relation.
72        //
73        // Fixme: check AVZ/KON relation as well?
74        // Fixme: what about particles linked KON?
75        let mut prefix_iter = graph
76            .dependents(node)
77            .filter(|(dependent, _)| graph.token(*dependent).xpos() == SEPARABLE_PARTICLE_POS);
78
79        if self.multiple_prefixes {
80            let mut lemmas = Vec::new();
81
82            // Fixme: prefixes are not returned in sentence order?
83            for (dependant, _) in prefix_iter {
84                let prefix = graph.token(dependant);
85                lemmas.push(format!("{}#{}", prefix.form().to_lowercase(), lemma));
86            }
87
88            if lemmas.is_empty() {
89                lemma
90            } else {
91                lemmas.join("|")
92            }
93        } else {
94            if let Some((dependant, _)) = prefix_iter.next() {
95                let prefix = graph.token(dependant);
96                lemma.insert_str(0, &format!("{}#", prefix.form().to_lowercase()));
97            }
98
99            lemma
100        }
101    }
102}
103
104/// Lemmatize tokens where the form is the lemma.
105pub struct FormAsLemma;
106
107impl Transform for FormAsLemma {
108    fn transform(&self, graph: &dyn DependencyGraph, node: usize) -> String {
109        let token = graph.token(node);
110
111        // Handle tags for which the lemma is the lowercased form.
112        if LEMMA_IS_FORM_TAGS.contains(token.xpos()) {
113            token.form().to_lowercase()
114        } else if LEMMA_IS_FORM_PRESERVE_CASE_TAGS.contains(token.xpos()) {
115            token.form().to_owned()
116        } else {
117            token.lemma().to_owned()
118        }
119    }
120}
121
122/// Mark separable verb prefixes in verbs.
123///
124/// TüBa-D/Z marks separable verb prefixes in the verb lemma. E.g. *ab#zeichnen*,
125/// where *ab* is the separable prefix. This transformation handles cases where
126/// the prefix is **not** separated from the verb. For example, it makes the
127/// following transformations:
128///
129/// 1. *abhing/hängen* -> *abhängen*
130/// 2. *dazugefügt/fügen* -> *dazu#fügen*
131/// 3. *wiedergutgemacht/machen* -> *wieder#gut#machen*
132/// 4. *hinzubewegen/bewegen* -> *hin#bewegen*
133///
134/// The transformation rule prefers analysis with longer prefixes over shorter
135/// prefixes. This leads to the analysis (2) rather than *da#zu#fügen*.
136///
137/// When a verb contains multiple separable prefixes, this transformation rule
138/// attempts to find them, as in (3).
139///
140/// In 'zu'-infinitives *zu* is removed and not analyzed as being (part of) a
141/// separable prefix.
142pub struct MarkVerbPrefix {
143    prefix_verbs: HashMap<String, String>,
144    prefixes: Set<Vec<u8>>,
145}
146
147impl MarkVerbPrefix {
148    /// Create this transformation. A simple lookup for prefix verbs can be
149    /// provided. More crucially, a set of prefixes must be provided to find
150    /// prefixes.
151    pub fn new() -> Self {
152        MarkVerbPrefix::read_verb_prefixes(Cursor::new(include_str!(
153            "../../data/tdz11-separable-prefixes.txt"
154        )))
155        .expect("Invalid separable verb prefix data")
156    }
157
158    pub fn set_prefix_verbs(&mut self, prefix_verbs: HashMap<String, String>) {
159        self.prefix_verbs = prefix_verbs;
160    }
161}
162
163impl Default for MarkVerbPrefix {
164    fn default() -> Self {
165        Self::new()
166    }
167}
168
169impl Transform for MarkVerbPrefix {
170    fn transform(&self, graph: &dyn DependencyGraph, node: usize) -> String {
171        let token = graph.token(node);
172        let lemma = token.lemma();
173        let lemma_lc = lemma.to_lowercase();
174
175        if !is_verb(token.xpos()) {
176            return lemma.to_owned();
177        }
178
179        // There are two cases that we have to handle separately:
180        //
181        // 1. The lemmatizer did not strip the prefix. In this case, we
182        //    perform a lemma lookup. For now, removing prefixes from the
183        //    lemma itself seems to be too tricky.
184        //
185        // 2. The lemmatizer stripped the prefix. The prefix needs to be
186        //    inferred from the token's form.
187
188        // Case 1: try a simple lookup for the lemma
189        if let Some(sep_lemma) = self.prefix_verbs.get(&lemma_lc) {
190            return sep_lemma.clone();
191        }
192
193        // Case 2: there are no prefixes in the lemma, try to find prefixes
194        // in the form.
195        let form_lc = token.form().to_lowercase();
196        let mut lemma_parts = longest_prefixes(&self.prefixes, &form_lc, &lemma_lc, token.xpos());
197        if !lemma_parts.is_empty() {
198            lemma_parts.push(lemma_lc);
199            return lemma_parts.join("#");
200        }
201
202        lemma.to_owned()
203    }
204}
205
206trait ReadVerbPrefixes {
207    fn read_verb_prefixes<R>(r: R) -> Result<MarkVerbPrefix, LemmatizationError>
208    where
209        R: BufRead;
210}
211
212impl ReadVerbPrefixes for MarkVerbPrefix {
213    fn read_verb_prefixes<R>(r: R) -> Result<MarkVerbPrefix, LemmatizationError>
214    where
215        R: BufRead,
216    {
217        let mut builder = SetBuilder::memory();
218
219        for line in r.lines() {
220            let line = line?;
221
222            builder.insert(&line)?;
223        }
224
225        let bytes = builder.into_inner()?;
226        let prefixes = Set::new(bytes)?;
227
228        Ok(MarkVerbPrefix {
229            prefix_verbs: HashMap::new(),
230            prefixes,
231        })
232    }
233}
234
235pub struct RestoreCase;
236
237impl Transform for RestoreCase {
238    fn transform(&self, graph: &dyn DependencyGraph, node: usize) -> String {
239        let token = graph.token(node);
240
241        if token.xpos() == NOUN_TAG {
242            uppercase_first_char(token.lemma())
243        } else if token.xpos() == NAMED_ENTITY_TAG {
244            restore_named_entity_case(token.form(), token.lemma())
245        } else {
246            token.lemma().to_owned()
247        }
248    }
249}
250
251fn uppercase_first_char<S>(s: S) -> String
252where
253    S: AsRef<str>,
254{
255    // Hold your seats... This is a bit convoluted, because uppercasing a
256    // unicode codepoint can result in multiple codepoints. Although this
257    // should not hapen in German orthography, we want to be correct here...
258
259    let mut chars = s.as_ref().chars();
260    let first = ok_or!(chars.next(), return String::new());
261
262    first.to_uppercase().chain(chars).collect()
263}
264
265#[cfg(test)]
266mod tests {
267    use std::collections::HashMap;
268    use std::iter::FromIterator;
269
270    use crate::transform::test_helpers::run_test_cases;
271
272    use super::{
273        uppercase_first_char, AddSeparatedVerbPrefix, FormAsLemma, MarkVerbPrefix, RestoreCase,
274    };
275
276    #[test]
277    pub fn first_char_is_uppercased() {
278        assert_eq!(uppercase_first_char("test"), "Test");
279        assert_eq!(uppercase_first_char("Test"), "Test");
280        assert_eq!(uppercase_first_char(""), "");
281    }
282
283    #[test]
284    pub fn add_separated_verb_prefix() {
285        run_test_cases(
286            "testdata/add-separated-verb-prefix.test",
287            AddSeparatedVerbPrefix {
288                multiple_prefixes: true,
289            },
290        );
291    }
292
293    #[test]
294    pub fn form_as_lemma() {
295        run_test_cases("testdata/form-as-lemma.test", FormAsLemma);
296    }
297
298    #[test]
299    pub fn mark_verb_prefix() {
300        let prefix_verbs = HashMap::from_iter(vec![(
301            String::from("abbestellen"),
302            String::from("ab#bestellen"),
303        )]);
304
305        let mut transform = MarkVerbPrefix::new();
306        transform.set_prefix_verbs(prefix_verbs);
307
308        run_test_cases("testdata/mark-verb-prefix.test", transform);
309    }
310
311    #[test]
312    pub fn restore_case() {
313        run_test_cases("testdata/restore-case.test", RestoreCase);
314    }
315}