ohnomore/transform/
lemmatization.rs1use std::collections::HashMap;
7use std::io::{BufRead, Cursor};
8
9use fst::{Set, SetBuilder};
10
11use crate::constants::*;
12use crate::transform::named_entity::restore_named_entity_case;
13use crate::transform::svp::longest_prefixes;
14use crate::transform::{DependencyGraph, Transform};
15use crate::LemmatizationError;
16
17pub struct AddReflexiveTag;
19
20impl Transform for AddReflexiveTag {
21 fn transform(&self, graph: &dyn DependencyGraph, node: usize) -> String {
22 let token = graph.token(node);
23 let lemma = token.lemma();
24
25 if token.xpos() == REFLEXIVE_PERSONAL_PRONOUN_TAG {
26 REFLEXIVE_PERSONAL_PRONOUN_LEMMA.to_owned()
27 } else {
28 lemma.to_owned()
29 }
30 }
31}
32
33pub struct AddSeparatedVerbPrefix {
50 multiple_prefixes: bool,
51}
52
53impl AddSeparatedVerbPrefix {
54 pub fn new(multiple_prefixes: bool) -> Self {
55 AddSeparatedVerbPrefix { multiple_prefixes }
56 }
57}
58
59impl Transform for AddSeparatedVerbPrefix {
60 fn transform(&self, graph: &dyn DependencyGraph, node: usize) -> String {
61 let token = graph.token(node);
62 let lemma = token.lemma();
63
64 if !is_separable_verb(token.xpos()) {
65 return lemma.to_owned();
66 }
67
68 let mut lemma = lemma.to_owned();
69
70 let mut prefix_iter = graph
76 .dependents(node)
77 .filter(|(dependent, _)| graph.token(*dependent).xpos() == SEPARABLE_PARTICLE_POS);
78
79 if self.multiple_prefixes {
80 let mut lemmas = Vec::new();
81
82 for (dependant, _) in prefix_iter {
84 let prefix = graph.token(dependant);
85 lemmas.push(format!("{}#{}", prefix.form().to_lowercase(), lemma));
86 }
87
88 if lemmas.is_empty() {
89 lemma
90 } else {
91 lemmas.join("|")
92 }
93 } else {
94 if let Some((dependant, _)) = prefix_iter.next() {
95 let prefix = graph.token(dependant);
96 lemma.insert_str(0, &format!("{}#", prefix.form().to_lowercase()));
97 }
98
99 lemma
100 }
101 }
102}
103
104pub struct FormAsLemma;
106
107impl Transform for FormAsLemma {
108 fn transform(&self, graph: &dyn DependencyGraph, node: usize) -> String {
109 let token = graph.token(node);
110
111 if LEMMA_IS_FORM_TAGS.contains(token.xpos()) {
113 token.form().to_lowercase()
114 } else if LEMMA_IS_FORM_PRESERVE_CASE_TAGS.contains(token.xpos()) {
115 token.form().to_owned()
116 } else {
117 token.lemma().to_owned()
118 }
119 }
120}
121
122pub struct MarkVerbPrefix {
143 prefix_verbs: HashMap<String, String>,
144 prefixes: Set<Vec<u8>>,
145}
146
147impl MarkVerbPrefix {
148 pub fn new() -> Self {
152 MarkVerbPrefix::read_verb_prefixes(Cursor::new(include_str!(
153 "../../data/tdz11-separable-prefixes.txt"
154 )))
155 .expect("Invalid separable verb prefix data")
156 }
157
158 pub fn set_prefix_verbs(&mut self, prefix_verbs: HashMap<String, String>) {
159 self.prefix_verbs = prefix_verbs;
160 }
161}
162
163impl Default for MarkVerbPrefix {
164 fn default() -> Self {
165 Self::new()
166 }
167}
168
169impl Transform for MarkVerbPrefix {
170 fn transform(&self, graph: &dyn DependencyGraph, node: usize) -> String {
171 let token = graph.token(node);
172 let lemma = token.lemma();
173 let lemma_lc = lemma.to_lowercase();
174
175 if !is_verb(token.xpos()) {
176 return lemma.to_owned();
177 }
178
179 if let Some(sep_lemma) = self.prefix_verbs.get(&lemma_lc) {
190 return sep_lemma.clone();
191 }
192
193 let form_lc = token.form().to_lowercase();
196 let mut lemma_parts = longest_prefixes(&self.prefixes, &form_lc, &lemma_lc, token.xpos());
197 if !lemma_parts.is_empty() {
198 lemma_parts.push(lemma_lc);
199 return lemma_parts.join("#");
200 }
201
202 lemma.to_owned()
203 }
204}
205
206trait ReadVerbPrefixes {
207 fn read_verb_prefixes<R>(r: R) -> Result<MarkVerbPrefix, LemmatizationError>
208 where
209 R: BufRead;
210}
211
212impl ReadVerbPrefixes for MarkVerbPrefix {
213 fn read_verb_prefixes<R>(r: R) -> Result<MarkVerbPrefix, LemmatizationError>
214 where
215 R: BufRead,
216 {
217 let mut builder = SetBuilder::memory();
218
219 for line in r.lines() {
220 let line = line?;
221
222 builder.insert(&line)?;
223 }
224
225 let bytes = builder.into_inner()?;
226 let prefixes = Set::new(bytes)?;
227
228 Ok(MarkVerbPrefix {
229 prefix_verbs: HashMap::new(),
230 prefixes,
231 })
232 }
233}
234
235pub struct RestoreCase;
236
237impl Transform for RestoreCase {
238 fn transform(&self, graph: &dyn DependencyGraph, node: usize) -> String {
239 let token = graph.token(node);
240
241 if token.xpos() == NOUN_TAG {
242 uppercase_first_char(token.lemma())
243 } else if token.xpos() == NAMED_ENTITY_TAG {
244 restore_named_entity_case(token.form(), token.lemma())
245 } else {
246 token.lemma().to_owned()
247 }
248 }
249}
250
251fn uppercase_first_char<S>(s: S) -> String
252where
253 S: AsRef<str>,
254{
255 let mut chars = s.as_ref().chars();
260 let first = ok_or!(chars.next(), return String::new());
261
262 first.to_uppercase().chain(chars).collect()
263}
264
265#[cfg(test)]
266mod tests {
267 use std::collections::HashMap;
268 use std::iter::FromIterator;
269
270 use crate::transform::test_helpers::run_test_cases;
271
272 use super::{
273 uppercase_first_char, AddSeparatedVerbPrefix, FormAsLemma, MarkVerbPrefix, RestoreCase,
274 };
275
276 #[test]
277 pub fn first_char_is_uppercased() {
278 assert_eq!(uppercase_first_char("test"), "Test");
279 assert_eq!(uppercase_first_char("Test"), "Test");
280 assert_eq!(uppercase_first_char(""), "");
281 }
282
283 #[test]
284 pub fn add_separated_verb_prefix() {
285 run_test_cases(
286 "testdata/add-separated-verb-prefix.test",
287 AddSeparatedVerbPrefix {
288 multiple_prefixes: true,
289 },
290 );
291 }
292
293 #[test]
294 pub fn form_as_lemma() {
295 run_test_cases("testdata/form-as-lemma.test", FormAsLemma);
296 }
297
298 #[test]
299 pub fn mark_verb_prefix() {
300 let prefix_verbs = HashMap::from_iter(vec![(
301 String::from("abbestellen"),
302 String::from("ab#bestellen"),
303 )]);
304
305 let mut transform = MarkVerbPrefix::new();
306 transform.set_prefix_verbs(prefix_verbs);
307
308 run_test_cases("testdata/mark-verb-prefix.test", transform);
309 }
310
311 #[test]
312 pub fn restore_case() {
313 run_test_cases("testdata/restore-case.test", RestoreCase);
314 }
315}