syntaxdot_encoders/lang/de/tdz/lemma/
mod.rs1mod automaton;
2
3mod constants;
4
5mod error;
6
7mod transform;
8
9use lazy_static::lazy_static;
10use serde::{Deserialize, Serialize};
11use transform::delemmatization::{
12 RemoveAlternatives, RemoveReflexiveTag, RemoveSepVerbPrefix, RemoveTruncMarker,
13};
14use transform::lemmatization::{
15 AddReflexiveTag, AddSeparatedVerbPrefix, FormAsLemma, MarkVerbPrefix, RestoreCase,
16};
17use transform::misc::{
18 SimplifyArticleLemma, SimplifyPIAT, SimplifyPIDAT, SimplifyPIS, SimplifyPossesivePronounLemma,
19};
20use transform::Transforms;
21use udgraph::graph::Sentence;
22
23use crate::lemma::{BackoffStrategy, EditTreeEncoder};
24use crate::{EncodingProb, SentenceDecoder, SentenceEncoder};
25
26lazy_static! {
27 static ref DECODE_TRANSFORMS: Transforms = {
28 Transforms(vec![
29 Box::new(FormAsLemma),
30 Box::new(RestoreCase),
31 Box::new(AddReflexiveTag),
32 Box::new(AddSeparatedVerbPrefix::new(true)),
33 Box::new(MarkVerbPrefix::new()),
34 Box::new(SimplifyArticleLemma),
35 Box::new(SimplifyPossesivePronounLemma),
36 Box::new(SimplifyPIS),
37 Box::new(SimplifyPIDAT),
38 Box::new(SimplifyPIAT),
39 ])
40 };
41 static ref ENCODE_TRANSFORMS: Transforms = {
42 Transforms(vec![
43 Box::new(RemoveAlternatives),
44 Box::new(RemoveReflexiveTag),
45 Box::new(RemoveSepVerbPrefix),
46 Box::new(RemoveTruncMarker),
47 Box::new(SimplifyArticleLemma),
48 Box::new(SimplifyPossesivePronounLemma),
49 Box::new(FormAsLemma),
50 ])
51 };
52}
53
54#[derive(Deserialize, Serialize)]
67#[serde(transparent)]
68pub struct TdzLemmaEncoder {
69 inner: EditTreeEncoder,
70}
71
72impl TdzLemmaEncoder {
73 pub fn new(backoff_strategy: BackoffStrategy) -> Self {
78 TdzLemmaEncoder {
79 inner: EditTreeEncoder::new(backoff_strategy),
80 }
81 }
82}
83
84impl SentenceDecoder for TdzLemmaEncoder {
85 type Encoding = <EditTreeEncoder as SentenceDecoder>::Encoding;
86
87 type Error = <EditTreeEncoder as SentenceDecoder>::Error;
88
89 fn decode<S>(&self, labels: &[S], sentence: &mut Sentence) -> Result<(), Self::Error>
90 where
91 S: AsRef<[EncodingProb<Self::Encoding>]>,
92 {
93 self.inner.decode(labels, sentence)?;
95
96 DECODE_TRANSFORMS.transform(sentence);
98
99 Ok(())
100 }
101}
102
103impl SentenceEncoder for TdzLemmaEncoder {
104 type Encoding = <EditTreeEncoder as SentenceEncoder>::Encoding;
105
106 type Error = <EditTreeEncoder as SentenceEncoder>::Error;
107
108 fn encode(&self, sentence: &Sentence) -> Result<Vec<Self::Encoding>, Self::Error> {
109 let mut sentence = sentence.clone();
111
112 ENCODE_TRANSFORMS.transform(&mut sentence);
114
115 self.inner.encode(&sentence)
116 }
117}
118
119#[cfg(test)]
120mod tests {
121 use std::iter::FromIterator;
122
123 use udgraph::graph::{DepTriple, Sentence};
124 use udgraph::token::TokenBuilder;
125
126 use super::TdzLemmaEncoder;
127 use crate::lemma::edit_tree::EditTree as EditTreeInner;
128 use crate::lemma::BackoffStrategy;
129 use crate::{EncodingProb, SentenceDecoder, SentenceEncoder};
130
131 fn example_sentence() -> Sentence {
132 let tokens = vec![
133 TokenBuilder::new("Ich")
134 .upos("PRON")
135 .xpos("PPER")
136 .lemma("ich")
137 .into(),
138 TokenBuilder::new("reise")
139 .upos("VERB")
140 .xpos("VVFIN")
141 .lemma("ab#reisen")
142 .into(),
143 TokenBuilder::new("ab")
144 .upos("ADP")
145 .xpos("PTKVZ")
146 .lemma("ab")
147 .into(),
148 ];
149
150 let mut sent = Sentence::from_iter(tokens);
151
152 sent.dep_graph_mut()
153 .add_deprel(DepTriple::new(2, Some("compound:prt"), 3))
154 .unwrap();
155
156 sent
157 }
158
159 fn sentence_edit_trees() -> Vec<EditTreeInner> {
160 vec![
161 EditTreeInner::create_tree(&['I', 'c', 'h'], &['i', 'c', 'h']).unwrap(),
162 EditTreeInner::create_tree(&['r', 'e', 'i', 's', 'e'], &['r', 'e', 'i', 's', 'e', 'n'])
163 .unwrap(),
164 EditTreeInner::create_tree(&['a', 'b'], &['a', 'b']).unwrap(),
165 ]
166 }
167
168 fn encode_and_wrap(
169 encoder: &TdzLemmaEncoder,
170 sent: &Sentence,
171 ) -> Vec<Vec<EncodingProb<EditTreeInner>>> {
172 encoder
173 .encode(sent)
174 .unwrap()
175 .into_iter()
176 .map(|encoding| vec![EncodingProb::new(encoding, 1.0)])
177 .collect::<Vec<_>>()
178 }
179
180 #[test]
181 fn encodes_with_transformations() {
182 let sent = example_sentence();
183
184 let encoder = TdzLemmaEncoder::new(BackoffStrategy::Nothing);
185
186 let encoding = encoder.encode(&sent).unwrap();
188 assert_eq!(encoding, sentence_edit_trees());
189
190 let encoding = encode_and_wrap(&encoder, &sent);
191
192 let mut sent_decoded = sent.clone();
193 encoder.decode(&encoding, &mut sent_decoded).unwrap();
194
195 assert_eq!(sent, sent_decoded);
197 }
198}