syntaxdot_encoders/lang/de/tdz/lemma/
mod.rs

1mod automaton;
2
3mod constants;
4
5mod error;
6
7mod transform;
8
9use lazy_static::lazy_static;
10use serde::{Deserialize, Serialize};
11use transform::delemmatization::{
12    RemoveAlternatives, RemoveReflexiveTag, RemoveSepVerbPrefix, RemoveTruncMarker,
13};
14use transform::lemmatization::{
15    AddReflexiveTag, AddSeparatedVerbPrefix, FormAsLemma, MarkVerbPrefix, RestoreCase,
16};
17use transform::misc::{
18    SimplifyArticleLemma, SimplifyPIAT, SimplifyPIDAT, SimplifyPIS, SimplifyPossesivePronounLemma,
19};
20use transform::Transforms;
21use udgraph::graph::Sentence;
22
23use crate::lemma::{BackoffStrategy, EditTreeEncoder};
24use crate::{EncodingProb, SentenceDecoder, SentenceEncoder};
25
26lazy_static! {
27    static ref DECODE_TRANSFORMS: Transforms = {
28        Transforms(vec![
29            Box::new(FormAsLemma),
30            Box::new(RestoreCase),
31            Box::new(AddReflexiveTag),
32            Box::new(AddSeparatedVerbPrefix::new(true)),
33            Box::new(MarkVerbPrefix::new()),
34            Box::new(SimplifyArticleLemma),
35            Box::new(SimplifyPossesivePronounLemma),
36            Box::new(SimplifyPIS),
37            Box::new(SimplifyPIDAT),
38            Box::new(SimplifyPIAT),
39        ])
40    };
41    static ref ENCODE_TRANSFORMS: Transforms = {
42        Transforms(vec![
43            Box::new(RemoveAlternatives),
44            Box::new(RemoveReflexiveTag),
45            Box::new(RemoveSepVerbPrefix),
46            Box::new(RemoveTruncMarker),
47            Box::new(SimplifyArticleLemma),
48            Box::new(SimplifyPossesivePronounLemma),
49            Box::new(FormAsLemma),
50        ])
51    };
52}
53
54/// Lemma encoder-decoder for TüBa-D/Z
55///
56/// This encoder wraps `EditTreeEncoder`. Before encoding and after
57/// decoding a list of transformation rules is applied to transform
58/// the lemmas from and to TüBa-D/Z-style lemmas.
59///
60/// For example, the particle verb *abschließen* is encoded as the
61/// *ab#schließen* in TüBa-D/Z. During encoding, the lemma is
62/// transformed to *schließen*. Then during decoding the lemma is
63/// transformed back to *ab#schließen* based on the *ab* particle that
64/// is either a prefix of in form (e.g. *abgeschlossen*) or a
65/// separated particle (e.g. *ich schließe es ab*).
66#[derive(Deserialize, Serialize)]
67#[serde(transparent)]
68pub struct TdzLemmaEncoder {
69    inner: EditTreeEncoder,
70}
71
72impl TdzLemmaEncoder {
73    /// Construct a `TdzLemmaEncoder`.
74    ///
75    /// The backoff strategy is used when the edit tree that was
76    /// predicted is not applicable to the form.
77    pub fn new(backoff_strategy: BackoffStrategy) -> Self {
78        TdzLemmaEncoder {
79            inner: EditTreeEncoder::new(backoff_strategy),
80        }
81    }
82}
83
84impl SentenceDecoder for TdzLemmaEncoder {
85    type Encoding = <EditTreeEncoder as SentenceDecoder>::Encoding;
86
87    type Error = <EditTreeEncoder as SentenceDecoder>::Error;
88
89    fn decode<S>(&self, labels: &[S], sentence: &mut Sentence) -> Result<(), Self::Error>
90    where
91        S: AsRef<[EncodingProb<Self::Encoding>]>,
92    {
93        // Decode edit trees.
94        self.inner.decode(labels, sentence)?;
95
96        // Apply TüBa-D/Z transformations
97        DECODE_TRANSFORMS.transform(sentence);
98
99        Ok(())
100    }
101}
102
103impl SentenceEncoder for TdzLemmaEncoder {
104    type Encoding = <EditTreeEncoder as SentenceEncoder>::Encoding;
105
106    type Error = <EditTreeEncoder as SentenceEncoder>::Error;
107
108    fn encode(&self, sentence: &Sentence) -> Result<Vec<Self::Encoding>, Self::Error> {
109        // Hmpf, but we need to modify the sentence in-place.
110        let mut sentence = sentence.clone();
111
112        // Apply tranformations to remove TüBa-D/Z specifics.
113        ENCODE_TRANSFORMS.transform(&mut sentence);
114
115        self.inner.encode(&sentence)
116    }
117}
118
119#[cfg(test)]
120mod tests {
121    use std::iter::FromIterator;
122
123    use udgraph::graph::{DepTriple, Sentence};
124    use udgraph::token::TokenBuilder;
125
126    use super::TdzLemmaEncoder;
127    use crate::lemma::edit_tree::EditTree as EditTreeInner;
128    use crate::lemma::BackoffStrategy;
129    use crate::{EncodingProb, SentenceDecoder, SentenceEncoder};
130
131    fn example_sentence() -> Sentence {
132        let tokens = vec![
133            TokenBuilder::new("Ich")
134                .upos("PRON")
135                .xpos("PPER")
136                .lemma("ich")
137                .into(),
138            TokenBuilder::new("reise")
139                .upos("VERB")
140                .xpos("VVFIN")
141                .lemma("ab#reisen")
142                .into(),
143            TokenBuilder::new("ab")
144                .upos("ADP")
145                .xpos("PTKVZ")
146                .lemma("ab")
147                .into(),
148        ];
149
150        let mut sent = Sentence::from_iter(tokens);
151
152        sent.dep_graph_mut()
153            .add_deprel(DepTriple::new(2, Some("compound:prt"), 3))
154            .unwrap();
155
156        sent
157    }
158
159    fn sentence_edit_trees() -> Vec<EditTreeInner> {
160        vec![
161            EditTreeInner::create_tree(&['I', 'c', 'h'], &['i', 'c', 'h']).unwrap(),
162            EditTreeInner::create_tree(&['r', 'e', 'i', 's', 'e'], &['r', 'e', 'i', 's', 'e', 'n'])
163                .unwrap(),
164            EditTreeInner::create_tree(&['a', 'b'], &['a', 'b']).unwrap(),
165        ]
166    }
167
168    fn encode_and_wrap(
169        encoder: &TdzLemmaEncoder,
170        sent: &Sentence,
171    ) -> Vec<Vec<EncodingProb<EditTreeInner>>> {
172        encoder
173            .encode(sent)
174            .unwrap()
175            .into_iter()
176            .map(|encoding| vec![EncodingProb::new(encoding, 1.0)])
177            .collect::<Vec<_>>()
178    }
179
180    #[test]
181    fn encodes_with_transformations() {
182        let sent = example_sentence();
183
184        let encoder = TdzLemmaEncoder::new(BackoffStrategy::Nothing);
185
186        // Check whether the encoder transformations are applied.
187        let encoding = encoder.encode(&sent).unwrap();
188        assert_eq!(encoding, sentence_edit_trees());
189
190        let encoding = encode_and_wrap(&encoder, &sent);
191
192        let mut sent_decoded = sent.clone();
193        encoder.decode(&encoding, &mut sent_decoded).unwrap();
194
195        // Check whether the encoder transformations are applied.
196        assert_eq!(sent, sent_decoded);
197    }
198}