mod automaton;
mod constants;
mod error;
mod transform;
use lazy_static::lazy_static;
use serde::{Deserialize, Serialize};
use transform::delemmatization::{
RemoveAlternatives, RemoveReflexiveTag, RemoveSepVerbPrefix, RemoveTruncMarker,
};
use transform::lemmatization::{
AddReflexiveTag, AddSeparatedVerbPrefix, FormAsLemma, MarkVerbPrefix, RestoreCase,
};
use transform::misc::{
SimplifyArticleLemma, SimplifyPIAT, SimplifyPIDAT, SimplifyPIS, SimplifyPossesivePronounLemma,
};
use transform::Transforms;
use udgraph::graph::Sentence;
use crate::lemma::{BackoffStrategy, EditTreeEncoder};
use crate::{EncodingProb, SentenceDecoder, SentenceEncoder};
lazy_static! {
static ref DECODE_TRANSFORMS: Transforms = {
Transforms(vec![
Box::new(FormAsLemma),
Box::new(RestoreCase),
Box::new(AddReflexiveTag),
Box::new(AddSeparatedVerbPrefix::new(true)),
Box::new(MarkVerbPrefix::new()),
Box::new(SimplifyArticleLemma),
Box::new(SimplifyPossesivePronounLemma),
Box::new(SimplifyPIS),
Box::new(SimplifyPIDAT),
Box::new(SimplifyPIAT),
])
};
static ref ENCODE_TRANSFORMS: Transforms = {
Transforms(vec![
Box::new(RemoveAlternatives),
Box::new(RemoveReflexiveTag),
Box::new(RemoveSepVerbPrefix),
Box::new(RemoveTruncMarker),
Box::new(SimplifyArticleLemma),
Box::new(SimplifyPossesivePronounLemma),
Box::new(FormAsLemma),
])
};
}
#[derive(Deserialize, Serialize)]
#[serde(transparent)]
pub struct TdzLemmaEncoder {
inner: EditTreeEncoder,
}
impl TdzLemmaEncoder {
pub fn new(backoff_strategy: BackoffStrategy) -> Self {
TdzLemmaEncoder {
inner: EditTreeEncoder::new(backoff_strategy),
}
}
}
impl SentenceDecoder for TdzLemmaEncoder {
type Encoding = <EditTreeEncoder as SentenceDecoder>::Encoding;
type Error = <EditTreeEncoder as SentenceDecoder>::Error;
fn decode<S>(&self, labels: &[S], sentence: &mut Sentence) -> Result<(), Self::Error>
where
S: AsRef<[EncodingProb<Self::Encoding>]>,
{
self.inner.decode(labels, sentence)?;
DECODE_TRANSFORMS.transform(sentence);
Ok(())
}
}
impl SentenceEncoder for TdzLemmaEncoder {
type Encoding = <EditTreeEncoder as SentenceEncoder>::Encoding;
type Error = <EditTreeEncoder as SentenceEncoder>::Error;
fn encode(&self, sentence: &Sentence) -> Result<Vec<Self::Encoding>, Self::Error> {
let mut sentence = sentence.clone();
ENCODE_TRANSFORMS.transform(&mut sentence);
self.inner.encode(&sentence)
}
}
#[cfg(test)]
mod tests {
use std::iter::FromIterator;
use udgraph::graph::{DepTriple, Sentence};
use udgraph::token::TokenBuilder;
use super::TdzLemmaEncoder;
use crate::lemma::edit_tree::EditTree as EditTreeInner;
use crate::lemma::BackoffStrategy;
use crate::{EncodingProb, SentenceDecoder, SentenceEncoder};
fn example_sentence() -> Sentence {
let tokens = vec![
TokenBuilder::new("Ich")
.upos("PRON")
.xpos("PPER")
.lemma("ich")
.into(),
TokenBuilder::new("reise")
.upos("VERB")
.xpos("VVFIN")
.lemma("ab#reisen")
.into(),
TokenBuilder::new("ab")
.upos("ADP")
.xpos("PTKVZ")
.lemma("ab")
.into(),
];
let mut sent = Sentence::from_iter(tokens);
sent.dep_graph_mut()
.add_deprel(DepTriple::new(2, Some("compound:prt"), 3))
.unwrap();
sent
}
fn sentence_edit_trees() -> Vec<EditTreeInner> {
vec![
EditTreeInner::create_tree(&['I', 'c', 'h'], &['i', 'c', 'h']).unwrap(),
EditTreeInner::create_tree(&['r', 'e', 'i', 's', 'e'], &['r', 'e', 'i', 's', 'e', 'n'])
.unwrap(),
EditTreeInner::create_tree(&['a', 'b'], &['a', 'b']).unwrap(),
]
}
fn encode_and_wrap(
encoder: &TdzLemmaEncoder,
sent: &Sentence,
) -> Vec<Vec<EncodingProb<EditTreeInner>>> {
encoder
.encode(sent)
.unwrap()
.into_iter()
.map(|encoding| vec![EncodingProb::new(encoding, 1.0)])
.collect::<Vec<_>>()
}
#[test]
fn encodes_with_transformations() {
let sent = example_sentence();
let encoder = TdzLemmaEncoder::new(BackoffStrategy::Nothing);
let encoding = encoder.encode(&sent).unwrap();
assert_eq!(encoding, sentence_edit_trees());
let encoding = encode_and_wrap(&encoder, &sent);
let mut sent_decoded = sent.clone();
encoder.decode(&encoding, &mut sent_decoded).unwrap();
assert_eq!(sent, sent_decoded);
}
}