use std::sync::LazyLock;
use fst::raw::Fst;
use crate::segmenter::utils::{BufferingStrategy, FstSegmenter};
use crate::segmenter::Segmenter;
pub struct GermanSegmenter;
static WORDS_FST: LazyLock<Fst<&[u8]>> = LazyLock::new(|| {
Fst::new(&include_bytes!("../../dictionaries/fst/german/words.fst")[..]).unwrap()
});
static FST_SEGMENTER: LazyLock<FstSegmenter> = LazyLock::new(|| {
FstSegmenter::new(&WORDS_FST, BufferingStrategy::UntilNextMatch { max_char_count: None })
});
impl Segmenter for GermanSegmenter {
fn segment_str<'o>(&self, to_segment: &'o str) -> Box<dyn Iterator<Item = &'o str> + 'o> {
FST_SEGMENTER.segment_str(to_segment)
}
}
#[cfg(test)]
mod test {
use crate::segmenter::test::test_segmenter;
const TEXT: &str =
"Der Dampfschifffahrtskapitän fährt über den Mittellandkanal zur Strombrücke Magdeburg 123 456. Feuchteschutz insgesamt";
const SEGMENTED: &[&str] = &[
"Der",
" ",
"Dampf",
"schifffahrts",
"kapitän",
" ",
"fähr",
"t",
" ",
"über",
" ",
"den",
" ",
"Mittel",
"land",
"kanal",
" ",
"zur",
" ",
"Strom",
"brücke",
" ",
"Magdeburg",
" ",
"123",
" ",
"456",
". ",
"Feuchte",
"schutz",
" ",
"ins",
"gesamt",
];
const TOKENIZED: &[&str] = &[
"der",
" ",
"dampf",
"schifffahrts",
"kapitan",
" ",
"fahr",
"t",
" ",
"uber",
" ",
"den",
" ",
"mittel",
"land",
"kanal",
" ",
"zur",
" ",
"strom",
"brucke",
" ",
"magdeburg",
" ",
"123",
" ",
"456",
". ",
"feuchte",
"schutz",
" ",
"ins",
"gesamt",
];
test_segmenter!(GermanSegmenter, TEXT, SEGMENTED, TOKENIZED, Script::Latin, Language::Deu);
macro_rules! test_segmentation {
($text:expr, $segmented:expr, $name:ident) => {
#[test]
fn $name() {
let segmented_text: Vec<_> = FST_SEGMENTER.segment_str($text).collect::<Vec<_>>();
assert_eq!(segmented_text, $segmented);
}
};
}
test_segmentation!(
"Literaturverwaltungsprogramm",
&["Literatur", "verwaltungs", "programm"],
word1
);
test_segmentation!("Schreibprozess", &["Schreib", "prozess"], word2);
test_segmentation!("Interkulturalität", &["Inter", "kulturalität"], word3);
test_segmentation!("Wissensorganisation", &["Wissens", "organisation"], word4);
test_segmentation!("Aufgabenplanung", &["Aufgaben", "planung"], word5);
test_segmentation!("Eisbrecher", &["Eis", "brecher"], word6);
test_segmentation!("Zuckerei", &["Zucker", "ei"], word7);
test_segmentation!("Glatteis", &["Glatt", "eis"], word8);
test_segmentation!("Sinnfindung", &["Sinn", "findung"], word9);
test_segmentation!(
"Donaudampfschifffahrtsgesellschaftskapitän",
&["Donau", "dampf", "schifffahrts", "gesellschafts", "kapitän"],
word10
);
test_segmentation!(
"Rindfleischetikettierungsüberwachungsaufgabenübertragungsgesetz",
&[
"Rind",
"fleisch",
"etikettierungs",
"überwachungs",
"aufgaben",
"übertragungs",
"gesetz"
],
word11
);
test_segmentation!(
"Nahrungsmittelunverträglichkeitsdiagnoseverfahren",
&["Nahrungs", "mittel", "un", "verträglichkeits", "diagnose", "verfahren"],
word12
);
test_segmentation!("Volleyball", &["Volley", "ball"], word13);
test_segmentation!("Fußball", &["Fuß", "ball"], word14);
test_segmentation!("Beachvolleyball", &["Beach", "volley", "ball"], word15);
test_segmentation!("Basketball", &["Basket", "ball"], word16);
test_segmentation!("Handball", &["Hand", "ball"], word17);
test_segmentation!("Spikeball", &["Spike", "ball"], word18);
}