charabia 0.9.9

A simple library to detect the language, tokenize the text and normalize the tokens
Documentation
use std::sync::LazyLock;

use fst::raw::Fst;

use crate::segmenter::utils::{BufferingStrategy, FstSegmenter};
use crate::segmenter::Segmenter;

/// German specialized [`Segmenter`].
///
/// This Segmenter uses a dictionary encoded as an FST to segment the provided text.
pub struct GermanSegmenter;

static WORDS_FST: LazyLock<Fst<&[u8]>> = LazyLock::new(|| {
    Fst::new(&include_bytes!("../../dictionaries/fst/german/words.fst")[..]).unwrap()
});

static FST_SEGMENTER: LazyLock<FstSegmenter> = LazyLock::new(|| {
    // no max char count, so the segmenter will buffer the sequence until the next match is found
    FstSegmenter::new(&WORDS_FST, BufferingStrategy::UntilNextMatch { max_char_count: None })
});

impl Segmenter for GermanSegmenter {
    fn segment_str<'o>(&self, to_segment: &'o str) -> Box<dyn Iterator<Item = &'o str> + 'o> {
        FST_SEGMENTER.segment_str(to_segment)
    }
}

// Test the segmenter:
#[cfg(test)]
mod test {
    use crate::segmenter::test::test_segmenter;

    const TEXT: &str =
        "Der Dampfschifffahrtskapitän fährt über den Mittellandkanal zur Strombrücke Magdeburg 123 456. Feuchteschutz insgesamt";

    const SEGMENTED: &[&str] = &[
        "Der",
        " ",
        "Dampf",
        "schifffahrts",
        "kapitän",
        " ",
        "fähr",
        "t",
        " ",
        "über",
        " ",
        "den",
        " ",
        "Mittel",
        "land",
        "kanal",
        " ",
        "zur",
        " ",
        "Strom",
        "brücke",
        " ",
        "Magdeburg",
        " ",
        "123",
        " ",
        "456",
        ". ",
        "Feuchte",
        "schutz",
        " ",
        "ins",
        "gesamt",
    ];

    const TOKENIZED: &[&str] = &[
        "der",
        " ",
        "dampf",
        "schifffahrts",
        "kapitan",
        " ",
        "fahr",
        "t",
        " ",
        "uber",
        " ",
        "den",
        " ",
        "mittel",
        "land",
        "kanal",
        " ",
        "zur",
        " ",
        "strom",
        "brucke",
        " ",
        "magdeburg",
        " ",
        "123",
        " ",
        "456",
        ". ",
        "feuchte",
        "schutz",
        " ",
        "ins",
        "gesamt",
    ];

    // Macro that runs several tests on the Segmenter.
    test_segmenter!(GermanSegmenter, TEXT, SEGMENTED, TOKENIZED, Script::Latin, Language::Deu);

    macro_rules! test_segmentation {
        ($text:expr, $segmented:expr, $name:ident) => {
            #[test]
            fn $name() {
                let segmented_text: Vec<_> = FST_SEGMENTER.segment_str($text).collect::<Vec<_>>();
                assert_eq!(segmented_text, $segmented);
            }
        };
    }

    test_segmentation!(
        "Literaturverwaltungsprogramm",
        &["Literatur", "verwaltungs", "programm"],
        word1
    );
    test_segmentation!("Schreibprozess", &["Schreib", "prozess"], word2);
    test_segmentation!("Interkulturalität", &["Inter", "kulturalität"], word3);
    test_segmentation!("Wissensorganisation", &["Wissens", "organisation"], word4);
    test_segmentation!("Aufgabenplanung", &["Aufgaben", "planung"], word5);
    test_segmentation!("Eisbrecher", &["Eis", "brecher"], word6);
    test_segmentation!("Zuckerei", &["Zucker", "ei"], word7);
    test_segmentation!("Glatteis", &["Glatt", "eis"], word8);
    test_segmentation!("Sinnfindung", &["Sinn", "findung"], word9);
    test_segmentation!(
        "Donaudampfschifffahrtsgesellschaftskapitän",
        &["Donau", "dampf", "schifffahrts", "gesellschafts", "kapitän"],
        word10
    );
    test_segmentation!(
        "Rindfleischetikettierungsüberwachungsaufgabenübertragungsgesetz",
        &[
            "Rind",
            "fleisch",
            "etikettierungs",
            "überwachungs",
            "aufgaben",
            "übertragungs",
            "gesetz"
        ],
        word11
    );
    test_segmentation!(
        "Nahrungsmittelunverträglichkeitsdiagnoseverfahren",
        &["Nahrungs", "mittel", "un", "verträglichkeits", "diagnose", "verfahren"],
        word12
    );
    test_segmentation!("Volleyball", &["Volley", "ball"], word13);
    test_segmentation!("Fußball", &["Fuß", "ball"], word14);
    test_segmentation!("Beachvolleyball", &["Beach", "volley", "ball"], word15);
    test_segmentation!("Basketball", &["Basket", "ball"], word16);
    test_segmentation!("Handball", &["Hand", "ball"], word17);
    test_segmentation!("Spikeball", &["Spike", "ball"], word18);
}