language-tokenizer 0.2.0

Text tokenizer for linguistic purposes, such as text matching. Supports more than 40 languages, including English, French, Russian, Japanese, Thai etc.
Documentation
use language_tokenizer::*;

const FUZZY_THRESHOLD: f64 = 0.8;

macro_rules! text_match_tests {
    (
        $(
            $name:ident {
                term: $term:expr,
                term_translation: $term_translation:expr,
                source_text: $source_text:expr,
                source_translation: $source_translation:expr,
                source_algorithm: $source_algorithm:expr,
                translation_algorithm: $translation_algorithm:expr,
                source_case_sensitivity: $source_case_sensitivity:expr,
                translation_case_sensitivity: $translation_case_sensitivity:expr,
                match_mode: $match_mode:expr,
                should_match: $should_match:expr,
                permissive_match: $permissive_match:expr,
            }
        ),* $(,)?
    ) => {
        $(
            #[test]
            fn $name() {
                let (term_tokens, term_translation_tokens) = (
                    tokenize(
                        $term,
                        $source_algorithm,
                        $source_case_sensitivity,
                    ).unwrap(),
                    tokenize(
                        $term_translation,
                        $translation_algorithm,
                        $translation_case_sensitivity
                    ).unwrap()
                );

                let (source_tokens, translation_tokens) = (
                    tokenize(
                        $source_text,
                        $source_algorithm,
                        $source_case_sensitivity,
                    ).unwrap(),
                    tokenize(
                        $source_translation,
                        $translation_algorithm,
                        $translation_case_sensitivity
                    ).unwrap()
                );

                let source_match = find_match(
                    &source_tokens,
                    &term_tokens,
                    $match_mode,
                    $permissive_match
                );

                let translation_match = find_match(
                    &translation_tokens,
                    &term_translation_tokens,
                    $match_mode,
                    $permissive_match,
                );

                assert_eq!(
                    source_match.is_some(), $should_match,
                    "Matching {source_tokens:?} and {term_tokens:?} failed",
                );

                assert_eq!(
                    translation_match.is_some(), $should_match,
                    "Matching {translation_tokens:?} and {term_translation_tokens:?} failed",
                );
            }
        )*
    };
}

text_match_tests! {
    exact_english_match {
        term: "climate change",
        term_translation: "climate change",
        source_text: "The effects of climate change are accelerating.",
        source_translation: "The effects of climate change are accelerating.",
        source_algorithm: Algorithm::English,
        translation_algorithm: Algorithm::English,
        source_case_sensitivity: false,
        translation_case_sensitivity: false,
        match_mode: MatchMode::Exact,
        should_match: true,
        permissive_match: false,
    },
    stemmed_english_match {
        term: "run fast",
        term_translation: "run fast",
        source_text: "She was running fast to catch the bus.",
        source_translation: "She was running fast to catch the bus.",
        source_algorithm: Algorithm::English,
        translation_algorithm: Algorithm::English,
        source_case_sensitivity: false,
        translation_case_sensitivity: false,
        match_mode: MatchMode::Exact,
        should_match: true,
        permissive_match: false,
    },
    fuzzy_english_match {
        term: "color theory",
        term_translation: "color theory",
        source_text: "This course focuses on colour theory fundamentals.",
        source_translation: "This course focuses on colour theory fundamentals.",
        source_algorithm: Algorithm::English,
        translation_algorithm: Algorithm::English,
        source_case_sensitivity: false,
        translation_case_sensitivity: false,
        match_mode: MatchMode::Fuzzy {threshold: FUZZY_THRESHOLD},
        should_match: true,
        permissive_match: false,
    },
    japanese_to_english_match {
        term: "人工知能",
        term_translation: "artificial intelligence",
        source_text: "人工知能は多くの分野で使われています。",
        source_translation: "Artificial intelligence is used in many fields.",
        source_algorithm: Algorithm::Japanese,
        translation_algorithm: Algorithm::English,
        source_case_sensitivity: false,
        translation_case_sensitivity: false,
        match_mode: MatchMode::Exact,
        should_match: true,
        permissive_match: false,
    },
    chinese_to_english_fuzzy_translation {
        term: "机器学习",
        term_translation: "machine learning",
        source_text: "机器学习正在改变世界。",
        source_translation: "Machine-learning techniques are changing the world.",
        source_algorithm: Algorithm::Chinese,
        translation_algorithm: Algorithm::English,
        source_case_sensitivity: false,
        translation_case_sensitivity: false,
        match_mode: MatchMode::Fuzzy {threshold: FUZZY_THRESHOLD},
        should_match: true,
        permissive_match: false,
    },
    term_not_present {
        term: "quantum computing",
        term_translation: "quantum computing",
        source_text: "Classical computers are widely used.",
        source_translation: "Classical computers are widely used.",
        source_algorithm: Algorithm::English,
        translation_algorithm: Algorithm::English,
        source_case_sensitivity: false,
        translation_case_sensitivity: false,
        match_mode: MatchMode::Fuzzy {threshold: FUZZY_THRESHOLD},
        should_match: false,
        permissive_match: false,
    },
    french_to_english_match {
        term: "apprentissage automatique",
        term_translation: "machine learning",
        source_text: "L'apprentissage automatique transforme de nombreux secteurs.",
        source_translation: "Machine learning is transforming many industries.",
        source_algorithm: Algorithm::French,
        translation_algorithm: Algorithm::English,
        source_case_sensitivity: false,
        translation_case_sensitivity: false,
        match_mode: MatchMode::Exact,
        should_match: true,
        permissive_match: false,
    },
    russian_to_english_match {
        term: "искусственный интеллект",
        term_translation: "artificial intelligence",
        source_text: "Искусственный интеллект развивается быстро.",
        source_translation: "Artificial intelligence is developing rapidly.",
        source_algorithm: Algorithm::Russian,
        translation_algorithm: Algorithm::English,
        source_case_sensitivity: false,
        translation_case_sensitivity: false,
        match_mode: MatchMode::Exact,
        should_match: true,
        permissive_match: false,
    },
    norwegian_to_english_match {
        term: "maskinlæring",
        term_translation: "machine learning",
        source_text: "Maskinlæring brukes i mange bransjer.",
        source_translation: "Machine learning is used in many industries.",
        source_algorithm: Algorithm::Norwegian,
        translation_algorithm: Algorithm::English,
        source_case_sensitivity: false,
        translation_case_sensitivity: false,
        match_mode: MatchMode::Exact,
        should_match: true,
        permissive_match: false,
    },
    indonesian_to_english_match {
        term: "pembelajaran mesin",
        term_translation: "machine learning",
        source_text: "Pembelajaran mesin mengubah cara kita bekerja.",
        source_translation: "Machine learning is changing how we work.",
        source_algorithm: Algorithm::Indonesian,
        translation_algorithm: Algorithm::English,
        source_case_sensitivity: false,
        translation_case_sensitivity: false,
        match_mode: MatchMode::Exact,
        should_match: true,
        permissive_match: false,
    },
    thai_to_english_match {
        term: "การเรียนรู้ของเครื่อง",
        term_translation: "machine learning",
        source_text: "การเรียนรู้ของเครื่องกำลังเปลี่ยนแปลงโลก",
        source_translation: "Machine learning is changing the world.",
        source_algorithm: Algorithm::Thai,
        translation_algorithm: Algorithm::English,
        source_case_sensitivity: false,
        translation_case_sensitivity: false,
        match_mode: MatchMode::Exact,
        should_match: true,
        permissive_match: false,
    },
    greek_to_english_match {
        term: "τεχνητή νοημοσύνη",
        term_translation: "artificial intelligence",
        source_text: "Η τεχνητή νοημοσύνη έχει πολλές εφαρμογές.",
        source_translation: "Artificial intelligence has many applications.",
        source_algorithm: Algorithm::Greek,
        translation_algorithm: Algorithm::English,
        source_case_sensitivity: false,
        translation_case_sensitivity: false,
        match_mode: MatchMode::Exact,
        should_match: true,
        permissive_match: false,
    },
    german_to_english_match {
        term: "künstliche Intelligenz",
        term_translation: "artificial intelligence",
        source_text: "Künstliche Intelligenz verändert unsere Gesellschaft.",
        source_translation: "Artificial intelligence is transforming our society.",
        source_algorithm: Algorithm::German,
        translation_algorithm: Algorithm::English,
        source_case_sensitivity: false,
        translation_case_sensitivity: false,
        match_mode: MatchMode::Exact,
        should_match: true,
        permissive_match: false,
    },
    german_fuzzy_match {
        term: "maschinelles Lernen",
        term_translation: "machine learning",
        source_text: "Maschinen-Lernen ist eine Schlüsseltechnologie.",
        source_translation: "Machine-learning is a key technology.",
        source_algorithm: Algorithm::German,
        translation_algorithm: Algorithm::English,
        source_case_sensitivity: false,
        translation_case_sensitivity: false,
        match_mode: MatchMode::Fuzzy {threshold: FUZZY_THRESHOLD},
        should_match: true,
        permissive_match: false,
    },
    lowercase_no_match {
        term: "Downtown",
        term_translation: "Даунтаун",
        source_text: "There are strange things going in the downtown.",
        source_translation: "Странные вещи творятся в деловом районе.",
        source_algorithm: Algorithm::English,
        translation_algorithm: Algorithm::Russian,
        source_case_sensitivity: true,
        translation_case_sensitivity: true,
        match_mode: MatchMode::Exact,
        should_match: false,
        permissive_match: false,
    },
    lowercase_match {
        term: "Downtown",
        term_translation: "Даунтаун",
        source_text: "There are strange things going in The Downtown.",
        source_translation: "Странные вещи творятся в Даунтауне.",
        source_algorithm: Algorithm::English,
        translation_algorithm: Algorithm::Russian,
        source_case_sensitivity: true,
        translation_case_sensitivity: true,
        match_mode: MatchMode::Exact,
        should_match: true,
        permissive_match: false,
    },
    permissive_match {
        term: "Downtown",
        term_translation: "Даунтаун",
        source_text: "There are strange things going in THE DOWNTOWN.",
        source_translation: "Странные вещи творятся в ДАУНТАУНЕ.",
        source_algorithm: Algorithm::English,
        translation_algorithm: Algorithm::Russian,
        source_case_sensitivity: true,
        translation_case_sensitivity: true,
        match_mode: MatchMode::Both {threshold: 0.85},
        should_match: true,
        permissive_match: true,
    },
}