hyphenation 0.6.0

Standard Knuth-Liang hyphenation based on the TeX UTF-8 patterns.
Documentation
//! Languages we can hyphenate and their default parameters, as provided by
//! the TeX `hyph-utf8` package.

#![allow(non_camel_case_types)]

use hyphenation_commons::{Exceptions, Patterns};

/// A `Corpus` carries hyphenation data and parameters.
///
/// It comprises the working language, the set of applicable patterns and
/// exceptions, as well as the left and right intra-word hyphenation boundaries.
#[derive(Clone, Debug)]
pub struct Corpus {
    pub language: Language,
    pub patterns: Patterns,
    pub exceptions: Exceptions,
    pub left_min: usize,
    pub right_min: usize
}


use self::Language::*;

/// The set of available languages.
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
pub enum Language {
    Afrikaans,
    Armenian,
    Assamese,
    Basque,
    Bengali,
    Bulgarian,
    Catalan,
    Chinese,
    Coptic,
    Croatian,
    Czech,
    Danish,
    Dutch,
    English_GB,
    English_US,
    Esperanto,
    Estonian,
    Ethiopic,
    Finnish,
    French,
    Friulan,
    Galician,
    Georgian,
    German_1901,
    German_1996,
    German_Swiss,
    Greek_Ancient,
    Greek_Mono,
    Greek_Poly,
    Gujarati,
    Hindi,
    Hungarian,
    Icelandic,
    Indonesian,
    Interlingua,
    Irish,
    Italian,
    Kannada,
    Kurmanji,
    Latin_Classic,
    Latin,
    Latvian,
    Lithuanian,
    Malayalam,
    Marathi,
    Mongolian,
    Norwegian_Bokmal,
    Norwegian_Nynorsk,
    Occitan,
    Oriya,
    Panjabi,
    Piedmontese,
    Polish,
    Portuguese,
    Romanian,
    Romansh,
    Russian,
    Sanskrit,
    Serbian_Cyrillic,
    Serbocroatian_Cyrillic,
    Serbocroatian_Latin,
    Slavonic_Church,
    Slovak,
    Slovenian,
    Spanish,
    Swedish,
    Tamil,
    Telugu,
    Thai,
    Turkish,
    Turkmen,
    Ukrainian,
    Uppersorbian,
    Welsh
}


/// The TeX tag for a given language.
pub fn tag(lang: Language) -> &'static str {
    match lang {
        Afrikaans => "af",
        Armenian => "hy",
        Assamese => "as",
        Basque => "eu",
        Bengali => "bn",
        Bulgarian => "bg",
        Catalan => "ca",
        Chinese => "zh-latn-pinyin",
        Coptic => "cop",
        Croatian => "hr",
        Czech => "cs",
        Danish => "da",
        Dutch => "nl",
        English_GB => "en-gb",
        English_US => "en-us",
        Esperanto => "eo",
        Estonian => "et",
        Ethiopic => "mul-ethi",
        Finnish => "fi",
        French => "fr",
        Friulan => "fur",
        Galician => "gl",
        Georgian => "ka",
        German_1901  => "de-1901",
        German_1996  => "de-1996",
        German_Swiss => "de-ch-1901",
        Greek_Ancient => "grc",
        Greek_Mono => "el-monoton",
        Greek_Poly => "el-polyton",
        Gujarati => "gu",
        Hindi => "hi",
        Hungarian => "hu",
        Icelandic => "is",
        Indonesian => "id",
        Interlingua => "ia",
        Irish => "ga",
        Italian => "it",
        Kannada => "kn",
        Kurmanji => "kmr",
        Latin => "la",
        Latin_Classic => "la-x-classic",
        Latvian => "lv",
        Lithuanian => "lt",
        Malayalam => "ml",
        Marathi => "mr",
        Mongolian => "mn-cyrl",
        Norwegian_Bokmal  => "nb",
        Norwegian_Nynorsk => "nn",
        Occitan => "oc",
        Oriya => "or",
        Panjabi => "pa",
        Piedmontese => "pms",
        Polish => "pl",
        Portuguese => "pt",
        Romanian => "ro",
        Romansh => "rm",
        Russian => "ru",
        Sanskrit => "sa",
        Serbian_Cyrillic => "sr-cyrl",
        Serbocroatian_Cyrillic => "sh-cyrl",
        Serbocroatian_Latin => "sh-latn",
        Slavonic_Church => "cu",
        Slovak => "sk",
        Slovenian => "sl",
        Spanish => "es",
        Swedish => "sv",
        Tamil => "ta",
        Telugu => "te",
        Thai => "th",
        Turkish => "tr",
        Turkmen => "tk",
        Ukrainian => "uk",
        Uppersorbian => "hsb",
        Welsh => "cy"
    }
}

/// The default number of characters from the start and end of a word
/// which shall not be hyphenated.
pub fn mins(lang: Language) -> (usize, usize) {
    // NOTE: These values were taken directly from the relevant TeX packages, but
    // it is unclear how well they map to the notion of Unicode `char` in Rust.
    //
    // In the worst case, a language featuring graphemes larger than 1 `char` may
    // set boundaries mid-grapheme. This should be of no practical consequence,
    // since well-formed hyphenation patterns only match full graphemes.
    match lang {
        Afrikaans => (1, 2),
        Armenian => (1, 2),
        Assamese => (1, 1),
        Basque => (2, 2),
        Bengali => (1, 1),
        Bulgarian => (2, 2),
        Catalan => (2, 2),
        Chinese => (1, 1),
        Coptic => (1, 1),
        Croatian => (2, 2),
        Czech => (2, 3),
        Danish => (2, 2),
        Dutch => (2, 2),
        English_GB => (2, 3),
        English_US => (2, 3),
        Esperanto => (2, 2),
        Estonian => (2, 3),
        Ethiopic => (1, 1),
        Finnish => (2, 2),
        French => (2, 3),
        Friulan => (2, 2),
        Galician => (2, 2),
        Georgian => (1, 2),
        German_1901 => (2, 2),
        German_1996 => (2, 2),
        German_Swiss => (2, 2),
        Greek_Ancient => (1, 1),
        Greek_Mono => (1, 1),
        Greek_Poly => (1, 1),
        Gujarati => (1, 1),
        Hindi => (1, 1),
        Hungarian => (2, 2),
        Icelandic => (2, 2),
        Indonesian => (2, 2),
        Interlingua => (2, 2),
        Irish => (2, 3),
        Italian => (2, 2),
        Kannada => (1, 1),
        Kurmanji => (2, 2),
        Latin => (2, 2),
        Latin_Classic => (2, 2),
        Latvian => (2, 2),
        Lithuanian => (2, 2),
        Malayalam => (1, 1),
        Marathi => (1, 1),
        Mongolian => (2, 2),
        Norwegian_Bokmal => (2, 2),
        Norwegian_Nynorsk => (2, 2),
        Occitan => (2, 2),
        Oriya => (1, 1),
        Panjabi => (1, 1),
        Piedmontese => (2, 2),
        Polish => (2, 2),
        Portuguese => (2, 3),
        Romanian => (2, 2),
        Romansh => (2, 2),
        Russian => (2, 2),
        Sanskrit => (1, 3),
        Serbian_Cyrillic => (2, 2),
        Serbocroatian_Cyrillic => (2, 2),
        Serbocroatian_Latin => (2, 2),
        Slavonic_Church => (1, 2),
        Slovak => (2, 3),
        Slovenian => (2, 2),
        Spanish => (2, 2),
        Swedish => (2, 2),
        Tamil => (1, 1),
        Telugu => (1, 1),
        Thai => (2, 3),
        Turkish => (2, 2),
        Turkmen => (2, 2),
        Ukrainian => (2, 2),
        Uppersorbian => (2, 2),
        Welsh => (2, 3)
    }
}