voikko-rs 1.1.0

Rust bindings for the Voikko library
Documentation
/*  voikko-rs - libvoikko bindings for the Rust programming language
    Copyright (C) 2019-2022 Ronja Koistinen

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.

*/
#![warn(missing_docs)]
#![warn(clippy::pedantic)]
#![allow(clippy::must_use_candidate)]
#![allow(clippy::similar_names)]

//! This module provides Rust bindings for libvoikko.
//!
//! Libvoikko provides spell checking, hyphenation, grammar checking and
//! morphological analysis for the Finnish language.
//!
//! voikko-rs requires libvoikko (version 4.1.1 or greater)
//! to be installed on your system.
//!
mod libvoikko;
mod tests;

/// This module contains the functions, types and structs of the crate.
pub mod voikko {

    use crate::libvoikko;
    use std::collections::HashMap;
    use std::error;
    use unicode_segmentation::UnicodeSegmentation;

    /// Returns the version number of libvoikko.
    pub fn version<'a>() -> &'a str {
        libvoikko::version()
    }

    /// Information about an available dictionary
    ///
    /// Contains the language, script, variant and human readable description
    /// of the dictionary.
    #[allow(missing_docs)]
    #[derive(Debug, PartialEq, Eq)]
    pub struct Dictionary {
        pub language: String,
        pub script: String,
        pub variant: String,
        pub description: String,
    }

    impl Dictionary {
        /// Construct new Dictionary struct.
        ///
        /// # Arguments
        ///
        /// * `language`
        /// * `script`
        /// * `variant`
        /// * `description`
        #[must_use]
        pub fn new(language: &str, script: &str, variant: &str, description: &str) -> Dictionary {
            Dictionary {
                language: String::from(language),
                script: String::from(script),
                variant: String::from(variant),
                description: String::from(description),
            }
        }
    }

    /// A morphological analysis item
    pub type Analysis = HashMap<String, String>;

    /// Get a list of available dictionaries. Returns a vector of Dictionary structs.
    ///
    /// # Arguments
    ///
    /// * `path` - Path to a directory from which dictionary files should be searched
    ///            first before looking into the standard dictionary locations.
    ///            Pass an empty string in order to only look in standard locations.
    pub fn list_dicts(path: &str) -> Vec<Dictionary> {
        libvoikko::list_dicts(path).unwrap_or_else(|_| vec![])
    }

    /// Return a list of language codes representing the languages for which at least one
    /// dictionary is available for spell checking. The codes conform to those specified
    /// in BCP 47. Typically the returned codes consist of only BCP 47 language subtags.
    /// They may also include tags in format Language-Script, Language-Region, or
    /// Language-Script-Region if such variants are widely used for a particular language.
    ///
    /// # Arguments
    ///
    /// * `path` - Path to a directory from which dictionary files should be searched
    ///            first before looking into the standard dictionary locations.
    ///            Pass an empty string in order to only look in standard locations.
    pub fn list_supported_spelling_languages(path: &str) -> Vec<String> {
        libvoikko::list_supported_spelling_languages(path).unwrap_or_else(|_| vec![])
    }

    /// Same as `list_supported_spelling_languages()` but for hyphenation.
    ///
    /// # Arguments
    ///
    /// * `path` - Path to a directory from which dictionary files should be searched
    ///            first before looking into the standard dictionary locations.
    ///            Pass an empty string in order to only look in standard locations.
    pub fn list_supported_hyphenation_languages(path: &str) -> Vec<String> {
        libvoikko::list_supported_hyphenation_languages(path).unwrap_or_else(|_| vec![])
    }

    /// Same as `list_supported_spelling_languages()` but for grammar checking.
    ///
    /// # Arguments
    ///
    /// * `path` - Path to a directory from which dictionary files should be searched
    ///            first before looking into the standard dictionary locations.
    ///            Pass an empty string in order to only look in standard locations.
    pub fn list_supported_grammar_checking_languages(path: &str) -> Vec<String> {
        libvoikko::list_supported_grammar_checking_languages(path).unwrap_or_else(|_| vec![])
    }

    /// A Voikko instance
    ///
    /// # Example
    ///
    /// ```
    /// extern crate voikko_rs; // in Rust 2015
    /// use voikko_rs::voikko;
    ///
    /// fn main() {
    ///     let v = voikko::Voikko::new("fi-x-morphoid", None).unwrap();
    ///     assert_eq!(v.hyphenate("kunnallispolitiikka", "-"),
    ///                Ok(String::from("kun-nal-lis-po-li-tiik-ka")));
    /// }
    /// ```
    pub struct Voikko {
        handle: *mut libvoikko::VoikkoHandle,
    }

    /// A spell check return value
    #[derive(Debug, PartialEq, Eq)]
    pub enum SpellReturn {
        /// Incorrect spelling
        SpellFailed,
        /// Correct spelling
        SpellOk,
        /// Internal error from libvoikko
        InternalError,
        /// libvoikko failed to convert character sets
        CharsetConversionFailed,
    }

    /// Type of token returned by [`analyze()`]
    #[derive(Debug, PartialEq, Eq)]
    #[allow(missing_docs)]
    pub enum TokenType {
        None,
        Word,
        Punctuation,
        Whitespace,
        Unknown,
    }

    /// Tokenization unit
    #[derive(Debug, PartialEq, Eq)]
    pub struct Token {
        /// Text of the token
        pub token_text: String,
        /// Type of the token
        pub token_type: TokenType,
    }

    #[allow(missing_docs)]
    impl Token {
        pub fn new(token_text: &str, token_type: TokenType) -> Token {
            Token {
                token_text: String::from(token_text),
                token_type,
            }
        }
    }

    /// Type of a following sentence
    #[derive(Debug, PartialEq, Eq, Clone, Copy)]
    pub enum SentenceType {
        /// End of text reached or error.
        None,
        /// This is not a start of a new sentence.
        NoStart,
        /// This may be a start of a new sentence.
        Probable,
        /// This is a probable start of a new sentence.
        Possible,
    }

    /// A sentence
    #[derive(Debug, PartialEq, Eq)]
    pub struct Sentence {
        /// Text of the sentence
        text: String,
        /// The type of the next sentence
        next_start_type: SentenceType,
    }

    #[allow(missing_docs)]
    impl Sentence {
        pub fn new(sentence_text: &str, sentence_type: SentenceType) -> Sentence {
            Sentence {
                text: String::from(sentence_text),
                next_start_type: sentence_type,
            }
        }
    }

    #[derive(Debug, PartialEq, Eq)]
    /// Grammar error
    pub struct GrammarError {
        /// Error code
        pub code: i32,
        /// Start position of the error in characters
        pub start_pos: usize,
        /// Length of the error in characters
        pub length: usize,
        /// A list of suggestions for correcting the grammar error
        pub suggestions: Vec<String>,
        /// A localized short description of the grammar error
        pub description: String,
    }

    #[derive(Debug)]
    /// Error in initializing libvoikko
    pub struct InitError {
        message: String,
    }

    #[allow(missing_docs)]
    impl InitError {
        pub fn new(message: &str) -> InitError {
            InitError {
                message: String::from(message),
            }
        }
    }

    impl std::fmt::Display for InitError {
        fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
            write!(f, "{}", self.message)
        }
    }

    impl error::Error for InitError {
        fn description(&self) -> &str {
            self.message.as_str()
        }
    }

    impl std::convert::From<std::ffi::NulError> for InitError {
        fn from(error: std::ffi::NulError) -> Self {
            InitError {
                message: format!("{}", error)
            }
        }
    }

    #[derive(Debug, PartialEq, Eq)]
    /// Error hyphenating a string
    pub struct HyphenateError {
        message: String,
    }

    #[allow(missing_docs)]
    impl HyphenateError {
        pub fn new(message: &str) -> Self {
            HyphenateError {
                message: String::from(message),
            }
        }
    }

    impl std::fmt::Display for HyphenateError {
        fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
            write!(f, "{}", self.message)
        }
    }

    impl error::Error for HyphenateError {
        fn description(&self) -> &str {
            self.message.as_str()
        }
    }

    impl std::convert::From<std::ffi::NulError> for HyphenateError {
        fn from(error: std::ffi::NulError) -> Self {
            HyphenateError {
                message: format!("{}", error)
            }
        }
    }

    impl std::convert::From<std::str::Utf8Error> for HyphenateError {
        fn from(error: std::str::Utf8Error) -> Self {
            HyphenateError {
                message: format!("{}", error)
            }
        }
    }

    impl Voikko {
        /// Initializes Voikko and returns a `Result<Voikko, InitError>`
        ///
        /// # Arguments
        ///
        /// * `language` - BCP 47 language tag for the language to be used.
        ///                Private use subtags can be used to specify the dictionary variant.
        /// * `path` - Path to a directory from which dictionary files should be searched first before
        ///            looking into the standard dictionary locations. If `None`, no additional search path
        ///            will be used.
        ///
        /// # Errors
        ///
        /// Returns an `InitError` result if init fails.
        pub fn new(language: &str, path: Option<&str>) -> Result<Voikko, InitError> {
            let v = libvoikko::init(language, path);

            match v {
                Ok(handle) => Ok(Voikko { handle }),
                Err(error) => Err(error),
            }
        }

        /// Check the spelling of a UTF-8 character string.
        ///
        /// # Arguments
        ///
        /// * `word` - word to check
        #[must_use]
        pub fn spell(&self, word: &str) -> SpellReturn {
            let ret = libvoikko::spell(self.handle, word);
            match ret {
                Ok(code) => match code {
                    0 => SpellReturn::SpellFailed,
                    1 => SpellReturn::SpellOk,
                    3 => SpellReturn::CharsetConversionFailed,
                    _ => SpellReturn::InternalError,
                },
                Err(_) => SpellReturn::SpellFailed,
            }

        }

        /// Finds suggested correct spellings for given UTF-8 encoded word.
        /// Returns a vector of strings - an empty vector, if no suggestions.
        ///
        /// # Arguments
        ///
        /// * `word` - word to find suggestions for
        #[must_use]
        pub fn suggest(&self, word: &str) -> Vec<String> {
            libvoikko::suggest(self.handle, word).unwrap_or_else(|_| vec![])
        }

        /// Hyphenates the given word in UTF-8 encoding.
        /// Returns a string containing the hyphenation using the following notation:
        /// * `' '` = no hyphenation at this character,
        /// * `'-'` = hyphenation point (character at this position
        ///        is preserved in the hyphenated form),
        /// * `'='` = hyphenation point (character at this position
        ///        is replaced by the hyphen.)
        ///
        /// # Arguments
        ///
        /// * `word` - word to hyphenate
        ///
        /// # Errors
        ///
        /// Returns an error result on error.
        pub fn hyphens(&self, word: &str) -> Result<String, bool> {
            libvoikko::hyphens(self.handle, word)
        }

        /// Hyphenates the given word in UTF-8 encoding.
        /// Returns a string where caller-supplied characters are inserted in all hyphenation points.
        ///
        /// # Arguments
        ///
        /// * `word` - word to hyphenate
        /// * `hyphen` - string to insert at hyphenation points
        ///
        /// # Errors
        ///
        /// Returns an error result on error.
        pub fn hyphenate(&self, word: &str, hyphen: &str) -> Result<String, bool> {
            let hyphens = self.hyphens(word);
            match hyphens {
                Err(_) => Err(false),
                Ok(hyph) => Ok(word
                    .graphemes(true)
                    .zip(hyph.graphemes(true))
                    .map(|(w, h)| match h {
                        // " " => String::from(w),
                        "-" => format!("{}{}", hyphen, w),
                        "=" => String::from(hyphen),
                        _ => String::from(w),
                    })
                    .collect::<String>()),
            }
        }

        /// Hyphenates the given word in UTF-8 encoding.
        /// Returns a string where caller-supplied characters are inserted in all hyphenation points.
        /// **Requires libvoikko version 4.2.0 or greater.**
        ///
        /// # Arguments
        ///
        /// * `word` - word to hyphenate
        /// * `character` - string to insert at hyphenation points
        /// * `allow_context_changes` - boolean parameter controlling whether to insert hyphens even if they alter the word
        ///
        /// # Examples
        ///
        /// ```
        /// # use voikko_rs::voikko;
        /// # let v = voikko::Voikko::new("fi-x-morphoid", None).unwrap();
        /// // Voikko initialized on the variable v
        /// let hyphenated1 = v.hyphenate_new("rei'ittää", "-", true);
        /// assert_eq!(hyphenated1, Ok(String::from("rei-it-tää")));
        /// let hyphenated2 = v.hyphenate_new("rei'ittää", "-", false);
        /// assert_eq!(hyphenated2, Ok(String::from("rei'it-tää")));
        ///
        /// ```
        ///
        /// # Errors
        ///
        /// Is Err if libvoikko returns a null pointer, i.e. it fails to hyphenate.
        pub fn hyphenate_new(&self, word: &str, character: &str, allow_context_changes: bool) -> Result<String, HyphenateError> {
            libvoikko::insert_hyphens(self.handle, word, character, allow_context_changes)
        }

        /// Tokenize a text string. Returns a vector of Token structs.
        ///
        /// # Arguments
        ///
        /// * `text` - Text to find tokens in.
        #[allow(clippy::match_wildcard_for_single_variants)]
        #[must_use]
        pub fn tokens(&self, text: &str) -> Vec<Token> {
            let mut tokenlist = Vec::new();
            let mut offset = 0;
            while offset < text.len() {
                let (raw_token, token_len) = libvoikko::next_token(self.handle, &text[offset..]);
                let token_type = match raw_token {
                    libvoikko::voikko_token_type::TOKEN_NONE => TokenType::None,
                    libvoikko::voikko_token_type::TOKEN_PUNCTUATION => TokenType::Punctuation,
                    libvoikko::voikko_token_type::TOKEN_WHITESPACE => TokenType::Whitespace,
                    libvoikko::voikko_token_type::TOKEN_WORD => TokenType::Word,
                    _ => TokenType::Unknown,
                };
                if token_type == TokenType::None {
                    break;
                }
                let token_text: String = text[offset..].chars().take(token_len).collect();
                let token = Token::new(&token_text, token_type);
                tokenlist.push(token);
                offset += token_text.as_bytes().len();
            }
            tokenlist
        }

        /// Find sentences in a text string. Returns a vector of Sentence structs.
        ///
        /// # Arguments
        ///
        /// * `text` - Text to find sentences in.
        #[allow(clippy::match_wildcard_for_single_variants)]
        #[must_use]
        pub fn sentences(&self, text: &str) -> Vec<Sentence> {
            let mut sentlist = Vec::new();
            let mut offset = 0;
            let mut next_start_type = SentenceType::NoStart;
            while offset < text.chars().count() && next_start_type != SentenceType::None {
                // sent_len is in UTF-8 characters, not bytes
                let next_text = text.chars().skip(offset).collect::<String>();
                let (raw_sent, sent_len) =
                    libvoikko::next_sentence(self.handle, next_text.as_str());
                next_start_type = match raw_sent {
                    libvoikko::voikko_sentence_type::SENTENCE_NO_START => SentenceType::NoStart,
                    libvoikko::voikko_sentence_type::SENTENCE_POSSIBLE => SentenceType::Possible,
                    libvoikko::voikko_sentence_type::SENTENCE_PROBABLE => SentenceType::Probable,
                    _ => SentenceType::None,
                };
                // construct new Sentence object with text slice and sentence type
                let token = Sentence::new(
                    text.chars()
                        .skip(offset)
                        .take(sent_len)
                        .collect::<String>()
                        .as_str(),
                    next_start_type,
                );
                sentlist.push(token);
                offset += sent_len;
            }
            sentlist
        }

        /// Analyzes the morphology of given word.
        ///
        /// Returns a vector of Analysis structs (`std::collections::HashMap`) or an empty vector if
        /// analysis fails.
        ///
        /// # Arguments
        ///
        /// * `word` - word to analyze
        // https://github.com/voikko/corevoikko/blob/rel-libvoikko-4.1.1/libvoikko/doc/morphological-analysis.txt
        #[must_use]
        pub fn analyze(&self, word: &str) -> Vec<Analysis> {
            libvoikko::analyze_word(self.handle, word).unwrap_or_else(|_| vec![])
        }

        /// Find all grammar errors in given text.
        ///
        /// Returns a vector of `GrammarError` structs or an empty vector if no errors found.
        ///
        /// # Arguments
        ///
        /// * `text` - Text to find grammar errors in. The text should usually begin at the start of
        ///            a paragraph or sentence.
        /// * `desc_lang` - ISO language code for the language in which to recieve error descriptions.
        #[must_use]
        pub fn grammar_errors(&self, text: &str, desc_lang: &str) -> Vec<GrammarError> {
            libvoikko::get_grammar_errors(self.handle, text, desc_lang).unwrap_or_else(|_| vec![])
        }

        // Values of option constants documented in
        // https://github.com/voikko/corevoikko/blob/rel-libvoikko-4.1.1/libvoikko/src/voikko_defines.h

        // Boolean options

        /// Ignore dot at the end of the word (needed for use in some word processors).
        /// If this option is set and input word ends with a dot, spell checking and
        /// hyphenation functions try to analyze the word without the dot if no results
        /// can be obtained for the original form. Also with this option, string tokenizer
        /// will consider trailing dot of a word to be a part of that word.
        ///
        /// Default: false
        pub fn set_opt_ignore_dot(&self, value: bool) -> bool {
            libvoikko::set_bool_option(self.handle, 0, value)
        }

        /// (Spell checking only) Ignore words containing numbers
        ///
        /// Default: false
        pub fn set_opt_ignore_numbers(&self, value: bool) -> bool {
            libvoikko::set_bool_option(self.handle, 1, value)
        }

        /// Accept words that are written completely in uppercase letters without checking
        /// them at all.
        ///
        /// Default: false
        pub fn set_opt_ignore_uppercase(&self, value: bool) -> bool {
            libvoikko::set_bool_option(self.handle, 3, value)
        }

        /// Accept words even when the first letter is in uppercase (start of sentence etc.)
        ///
        /// Default: true
        pub fn set_opt_accept_first_uppercase(&self, value: bool) -> bool {
            libvoikko::set_bool_option(self.handle, 6, value)
        }

        /// Accept words even when all of the letters are in uppercase. Note that this is
        /// not the same as `set_opt_ignore_uppercase(true)`: with this option the word is still
        /// checked, only case differences are ignored.
        ///
        /// Default: true
        pub fn set_opt_accept_all_uppercase(&self, value: bool) -> bool {
            libvoikko::set_bool_option(self.handle, 7, value)
        }

        /// Do not insert hyphenation positions that are considered to be ugly but correct
        ///
        /// Default: false
        pub fn set_opt_no_ugly_hyphenation(&self, value: bool) -> bool {
            libvoikko::set_bool_option(self.handle, 4, value)
        }

        /// Use suggestions optimized for optical character recognition software.
        /// By default suggestions are optimized for typing errors.
        ///
        /// Default: false
        pub fn set_opt_ocr_suggestions(&self, value: bool) -> bool {
            libvoikko::set_bool_option(self.handle, 8, value)
        }

        /// (Spell checking only): Ignore non-words such as URLs and email addresses.
        ///
        /// Default: true
        pub fn set_opt_ignore_nonwords(&self, value: bool) -> bool {
            libvoikko::set_bool_option(self.handle, 10, value)
        }

        /// (Spell checking only): Allow some extra hyphens in words. This option relaxes
        /// hyphen checking rules to work around some unresolved issues in the underlying
        /// morphology, but it may cause some incorrect words to be accepted. The exact
        /// behavior (if any) of this option is not specified.
        ///
        /// Default: false */
        pub fn set_opt_accept_extra_hyphens(&self, value: bool) -> bool {
            libvoikko::set_bool_option(self.handle, 11, value)
        }

        /// (Spell checking only): Accept missing hyphens at the start and end of the word.
        /// Some application programs do not consider hyphens to be word characters. This
        /// is a reasonable assumption for many languages but not for Finnish. If the
        /// application cannot be fixed to use a proper tokenisation algorithm for Finnish,
        /// this option may be used to tell libvoikko to work around this defect.
        ///
        /// Default: false
        pub fn set_opt_accept_missing_hyphens(&self, value: bool) -> bool {
            libvoikko::set_bool_option(self.handle, 12, value)
        }

        /// (Grammar checking only): Accept incomplete sentences that could occur in
        /// titles or headings. Set this option to true if your application is not able
        /// to differentiate titles from normal text paragraphs, or if you know that
        /// you are checking title text.
        ///
        /// Default: false
        pub fn set_opt_accept_titles_in_gc(&self, value: bool) -> bool {
            libvoikko::set_bool_option(self.handle, 13, value)
        }

        /// (Grammar checking only): Accept incomplete sentences at the end of the
        /// paragraph. These may exist when text is still being written.
        ///
        /// Default: false
        pub fn set_opt_accept_unfinished_paragraphs_in_gc(&self, value: bool) -> bool {
            libvoikko::set_bool_option(self.handle, 14, value)
        }

        /// (Hyphenation only): Hyphenate unknown words.
        ///
        /// Default: true
        pub fn set_opt_hyphenate_unknown_words(&self, value: bool) -> bool {
            libvoikko::set_bool_option(self.handle, 15, value)
        }

        /// (Grammar checking only): Accept paragraphs if they would be valid within
        /// bulleted lists.
        ///
        /// Default: false
        pub fn set_opt_accept_bulleted_lists_in_gc(&self, value: bool) -> bool {
            libvoikko::set_bool_option(self.handle, 16, value)
        }

        // Integer options

        /// The minimum length for words that may be hyphenated. This limit is also enforced on
        /// individual parts of compound words.
        ///
        /// Default: 2
        pub fn set_min_hyphenated_word_length(&self, value: i32) -> bool {
            libvoikko::set_int_option(self.handle, 9, value)
        }

        /// Size of the spell checker cache. This can be -1 (no cache) or
        /// >= 0 ( size in bytes = `2^cache_size * (6544*sizeof(wchar_t) + 1008)` ).
        ///
        /// Default: 0
        pub fn set_speller_cache_size(&self, value: i32) -> bool {
            libvoikko::set_int_option(self.handle, 17, value)
        }
    }

    impl Drop for Voikko {
        fn drop(&mut self) {
            libvoikko::terminate(self.handle);
        }
    }
}