voikko_rs/
lib.rs

1/*  voikko-rs - libvoikko bindings for the Rust programming language
2    Copyright (C) 2019-2022 Ronja Koistinen
3
4    This program is free software: you can redistribute it and/or modify
5    it under the terms of the GNU General Public License as published by
6    the Free Software Foundation, either version 3 of the License, or
7    (at your option) any later version.
8
9    This program is distributed in the hope that it will be useful,
10    but WITHOUT ANY WARRANTY; without even the implied warranty of
11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12    GNU General Public License for more details.
13
14    You should have received a copy of the GNU General Public License
15    along with this program.  If not, see <http://www.gnu.org/licenses/>.
16
17*/
18#![warn(missing_docs)]
19#![warn(clippy::pedantic)]
20#![allow(clippy::must_use_candidate)]
21#![allow(clippy::similar_names)]
22
23//! This module provides Rust bindings for libvoikko.
24//!
25//! Libvoikko provides spell checking, hyphenation, grammar checking and
26//! morphological analysis for the Finnish language.
27//!
28//! voikko-rs requires libvoikko (version 4.1.1 or greater)
29//! to be installed on your system.
30//!
31mod libvoikko;
32mod tests;
33
34/// This module contains the functions, types and structs of the crate.
35pub mod voikko {
36
37    use crate::libvoikko;
38    use std::collections::HashMap;
39    use std::error;
40    use unicode_segmentation::UnicodeSegmentation;
41
42    /// Returns the version number of libvoikko.
43    pub fn version<'a>() -> &'a str {
44        libvoikko::version()
45    }
46
47    /// Information about an available dictionary
48    ///
49    /// Contains the language, script, variant and human readable description
50    /// of the dictionary.
51    #[allow(missing_docs)]
52    #[derive(Debug, PartialEq, Eq)]
53    pub struct Dictionary {
54        pub language: String,
55        pub script: String,
56        pub variant: String,
57        pub description: String,
58    }
59
60    impl Dictionary {
61        /// Construct new Dictionary struct.
62        ///
63        /// # Arguments
64        ///
65        /// * `language`
66        /// * `script`
67        /// * `variant`
68        /// * `description`
69        #[must_use]
70        pub fn new(language: &str, script: &str, variant: &str, description: &str) -> Dictionary {
71            Dictionary {
72                language: String::from(language),
73                script: String::from(script),
74                variant: String::from(variant),
75                description: String::from(description),
76            }
77        }
78    }
79
80    /// A morphological analysis item
81    pub type Analysis = HashMap<String, String>;
82
83    /// Get a list of available dictionaries. Returns a vector of Dictionary structs.
84    ///
85    /// # Arguments
86    ///
87    /// * `path` - Path to a directory from which dictionary files should be searched
88    ///            first before looking into the standard dictionary locations.
89    ///            Pass an empty string in order to only look in standard locations.
90    pub fn list_dicts(path: &str) -> Vec<Dictionary> {
91        libvoikko::list_dicts(path).unwrap_or_else(|_| vec![])
92    }
93
94    /// Return a list of language codes representing the languages for which at least one
95    /// dictionary is available for spell checking. The codes conform to those specified
96    /// in BCP 47. Typically the returned codes consist of only BCP 47 language subtags.
97    /// They may also include tags in format Language-Script, Language-Region, or
98    /// Language-Script-Region if such variants are widely used for a particular language.
99    ///
100    /// # Arguments
101    ///
102    /// * `path` - Path to a directory from which dictionary files should be searched
103    ///            first before looking into the standard dictionary locations.
104    ///            Pass an empty string in order to only look in standard locations.
105    pub fn list_supported_spelling_languages(path: &str) -> Vec<String> {
106        libvoikko::list_supported_spelling_languages(path).unwrap_or_else(|_| vec![])
107    }
108
109    /// Same as `list_supported_spelling_languages()` but for hyphenation.
110    ///
111    /// # Arguments
112    ///
113    /// * `path` - Path to a directory from which dictionary files should be searched
114    ///            first before looking into the standard dictionary locations.
115    ///            Pass an empty string in order to only look in standard locations.
116    pub fn list_supported_hyphenation_languages(path: &str) -> Vec<String> {
117        libvoikko::list_supported_hyphenation_languages(path).unwrap_or_else(|_| vec![])
118    }
119
120    /// Same as `list_supported_spelling_languages()` but for grammar checking.
121    ///
122    /// # Arguments
123    ///
124    /// * `path` - Path to a directory from which dictionary files should be searched
125    ///            first before looking into the standard dictionary locations.
126    ///            Pass an empty string in order to only look in standard locations.
127    pub fn list_supported_grammar_checking_languages(path: &str) -> Vec<String> {
128        libvoikko::list_supported_grammar_checking_languages(path).unwrap_or_else(|_| vec![])
129    }
130
131    /// A Voikko instance
132    ///
133    /// # Example
134    ///
135    /// ```
136    /// extern crate voikko_rs; // in Rust 2015
137    /// use voikko_rs::voikko;
138    ///
139    /// fn main() {
140    ///     let v = voikko::Voikko::new("fi-x-morphoid", None).unwrap();
141    ///     assert_eq!(v.hyphenate("kunnallispolitiikka", "-"),
142    ///                Ok(String::from("kun-nal-lis-po-li-tiik-ka")));
143    /// }
144    /// ```
145    pub struct Voikko {
146        handle: *mut libvoikko::VoikkoHandle,
147    }
148
149    /// A spell check return value
150    #[derive(Debug, PartialEq, Eq)]
151    pub enum SpellReturn {
152        /// Incorrect spelling
153        SpellFailed,
154        /// Correct spelling
155        SpellOk,
156        /// Internal error from libvoikko
157        InternalError,
158        /// libvoikko failed to convert character sets
159        CharsetConversionFailed,
160    }
161
162    /// Type of token returned by [`analyze()`]
163    #[derive(Debug, PartialEq, Eq)]
164    #[allow(missing_docs)]
165    pub enum TokenType {
166        None,
167        Word,
168        Punctuation,
169        Whitespace,
170        Unknown,
171    }
172
173    /// Tokenization unit
174    #[derive(Debug, PartialEq, Eq)]
175    pub struct Token {
176        /// Text of the token
177        pub token_text: String,
178        /// Type of the token
179        pub token_type: TokenType,
180    }
181
182    #[allow(missing_docs)]
183    impl Token {
184        pub fn new(token_text: &str, token_type: TokenType) -> Token {
185            Token {
186                token_text: String::from(token_text),
187                token_type,
188            }
189        }
190    }
191
192    /// Type of a following sentence
193    #[derive(Debug, PartialEq, Eq, Clone, Copy)]
194    pub enum SentenceType {
195        /// End of text reached or error.
196        None,
197        /// This is not a start of a new sentence.
198        NoStart,
199        /// This may be a start of a new sentence.
200        Probable,
201        /// This is a probable start of a new sentence.
202        Possible,
203    }
204
205    /// A sentence
206    #[derive(Debug, PartialEq, Eq)]
207    pub struct Sentence {
208        /// Text of the sentence
209        text: String,
210        /// The type of the next sentence
211        next_start_type: SentenceType,
212    }
213
214    #[allow(missing_docs)]
215    impl Sentence {
216        pub fn new(sentence_text: &str, sentence_type: SentenceType) -> Sentence {
217            Sentence {
218                text: String::from(sentence_text),
219                next_start_type: sentence_type,
220            }
221        }
222    }
223
224    #[derive(Debug, PartialEq, Eq)]
225    /// Grammar error
226    pub struct GrammarError {
227        /// Error code
228        pub code: i32,
229        /// Start position of the error in characters
230        pub start_pos: usize,
231        /// Length of the error in characters
232        pub length: usize,
233        /// A list of suggestions for correcting the grammar error
234        pub suggestions: Vec<String>,
235        /// A localized short description of the grammar error
236        pub description: String,
237    }
238
239    #[derive(Debug)]
240    /// Error in initializing libvoikko
241    pub struct InitError {
242        message: String,
243    }
244
245    #[allow(missing_docs)]
246    impl InitError {
247        pub fn new(message: &str) -> InitError {
248            InitError {
249                message: String::from(message),
250            }
251        }
252    }
253
254    impl std::fmt::Display for InitError {
255        fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
256            write!(f, "{}", self.message)
257        }
258    }
259
260    impl error::Error for InitError {
261        fn description(&self) -> &str {
262            self.message.as_str()
263        }
264    }
265
266    impl std::convert::From<std::ffi::NulError> for InitError {
267        fn from(error: std::ffi::NulError) -> Self {
268            InitError {
269                message: format!("{}", error)
270            }
271        }
272    }
273
274    #[derive(Debug, PartialEq, Eq)]
275    /// Error hyphenating a string
276    pub struct HyphenateError {
277        message: String,
278    }
279
280    #[allow(missing_docs)]
281    impl HyphenateError {
282        pub fn new(message: &str) -> Self {
283            HyphenateError {
284                message: String::from(message),
285            }
286        }
287    }
288
289    impl std::fmt::Display for HyphenateError {
290        fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
291            write!(f, "{}", self.message)
292        }
293    }
294
295    impl error::Error for HyphenateError {
296        fn description(&self) -> &str {
297            self.message.as_str()
298        }
299    }
300
301    impl std::convert::From<std::ffi::NulError> for HyphenateError {
302        fn from(error: std::ffi::NulError) -> Self {
303            HyphenateError {
304                message: format!("{}", error)
305            }
306        }
307    }
308
309    impl std::convert::From<std::str::Utf8Error> for HyphenateError {
310        fn from(error: std::str::Utf8Error) -> Self {
311            HyphenateError {
312                message: format!("{}", error)
313            }
314        }
315    }
316
317    impl Voikko {
318        /// Initializes Voikko and returns a `Result<Voikko, InitError>`
319        ///
320        /// # Arguments
321        ///
322        /// * `language` - BCP 47 language tag for the language to be used.
323        ///                Private use subtags can be used to specify the dictionary variant.
324        /// * `path` - Path to a directory from which dictionary files should be searched first before
325        ///            looking into the standard dictionary locations. If `None`, no additional search path
326        ///            will be used.
327        ///
328        /// # Errors
329        ///
330        /// Returns an `InitError` result if init fails.
331        pub fn new(language: &str, path: Option<&str>) -> Result<Voikko, InitError> {
332            let v = libvoikko::init(language, path);
333
334            match v {
335                Ok(handle) => Ok(Voikko { handle }),
336                Err(error) => Err(error),
337            }
338        }
339
340        /// Check the spelling of a UTF-8 character string.
341        ///
342        /// # Arguments
343        ///
344        /// * `word` - word to check
345        #[must_use]
346        pub fn spell(&self, word: &str) -> SpellReturn {
347            let ret = libvoikko::spell(self.handle, word);
348            match ret {
349                Ok(code) => match code {
350                    0 => SpellReturn::SpellFailed,
351                    1 => SpellReturn::SpellOk,
352                    3 => SpellReturn::CharsetConversionFailed,
353                    _ => SpellReturn::InternalError,
354                },
355                Err(_) => SpellReturn::SpellFailed,
356            }
357
358        }
359
360        /// Finds suggested correct spellings for given UTF-8 encoded word.
361        /// Returns a vector of strings - an empty vector, if no suggestions.
362        ///
363        /// # Arguments
364        ///
365        /// * `word` - word to find suggestions for
366        #[must_use]
367        pub fn suggest(&self, word: &str) -> Vec<String> {
368            libvoikko::suggest(self.handle, word).unwrap_or_else(|_| vec![])
369        }
370
371        /// Hyphenates the given word in UTF-8 encoding.
372        /// Returns a string containing the hyphenation using the following notation:
373        /// * `' '` = no hyphenation at this character,
374        /// * `'-'` = hyphenation point (character at this position
375        ///        is preserved in the hyphenated form),
376        /// * `'='` = hyphenation point (character at this position
377        ///        is replaced by the hyphen.)
378        ///
379        /// # Arguments
380        ///
381        /// * `word` - word to hyphenate
382        ///
383        /// # Errors
384        ///
385        /// Returns an error result on error.
386        pub fn hyphens(&self, word: &str) -> Result<String, bool> {
387            libvoikko::hyphens(self.handle, word)
388        }
389
390        /// Hyphenates the given word in UTF-8 encoding.
391        /// Returns a string where caller-supplied characters are inserted in all hyphenation points.
392        ///
393        /// # Arguments
394        ///
395        /// * `word` - word to hyphenate
396        /// * `hyphen` - string to insert at hyphenation points
397        ///
398        /// # Errors
399        ///
400        /// Returns an error result on error.
401        pub fn hyphenate(&self, word: &str, hyphen: &str) -> Result<String, bool> {
402            let hyphens = self.hyphens(word);
403            match hyphens {
404                Err(_) => Err(false),
405                Ok(hyph) => Ok(word
406                    .graphemes(true)
407                    .zip(hyph.graphemes(true))
408                    .map(|(w, h)| match h {
409                        // " " => String::from(w),
410                        "-" => format!("{}{}", hyphen, w),
411                        "=" => String::from(hyphen),
412                        _ => String::from(w),
413                    })
414                    .collect::<String>()),
415            }
416        }
417
418        /// Hyphenates the given word in UTF-8 encoding.
419        /// Returns a string where caller-supplied characters are inserted in all hyphenation points.
420        /// **Requires libvoikko version 4.2.0 or greater.**
421        ///
422        /// # Arguments
423        ///
424        /// * `word` - word to hyphenate
425        /// * `character` - string to insert at hyphenation points
426        /// * `allow_context_changes` - boolean parameter controlling whether to insert hyphens even if they alter the word
427        ///
428        /// # Examples
429        ///
430        /// ```
431        /// # use voikko_rs::voikko;
432        /// # let v = voikko::Voikko::new("fi-x-morphoid", None).unwrap();
433        /// // Voikko initialized on the variable v
434        /// let hyphenated1 = v.hyphenate_new("rei'ittää", "-", true);
435        /// assert_eq!(hyphenated1, Ok(String::from("rei-it-tää")));
436        /// let hyphenated2 = v.hyphenate_new("rei'ittää", "-", false);
437        /// assert_eq!(hyphenated2, Ok(String::from("rei'it-tää")));
438        ///
439        /// ```
440        ///
441        /// # Errors
442        ///
443        /// Is Err if libvoikko returns a null pointer, i.e. it fails to hyphenate.
444        pub fn hyphenate_new(&self, word: &str, character: &str, allow_context_changes: bool) -> Result<String, HyphenateError> {
445            libvoikko::insert_hyphens(self.handle, word, character, allow_context_changes)
446        }
447
448        /// Tokenize a text string. Returns a vector of Token structs.
449        ///
450        /// # Arguments
451        ///
452        /// * `text` - Text to find tokens in.
453        #[allow(clippy::match_wildcard_for_single_variants)]
454        #[must_use]
455        pub fn tokens(&self, text: &str) -> Vec<Token> {
456            let mut tokenlist = Vec::new();
457            let mut offset = 0;
458            while offset < text.len() {
459                let (raw_token, token_len) = libvoikko::next_token(self.handle, &text[offset..]);
460                let token_type = match raw_token {
461                    libvoikko::voikko_token_type::TOKEN_NONE => TokenType::None,
462                    libvoikko::voikko_token_type::TOKEN_PUNCTUATION => TokenType::Punctuation,
463                    libvoikko::voikko_token_type::TOKEN_WHITESPACE => TokenType::Whitespace,
464                    libvoikko::voikko_token_type::TOKEN_WORD => TokenType::Word,
465                    _ => TokenType::Unknown,
466                };
467                if token_type == TokenType::None {
468                    break;
469                }
470                let token_text: String = text[offset..].chars().take(token_len).collect();
471                let token = Token::new(&token_text, token_type);
472                tokenlist.push(token);
473                offset += token_text.as_bytes().len();
474            }
475            tokenlist
476        }
477
478        /// Find sentences in a text string. Returns a vector of Sentence structs.
479        ///
480        /// # Arguments
481        ///
482        /// * `text` - Text to find sentences in.
483        #[allow(clippy::match_wildcard_for_single_variants)]
484        #[must_use]
485        pub fn sentences(&self, text: &str) -> Vec<Sentence> {
486            let mut sentlist = Vec::new();
487            let mut offset = 0;
488            let mut next_start_type = SentenceType::NoStart;
489            while offset < text.chars().count() && next_start_type != SentenceType::None {
490                // sent_len is in UTF-8 characters, not bytes
491                let next_text = text.chars().skip(offset).collect::<String>();
492                let (raw_sent, sent_len) =
493                    libvoikko::next_sentence(self.handle, next_text.as_str());
494                next_start_type = match raw_sent {
495                    libvoikko::voikko_sentence_type::SENTENCE_NO_START => SentenceType::NoStart,
496                    libvoikko::voikko_sentence_type::SENTENCE_POSSIBLE => SentenceType::Possible,
497                    libvoikko::voikko_sentence_type::SENTENCE_PROBABLE => SentenceType::Probable,
498                    _ => SentenceType::None,
499                };
500                // construct new Sentence object with text slice and sentence type
501                let token = Sentence::new(
502                    text.chars()
503                        .skip(offset)
504                        .take(sent_len)
505                        .collect::<String>()
506                        .as_str(),
507                    next_start_type,
508                );
509                sentlist.push(token);
510                offset += sent_len;
511            }
512            sentlist
513        }
514
515        /// Analyzes the morphology of given word.
516        ///
517        /// Returns a vector of Analysis structs (`std::collections::HashMap`) or an empty vector if
518        /// analysis fails.
519        ///
520        /// # Arguments
521        ///
522        /// * `word` - word to analyze
523        // https://github.com/voikko/corevoikko/blob/rel-libvoikko-4.1.1/libvoikko/doc/morphological-analysis.txt
524        #[must_use]
525        pub fn analyze(&self, word: &str) -> Vec<Analysis> {
526            libvoikko::analyze_word(self.handle, word).unwrap_or_else(|_| vec![])
527        }
528
529        /// Find all grammar errors in given text.
530        ///
531        /// Returns a vector of `GrammarError` structs or an empty vector if no errors found.
532        ///
533        /// # Arguments
534        ///
535        /// * `text` - Text to find grammar errors in. The text should usually begin at the start of
536        ///            a paragraph or sentence.
537        /// * `desc_lang` - ISO language code for the language in which to recieve error descriptions.
538        #[must_use]
539        pub fn grammar_errors(&self, text: &str, desc_lang: &str) -> Vec<GrammarError> {
540            libvoikko::get_grammar_errors(self.handle, text, desc_lang).unwrap_or_else(|_| vec![])
541        }
542
543        // Values of option constants documented in
544        // https://github.com/voikko/corevoikko/blob/rel-libvoikko-4.1.1/libvoikko/src/voikko_defines.h
545
546        // Boolean options
547
548        /// Ignore dot at the end of the word (needed for use in some word processors).
549        /// If this option is set and input word ends with a dot, spell checking and
550        /// hyphenation functions try to analyze the word without the dot if no results
551        /// can be obtained for the original form. Also with this option, string tokenizer
552        /// will consider trailing dot of a word to be a part of that word.
553        ///
554        /// Default: false
555        pub fn set_opt_ignore_dot(&self, value: bool) -> bool {
556            libvoikko::set_bool_option(self.handle, 0, value)
557        }
558
559        /// (Spell checking only) Ignore words containing numbers
560        ///
561        /// Default: false
562        pub fn set_opt_ignore_numbers(&self, value: bool) -> bool {
563            libvoikko::set_bool_option(self.handle, 1, value)
564        }
565
566        /// Accept words that are written completely in uppercase letters without checking
567        /// them at all.
568        ///
569        /// Default: false
570        pub fn set_opt_ignore_uppercase(&self, value: bool) -> bool {
571            libvoikko::set_bool_option(self.handle, 3, value)
572        }
573
574        /// Accept words even when the first letter is in uppercase (start of sentence etc.)
575        ///
576        /// Default: true
577        pub fn set_opt_accept_first_uppercase(&self, value: bool) -> bool {
578            libvoikko::set_bool_option(self.handle, 6, value)
579        }
580
581        /// Accept words even when all of the letters are in uppercase. Note that this is
582        /// not the same as `set_opt_ignore_uppercase(true)`: with this option the word is still
583        /// checked, only case differences are ignored.
584        ///
585        /// Default: true
586        pub fn set_opt_accept_all_uppercase(&self, value: bool) -> bool {
587            libvoikko::set_bool_option(self.handle, 7, value)
588        }
589
590        /// Do not insert hyphenation positions that are considered to be ugly but correct
591        ///
592        /// Default: false
593        pub fn set_opt_no_ugly_hyphenation(&self, value: bool) -> bool {
594            libvoikko::set_bool_option(self.handle, 4, value)
595        }
596
597        /// Use suggestions optimized for optical character recognition software.
598        /// By default suggestions are optimized for typing errors.
599        ///
600        /// Default: false
601        pub fn set_opt_ocr_suggestions(&self, value: bool) -> bool {
602            libvoikko::set_bool_option(self.handle, 8, value)
603        }
604
605        /// (Spell checking only): Ignore non-words such as URLs and email addresses.
606        ///
607        /// Default: true
608        pub fn set_opt_ignore_nonwords(&self, value: bool) -> bool {
609            libvoikko::set_bool_option(self.handle, 10, value)
610        }
611
612        /// (Spell checking only): Allow some extra hyphens in words. This option relaxes
613        /// hyphen checking rules to work around some unresolved issues in the underlying
614        /// morphology, but it may cause some incorrect words to be accepted. The exact
615        /// behavior (if any) of this option is not specified.
616        ///
617        /// Default: false */
618        pub fn set_opt_accept_extra_hyphens(&self, value: bool) -> bool {
619            libvoikko::set_bool_option(self.handle, 11, value)
620        }
621
622        /// (Spell checking only): Accept missing hyphens at the start and end of the word.
623        /// Some application programs do not consider hyphens to be word characters. This
624        /// is a reasonable assumption for many languages but not for Finnish. If the
625        /// application cannot be fixed to use a proper tokenisation algorithm for Finnish,
626        /// this option may be used to tell libvoikko to work around this defect.
627        ///
628        /// Default: false
629        pub fn set_opt_accept_missing_hyphens(&self, value: bool) -> bool {
630            libvoikko::set_bool_option(self.handle, 12, value)
631        }
632
633        /// (Grammar checking only): Accept incomplete sentences that could occur in
634        /// titles or headings. Set this option to true if your application is not able
635        /// to differentiate titles from normal text paragraphs, or if you know that
636        /// you are checking title text.
637        ///
638        /// Default: false
639        pub fn set_opt_accept_titles_in_gc(&self, value: bool) -> bool {
640            libvoikko::set_bool_option(self.handle, 13, value)
641        }
642
643        /// (Grammar checking only): Accept incomplete sentences at the end of the
644        /// paragraph. These may exist when text is still being written.
645        ///
646        /// Default: false
647        pub fn set_opt_accept_unfinished_paragraphs_in_gc(&self, value: bool) -> bool {
648            libvoikko::set_bool_option(self.handle, 14, value)
649        }
650
651        /// (Hyphenation only): Hyphenate unknown words.
652        ///
653        /// Default: true
654        pub fn set_opt_hyphenate_unknown_words(&self, value: bool) -> bool {
655            libvoikko::set_bool_option(self.handle, 15, value)
656        }
657
658        /// (Grammar checking only): Accept paragraphs if they would be valid within
659        /// bulleted lists.
660        ///
661        /// Default: false
662        pub fn set_opt_accept_bulleted_lists_in_gc(&self, value: bool) -> bool {
663            libvoikko::set_bool_option(self.handle, 16, value)
664        }
665
666        // Integer options
667
668        /// The minimum length for words that may be hyphenated. This limit is also enforced on
669        /// individual parts of compound words.
670        ///
671        /// Default: 2
672        pub fn set_min_hyphenated_word_length(&self, value: i32) -> bool {
673            libvoikko::set_int_option(self.handle, 9, value)
674        }
675
676        /// Size of the spell checker cache. This can be -1 (no cache) or
677        /// >= 0 ( size in bytes = `2^cache_size * (6544*sizeof(wchar_t) + 1008)` ).
678        ///
679        /// Default: 0
680        pub fn set_speller_cache_size(&self, value: i32) -> bool {
681            libvoikko::set_int_option(self.handle, 17, value)
682        }
683    }
684
685    impl Drop for Voikko {
686        fn drop(&mut self) {
687            libvoikko::terminate(self.handle);
688        }
689    }
690}