voikko_rs/lib.rs
1/* voikko-rs - libvoikko bindings for the Rust programming language
2 Copyright (C) 2019-2022 Ronja Koistinen
3
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>.
16
17*/
18#![warn(missing_docs)]
19#![warn(clippy::pedantic)]
20#![allow(clippy::must_use_candidate)]
21#![allow(clippy::similar_names)]
22
23//! This module provides Rust bindings for libvoikko.
24//!
25//! Libvoikko provides spell checking, hyphenation, grammar checking and
26//! morphological analysis for the Finnish language.
27//!
28//! voikko-rs requires libvoikko (version 4.1.1 or greater)
29//! to be installed on your system.
30//!
31mod libvoikko;
32mod tests;
33
34/// This module contains the functions, types and structs of the crate.
35pub mod voikko {
36
37 use crate::libvoikko;
38 use std::collections::HashMap;
39 use std::error;
40 use unicode_segmentation::UnicodeSegmentation;
41
42 /// Returns the version number of libvoikko.
43 pub fn version<'a>() -> &'a str {
44 libvoikko::version()
45 }
46
47 /// Information about an available dictionary
48 ///
49 /// Contains the language, script, variant and human readable description
50 /// of the dictionary.
51 #[allow(missing_docs)]
52 #[derive(Debug, PartialEq, Eq)]
53 pub struct Dictionary {
54 pub language: String,
55 pub script: String,
56 pub variant: String,
57 pub description: String,
58 }
59
60 impl Dictionary {
61 /// Construct new Dictionary struct.
62 ///
63 /// # Arguments
64 ///
65 /// * `language`
66 /// * `script`
67 /// * `variant`
68 /// * `description`
69 #[must_use]
70 pub fn new(language: &str, script: &str, variant: &str, description: &str) -> Dictionary {
71 Dictionary {
72 language: String::from(language),
73 script: String::from(script),
74 variant: String::from(variant),
75 description: String::from(description),
76 }
77 }
78 }
79
80 /// A morphological analysis item
81 pub type Analysis = HashMap<String, String>;
82
83 /// Get a list of available dictionaries. Returns a vector of Dictionary structs.
84 ///
85 /// # Arguments
86 ///
87 /// * `path` - Path to a directory from which dictionary files should be searched
88 /// first before looking into the standard dictionary locations.
89 /// Pass an empty string in order to only look in standard locations.
90 pub fn list_dicts(path: &str) -> Vec<Dictionary> {
91 libvoikko::list_dicts(path).unwrap_or_else(|_| vec![])
92 }
93
94 /// Return a list of language codes representing the languages for which at least one
95 /// dictionary is available for spell checking. The codes conform to those specified
96 /// in BCP 47. Typically the returned codes consist of only BCP 47 language subtags.
97 /// They may also include tags in format Language-Script, Language-Region, or
98 /// Language-Script-Region if such variants are widely used for a particular language.
99 ///
100 /// # Arguments
101 ///
102 /// * `path` - Path to a directory from which dictionary files should be searched
103 /// first before looking into the standard dictionary locations.
104 /// Pass an empty string in order to only look in standard locations.
105 pub fn list_supported_spelling_languages(path: &str) -> Vec<String> {
106 libvoikko::list_supported_spelling_languages(path).unwrap_or_else(|_| vec![])
107 }
108
109 /// Same as `list_supported_spelling_languages()` but for hyphenation.
110 ///
111 /// # Arguments
112 ///
113 /// * `path` - Path to a directory from which dictionary files should be searched
114 /// first before looking into the standard dictionary locations.
115 /// Pass an empty string in order to only look in standard locations.
116 pub fn list_supported_hyphenation_languages(path: &str) -> Vec<String> {
117 libvoikko::list_supported_hyphenation_languages(path).unwrap_or_else(|_| vec![])
118 }
119
120 /// Same as `list_supported_spelling_languages()` but for grammar checking.
121 ///
122 /// # Arguments
123 ///
124 /// * `path` - Path to a directory from which dictionary files should be searched
125 /// first before looking into the standard dictionary locations.
126 /// Pass an empty string in order to only look in standard locations.
127 pub fn list_supported_grammar_checking_languages(path: &str) -> Vec<String> {
128 libvoikko::list_supported_grammar_checking_languages(path).unwrap_or_else(|_| vec![])
129 }
130
131 /// A Voikko instance
132 ///
133 /// # Example
134 ///
135 /// ```
136 /// extern crate voikko_rs; // in Rust 2015
137 /// use voikko_rs::voikko;
138 ///
139 /// fn main() {
140 /// let v = voikko::Voikko::new("fi-x-morphoid", None).unwrap();
141 /// assert_eq!(v.hyphenate("kunnallispolitiikka", "-"),
142 /// Ok(String::from("kun-nal-lis-po-li-tiik-ka")));
143 /// }
144 /// ```
145 pub struct Voikko {
146 handle: *mut libvoikko::VoikkoHandle,
147 }
148
149 /// A spell check return value
150 #[derive(Debug, PartialEq, Eq)]
151 pub enum SpellReturn {
152 /// Incorrect spelling
153 SpellFailed,
154 /// Correct spelling
155 SpellOk,
156 /// Internal error from libvoikko
157 InternalError,
158 /// libvoikko failed to convert character sets
159 CharsetConversionFailed,
160 }
161
162 /// Type of token returned by [`analyze()`]
163 #[derive(Debug, PartialEq, Eq)]
164 #[allow(missing_docs)]
165 pub enum TokenType {
166 None,
167 Word,
168 Punctuation,
169 Whitespace,
170 Unknown,
171 }
172
173 /// Tokenization unit
174 #[derive(Debug, PartialEq, Eq)]
175 pub struct Token {
176 /// Text of the token
177 pub token_text: String,
178 /// Type of the token
179 pub token_type: TokenType,
180 }
181
182 #[allow(missing_docs)]
183 impl Token {
184 pub fn new(token_text: &str, token_type: TokenType) -> Token {
185 Token {
186 token_text: String::from(token_text),
187 token_type,
188 }
189 }
190 }
191
192 /// Type of a following sentence
193 #[derive(Debug, PartialEq, Eq, Clone, Copy)]
194 pub enum SentenceType {
195 /// End of text reached or error.
196 None,
197 /// This is not a start of a new sentence.
198 NoStart,
199 /// This may be a start of a new sentence.
200 Probable,
201 /// This is a probable start of a new sentence.
202 Possible,
203 }
204
205 /// A sentence
206 #[derive(Debug, PartialEq, Eq)]
207 pub struct Sentence {
208 /// Text of the sentence
209 text: String,
210 /// The type of the next sentence
211 next_start_type: SentenceType,
212 }
213
214 #[allow(missing_docs)]
215 impl Sentence {
216 pub fn new(sentence_text: &str, sentence_type: SentenceType) -> Sentence {
217 Sentence {
218 text: String::from(sentence_text),
219 next_start_type: sentence_type,
220 }
221 }
222 }
223
224 #[derive(Debug, PartialEq, Eq)]
225 /// Grammar error
226 pub struct GrammarError {
227 /// Error code
228 pub code: i32,
229 /// Start position of the error in characters
230 pub start_pos: usize,
231 /// Length of the error in characters
232 pub length: usize,
233 /// A list of suggestions for correcting the grammar error
234 pub suggestions: Vec<String>,
235 /// A localized short description of the grammar error
236 pub description: String,
237 }
238
239 #[derive(Debug)]
240 /// Error in initializing libvoikko
241 pub struct InitError {
242 message: String,
243 }
244
245 #[allow(missing_docs)]
246 impl InitError {
247 pub fn new(message: &str) -> InitError {
248 InitError {
249 message: String::from(message),
250 }
251 }
252 }
253
254 impl std::fmt::Display for InitError {
255 fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
256 write!(f, "{}", self.message)
257 }
258 }
259
260 impl error::Error for InitError {
261 fn description(&self) -> &str {
262 self.message.as_str()
263 }
264 }
265
266 impl std::convert::From<std::ffi::NulError> for InitError {
267 fn from(error: std::ffi::NulError) -> Self {
268 InitError {
269 message: format!("{}", error)
270 }
271 }
272 }
273
274 #[derive(Debug, PartialEq, Eq)]
275 /// Error hyphenating a string
276 pub struct HyphenateError {
277 message: String,
278 }
279
280 #[allow(missing_docs)]
281 impl HyphenateError {
282 pub fn new(message: &str) -> Self {
283 HyphenateError {
284 message: String::from(message),
285 }
286 }
287 }
288
289 impl std::fmt::Display for HyphenateError {
290 fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
291 write!(f, "{}", self.message)
292 }
293 }
294
295 impl error::Error for HyphenateError {
296 fn description(&self) -> &str {
297 self.message.as_str()
298 }
299 }
300
301 impl std::convert::From<std::ffi::NulError> for HyphenateError {
302 fn from(error: std::ffi::NulError) -> Self {
303 HyphenateError {
304 message: format!("{}", error)
305 }
306 }
307 }
308
309 impl std::convert::From<std::str::Utf8Error> for HyphenateError {
310 fn from(error: std::str::Utf8Error) -> Self {
311 HyphenateError {
312 message: format!("{}", error)
313 }
314 }
315 }
316
317 impl Voikko {
318 /// Initializes Voikko and returns a `Result<Voikko, InitError>`
319 ///
320 /// # Arguments
321 ///
322 /// * `language` - BCP 47 language tag for the language to be used.
323 /// Private use subtags can be used to specify the dictionary variant.
324 /// * `path` - Path to a directory from which dictionary files should be searched first before
325 /// looking into the standard dictionary locations. If `None`, no additional search path
326 /// will be used.
327 ///
328 /// # Errors
329 ///
330 /// Returns an `InitError` result if init fails.
331 pub fn new(language: &str, path: Option<&str>) -> Result<Voikko, InitError> {
332 let v = libvoikko::init(language, path);
333
334 match v {
335 Ok(handle) => Ok(Voikko { handle }),
336 Err(error) => Err(error),
337 }
338 }
339
340 /// Check the spelling of a UTF-8 character string.
341 ///
342 /// # Arguments
343 ///
344 /// * `word` - word to check
345 #[must_use]
346 pub fn spell(&self, word: &str) -> SpellReturn {
347 let ret = libvoikko::spell(self.handle, word);
348 match ret {
349 Ok(code) => match code {
350 0 => SpellReturn::SpellFailed,
351 1 => SpellReturn::SpellOk,
352 3 => SpellReturn::CharsetConversionFailed,
353 _ => SpellReturn::InternalError,
354 },
355 Err(_) => SpellReturn::SpellFailed,
356 }
357
358 }
359
360 /// Finds suggested correct spellings for given UTF-8 encoded word.
361 /// Returns a vector of strings - an empty vector, if no suggestions.
362 ///
363 /// # Arguments
364 ///
365 /// * `word` - word to find suggestions for
366 #[must_use]
367 pub fn suggest(&self, word: &str) -> Vec<String> {
368 libvoikko::suggest(self.handle, word).unwrap_or_else(|_| vec![])
369 }
370
371 /// Hyphenates the given word in UTF-8 encoding.
372 /// Returns a string containing the hyphenation using the following notation:
373 /// * `' '` = no hyphenation at this character,
374 /// * `'-'` = hyphenation point (character at this position
375 /// is preserved in the hyphenated form),
376 /// * `'='` = hyphenation point (character at this position
377 /// is replaced by the hyphen.)
378 ///
379 /// # Arguments
380 ///
381 /// * `word` - word to hyphenate
382 ///
383 /// # Errors
384 ///
385 /// Returns an error result on error.
386 pub fn hyphens(&self, word: &str) -> Result<String, bool> {
387 libvoikko::hyphens(self.handle, word)
388 }
389
390 /// Hyphenates the given word in UTF-8 encoding.
391 /// Returns a string where caller-supplied characters are inserted in all hyphenation points.
392 ///
393 /// # Arguments
394 ///
395 /// * `word` - word to hyphenate
396 /// * `hyphen` - string to insert at hyphenation points
397 ///
398 /// # Errors
399 ///
400 /// Returns an error result on error.
401 pub fn hyphenate(&self, word: &str, hyphen: &str) -> Result<String, bool> {
402 let hyphens = self.hyphens(word);
403 match hyphens {
404 Err(_) => Err(false),
405 Ok(hyph) => Ok(word
406 .graphemes(true)
407 .zip(hyph.graphemes(true))
408 .map(|(w, h)| match h {
409 // " " => String::from(w),
410 "-" => format!("{}{}", hyphen, w),
411 "=" => String::from(hyphen),
412 _ => String::from(w),
413 })
414 .collect::<String>()),
415 }
416 }
417
418 /// Hyphenates the given word in UTF-8 encoding.
419 /// Returns a string where caller-supplied characters are inserted in all hyphenation points.
420 /// **Requires libvoikko version 4.2.0 or greater.**
421 ///
422 /// # Arguments
423 ///
424 /// * `word` - word to hyphenate
425 /// * `character` - string to insert at hyphenation points
426 /// * `allow_context_changes` - boolean parameter controlling whether to insert hyphens even if they alter the word
427 ///
428 /// # Examples
429 ///
430 /// ```
431 /// # use voikko_rs::voikko;
432 /// # let v = voikko::Voikko::new("fi-x-morphoid", None).unwrap();
433 /// // Voikko initialized on the variable v
434 /// let hyphenated1 = v.hyphenate_new("rei'ittää", "-", true);
435 /// assert_eq!(hyphenated1, Ok(String::from("rei-it-tää")));
436 /// let hyphenated2 = v.hyphenate_new("rei'ittää", "-", false);
437 /// assert_eq!(hyphenated2, Ok(String::from("rei'it-tää")));
438 ///
439 /// ```
440 ///
441 /// # Errors
442 ///
443 /// Is Err if libvoikko returns a null pointer, i.e. it fails to hyphenate.
444 pub fn hyphenate_new(&self, word: &str, character: &str, allow_context_changes: bool) -> Result<String, HyphenateError> {
445 libvoikko::insert_hyphens(self.handle, word, character, allow_context_changes)
446 }
447
448 /// Tokenize a text string. Returns a vector of Token structs.
449 ///
450 /// # Arguments
451 ///
452 /// * `text` - Text to find tokens in.
453 #[allow(clippy::match_wildcard_for_single_variants)]
454 #[must_use]
455 pub fn tokens(&self, text: &str) -> Vec<Token> {
456 let mut tokenlist = Vec::new();
457 let mut offset = 0;
458 while offset < text.len() {
459 let (raw_token, token_len) = libvoikko::next_token(self.handle, &text[offset..]);
460 let token_type = match raw_token {
461 libvoikko::voikko_token_type::TOKEN_NONE => TokenType::None,
462 libvoikko::voikko_token_type::TOKEN_PUNCTUATION => TokenType::Punctuation,
463 libvoikko::voikko_token_type::TOKEN_WHITESPACE => TokenType::Whitespace,
464 libvoikko::voikko_token_type::TOKEN_WORD => TokenType::Word,
465 _ => TokenType::Unknown,
466 };
467 if token_type == TokenType::None {
468 break;
469 }
470 let token_text: String = text[offset..].chars().take(token_len).collect();
471 let token = Token::new(&token_text, token_type);
472 tokenlist.push(token);
473 offset += token_text.as_bytes().len();
474 }
475 tokenlist
476 }
477
478 /// Find sentences in a text string. Returns a vector of Sentence structs.
479 ///
480 /// # Arguments
481 ///
482 /// * `text` - Text to find sentences in.
483 #[allow(clippy::match_wildcard_for_single_variants)]
484 #[must_use]
485 pub fn sentences(&self, text: &str) -> Vec<Sentence> {
486 let mut sentlist = Vec::new();
487 let mut offset = 0;
488 let mut next_start_type = SentenceType::NoStart;
489 while offset < text.chars().count() && next_start_type != SentenceType::None {
490 // sent_len is in UTF-8 characters, not bytes
491 let next_text = text.chars().skip(offset).collect::<String>();
492 let (raw_sent, sent_len) =
493 libvoikko::next_sentence(self.handle, next_text.as_str());
494 next_start_type = match raw_sent {
495 libvoikko::voikko_sentence_type::SENTENCE_NO_START => SentenceType::NoStart,
496 libvoikko::voikko_sentence_type::SENTENCE_POSSIBLE => SentenceType::Possible,
497 libvoikko::voikko_sentence_type::SENTENCE_PROBABLE => SentenceType::Probable,
498 _ => SentenceType::None,
499 };
500 // construct new Sentence object with text slice and sentence type
501 let token = Sentence::new(
502 text.chars()
503 .skip(offset)
504 .take(sent_len)
505 .collect::<String>()
506 .as_str(),
507 next_start_type,
508 );
509 sentlist.push(token);
510 offset += sent_len;
511 }
512 sentlist
513 }
514
515 /// Analyzes the morphology of given word.
516 ///
517 /// Returns a vector of Analysis structs (`std::collections::HashMap`) or an empty vector if
518 /// analysis fails.
519 ///
520 /// # Arguments
521 ///
522 /// * `word` - word to analyze
523 // https://github.com/voikko/corevoikko/blob/rel-libvoikko-4.1.1/libvoikko/doc/morphological-analysis.txt
524 #[must_use]
525 pub fn analyze(&self, word: &str) -> Vec<Analysis> {
526 libvoikko::analyze_word(self.handle, word).unwrap_or_else(|_| vec![])
527 }
528
529 /// Find all grammar errors in given text.
530 ///
531 /// Returns a vector of `GrammarError` structs or an empty vector if no errors found.
532 ///
533 /// # Arguments
534 ///
535 /// * `text` - Text to find grammar errors in. The text should usually begin at the start of
536 /// a paragraph or sentence.
537 /// * `desc_lang` - ISO language code for the language in which to recieve error descriptions.
538 #[must_use]
539 pub fn grammar_errors(&self, text: &str, desc_lang: &str) -> Vec<GrammarError> {
540 libvoikko::get_grammar_errors(self.handle, text, desc_lang).unwrap_or_else(|_| vec![])
541 }
542
543 // Values of option constants documented in
544 // https://github.com/voikko/corevoikko/blob/rel-libvoikko-4.1.1/libvoikko/src/voikko_defines.h
545
546 // Boolean options
547
548 /// Ignore dot at the end of the word (needed for use in some word processors).
549 /// If this option is set and input word ends with a dot, spell checking and
550 /// hyphenation functions try to analyze the word without the dot if no results
551 /// can be obtained for the original form. Also with this option, string tokenizer
552 /// will consider trailing dot of a word to be a part of that word.
553 ///
554 /// Default: false
555 pub fn set_opt_ignore_dot(&self, value: bool) -> bool {
556 libvoikko::set_bool_option(self.handle, 0, value)
557 }
558
559 /// (Spell checking only) Ignore words containing numbers
560 ///
561 /// Default: false
562 pub fn set_opt_ignore_numbers(&self, value: bool) -> bool {
563 libvoikko::set_bool_option(self.handle, 1, value)
564 }
565
566 /// Accept words that are written completely in uppercase letters without checking
567 /// them at all.
568 ///
569 /// Default: false
570 pub fn set_opt_ignore_uppercase(&self, value: bool) -> bool {
571 libvoikko::set_bool_option(self.handle, 3, value)
572 }
573
574 /// Accept words even when the first letter is in uppercase (start of sentence etc.)
575 ///
576 /// Default: true
577 pub fn set_opt_accept_first_uppercase(&self, value: bool) -> bool {
578 libvoikko::set_bool_option(self.handle, 6, value)
579 }
580
581 /// Accept words even when all of the letters are in uppercase. Note that this is
582 /// not the same as `set_opt_ignore_uppercase(true)`: with this option the word is still
583 /// checked, only case differences are ignored.
584 ///
585 /// Default: true
586 pub fn set_opt_accept_all_uppercase(&self, value: bool) -> bool {
587 libvoikko::set_bool_option(self.handle, 7, value)
588 }
589
590 /// Do not insert hyphenation positions that are considered to be ugly but correct
591 ///
592 /// Default: false
593 pub fn set_opt_no_ugly_hyphenation(&self, value: bool) -> bool {
594 libvoikko::set_bool_option(self.handle, 4, value)
595 }
596
597 /// Use suggestions optimized for optical character recognition software.
598 /// By default suggestions are optimized for typing errors.
599 ///
600 /// Default: false
601 pub fn set_opt_ocr_suggestions(&self, value: bool) -> bool {
602 libvoikko::set_bool_option(self.handle, 8, value)
603 }
604
605 /// (Spell checking only): Ignore non-words such as URLs and email addresses.
606 ///
607 /// Default: true
608 pub fn set_opt_ignore_nonwords(&self, value: bool) -> bool {
609 libvoikko::set_bool_option(self.handle, 10, value)
610 }
611
612 /// (Spell checking only): Allow some extra hyphens in words. This option relaxes
613 /// hyphen checking rules to work around some unresolved issues in the underlying
614 /// morphology, but it may cause some incorrect words to be accepted. The exact
615 /// behavior (if any) of this option is not specified.
616 ///
617 /// Default: false */
618 pub fn set_opt_accept_extra_hyphens(&self, value: bool) -> bool {
619 libvoikko::set_bool_option(self.handle, 11, value)
620 }
621
622 /// (Spell checking only): Accept missing hyphens at the start and end of the word.
623 /// Some application programs do not consider hyphens to be word characters. This
624 /// is a reasonable assumption for many languages but not for Finnish. If the
625 /// application cannot be fixed to use a proper tokenisation algorithm for Finnish,
626 /// this option may be used to tell libvoikko to work around this defect.
627 ///
628 /// Default: false
629 pub fn set_opt_accept_missing_hyphens(&self, value: bool) -> bool {
630 libvoikko::set_bool_option(self.handle, 12, value)
631 }
632
633 /// (Grammar checking only): Accept incomplete sentences that could occur in
634 /// titles or headings. Set this option to true if your application is not able
635 /// to differentiate titles from normal text paragraphs, or if you know that
636 /// you are checking title text.
637 ///
638 /// Default: false
639 pub fn set_opt_accept_titles_in_gc(&self, value: bool) -> bool {
640 libvoikko::set_bool_option(self.handle, 13, value)
641 }
642
643 /// (Grammar checking only): Accept incomplete sentences at the end of the
644 /// paragraph. These may exist when text is still being written.
645 ///
646 /// Default: false
647 pub fn set_opt_accept_unfinished_paragraphs_in_gc(&self, value: bool) -> bool {
648 libvoikko::set_bool_option(self.handle, 14, value)
649 }
650
651 /// (Hyphenation only): Hyphenate unknown words.
652 ///
653 /// Default: true
654 pub fn set_opt_hyphenate_unknown_words(&self, value: bool) -> bool {
655 libvoikko::set_bool_option(self.handle, 15, value)
656 }
657
658 /// (Grammar checking only): Accept paragraphs if they would be valid within
659 /// bulleted lists.
660 ///
661 /// Default: false
662 pub fn set_opt_accept_bulleted_lists_in_gc(&self, value: bool) -> bool {
663 libvoikko::set_bool_option(self.handle, 16, value)
664 }
665
666 // Integer options
667
668 /// The minimum length for words that may be hyphenated. This limit is also enforced on
669 /// individual parts of compound words.
670 ///
671 /// Default: 2
672 pub fn set_min_hyphenated_word_length(&self, value: i32) -> bool {
673 libvoikko::set_int_option(self.handle, 9, value)
674 }
675
676 /// Size of the spell checker cache. This can be -1 (no cache) or
677 /// >= 0 ( size in bytes = `2^cache_size * (6544*sizeof(wchar_t) + 1008)` ).
678 ///
679 /// Default: 0
680 pub fn set_speller_cache_size(&self, value: i32) -> bool {
681 libvoikko::set_int_option(self.handle, 17, value)
682 }
683 }
684
685 impl Drop for Voikko {
686 fn drop(&mut self) {
687 libvoikko::terminate(self.handle);
688 }
689 }
690}