pyphen_rs/
lib.rs

1// This file is part of pyphen-rs
2//
3// Copyright 2008 - Wilbert Berendsen <info@wilbertberendsen.nl>
4// Copyright 2012-2013 - Guillaume Ayoub <guillaume.ayoub@kozea.fr>
5// Copyright 2019 - Naresh Ganduri <gandurinaresh@gmail.com>
6//
7// This library is free software.  It is released under the
8// GPL 2.0+/LGPL 2.1+/MPL 1.1 tri-license.  See COPYING.GPL, COPYING.LGPL and
9// COPYING.MPL for more details.
10//
11// This library is distributed in the hope that it will be useful, but WITHOUT
12// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
13// FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
14// details.
15
16//! A pure Rust port of Python's [Pyphen][1].
17//!
18//! [1]: https://pyphen.org/
19
20#![warn(clippy::all)]
21#![warn(missing_docs)]
22
23mod alternative_parser;
24mod data_int;
25mod hyph_dict;
26mod pyphen;
27
28use std::cell::RefCell;
29use std::collections::HashMap;
30use std::rc::Rc;
31use std::thread_local;
32
33use alternative_parser::AlternativeParser;
34use data_int::DataInt;
35use hyph_dict::HyphDict;
36pub use pyphen::{builder::Builder, iter::Iter, Pyphen};
37
38#[macro_use]
39extern crate lazy_static;
40
41use regex::Regex;
42
43// precompile some stuff
44lazy_static! {
45    static ref PARSE_HEX: Regex = Regex::new(r"\^{2}([0-9a-f]{2})").unwrap();
46    static ref PARSE: Regex = Regex::new(r"(\d?)(\D?)").unwrap();
47}
48
49thread_local! {
50    // cache of per-file HyphDict objects
51    static HD_CACHE: RefCell<HashMap<String, Rc<HyphDict>>> = RefCell::new(HashMap::new());
52
53    /// A thread-local copy of all available languages
54    pub static LANGUAGES: RefCell<HashMap<String, Rc<String>>> = {
55        let mut dict = HashMap::new();
56        let dir = format!("{}/dictionaries", env!("CARGO_MANIFEST_DIR"));
57
58        if let Ok(read_dir) = std::fs::read_dir(dir) {
59            for entry in read_dir {
60                if let Ok(entry) = entry {
61                    if let Some(filepath) = entry.path().to_str() {
62                        let filename = entry.file_name();
63                        let filename = filename
64                            .to_str()
65                            .unwrap()
66                            .trim_start_matches("hyph_")
67                            .trim_end_matches(".dic");
68                        dict.insert(filename.to_string(), Rc::new(filepath.to_string()));
69                    }
70                }
71            }
72        }
73
74        RefCell::new(dict)
75    }
76}
77
78/// Get a fallback language if one is available in our dictionaries.
79///
80/// <http://www.unicode.org/reports/tr35/#Locale_Inheritance>
81///
82/// We use the normal truncation inheritance. This function needs aliases
83/// including scripts for languages with multiple regions available.
84pub fn language_fallback(language: &str) -> Option<String> {
85    let language = language.replace('-', "_");
86    let mut parts: Vec<_> = language.split('_').collect();
87
88    while !parts.is_empty() {
89        let language = parts.join("_");
90        let mut flag = false;
91        LANGUAGES.with(|l| {
92            if l.borrow().contains_key(&language) {
93                flag = true;
94            }
95        });
96        if flag {
97            return Some(language);
98        }
99
100        parts.pop();
101    }
102
103    None
104}
105
106#[cfg(test)]
107mod tests {
108    use super::*;
109    use std::ops::Deref;
110
111    fn match_tuple<T, U>(tup1: (T, U), s1: &str, s2: &str)
112    where
113        T: Deref<Target = str>,
114        U: Deref<Target = str>,
115    {
116        let (a, b) = tup1;
117
118        assert_eq!(&*a, s1);
119        assert_eq!(&*b, s2);
120    }
121
122    fn match_iter<T>(iter: Option<(T, T)>, s1: &str, s2: &str)
123    where
124        T: Deref<Target = str>,
125    {
126        assert!(iter.is_some());
127        let x = iter.unwrap();
128        match_tuple(x, s1, s2);
129    }
130
131    fn test_lang(a: Option<String>, b: &str) {
132        assert!(a.is_some());
133        assert_eq!(a.unwrap(), b);
134    }
135
136    ///Test the ``inserted`` method.
137    #[test]
138    fn test_inserted() {
139        let dic = Builder::lang("nl_NL").build().unwrap();
140        assert_eq!(dic.inserted("lettergrepen"), "let-ter-gre-pen");
141    }
142
143    /// Test the ``wrap`` method.
144    #[test]
145    fn test_wrap() {
146        let dic = Builder::lang("nl_NL").build().unwrap();
147        match_tuple(
148            dic.wrap("autobandventieldopje", 11).unwrap(),
149            "autoband-",
150            "ventieldopje",
151        );
152    }
153
154    /// Test the ``iterate`` method.
155    #[test]
156    fn test_iterate() {
157        let dic = Builder::lang("nl_NL").build().unwrap();
158        let mut iter = dic.iterate("Amsterdam");
159        match_iter(iter.next(), "Amster", "dam");
160        match_iter(iter.next(), "Am", "sterdam");
161        assert_eq!(iter.next(), None);
162    }
163
164    /// Test the ``iterate`` method with a fallback dict.
165    #[test]
166    fn test_fallback_dict() {
167        let dic = Builder::lang("nl_NL-variant").build().unwrap();
168        let mut iter = dic.iterate("Amsterdam");
169        match_iter(iter.next(), "Amster", "dam");
170        match_iter(iter.next(), "Am", "sterdam");
171        assert_eq!(iter.next(), None);
172    }
173
174    /// Test a missing dict.
175    #[test]
176    fn test_missing_dict() {
177        assert!(Builder::lang("mi_SS").build().is_err());
178    }
179
180    /// Test a personal dict.
181    #[test]
182    fn test_personal_dict() {
183        let dic = Builder::lang("fr").build().unwrap();
184        assert_ne!(
185            dic.inserted("autobandventieldopje"),
186            "au-to-band-ven-tiel-dop-je"
187        );
188        LANGUAGES.with(|l| {
189            let nl = {
190                let l = l.borrow();
191                l["nl_NL"].clone()
192            };
193            let mut l = l.borrow_mut();
194            let fr = l.get_mut("fr").unwrap();
195            *fr = nl;
196        });
197        let dic = Builder::lang("fr").build().unwrap();
198        assert_eq!(
199            dic.inserted("autobandventieldopje"),
200            "au-to-band-ven-tiel-dop-je"
201        );
202    }
203
204    /// Test the ``left`` and ``right`` parameters.
205    #[test]
206    fn test_left_right() {
207        let dic = Builder::lang("nl_NL").build().unwrap();
208        assert_eq!(dic.inserted("lettergrepen"), "let-ter-gre-pen");
209        let dic = Builder::lang("nl_NL").left(4).build().unwrap();
210        assert_eq!(dic.inserted("lettergrepen"), "letter-gre-pen");
211        let dic = Builder::lang("nl_NL").right(4).build().unwrap();
212        assert_eq!(dic.inserted("lettergrepen"), "let-ter-grepen");
213        let dic = Builder::lang("nl_NL").left(4).right(4).build().unwrap();
214        assert_eq!(dic.inserted("lettergrepen"), "letter-grepen");
215    }
216
217    /// Test the ``filename`` parameter.
218    #[test]
219    fn test_filename() {
220        LANGUAGES.with(|l| {
221            let l = l.borrow();
222            let filename = l["nl_NL"].clone();
223
224            let dic = Builder::filename(filename).build().unwrap();
225            assert_eq!(dic.inserted("lettergrepen"), "let-ter-gre-pen");
226        });
227    }
228
229    /// Test the alternative Parser.
230    #[test]
231    fn test_alternative() {
232        let dic = Builder::lang("hu").left(1).right(1).build().unwrap();
233        let mut iter = dic.iterate("kulissza");
234        match_iter(iter.next(), "kulisz", "sza");
235        match_iter(iter.next(), "ku", "lissza");
236        assert_eq!(iter.next(), None);
237        assert_eq!(dic.inserted("kulissza"), "ku-lisz-sza");
238    }
239
240    /// Test uppercase.
241    #[test]
242    fn test_upper() {
243        let dic = Builder::lang("nl_NL").build().unwrap();
244        assert_eq!(dic.inserted("LETTERGREPEN"), "LET-TER-GRE-PEN");
245    }
246
247    /// Test uppercase with alternative Parser.
248    #[test]
249    fn test_upper_alternative() {
250        let dic = Builder::lang("hu").left(1).right(1).build().unwrap();
251        let mut iter = dic.iterate("KULISSZA");
252        match_iter(iter.next(), "KULISZ", "SZA");
253        match_iter(iter.next(), "KU", "LISSZA");
254        assert_eq!(iter.next(), None);
255        assert_eq!(dic.inserted("KULISSZA"), "KU-LISZ-SZA");
256    }
257
258    /// Test that all included dictionaries can be parsed.
259    #[test]
260    fn test_all_dictionaries() {
261        LANGUAGES.with(|l| {
262            for lang in l.borrow().keys() {
263                Builder::lang(lang).build().unwrap();
264            }
265        });
266    }
267
268    /// Test the language fallback algorithm.
269    #[test]
270    fn test_fallback() {
271        test_lang(language_fallback("en"), "en");
272        test_lang(language_fallback("en_US"), "en_US");
273        test_lang(language_fallback("en_FR"), "en");
274        test_lang(language_fallback("en-Latn-US"), "en_Latn_US");
275        test_lang(language_fallback("en-Cyrl-US"), "en");
276        test_lang(language_fallback("fr-Latn-FR"), "fr");
277        test_lang(language_fallback("en-US_variant1-x"), "en_US");
278    }
279}