iuliia_rust/
lib.rs

1#[macro_use]
2extern crate include_dir;
3extern crate regex;
4
5use include_dir::Dir;
6use regex::Regex;
7use lazy_static::lazy_static;
8
9use std::collections::HashMap;
10use serde::{Serialize, Deserialize};
11
12const SCHEMA_DIR: Dir = include_dir!("./iuliia");
13const DUMMY_SYMBOL: &str = "$";
14
15/// Describe struct of transliterate schema
16#[derive(Serialize, Deserialize, Debug)]
17pub struct Schema {
18    name: String,
19    description: String,
20    url: String,
21    mapping: Option<HashMap<String, String>>,
22    prev_mapping: Option<HashMap<String, String>>,
23    next_mapping: Option<HashMap<String, String>>,
24    ending_mapping: Option<HashMap<String, String>>,
25    samples: Option<Vec<Vec<String>>>,
26}
27
28impl Schema {
29    /// Return Schema object by schema name
30    pub fn for_name(s: &str) -> Schema {
31        let schema_file = SCHEMA_DIR.get_file(format!("{}{}", s, ".json"))
32            .expect(&format!("There are no schema with name {}", s));
33        serde_json::from_str(schema_file.contents_utf8().unwrap()).unwrap()
34    }
35
36    pub fn get_pref(&self, s: &str) -> Option<String> {
37        if self.prev_mapping.is_none() {
38            return None;
39        }
40        match self.prev_mapping.as_ref().unwrap().get(&s.replace(DUMMY_SYMBOL.clone(), "").to_lowercase()) {
41            Some(result) => Some(result.clone()),
42            None => None
43        }
44    }
45
46    pub fn get_next(&self, s: &str) -> Option<String> {
47        if self.next_mapping.is_none() {
48            return None;
49        }
50        match self.next_mapping.as_ref().unwrap().get(&s.replace(DUMMY_SYMBOL.clone(), "").to_lowercase()) {
51            Some(result) => Some(result.clone()),
52            None => None
53        }
54    }
55
56    pub fn get_letter(&self, s: &str) -> Option<String> {
57        if self.mapping.is_none() {
58            return None;
59        }
60        match self.mapping.as_ref().unwrap().get(&s.to_lowercase()) {
61            Some(result) => Some(result.clone()),
62            None => None
63        }
64    }
65
66    pub fn get_ending(&self, s: &str) -> Option<String> {
67        if self.ending_mapping.is_none() {
68            return None;
69        }
70        match self.ending_mapping.as_ref().unwrap().get(&s.to_lowercase()) {
71            Some(result) => Some(result.clone()),
72            None => None
73        }
74    }
75}
76
77/// Transliterate a slice of str using name of schema to `String`
78///
79/// ```
80/// assert_eq!(iuliia_rust::parse_by_schema_name("Юлия", "wikipedia"), "Yuliya")
81/// ```
82///
83pub fn parse_by_schema_name(s: &str, schema_name: &str) -> String {
84    let schema = Schema::for_name(schema_name);
85    parse_by_schema(&s, &schema)
86
87}
88
89/// Transliterate a slice of str using `Schema` to `String`
90///
91/// ```
92///
93/// let input = "Юлия, съешь ещё этих мягких французских булок из Йошкар-Олы, да выпей алтайского чаю";
94/// let expected = "Yuliya, syesh yeshchyo etikh myagkikh frantsuzskikh bulok iz Yoshkar-Oly, da vypey altayskogo chayu";
95/// let schema = iuliia_rust::Schema::for_name("wikipedia");
96/// 
97/// let transliterated_word = iuliia_rust::parse_by_schema(&input, &schema);
98///
99/// assert_eq!(transliterated_word, expected)
100/// ```
101///
102pub fn parse_by_schema(s: &str, schema: &Schema) -> String {
103    lazy_static! {
104        static ref RE: Regex = Regex::new(r"\b").unwrap();
105    }
106    RE.split(s)
107        .map(|word| parse_word_by_schema(word, schema))
108        .collect()
109}
110
111fn parse_word_by_schema(s: &str, schema: &Schema) -> String {
112    let word_by_letters: Vec<String> = s.chars()
113        .map(|char| char.to_string())
114        .collect::<Vec<_>>();
115
116    //Parse ending
117    let ending = parse_ending(&word_by_letters, schema);
118    let mut parsed_end = String::new();
119    let word_without_ending = match ending {
120        Some(matched) => {
121            parsed_end = matched.translate;
122            word_by_letters[..matched.ending_start].to_vec()
123        }
124        None => word_by_letters
125    };
126
127    //Add dummy symbols for window function
128    let mut word_for_parse: Vec<String> = Vec::with_capacity(word_without_ending.len() + 2);
129    let dummy_string: Vec<String> = vec![String::from(DUMMY_SYMBOL.clone())];
130    word_for_parse.extend(dummy_string.clone());
131    word_for_parse.extend(word_without_ending);
132    word_for_parse.extend(dummy_string);
133
134    //Parse each letter
135    let parsed_word: String = word_for_parse
136        .windows(3)
137        .map(|letter_with_neighbors| parse_letter(letter_with_neighbors, schema))
138        .collect();
139
140    //Concat with ending
141    format!("{}{}", parsed_word, parsed_end)
142}
143
144fn parse_ending(s: &Vec<String>, schema: &Schema) -> Option<Ending> {
145    let length = s.len();
146    if length < 3 {
147        return None;
148    }
149
150    match schema.get_ending(&s[length - 1..].concat()) {
151        Some(matched) => return Some(Ending {
152            translate: propagate_case_from_source(matched, &s[length - 1..].concat(), false),
153            ending_start: length - 1,
154        }),
155        None => ()
156    };
157    return match schema.get_ending(&s[length - 2..].concat()) {
158        Some(matched) => return Some(Ending {
159            translate: propagate_case_from_source(matched, &s[length - 2..].concat(), false),
160            ending_start: length - 2,
161        }),
162        None => None
163    };
164}
165
166struct Ending {
167    translate: String,
168    ending_start: usize,
169}
170
171/// Find letter transliteration with steps priority(apply higher):
172/// 1. prefix parse
173/// 2. postfix parse
174/// 3. letter parse
175/// 4. use input letter
176fn parse_letter(letter_with_neighbors: &[String], schema: &Schema) -> String {
177    let prefix: String = letter_with_neighbors[..2].concat();
178    let postfix: String = letter_with_neighbors[1..].concat();
179    let letter: String = letter_with_neighbors[1..2].concat();
180    let mut result = letter.clone();
181    match schema.get_letter(&letter) {
182        Some(matched) => result = matched,
183        None => ()
184    };
185    match schema.get_next(&postfix) {
186        Some(matched) => result = matched,
187        None => ()
188    };
189    match schema.get_pref(&prefix) {
190        Some(matched) => result = matched,
191        None => ()
192    };
193    propagate_case_from_source(result, &letter, true)
194}
195
196fn propagate_case_from_source(result: String, source_letter: &str, only_first_symbol: bool) -> String {
197    // Determinate case of letter
198    let letter_upper = source_letter.chars().any(|letter| letter.is_uppercase());
199
200    if !letter_upper {
201        return result.to_owned();
202    }
203
204    if only_first_symbol {
205        let mut c = result.chars();
206        match c.next() {
207            None => String::new(),
208            Some(f) => f.to_uppercase().collect::<String>() + c.as_str(),
209        }
210    } else {
211        result.to_uppercase()
212    }
213}
214
215
216#[cfg(test)]
217mod tests {
218    use crate::{Schema, parse_by_schema};
219
220    #[test]
221    fn schema_test() {
222        let schema = Schema::for_name("ala_lc");
223        assert_eq!(schema.name, "ala_lc")
224    }
225
226    #[test]
227    fn simple_word_test() {
228        //Given
229        let test_words = vec!["б", "пол"];
230        let expected_words = vec!["b", "pol"];
231        let schema = Schema::for_name("wikipedia");
232
233        //When
234        let transliterated_words: Vec<String> = test_words.iter()
235            .map(|word| parse_by_schema(&word, &schema))
236            .collect();
237
238        //Then
239        assert_eq!(transliterated_words, expected_words)
240    }
241
242    #[test]
243    fn prefix_word_test() {
244        //Given
245        let test_words = vec!["ель"];
246        let expected_words = vec!["yel"];
247        let schema = Schema::for_name("wikipedia");
248
249        //When
250        let transliterated_words: Vec<String> = test_words.iter()
251            .map(|word| parse_by_schema(&word, &schema))
252            .collect();
253
254        //Then
255        assert_eq!(transliterated_words, expected_words)
256    }
257
258    #[test]
259    fn postfix_word_test() {
260        //Given
261        let test_words = vec!["бульон"];
262        let expected_words = vec!["bulyon"];
263        let schema = Schema::for_name("wikipedia");
264
265        //When
266        let transliterated_words: Vec<String> = test_words.iter()
267            .map(|word| parse_by_schema(&word, &schema))
268            .collect();
269
270        //Then
271        assert_eq!(transliterated_words, expected_words)
272    }
273
274    #[test]
275    fn test_letter_case() {
276        //Given
277        let test_words = vec!["ноГа", "Рука"];
278        let expected_words = vec!["noGa", "Ruka"];
279        let schema = Schema::for_name("wikipedia");
280
281        //When
282        let transliterated_words: Vec<String> = test_words.iter()
283            .map(|word| parse_by_schema(&word, &schema))
284            .collect();
285
286        //Then
287        assert_eq!(transliterated_words, expected_words)
288    }
289
290    #[test]
291    fn test_ending() {
292        //Given
293        let test_words = vec!["хороший"];
294        let expected_words = vec!["khoroshy"];
295        let schema = Schema::for_name("wikipedia");
296
297        //When
298        let transliterated_words: Vec<String> = test_words.iter()
299            .map(|word| parse_by_schema(&word, &schema))
300            .collect();
301
302        //Then
303        assert_eq!(transliterated_words, expected_words)
304    }
305
306    #[test]
307    fn test_sentence() {
308        //Given
309        let test_words = vec!["Юлия, съешь ещё этих мягких французских булок из Йошкар-Олы, да выпей алтайского чаю", "ВЕЛИКИЙ"];
310        let expected_words = vec!["Yuliya, syesh yeshchyo etikh myagkikh frantsuzskikh bulok iz Yoshkar-Oly, da vypey altayskogo chayu", "VELIKY"];
311        let schema = Schema::for_name("wikipedia");
312
313        //When
314        let transliterated_words: Vec<String> = test_words.iter()
315            .map(|word| parse_by_schema(&word, &schema))
316            .collect();
317
318        //Then
319        assert_eq!(transliterated_words, expected_words)
320    }
321}