mal2eng/
sketch.rs

1use fancy_regex::Regex as FancyRegex;
2use regex::Regex;
3use std::collections::HashMap;
4
5pub struct CharacterMap {
6    vowels: HashMap<&'static str, &'static str>,
7    compounds: HashMap<&'static str, &'static str>,
8    consonants: HashMap<&'static str, &'static str>,
9    chill: HashMap<&'static str, &'static str>,
10    modifiers: HashMap<&'static str, &'static str>,
11}
12
13impl CharacterMap {
14    /// # CharacterMap::init()
15    /// Initializes Malayalam to English transliterator
16    ///
17    /// Usage:
18    /// ```rust
19    /// use mal2eng::CharacterMap;
20    ///
21    /// fn main() {
22    ///     let m2e = CharacterMap::init();
23    ///     // ... rest of the code ...
24    ///     // ... refer `m2e.transliterate` ...
25    /// }
26    /// ````
27    pub fn init() -> CharacterMap {
28        let mut vowels_map = HashMap::new();
29        for (key, val) in [
30            ("അ", "a"),
31            ("ആ", "aa"),
32            ("ഇ", "i"),
33            ("ഈ", "ee"),
34            ("ഉ", "u"),
35            ("ഊ", "oo"),
36            ("ഋ", "ru"),
37            ("എ", "e"),
38            ("ഏ", "e"),
39            ("ഐ", "ai"),
40            ("ഒ", "o"),
41            ("ഓ", "o"),
42            ("ഔ", "au"),
43        ] {
44            vowels_map.insert(key, val);
45        }
46        let mut compounds_map = HashMap::new();
47        for (key, val) in [
48            ("ക്ക", "kk"),
49            ("ഗ്ഗ", "gg"),
50            ("ങ്ങ", "ng"),
51            ("ച്ച", "cch"),
52            ("ജ്ജ", "jj"),
53            ("ഞ്ഞ", "nj"),
54            ("ട്ട", "tt"),
55            ("ണ്ണ", "nn"),
56            ("ത്ത", "tth"),
57            ("ദ്ദ", "ddh"),
58            ("ദ്ധ", "ddh"),
59            ("ന്ന", "nn"),
60            ("ന്ത", "nth"),
61            ("ങ്ക", "nk"),
62            ("ണ്ട", "nd"),
63            ("ബ്ബ", "bb"),
64            ("പ്പ", "pp"),
65            ("മ്മ", "mm"),
66            ("യ്യ", "yy"),
67            ("ല്ല", "ll"),
68            ("വ്വ", "vv"),
69            ("ശ്ശ", "sh"),
70            ("സ്സ", "s"),
71            ("ക്സ", "ks"),
72            ("ഞ്ച", "nch"),
73            ("ക്ഷ", "ksh"),
74            ("മ്പ", "mp"),
75            ("റ്റ", "tt"),
76            ("ന്റ", "nt"),
77            ("ന്ത്യ", "nthy"),
78        ] {
79            compounds_map.insert(key, val);
80        }
81        let mut consonants_map = HashMap::new();
82        for (key, val) in [
83            ("ക", "k"),
84            ("ഖ", "kh"),
85            ("ഗ", "g"),
86            ("ഘ", "gh"),
87            ("ങ", "ng"),
88            ("ച", "ch"),
89            ("ഛ", "chh"),
90            ("ജ", "j"),
91            ("ഝ", "jh"),
92            ("ഞ", "nj"),
93            ("ട", "t"),
94            ("ഠ", "dt"),
95            ("ഡ", "d"),
96            ("ഢ", "dd"),
97            ("ണ", "n"),
98            ("ത", "th"),
99            ("ഥ", "th"),
100            ("ദ", "d"),
101            ("ധ", "dh"),
102            ("ന", "n"),
103            ("പ", "p"),
104            ("ഫ", "ph"),
105            ("ബ", "b"),
106            ("ഭ", "bh"),
107            ("മ", "m"),
108            ("യ", "y"),
109            ("ര", "r"),
110            ("ല", "l"),
111            ("വ", "v"),
112            ("ശ", "sh"),
113            ("ഷ", "sh"),
114            ("സ", "s"),
115            ("ഹ", "h"),
116            ("ള", "l"),
117            ("ഴ", "zh"),
118            ("റ", "r"),
119        ] {
120            consonants_map.insert(key, val);
121        }
122        let mut chill_map = HashMap::new();
123        for (key, val) in [
124            ("ൽ", "l"),
125            ("ൾ", "l"),
126            ("ൺ", "n"),
127            ("ൻ", "n"),
128            ("ർ", "r"),
129            ("ൿ", "k"),
130        ] {
131            chill_map.insert(key, val);
132        }
133        let mut modifiers_map = HashMap::new();
134        for (key, val) in [
135            ("ു്", "u"),
136            ("ാ", "aa"),
137            ("ി", "i"),
138            ("ീ", "ee"),
139            ("ു", "u"),
140            ("ൂ", "oo"),
141            ("ൃ", "ru"),
142            ("െ", "e"),
143            ("േ", "e"),
144            ("ൈ", "y"),
145            ("ൊ", "o"),
146            ("ോ", "o"),
147            ("ൌ", "ou"),
148            ("ൗ", "au"),
149            ("ഃ", "a"),
150        ] {
151            modifiers_map.insert(key, val);
152        }
153        CharacterMap {
154            vowels: vowels_map,
155            compounds: compounds_map,
156            consonants: consonants_map,
157            chill: chill_map,
158            modifiers: modifiers_map,
159        }
160    }
161
162    /// # replace_modified_glyphs
163    fn replace_modified_glyphs(&self, glyphs: &HashMap<&str, &str>, given_text: String) -> String {
164        let gk = glyphs.keys().map(|&k| k).collect::<Vec<_>>().join("|");
165        let mk = self
166            .modifiers
167            .keys()
168            .map(|&k| k)
169            .collect::<Vec<_>>()
170            .join("|");
171        let exp = regex::Regex::new(&format!("({gk})({mk})"))
172            .expect(&format!("E@{}: Unable to build regex", line!()));
173
174        let mut modified_text = given_text.to_owned();
175
176        for cap in exp.captures_iter(&given_text) {
177            let matched_str = cap.get(0).unwrap().as_str();
178            let matched_glyph = glyphs.get(cap.get(1).unwrap().as_str()).unwrap();
179            let matched_modifier = self.modifiers.get(cap.get(2).unwrap().as_str()).unwrap();
180            modified_text =
181                modified_text.replace(matched_str, &format!("{matched_glyph}{matched_modifier}"))
182        }
183
184        modified_text
185    }
186
187    /// # render
188    fn render(&self, given_text: String, caps: bool) -> String {
189        let mut modified_text = given_text.to_owned();
190
191        // replace zero width non joiners
192        // \u{200C} == \xE2\x80\x8C
193        modified_text = Regex::new("\u{200c}")
194            .expect(&format!("E@{}: Unable to build regex", line!()))
195            .replace_all(&modified_text, "")
196            .to_string();
197
198        modified_text = self.replace_modified_glyphs(&self.compounds, modified_text);
199        modified_text = self.replace_modified_glyphs(&self.vowels, modified_text);
200        modified_text = self.replace_modified_glyphs(&self.consonants, modified_text);
201
202        // replace unmodified compounds
203        for (key, val) in &self.compounds {
204            // compounds ending in chandrakkala but not at the end of the word
205            modified_text = Regex::new(&format!("{key}്([\\w])"))
206                .expect(&format!("E@{}: Unable to build regex", line!()))
207                .replace_all(&modified_text, format!("{val}$1"))
208                .to_string();
209            // compounds ending in chandrakkala have +'u' pronunciation
210            modified_text = modified_text.replace(&format!("{key}്"), &format!("{val}u"));
211            // compounds not ending in chandrakkala have +'a' pronunciation
212            modified_text = modified_text.replace(key, &format!("{val}a"));
213        }
214
215        // glyphs not ending in chandrakkala have +'a' pronunciation
216        for (key, val) in &self.consonants {
217            modified_text = FancyRegex::new(&format!("{key}(?!്)"))
218                .expect(&format!("E@{}: Unable to build regex", line!()))
219                .replace_all(&modified_text, format!("{val}a"))
220                .to_string();
221        }
222
223        // glyphs ending in chandrakkala not at the end of a word
224        for (key, val) in &self.consonants {
225            modified_text = FancyRegex::new(&format!("{key}്(?![\\s\\)\\.;,\"'\\/\\\\%\\!])"))
226                .expect(&format!("E@{}: Unable to build regex", line!()))
227                .replace_all(&modified_text, format!("{val}"))
228                .to_string();
229        }
230        // println!("{modified_text}");
231
232        // remaining glyphs ending in chandrakkala will be at end of words and have a +'u' pronunciation
233        for (key, val) in &self.consonants {
234            modified_text = modified_text.replace(&format!("{key}്"), &format!("{val}u"));
235        }
236
237        // remaining consonants
238        for (key, val) in &self.consonants {
239            modified_text = modified_text.replace(key, val);
240        }
241
242        // remaining vowels
243        for (key, val) in &self.vowels {
244            modified_text = modified_text.replace(key, val);
245        }
246
247        // chill glyphs
248        for (key, val) in &self.chill {
249            modified_text = modified_text.replace(key, val);
250        }
251
252        // anusvaram 'am' at the end
253        modified_text = modified_text.replace("ം", "m");
254
255        // replace any stray modifiers that may have been left out
256        for (key, val) in &self.modifiers {
257            modified_text = modified_text.replace(key, val);
258        }
259
260        if !caps {
261            return modified_text;
262        }
263
264        // capitalize first letter of modified_text for better aesthetics
265        modified_text
266            .split_inclusive(&['.', '!', '?'])
267            .map(|s| capitalize(s.trim()))
268            .collect::<Vec<_>>()
269            .join(" ")
270    }
271
272    /// # transliterate
273    /// Transliterate Malayalam to English
274    ///
275    /// Usage:
276    /// ```rust
277    /// use mal2eng::CharacterMap;
278    ///
279    /// fn main() {
280    ///     let m2e = CharacterMap::init();
281    ///     let res = m2e.transliterate("മലയാളത്തിലെ നിങ്ങളുടെ വാക്കുകൾ", true);
282    ///     println!("{}", res);
283    /// }
284    /// ````
285    pub fn transliterate(&self, text: &str, capitalize: bool) -> String {
286        self.render(text.to_string(), capitalize)
287    }
288}
289
290fn capitalize(sentence: &str) -> String {
291    let mut characters = sentence.chars();
292    match characters.next() {
293        None => String::new(),
294        Some(first_char) => first_char.to_uppercase().collect::<String>() + characters.as_str(),
295    }
296}