Skip to main content

bamboo_core/
input_method.rs

1use phf::{Map, phf_map};
2
3use crate::input_method_def::InputMethodDef;
4use crate::utils::{add_mark_to_toneless_char, add_tone_to_char, is_vowel};
5
6/// Represents a Vietnamese tone mark.
7#[repr(u8)]
8#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
9pub enum Tone {
10    /// No tone.
11    None = 0,
12    /// Grave accent.
13    Grave = 1,
14    /// Acute accent.
15    Acute = 2,
16    /// Hook above.
17    Hook = 3,
18    /// Tilde.
19    Tilde = 4,
20    /// Dot below.
21    Dot = 5,
22}
23
24/// Represents a Vietnamese diacritic mark (marks that change the vowel/consonant).
25#[repr(u8)]
26#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
27pub enum Mark {
28    /// No diacritic.
29    None = 0,
30    /// Circumflex (Â, Ê, Ô).
31    Hat = 1,
32    /// Breve (Ă).
33    Breve = 2,
34    /// Horn (Ư, Ơ).
35    Horn = 3,
36    /// Dash (Đ).
37    Dash = 4,
38    /// Special mark for raw character restoration.
39    Raw = 5,
40}
41
42/// The type of transformation a rule applies.
43#[repr(u8)]
44#[derive(Clone, Copy, Debug, Default, PartialEq, Eq, Hash)]
45pub enum EffectType {
46    /// Appends a character (standard typing).
47    #[default]
48    Appending = 0,
49    /// Adds/changes a diacritic mark.
50    MarkTransformation = 1,
51    /// Adds/changes a tone mark.
52    ToneTransformation = 2,
53    /// Replaces a character with another.
54    Replacing = 3,
55}
56
57static TONES: Map<&'static str, Tone> = phf_map! {
58    "XoaDauThanh" => Tone::None,
59    "DauSac" => Tone::Acute,
60    "DauHuyen" => Tone::Grave,
61    "DauNga" => Tone::Tilde,
62    "DauNang" => Tone::Dot,
63    "DauHoi" => Tone::Hook,
64};
65
66/// A transformation rule that defines how a key press affects the composition.
67#[derive(Clone, Copy, Debug, Default, PartialEq, Eq, Hash)]
68pub struct Rule {
69    /// The key that triggers this rule.
70    pub key: char,
71    /// Effect value:
72    /// - if `effect_type == ToneTransformation`: this is a [`Tone`] as `u8`
73    /// - if `effect_type == MarkTransformation`: this is a [`Mark`] as `u8`
74    pub effect: u8,
75    /// The type of transformation to apply.
76    pub effect_type: EffectType,
77    /// The character that this rule targets (to be replaced or marked).
78    pub effect_on: char,
79    /// The resulting character after applying the transformation.
80    pub result: char,
81    /// Additional characters to append immediately after this one (used for multi-character shortcuts).
82    pub appended: [char; 2],
83    /// Number of characters in `appended`.
84    pub appended_len: u8,
85}
86
87impl Rule {
88    /// Sets the effect value from a [`Tone`].
89    pub fn set_tone(&mut self, tone: Tone) {
90        self.effect = tone as u8;
91    }
92
93    /// Sets the effect value from a [`Mark`].
94    pub fn set_mark(&mut self, mark: Mark) {
95        self.effect = mark as u8;
96    }
97
98    /// Retrieves the effect value as a [`Tone`].
99    pub fn get_tone(&self) -> Tone {
100        // Safety: effect is created by parser or engine.
101        match self.effect {
102            1 => Tone::Grave,
103            2 => Tone::Acute,
104            3 => Tone::Hook,
105            4 => Tone::Tilde,
106            5 => Tone::Dot,
107            _ => Tone::None,
108        }
109    }
110
111    /// Retrieves the effect value as a [`Mark`].
112    pub fn get_mark(&self) -> Mark {
113        match self.effect {
114            1 => Mark::Hat,
115            2 => Mark::Breve,
116            3 => Mark::Horn,
117            4 => Mark::Dash,
118            5 => Mark::Raw,
119            _ => Mark::None,
120        }
121    }
122}
123
124/// A collection of rules defining how keys transform text.
125///
126/// Use the provided static methods (e.g., [`InputMethod::telex()`]) to get
127/// standard Vietnamese input methods.
128#[derive(Clone, Debug, Default)]
129pub struct InputMethod {
130    /// The name of the input method.
131    pub name: String,
132    /// The complete list of transformation rules.
133    pub rules: Vec<Rule>,
134    /// Keys that can affect multiple vowels at once (e.g., 'w' in Telex).
135    pub super_keys: Vec<char>,
136    /// Keys that apply tone marks.
137    pub tone_keys: Vec<char>,
138    /// Keys that primarily append characters.
139    pub appending_keys: Vec<char>,
140    /// All keys that have at least one rule associated with them.
141    pub keys: Vec<char>,
142}
143
144impl InputMethod {
145    /// Standard Telex input method.
146    pub fn telex() -> Self {
147        parse_input_method("Telex")
148    }
149
150    /// Standard VNI input method (using number keys).
151    pub fn vni() -> Self {
152        parse_input_method("VNI")
153    }
154
155    /// Standard VIQR input method.
156    pub fn viqr() -> Self {
157        parse_input_method("VIQR")
158    }
159
160    /// Microsoft Standard Vietnamese keyboard layout.
161    pub fn microsoft_layout() -> Self {
162        parse_input_method("Microsoft layout")
163    }
164
165    /// Telex variant that also supports `[` and `]` keys.
166    pub fn telex_2() -> Self {
167        parse_input_method("Telex 2")
168    }
169
170    /// Combined Telex and VNI.
171    pub fn telex_vni() -> Self {
172        parse_input_method("Telex + VNI")
173    }
174
175    /// Combined Telex, VNI, and VIQR.
176    pub fn telex_vni_viqr() -> Self {
177        parse_input_method("Telex + VNI + VIQR")
178    }
179
180    /// VNI for French keyboard layouts.
181    pub fn vni_french_layout() -> Self {
182        parse_input_method("VNI Bàn phím tiếng Pháp")
183    }
184
185    /// Telex variant using `w` for marks and `z` for tone removal.
186    pub fn telex_w() -> Self {
187        parse_input_method("Telex W")
188    }
189}
190
191/// Parse a known input method by name from the built-in definitions.
192pub(crate) fn parse_input_method(im_name: &str) -> InputMethod {
193    let defs = crate::input_method_def::get_input_method_definitions();
194    defs.get(im_name).copied().map(|def| parse_input_method_def(im_name, def)).unwrap_or_default()
195}
196
197/// Parses an input method definition from its structured format.
198pub(crate) fn parse_input_method_def(im_name: &str, im_def: &InputMethodDef) -> InputMethod {
199    let mut im = InputMethod { name: im_name.to_string(), ..Default::default() };
200
201    for (key_str, line) in im_def.entries() {
202        let key = match key_str.chars().next() {
203            Some(c) => c,
204            None => continue,
205        };
206
207        im.rules.extend(parse_rules(key, line));
208
209        if contains_uo_case_insensitive(line) {
210            im.super_keys.push(key);
211        }
212        im.keys.push(key);
213    }
214
215    for rule in &im.rules {
216        if rule.effect_type == EffectType::Appending {
217            im.appending_keys.push(rule.key);
218        }
219        if rule.effect_type == EffectType::ToneTransformation {
220            im.tone_keys.push(rule.key);
221        }
222    }
223
224    im
225}
226
227#[inline]
228fn contains_uo_case_insensitive(s: &str) -> bool {
229    let mut prev_u = false;
230    for c in s.chars() {
231        let lc = c.to_ascii_lowercase();
232        if prev_u && lc == 'o' {
233            return true;
234        }
235        prev_u = lc == 'u';
236    }
237    false
238}
239
240/// Parses rules for a given key based on its definition line.
241pub(crate) fn parse_rules(key: char, line: &str) -> Vec<Rule> {
242    if let Some(tone) = TONES.get(line).copied() {
243        return vec![Rule {
244            key,
245            effect_type: EffectType::ToneTransformation,
246            effect: tone as u8,
247            effect_on: '\0',
248            result: '\0',
249            appended: ['\0'; 2],
250            appended_len: 0,
251        }];
252    }
253
254    parse_toneless_rules(key, line)
255}
256
257/// Parses rules that don't involve tone transformations.
258pub(crate) fn parse_toneless_rules(key: char, line: &str) -> Vec<Rule> {
259    let lower = line.to_lowercase();
260
261    if let Some((effective_ons, results, rest)) = parse_dsl(&lower) {
262        let mut rules = Vec::new();
263        for (effective_on, result) in effective_ons.into_iter().zip(results.into_iter()) {
264            let Some(effect) = find_mark_from_char(result) else {
265                continue;
266            };
267            rules.extend(parse_toneless_rule(key, effective_on, result, effect));
268        }
269
270        if let Some(rule) = get_appending_rule(key, rest) {
271            rules.push(rule);
272        }
273
274        return rules;
275    }
276
277    if let Some(rule) = get_appending_rule(key, line) {
278        return vec![rule];
279    }
280
281    Vec::new()
282}
283
284fn parse_toneless_rule(key: char, effective_on: char, result: char, effect: Mark) -> Vec<Rule> {
285    let mut rules = Vec::new();
286
287    for chr in get_mark_family(effective_on) {
288        if chr == result {
289            rules.push(Rule {
290                key,
291                effect_type: EffectType::MarkTransformation,
292                effect: 0,
293                effect_on: result,
294                result: effective_on,
295                appended: ['\0'; 2],
296                appended_len: 0,
297            });
298            continue;
299        }
300
301        if is_vowel(chr) {
302            for tone in 0u8..=5 {
303                rules.push(Rule {
304                    key,
305                    effect_type: EffectType::MarkTransformation,
306                    effect_on: add_tone_to_char(chr, tone),
307                    effect: effect as u8,
308                    result: add_tone_to_char(result, tone),
309                    appended: ['\0'; 2],
310                    appended_len: 0,
311                });
312            }
313        } else {
314            rules.push(Rule {
315                key,
316                effect_type: EffectType::MarkTransformation,
317                effect_on: chr,
318                effect: effect as u8,
319                result,
320                appended: ['\0'; 2],
321                appended_len: 0,
322            });
323        }
324    }
325
326    rules
327}
328
329/// Parse: `([a-zA-Z]+)_(\p{L}+)([_\p{L}]*)`.
330fn parse_dsl(s: &str) -> Option<(Vec<char>, Vec<char>, &str)> {
331    let (left, right) = s.split_once('_')?;
332    if left.is_empty() || !left.chars().all(|c| c.is_ascii_alphabetic()) {
333        return None;
334    }
335
336    let mut results = Vec::new();
337    let mut rest_start_byte = right.len();
338
339    for (byte_idx, ch) in right.char_indices() {
340        if ch.is_alphabetic() {
341            results.push(ch);
342            continue;
343        }
344        rest_start_byte = byte_idx;
345        break;
346    }
347
348    if results.is_empty() {
349        return None;
350    }
351
352    let rest = &right[rest_start_byte..];
353    Some((left.chars().collect(), results, rest))
354}
355
356/// Parse: `(_?)_(\p{L}+)`.
357fn get_appending_rule(key: char, value: &str) -> Option<Rule> {
358    if !value.starts_with('_') {
359        return None;
360    }
361
362    // "_x" or "__x" forms.
363    let start = if value.starts_with("__") { 2 } else { 1 };
364    let tail = value.get(start..)?;
365
366    let mut letters = Vec::new();
367    for ch in tail.chars() {
368        if ch.is_alphabetic() {
369            letters.push(ch);
370        } else {
371            break;
372        }
373    }
374
375    let first = *letters.first()?;
376
377    let mut appended = ['\0'; 2];
378    let mut appended_len = 0u8;
379    for &ch in letters.iter().skip(1) {
380        if (appended_len as usize) < appended.len() {
381            appended[appended_len as usize] = ch;
382            appended_len += 1;
383        }
384    }
385
386    Some(Rule {
387        key,
388        effect_type: EffectType::Appending,
389        effect: 0,
390        effect_on: first,
391        result: first,
392        appended,
393        appended_len,
394    })
395}
396
397fn get_mark_family(c: char) -> Vec<char> {
398    let base = add_tone_to_char(c, 0);
399    let canonical = add_mark_to_toneless_char(base, 0);
400
401    // Marks are 0..=4 in utils' internal mark table.
402    let mut family: Vec<char> =
403        (0u8..=4).map(|m| add_mark_to_toneless_char(canonical, m)).collect();
404
405    family.sort_unstable();
406    family.dedup();
407    family
408}
409
410fn find_mark_from_char(c: char) -> Option<Mark> {
411    let c = c.to_lowercase().next().unwrap_or(c);
412    let toneless = add_tone_to_char(c, 0);
413    let base = add_mark_to_toneless_char(toneless, 0);
414
415    for m in 0u8..=4 {
416        if add_mark_to_toneless_char(base, m) == toneless {
417            return Some(match m {
418                1 => Mark::Hat,
419                2 => Mark::Breve,
420                3 => Mark::Horn,
421                4 => Mark::Dash,
422                _ => Mark::None,
423            });
424        }
425    }
426
427    None
428}
429
430#[cfg(test)]
431mod tests {
432    use super::*;
433
434    #[test]
435    fn parse_tone_rules() {
436        let rules = parse_rules('z', "XoaDauThanh");
437        assert_eq!(rules.len(), 1);
438        assert_eq!(rules[0].effect_type, EffectType::ToneTransformation);
439        assert_eq!(rules[0].effect, Tone::None as u8);
440
441        let rules = parse_rules('x', "DauNga");
442        assert_eq!(rules.len(), 1);
443        assert_eq!(rules[0].effect_type, EffectType::ToneTransformation);
444        assert_eq!(rules[0].get_tone(), Tone::Tilde);
445    }
446
447    #[test]
448    fn parse_toneless_rules_cases() {
449        let rules = parse_toneless_rules('d', "D_Đ");
450        assert_eq!(rules.len(), 2);
451        assert_eq!(rules[0].effect_type, EffectType::MarkTransformation);
452        assert_eq!(rules[0].effect, Mark::Dash as u8);
453        assert_eq!(rules[0].effect_on, 'd');
454
455        let rules = parse_toneless_rules('{', "_Ư");
456        assert_eq!(rules.len(), 1);
457        assert_eq!(rules[0].effect_type, EffectType::Appending);
458        assert_eq!(rules[0].effect_on, 'Ư');
459
460        let rules = parse_toneless_rules('w', "UOA_ƯƠĂ");
461        assert_eq!(rules.len(), 33);
462        assert_eq!(rules[0].effect_type, EffectType::MarkTransformation);
463        assert_eq!(rules[0].get_mark(), Mark::Horn);
464        assert_eq!(rules[0].effect_on, 'u');
465        assert_eq!(rules[7].effect_type, EffectType::MarkTransformation);
466        assert_eq!(rules[7].get_mark(), Mark::Horn);
467        assert_eq!(rules[7].effect_on, 'o');
468        assert_eq!(rules[20].effect_type, EffectType::MarkTransformation);
469        assert_eq!(rules[20].get_mark(), Mark::Breve);
470        assert_eq!(rules[20].effect_on, 'a');
471
472        let rules = parse_toneless_rules('w', "UOA_ƯƠĂ__Ư");
473        assert_eq!(rules.len(), 34);
474        assert_eq!(rules[20].effect_type, EffectType::MarkTransformation);
475        assert_eq!(rules[20].get_mark(), Mark::Breve);
476        assert_eq!(rules[20].effect_on, 'a');
477        assert_eq!(rules[33].effect_type, EffectType::Appending);
478        assert_eq!(rules[33].effect_on, 'ư');
479    }
480
481    #[test]
482    fn parse_append_rule() {
483        let rules = parse_toneless_rules('[', "__ươ");
484        assert_eq!(rules.len(), 1);
485        let appended_len = rules[0].appended_len;
486        assert_eq!(appended_len, 1);
487        assert_eq!(rules[0].appended[0], 'ơ');
488
489        let rules = parse_toneless_rules('{', "__ƯƠ");
490        assert_eq!(rules.len(), 1);
491        let appended_len = rules[0].appended_len;
492        assert_eq!(appended_len, 1);
493        assert_eq!(rules[0].appended[0], 'Ơ');
494    }
495
496    #[test]
497    fn parse_input_method_super_key_detection() {
498        let im = parse_input_method("Telex");
499        assert!(im.super_keys.contains(&'w'));
500    }
501
502    #[test]
503    fn parse_telex_o_hat_rule_exists() {
504        // In Telex, typing 'o' after an existing 'o' should be able to mark it as 'ô'.
505        let rules = parse_toneless_rules('o', "O_Ô");
506        assert!(rules.iter().any(|r| {
507            r.effect_type == EffectType::MarkTransformation
508                && r.get_mark() == Mark::Hat
509                && r.effect_on == 'o'
510                && r.result == 'ô'
511        }));
512        assert!(!rules.iter().any(|r| r.effect_type == EffectType::Appending));
513    }
514
515    #[test]
516    fn telex2_has_no_appending_rule_for_o() {
517        let im = parse_input_method("Telex 2");
518        let o_rules: Vec<_> = im.rules.iter().filter(|r| r.key == 'o').collect();
519        assert!(!o_rules.is_empty());
520        assert!(!o_rules.iter().any(|r| r.effect_type == EffectType::Appending));
521    }
522}