Skip to main content

name_variants/
lib.rs

1//! Multilingual name romanization lookup tables.
2//!
3//! Maps romanization variants to their canonical native-script key so that
4//! `Chen`, `Chan`, and `Tan` all resolve to `陈`.
5
6mod generated;
7
8#[cfg(target_arch = "wasm32")]
9mod wasm;
10
11/// Return the canonical script-form key for a name, or `None` if unknown.
12///
13/// Matching order:
14/// 1. Exact match (handles native script input like `陈`)
15/// 2. Lowercase match (handles `Chan`, `CHAN`)
16/// 3. Token-by-token (handles `"Chan Wai Ming"` → checks `chan`)
17///
18/// # Examples
19/// ```
20/// use name_variants::lookup_key;
21/// assert_eq!(lookup_key("Chan"), Some("陈"));
22/// assert_eq!(lookup_key("Smith"), None);
23/// ```
24pub fn lookup_key(text: &str) -> Option<&'static str> {
25    if text.is_empty() {
26        return None;
27    }
28
29    // 1. Exact match (handles native script keys: 陈, 박, محمد)
30    if let Some(key) = generated::INDEX.get(text) {
31        return Some(key);
32    }
33
34    // 2. Lowercase match (handles Chan, CHAN, chan)
35    let lower = text.to_lowercase();
36    if let Some(key) = generated::INDEX.get(lower.as_str()) {
37        return Some(key);
38    }
39
40    // 3. Token-by-token: "Chan Wai Ming" → try each token
41    for token in lower.split_whitespace() {
42        if let Some(key) = generated::INDEX.get(token) {
43            return Some(key);
44        }
45    }
46
47    None
48}
49
50/// Return the canonical key and all known variants for a name, or `None` if unknown.
51///
52/// # Examples
53/// ```
54/// use name_variants::lookup_all;
55/// let (key, variants) = lookup_all("Chan").unwrap();
56/// assert_eq!(key, "陈");
57/// assert!(variants.contains(&"chen"));
58/// ```
59pub fn lookup_all(text: &str) -> Option<(&'static str, &'static [&'static str])> {
60    let key = lookup_key(text)?;
61    let variants = generated::VARIANTS.get(key)?;
62    Some((key, variants))
63}
64
65/// Return `(language, forms_slice)` for a canonical storage key, or `None` if unknown.
66///
67/// Used by the PyO3 extension to build `{"language": "...", "forms": [...]}` dicts.
68pub fn get_cluster_info(canonical_key: &str) -> Option<(&'static str, &'static [&'static str])> {
69    let language = generated::LANGUAGE.get(canonical_key)?;
70    let forms = generated::VARIANTS.get(canonical_key)?;
71    Some((language, forms))
72}
73
74/// Return all canonical keys that list this romanization as a variant.
75///
76/// Unlike [`lookup_key`], which returns one result via first-write-wins,
77/// this returns every canonical key across all 15 language tables that
78/// lists the input as a variant — ordered by table iteration order.
79///
80/// # Examples
81/// ```
82/// use name_variants::lookup_candidates;
83/// let candidates = lookup_candidates("Lee");
84/// assert!(candidates.contains(&"李"));
85/// assert!(candidates.contains(&"이"));
86/// assert!(lookup_candidates("Smith").is_empty());
87/// ```
88pub fn lookup_candidates(text: &str) -> Vec<&'static str> {
89    if text.is_empty() {
90        return Vec::new();
91    }
92    let mut seen: std::collections::HashSet<&'static str> = std::collections::HashSet::new();
93    let mut result: Vec<&'static str> = Vec::new();
94
95    let mut collect = |lookup_key: &str| {
96        if let Some(cands) = generated::CANDIDATES.get(lookup_key) {
97            for &c in *cands {
98                if seen.insert(c) {
99                    result.push(c);
100                }
101            }
102        }
103    };
104
105    let key = text.trim();
106    collect(key);
107    let key_lower = key.to_lowercase();
108    if key_lower != key {
109        collect(key_lower.as_str());
110    }
111    for token in key_lower.split_whitespace() {
112        collect(token);
113    }
114    result
115}
116
117#[cfg(test)]
118mod tests {
119    use super::*;
120
121    // ── Chinese ──────────────────────────────────────────────────────────────
122    #[test]
123    fn chan_and_chen_same_key() {
124        assert_eq!(lookup_key("chan"), lookup_key("chen"));
125    }
126
127    #[test]
128    fn chan_resolves_to_simplified_chinese() {
129        assert_eq!(lookup_key("chan"), Some("陈"));
130    }
131
132    #[test]
133    fn hui_and_xu_same_key() {
134        // 許/许: Xu (Mandarin), Hui (Cantonese), Kho (Hokkien)
135        assert_eq!(lookup_key("hui"), lookup_key("xu"));
136    }
137
138    #[test]
139    fn wang_and_wong_same_key() {
140        assert_eq!(lookup_key("wang"), lookup_key("wong"));
141    }
142
143    // ── Korean ───────────────────────────────────────────────────────────────
144    #[test]
145    fn park_and_bak_same_key() {
146        assert_eq!(lookup_key("park"), lookup_key("bak"));
147    }
148
149    #[test]
150    fn lee_and_rhee_same_key() {
151        // "lee" also maps to Chinese 李 in the data, so use yi/rhee — both are Korean-only variants of 이
152        assert_eq!(lookup_key("yi"), lookup_key("rhee"));
153    }
154
155    // ── Arabic ───────────────────────────────────────────────────────────────
156    #[test]
157    fn muhammad_and_mohammed_same_key() {
158        assert_eq!(lookup_key("muhammad"), lookup_key("mohammed"));
159    }
160
161    // ── Russian ──────────────────────────────────────────────────────────────
162    #[test]
163    fn ivanov_and_ivanoff_same_key() {
164        assert_eq!(lookup_key("ivanov"), lookup_key("ivanoff"));
165    }
166
167    // ── Case-insensitive ─────────────────────────────────────────────────────
168    #[test]
169    fn uppercase_input_matches() {
170        assert_eq!(lookup_key("CHAN"), Some("陈"));
171    }
172
173    #[test]
174    fn mixed_case_input_matches() {
175        assert_eq!(lookup_key("Chan"), Some("陈"));
176    }
177
178    // ── Multi-word token split ────────────────────────────────────────────────
179    #[test]
180    fn full_name_resolves_via_token_split() {
181        // "chan wai ming" → first token "chan" matches
182        assert_eq!(lookup_key("Chan Wai Ming"), Some("陈"));
183    }
184
185    #[test]
186    fn korean_full_name_resolves() {
187        assert_eq!(lookup_key("Park Ji-sung"), lookup_key("park"));
188    }
189
190    // ── Unknown names ────────────────────────────────────────────────────────
191    #[test]
192    fn unknown_returns_none() {
193        assert_eq!(lookup_key("Smith"), None);
194        assert_eq!(lookup_key("Kowalski"), None);
195        assert_eq!(lookup_key(""), None);
196    }
197
198    #[test]
199    fn all_unknown_tokens_returns_none() {
200        // "john" maps to Greek Ιωάννης — use names genuinely absent from the dataset
201        assert_eq!(lookup_key("Kowalski Smith"), None);
202    }
203
204    #[test]
205    fn lookup_all_chan_returns_variants() {
206        let (key, variants) = lookup_all("Chan").unwrap();
207        assert_eq!(key, "陈");
208        assert!(variants.contains(&"chen"));
209        assert!(variants.contains(&"陳"));
210    }
211
212    #[test]
213    fn lookup_all_unknown_returns_none() {
214        assert!(lookup_all("Smith").is_none());
215    }
216
217    // ── lookup_candidates ─────────────────────────────────────────────────────
218    #[test]
219    fn lookup_candidates_lee_returns_multiple_scripts() {
220        let result = lookup_candidates("Lee");
221        assert!(result.contains(&"李"), "should contain Chinese 李");
222        assert!(result.contains(&"이"), "should contain Korean 이");
223    }
224
225    #[test]
226    fn lookup_candidates_unknown_returns_empty() {
227        assert!(lookup_candidates("Smith").is_empty());
228        assert!(lookup_candidates("").is_empty());
229    }
230
231    #[test]
232    fn lookup_candidates_unambiguous_returns_one() {
233        let result = lookup_candidates("Nguyen");
234        assert!(!result.is_empty());
235    }
236}