Skip to main content

nexo_tool_meta/
locale.rs

1//! BCP-47 locale subset for agent language configuration.
2//!
3//! The SDK ships a closed-enum locale model — every recognised
4//! language and region is enumerated explicitly in [`LangCode`] and
5//! [`RegionCode`]. Adding new locales requires a code change so
6//! that the per-locale system addenda + voice picker tables stay
7//! in lock-step (an exhaustive `match` over the enums prevents
8//! "parse-but-no-addendum" gaps).
9//!
10//! ## Why a string-backed value type?
11//!
12//! [`Locale`] stores the canonical BCP-47 string (`es-AR`, not
13//! `ES_ar`) so the wire shape stays transparent — call sites that
14//! already serialise `Option<String>` (e.g.
15//! `nexo_tool_meta::reply_kind::OutboundReplyContext::language`)
16//! keep working unchanged. Consumers parse the string back into a
17//! [`Locale`] on the receiving side.
18//!
19//! See `crates/microapp-sdk/src/voice/locale_addenda.rs` for the
20//! addendum + voice picker tables that consume this type.
21
22use thiserror::Error;
23
24/// Closed set of language subtags. Adding one == code change.
25///
26/// Lowercase 2-letter ISO-639-1 codes when serialised. Variants
27/// listed alphabetically by code for diff stability.
28#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
29pub enum LangCode {
30    /// German.
31    De,
32    /// English.
33    En,
34    /// Spanish.
35    Es,
36    /// French.
37    Fr,
38    /// Italian.
39    It,
40    /// Japanese.
41    Ja,
42    /// Portuguese.
43    Pt,
44    /// Chinese (simplified script assumed; `zh-Hant` rejected v1).
45    Zh,
46}
47
48impl LangCode {
49    /// Lowercase 2-letter ISO-639-1 code (`es`, `en`, `pt`, …).
50    pub fn as_str(&self) -> &'static str {
51        match self {
52            Self::De => "de",
53            Self::En => "en",
54            Self::Es => "es",
55            Self::Fr => "fr",
56            Self::It => "it",
57            Self::Ja => "ja",
58            Self::Pt => "pt",
59            Self::Zh => "zh",
60        }
61    }
62}
63
64/// Closed set of region subtags. Per-language coverage is what the
65/// voice picker table guarantees a region-matched Edge voice for.
66///
67/// Uppercase 2-letter ISO-3166-1 alpha-2 codes when serialised.
68/// Variants listed alphabetically.
69#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
70pub enum RegionCode {
71    /// Argentina.
72    Ar,
73    /// Australia.
74    Au,
75    /// Brazil.
76    Br,
77    /// Canada.
78    Ca,
79    /// Chile.
80    Cl,
81    /// China.
82    Cn,
83    /// Colombia.
84    Co,
85    /// Germany.
86    De,
87    /// Spain.
88    Es,
89    /// France.
90    Fr,
91    /// United Kingdom.
92    Gb,
93    /// Italy.
94    It,
95    /// Japan.
96    Jp,
97    /// Mexico.
98    Mx,
99    /// Peru.
100    Pe,
101    /// Portugal.
102    Pt,
103    /// United States.
104    Us,
105}
106
107impl RegionCode {
108    /// Uppercase 2-letter ISO-3166-1 alpha-2 code (`AR`, `MX`, …).
109    pub fn as_str(&self) -> &'static str {
110        match self {
111            Self::Ar => "AR",
112            Self::Au => "AU",
113            Self::Br => "BR",
114            Self::Ca => "CA",
115            Self::Cl => "CL",
116            Self::Cn => "CN",
117            Self::Co => "CO",
118            Self::De => "DE",
119            Self::Es => "ES",
120            Self::Fr => "FR",
121            Self::Gb => "GB",
122            Self::It => "IT",
123            Self::Jp => "JP",
124            Self::Mx => "MX",
125            Self::Pe => "PE",
126            Self::Pt => "PT",
127            Self::Us => "US",
128        }
129    }
130}
131
132/// Parsed BCP-47 locale (subset).
133///
134/// Cheap to clone — wraps a single canonical [`String`]
135/// (`es-AR`, never `ES_ar`). Construct via [`std::str::FromStr`]
136/// or [`Locale::new`]; both routes guarantee the closed-enum set.
137#[derive(Debug, Clone, PartialEq, Eq, Hash)]
138pub struct Locale(String);
139
140impl Locale {
141    /// Build a locale from already-typed enum values. The caller
142    /// has already proved the language + region pair is valid; the
143    /// resulting [`Locale::as_bcp47`] is the canonical
144    /// `"<lang>-<REGION>"` (or `"<lang>"` when region is None).
145    pub fn new(language: LangCode, region: Option<RegionCode>) -> Self {
146        let raw = match region {
147            Some(r) => format!("{}-{}", language.as_str(), r.as_str()),
148            None => language.as_str().to_string(),
149        };
150        Self(raw)
151    }
152
153    /// Recover the [`LangCode`]. Infallible: every constructed
154    /// [`Locale`] has been parsed against the closed enum.
155    pub fn language(&self) -> LangCode {
156        // Safe-by-construction: `Self::new` and `FromStr` both
157        // build from a known-good `LangCode`. We re-parse the
158        // first segment instead of caching to keep the type
159        // small + `Copy`-clone-friendly via the inner String.
160        let head = self.0.split('-').next().unwrap_or("");
161        parse_language(head).expect("Locale invariant: language always valid")
162    }
163
164    /// Recover the [`RegionCode`] when present, `None` for
165    /// language-only locales (`"es"`, `"en"`, …).
166    pub fn region(&self) -> Option<RegionCode> {
167        let mut parts = self.0.split('-');
168        let _lang = parts.next();
169        let region = parts.next()?;
170        // `Self::new` and `FromStr` only produce already-validated
171        // region tokens, so this expect can never trip on a value
172        // that came through the public surface.
173        Some(parse_region(region).expect("Locale invariant: region always valid"))
174    }
175
176    /// The canonical BCP-47 string (`"es-AR"` / `"en-US"` /
177    /// `"es"`). Identical to [`Locale::to_string`] but without the
178    /// allocation when the caller can borrow.
179    pub fn as_bcp47(&self) -> &str {
180        &self.0
181    }
182
183    /// Drop the region subtag, returning the language-only locale.
184    /// Useful for fallback logic in voice picker / addendum tables.
185    pub fn language_only(&self) -> Locale {
186        Self::new(self.language(), None)
187    }
188
189    /// `true` when the locale carries no region subtag.
190    pub fn is_just_language(&self) -> bool {
191        !self.0.contains('-')
192    }
193
194    /// Iterate every locale that [`FromStr`] would accept: each
195    /// [`LangCode`] paired with `None`, then with each
196    /// [`RegionCode`]. Output is the full cross-product
197    /// (8 × (1 + 17) = 144 entries) regardless of voice picker /
198    /// addendum coverage; the lint script intersects this against
199    /// the curated frontend `SUPPORTED_LOCALES` list to catch drift.
200    ///
201    /// Order is deterministic (alphabetical by language code,
202    /// then language-only first, then alphabetical by region
203    /// code) so the dump is diff-stable.
204    pub fn iter_supported() -> impl Iterator<Item = Locale> {
205        const LANGS: &[LangCode] = &[
206            LangCode::De,
207            LangCode::En,
208            LangCode::Es,
209            LangCode::Fr,
210            LangCode::It,
211            LangCode::Ja,
212            LangCode::Pt,
213            LangCode::Zh,
214        ];
215        const REGIONS: &[RegionCode] = &[
216            RegionCode::Ar,
217            RegionCode::Au,
218            RegionCode::Br,
219            RegionCode::Ca,
220            RegionCode::Cl,
221            RegionCode::Cn,
222            RegionCode::Co,
223            RegionCode::De,
224            RegionCode::Es,
225            RegionCode::Fr,
226            RegionCode::Gb,
227            RegionCode::It,
228            RegionCode::Jp,
229            RegionCode::Mx,
230            RegionCode::Pe,
231            RegionCode::Pt,
232            RegionCode::Us,
233        ];
234        LANGS.iter().flat_map(|lang| {
235            std::iter::once(Locale::new(*lang, None)).chain(
236                REGIONS
237                    .iter()
238                    .map(move |region| Locale::new(*lang, Some(*region))),
239            )
240        })
241    }
242}
243
244impl std::str::FromStr for Locale {
245    type Err = LocaleParseError;
246
247    fn from_str(raw: &str) -> Result<Self, Self::Err> {
248        let trimmed = raw.trim();
249        if trimmed.is_empty() {
250            return Err(LocaleParseError::Empty);
251        }
252        // `es_AR` (Java/Microsoft style) accepted; canonical
253        // separator is `-`. Lowercase the language head, uppercase
254        // the region tail, refuse anything past the first region.
255        let normalised = trimmed.replace('_', "-");
256        let mut parts = normalised.split('-');
257        let lang_raw = parts.next().unwrap_or(""); // safe: non-empty trimmed
258        let region_raw = parts.next();
259        if parts.next().is_some() {
260            return Err(LocaleParseError::TooManySubtags(trimmed.to_string()));
261        }
262
263        let lang_lower = lang_raw.to_ascii_lowercase();
264        let language = parse_language(&lang_lower)
265            .ok_or_else(|| LocaleParseError::UnknownLanguage(lang_raw.to_string()))?;
266
267        let region = match region_raw {
268            None => None,
269            Some(r) => {
270                let upper = r.to_ascii_uppercase();
271                let region = parse_region(&upper).ok_or_else(|| {
272                    LocaleParseError::UnknownRegion(language.as_str().to_string(), r.to_string())
273                })?;
274                Some(region)
275            }
276        };
277
278        Ok(Self::new(language, region))
279    }
280}
281
282impl std::fmt::Display for Locale {
283    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
284        f.write_str(&self.0)
285    }
286}
287
288// `#[serde(transparent)]` makes the JSON / YAML wire shape a bare
289// string (`"es-AR"`), preserving compatibility with the existing
290// `OutboundReplyContext.language: Option<String>` field. The
291// transparent representation also means `serde_json::to_string`
292// of a `Locale` is `"\"es-AR\""` — same as the input.
293impl serde::Serialize for Locale {
294    fn serialize<S: serde::Serializer>(&self, s: S) -> Result<S::Ok, S::Error> {
295        s.serialize_str(&self.0)
296    }
297}
298
299impl<'de> serde::Deserialize<'de> for Locale {
300    fn deserialize<D: serde::Deserializer<'de>>(d: D) -> Result<Self, D::Error> {
301        use serde::de::Error;
302        let raw = String::deserialize(d)?;
303        raw.parse().map_err(D::Error::custom)
304    }
305}
306
307fn parse_language(s: &str) -> Option<LangCode> {
308    Some(match s {
309        "de" => LangCode::De,
310        "en" => LangCode::En,
311        "es" => LangCode::Es,
312        "fr" => LangCode::Fr,
313        "it" => LangCode::It,
314        "ja" => LangCode::Ja,
315        "pt" => LangCode::Pt,
316        "zh" => LangCode::Zh,
317        _ => return None,
318    })
319}
320
321fn parse_region(s: &str) -> Option<RegionCode> {
322    Some(match s {
323        "AR" => RegionCode::Ar,
324        "AU" => RegionCode::Au,
325        "BR" => RegionCode::Br,
326        "CA" => RegionCode::Ca,
327        "CL" => RegionCode::Cl,
328        "CN" => RegionCode::Cn,
329        "CO" => RegionCode::Co,
330        "DE" => RegionCode::De,
331        "ES" => RegionCode::Es,
332        "FR" => RegionCode::Fr,
333        "GB" => RegionCode::Gb,
334        "IT" => RegionCode::It,
335        "JP" => RegionCode::Jp,
336        "MX" => RegionCode::Mx,
337        "PE" => RegionCode::Pe,
338        "PT" => RegionCode::Pt,
339        "US" => RegionCode::Us,
340        _ => return None,
341    })
342}
343
344#[cfg(test)]
345mod tests {
346    use super::*;
347    use std::str::FromStr;
348
349    // ── Parser: valid inputs ────────────────────────────────────
350
351    #[test]
352    fn parses_language_only() {
353        let l = Locale::from_str("es").unwrap();
354        assert_eq!(l.language(), LangCode::Es);
355        assert_eq!(l.region(), None);
356        assert_eq!(l.as_bcp47(), "es");
357    }
358
359    #[test]
360    fn parses_full_locale() {
361        let l = Locale::from_str("es-AR").unwrap();
362        assert_eq!(l.language(), LangCode::Es);
363        assert_eq!(l.region(), Some(RegionCode::Ar));
364        assert_eq!(l.as_bcp47(), "es-AR");
365    }
366
367    #[test]
368    fn parses_underscore_separator_canonicalises_to_hyphen() {
369        let l = Locale::from_str("es_AR").unwrap();
370        assert_eq!(l.as_bcp47(), "es-AR");
371    }
372
373    #[test]
374    fn parses_mixed_case_canonicalises() {
375        let l = Locale::from_str("ES-ar").unwrap();
376        assert_eq!(l.as_bcp47(), "es-AR");
377    }
378
379    #[test]
380    fn parses_with_surrounding_whitespace() {
381        let l = Locale::from_str("  es-AR  ").unwrap();
382        assert_eq!(l.as_bcp47(), "es-AR");
383    }
384
385    #[test]
386    fn parses_pt_br() {
387        let l = Locale::from_str("pt-BR").unwrap();
388        assert_eq!(l.language(), LangCode::Pt);
389        assert_eq!(l.region(), Some(RegionCode::Br));
390    }
391
392    // ── Parser: invalid inputs ─────────────────────────────────
393
394    #[test]
395    fn empty_string_errors_with_empty_variant() {
396        assert_eq!(Locale::from_str("").unwrap_err(), LocaleParseError::Empty);
397    }
398
399    #[test]
400    fn whitespace_only_errors_with_empty_variant() {
401        assert_eq!(
402            Locale::from_str("   ").unwrap_err(),
403            LocaleParseError::Empty
404        );
405    }
406
407    #[test]
408    fn unknown_language_errors() {
409        match Locale::from_str("xx").unwrap_err() {
410            LocaleParseError::UnknownLanguage(s) => assert_eq!(s, "xx"),
411            other => panic!("expected UnknownLanguage, got {other:?}"),
412        }
413    }
414
415    #[test]
416    fn unknown_region_for_known_language_errors() {
417        match Locale::from_str("es-XX").unwrap_err() {
418            LocaleParseError::UnknownRegion(lang, region) => {
419                assert_eq!(lang, "es");
420                assert_eq!(region, "XX");
421            }
422            other => panic!("expected UnknownRegion, got {other:?}"),
423        }
424    }
425
426    #[test]
427    fn extra_subtags_errors_too_many() {
428        match Locale::from_str("es-AR-x").unwrap_err() {
429            LocaleParseError::TooManySubtags(s) => assert_eq!(s, "es-AR-x"),
430            other => panic!("expected TooManySubtags, got {other:?}"),
431        }
432    }
433
434    #[test]
435    fn script_subtag_errors_too_many() {
436        // `zh-Hant` carries a script subtag; v1 parser rejects.
437        match Locale::from_str("zh-Hant-CN").unwrap_err() {
438            LocaleParseError::TooManySubtags(_) => {}
439            other => panic!("expected TooManySubtags, got {other:?}"),
440        }
441    }
442
443    #[test]
444    fn variant_subtag_errors_too_many() {
445        match Locale::from_str("de-DE-1996").unwrap_err() {
446            LocaleParseError::TooManySubtags(_) => {}
447            other => panic!("expected TooManySubtags, got {other:?}"),
448        }
449    }
450
451    #[test]
452    fn m49_un_region_code_errors_unknown_region() {
453        // `es-419` (UN M.49 for Latin America) — not in v1 enum.
454        match Locale::from_str("es-419").unwrap_err() {
455            LocaleParseError::UnknownRegion(lang, region) => {
456                assert_eq!(lang, "es");
457                assert_eq!(region, "419");
458            }
459            other => panic!("expected UnknownRegion, got {other:?}"),
460        }
461    }
462
463    /// STT lang_hint trim.
464    /// `Locale::language().as_str()` is the documented path the SDK's
465    /// `InboundTransformHandler` uses to convert a binding's BCP-47
466    /// (`es-AR`) into the ISO-639-1 prefix (`es`) that whisper's
467    /// `set_language` accepts. The trim must be lossless on every
468    /// supported lang+region pair.
469    #[test]
470    fn lang_only_trim_drops_region_for_whisper_hint() {
471        for (input, expected_iso639_1) in [
472            ("es-AR", "es"),
473            ("es-MX", "es"),
474            ("en-GB", "en"),
475            ("en-US", "en"),
476            ("pt-BR", "pt"),
477            ("pt-PT", "pt"),
478            ("zh-CN", "zh"),
479            ("ja-JP", "ja"),
480            // Lang-only inputs already at the trim target.
481            ("es", "es"),
482            ("en", "en"),
483        ] {
484            let l = Locale::from_str(input).unwrap();
485            assert_eq!(
486                l.language().as_str(),
487                expected_iso639_1,
488                "BCP-47 {input} must trim to ISO-639-1 {expected_iso639_1}"
489            );
490        }
491    }
492
493    /// `iter_supported()` is
494    /// the source of truth for `cargo run -p nexo-microapp-sdk
495    /// --bin locale_dump`. Verify it yields the documented count
496    /// (8 lang × (1 + 17 region) = 144) and no duplicates.
497    #[test]
498    fn iter_supported_yields_full_cross_product() {
499        let all: Vec<String> = Locale::iter_supported()
500            .map(|l| l.as_bcp47().to_string())
501            .collect();
502        assert_eq!(
503            all.len(),
504            8 * (1 + 17),
505            "expected 8 langs × (lang-only + 17 regions) = 144"
506        );
507        let unique: std::collections::HashSet<_> = all.iter().collect();
508        assert_eq!(unique.len(), all.len(), "no duplicates in iter_supported");
509        // Spot-check first/last by sorted order.
510        let mut sorted = all.clone();
511        sorted.sort();
512        assert_eq!(sorted.first().map(String::as_str), Some("de"));
513        assert_eq!(sorted.last().map(String::as_str), Some("zh-US"));
514    }
515}
516
517/// Parser-side errors. Wrapped in [`thiserror::Error`] so they
518/// surface cleanly through the existing error envelopes
519/// (`ToolError::InvalidArguments`, daemon boot logs, admin RPC).
520#[derive(Debug, Clone, Error, PartialEq, Eq)]
521pub enum LocaleParseError {
522    /// Empty input string after trimming whitespace.
523    #[error("empty locale string")]
524    Empty,
525    /// Language subtag not in [`LangCode`]'s closed set.
526    #[error("unsupported language subtag `{0}`")]
527    UnknownLanguage(String),
528    /// Region subtag not in [`RegionCode`]'s closed set OR not
529    /// covered by the voice picker for the supplied language.
530    #[error("unsupported region subtag `{1}` for language `{0}`")]
531    UnknownRegion(String, String),
532    /// Locale string carries more than `language[-region]` —
533    /// script subtags (`zh-Hant`), variants (`de-DE-1996`), and
534    /// extension subtags are deferred to a follow-up.
535    #[error("unsupported subtag count: locale `{0}` has more than one region/script subtag")]
536    TooManySubtags(String),
537}
538
539/// Default voice id used when no locale is supplied. Matches the
540/// SDK's pre-Phase-89 default ("English neutral, female").
541pub const DEFAULT_VOICE_ID: &str = "en-US-AriaNeural";
542
543/// Phase 89 (relocated from `nexo-microapp-sdk::voice` in Phase
544/// 81.31 follow-up #2) — recommended Microsoft Edge neural voice
545/// id for a given locale. Lookup is a deterministic match on
546/// `(LangCode, Option<RegionCode>)`; returns
547/// [`DEFAULT_VOICE_ID`] when `locale` is `None`. Same data the
548/// microapp SDK's voice-mode runtime consumes; surfaced here so
549/// admin surfaces (wizard, PersonaEditor) can preview which voice
550/// a locale would use without pulling in the full SDK.
551pub fn default_voice_for_locale(locale: Option<&Locale>) -> &'static str {
552    let Some(loc) = locale else {
553        return DEFAULT_VOICE_ID;
554    };
555    match (loc.language(), loc.region()) {
556        // Spanish family.
557        (LangCode::Es, Some(RegionCode::Ar)) => "es-AR-ElenaNeural",
558        (LangCode::Es, Some(RegionCode::Mx)) => "es-MX-DaliaNeural",
559        (LangCode::Es, Some(RegionCode::Es)) => "es-ES-ElviraNeural",
560        (LangCode::Es, Some(RegionCode::Co)) => "es-CO-SalomeNeural",
561        (LangCode::Es, Some(RegionCode::Pe)) => "es-PE-CamilaNeural",
562        (LangCode::Es, Some(RegionCode::Cl)) => "es-CL-CatalinaNeural",
563        (LangCode::Es, Some(RegionCode::Us)) => "es-US-PalomaNeural",
564        (LangCode::Es, _) => "es-MX-DaliaNeural",
565        // English family.
566        (LangCode::En, Some(RegionCode::Us)) => "en-US-AriaNeural",
567        (LangCode::En, Some(RegionCode::Gb)) => "en-GB-SoniaNeural",
568        (LangCode::En, Some(RegionCode::Au)) => "en-AU-NatashaNeural",
569        (LangCode::En, Some(RegionCode::Ca)) => "en-CA-ClaraNeural",
570        (LangCode::En, _) => "en-US-AriaNeural",
571        // Portuguese.
572        (LangCode::Pt, Some(RegionCode::Br)) => "pt-BR-FranciscaNeural",
573        (LangCode::Pt, Some(RegionCode::Pt)) => "pt-PT-RaquelNeural",
574        (LangCode::Pt, _) => "pt-BR-FranciscaNeural",
575        // French.
576        (LangCode::Fr, Some(RegionCode::Fr)) => "fr-FR-DeniseNeural",
577        (LangCode::Fr, Some(RegionCode::Ca)) => "fr-CA-SylvieNeural",
578        (LangCode::Fr, _) => "fr-FR-DeniseNeural",
579        // Italian.
580        (LangCode::It, Some(RegionCode::It)) => "it-IT-ElsaNeural",
581        (LangCode::It, _) => "it-IT-ElsaNeural",
582        // German.
583        (LangCode::De, Some(RegionCode::De)) => "de-DE-KatjaNeural",
584        (LangCode::De, _) => "de-DE-KatjaNeural",
585        // Japanese.
586        (LangCode::Ja, Some(RegionCode::Jp)) => "ja-JP-NanamiNeural",
587        (LangCode::Ja, _) => "ja-JP-NanamiNeural",
588        // Chinese.
589        (LangCode::Zh, Some(RegionCode::Cn)) => "zh-CN-XiaoxiaoNeural",
590        (LangCode::Zh, _) => "zh-CN-XiaoxiaoNeural",
591    }
592}