Skip to main content

citum_schema_style/locale/
raw_conversion.rs

1/*
2SPDX-License-Identifier: MIT OR Apache-2.0
3SPDX-FileCopyrightText: © 2023-2026 Bruce D'Arcus and Citum contributors
4*/
5
6//! Conversion from [`raw::RawLocale`] (serde-facing schema) into the runtime
7//! [`Locale`] type, plus the file/YAML/JSON/CBOR loaders that drive it.
8//!
9//! Parsing helpers (`extract_*`, `parse_*`, `from_raw_gendered_string`) live
10//! alongside `from_raw` so a reader sees the whole transformation in one
11//! place. Public Locale APIs unrelated to raw conversion stay in `mod.rs`.
12
13use super::Locale;
14use super::message::Mf2MessageEvaluator;
15use super::raw;
16use super::types::{
17    ContributorTerm, DateTerms, LocaleOverride, LocatorTerm, MaybeGendered, MessageSyntax,
18    MonthNames, SimpleTerm, SingularPlural,
19};
20use crate::citation::LocatorType;
21use crate::template::ContributorRole;
22use std::collections::HashMap;
23use std::sync::Arc;
24
25impl Locale {
26    /// Load a locale from a YAML string.
27    ///
28    /// # Errors
29    ///
30    /// Returns an error when the YAML cannot be parsed into a locale.
31    pub fn from_yaml_str(yaml: &str) -> Result<Self, String> {
32        let raw: raw::RawLocale = serde_yaml::from_str(yaml)
33            .map_err(|e| format!("Failed to parse locale YAML: {}", e))?;
34
35        Ok(Self::from_raw(raw))
36    }
37
38    /// Load a locale by ID (e.g., "en-US", "de-DE") from a locales directory.
39    /// Falls back to en-US if the locale file is not found.
40    pub fn load(locale_id: &str, locales_dir: &std::path::Path) -> Self {
41        let extensions = ["yaml", "yml", "json", "cbor"];
42
43        for ext in &extensions {
44            let file_name = format!("{}.{}", locale_id, ext);
45            let file_path = locales_dir.join(&file_name);
46
47            if file_path.exists() {
48                match Self::from_file(&file_path) {
49                    Ok(locale) => return locale,
50                    Err(e) => {
51                        eprintln!(
52                            "Warning: Failed to load locale {}.{}: {}",
53                            locale_id, ext, e
54                        );
55                    }
56                }
57            }
58        }
59
60        if locale_id.contains('-') {
61            let base = locale_id.split('-').next().unwrap_or("en");
62            if let Ok(entries) = std::fs::read_dir(locales_dir) {
63                for entry in entries.flatten() {
64                    let name = entry.file_name();
65                    let name_str = name.to_string_lossy();
66                    if (name_str.starts_with(base)
67                        && extensions.iter().any(|ext| name_str.ends_with(ext)))
68                        && let Ok(locale) = Self::from_file(&entry.path())
69                    {
70                        return locale;
71                    }
72                }
73            }
74        }
75
76        Self::en_us()
77    }
78
79    /// Load locale from a file path directly (detects format).
80    ///
81    /// # Errors
82    ///
83    /// Returns an error when the file cannot be read or its contents cannot be
84    /// parsed as a supported locale format.
85    pub fn from_file(path: &std::path::Path) -> Result<Self, String> {
86        let bytes =
87            std::fs::read(path).map_err(|e| format!("Failed to read locale file: {}", e))?;
88        let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("yaml");
89
90        match ext {
91            "cbor" => ciborium::de::from_reader::<raw::RawLocale, _>(std::io::Cursor::new(&bytes))
92                .map(Self::from_raw)
93                .map_err(|e| format!("Failed to parse CBOR locale: {}", e)),
94            "json" => serde_json::from_slice::<raw::RawLocale>(&bytes)
95                .map(Self::from_raw)
96                .map_err(|e| format!("Failed to parse JSON locale: {}", e)),
97            _ => {
98                let content = String::from_utf8_lossy(&bytes);
99                Self::from_yaml_str(&content)
100            }
101        }
102    }
103
104    /// Convert a RawLocale to a Locale.
105    #[allow(
106        clippy::too_many_lines,
107        reason = "Complex parsing of raw locale data with multiple term types"
108    )]
109    fn from_raw(raw: raw::RawLocale) -> Self {
110        let punctuation_in_quote = raw.locale.starts_with("en-US")
111            || (raw.locale.starts_with("en") && !raw.locale.starts_with("en-GB"));
112
113        let mut locale = Locale::en_us();
114        locale.locale = raw.locale.clone();
115        locale.dates = DateTerms {
116            months: MonthNames {
117                long: raw.dates.months.long,
118                short: raw.dates.months.short,
119            },
120            seasons: raw.dates.seasons,
121            uncertainty_term: raw.dates.uncertainty_term,
122            open_ended_term: raw.dates.open_ended_term,
123            am: raw.dates.am,
124            pm: raw.dates.pm,
125            timezone_utc: raw.dates.timezone_utc,
126            before_era: raw.dates.before_era,
127            ad: raw.dates.ad,
128            bc: raw.dates.bc,
129            bce: raw.dates.bce,
130            ce: raw.dates.ce,
131        };
132        locale.punctuation_in_quote = punctuation_in_quote;
133        locale.sort_articles = Self::default_articles_for_locale(&raw.locale);
134
135        locale.locale_schema_version = raw.locale_schema_version;
136        locale.evaluation = raw.evaluation.unwrap_or_default();
137        locale.messages = raw.messages;
138        locale.date_formats = raw.date_formats;
139        locale.legacy_term_aliases = raw.legacy_term_aliases;
140
141        if let Some(raw_vocab) = raw.vocab {
142            locale.vocab.genre.extend(raw_vocab.genre);
143            locale.vocab.medium.extend(raw_vocab.medium);
144        }
145
146        if let Some(go) = raw.grammar_options {
147            locale.grammar_options = go;
148        } else {
149            locale.grammar_options.punctuation_in_quote = locale.punctuation_in_quote;
150        }
151        locale.punctuation_in_quote = locale.grammar_options.punctuation_in_quote;
152
153        if let Some(nf) = raw.number_formats {
154            locale.number_formats = nf;
155        }
156
157        let explicit_locator_keys: std::collections::HashSet<LocatorType> = raw
158            .locators
159            .keys()
160            .filter_map(|key| Self::parse_builtin_locator_type(key))
161            .collect();
162
163        for (key, value) in &raw.locators {
164            if let Some(locator_type) = Self::parse_locator_type(key) {
165                let locator_term = LocatorTerm {
166                    long: Self::extract_singular_plural(value.long.as_ref().as_ref()),
167                    short: Self::extract_singular_plural(value.short.as_ref().as_ref()),
168                    symbol: Self::extract_singular_plural(value.symbol.as_ref().as_ref()),
169                    gender: value.gender.clone(),
170                };
171                locale.locators.insert(locator_type, locator_term);
172            }
173        }
174
175        for (key, value) in &raw.terms {
176            if let Some(locator_type) = Self::parse_builtin_locator_type(key)
177                && !explicit_locator_keys.contains(&locator_type)
178                && let Some(forms) = Self::get_forms(value)
179            {
180                let locator_term = LocatorTerm {
181                    long: Self::extract_singular_plural(forms.get("long").as_ref()),
182                    short: Self::extract_singular_plural(forms.get("short").as_ref()),
183                    symbol: Self::extract_singular_plural(forms.get("symbol").as_ref()),
184                    gender: None,
185                };
186                locale.locators.insert(locator_type, locator_term);
187                continue;
188            }
189
190            match key.as_str() {
191                "and" => {
192                    if let Some(forms) = Self::get_forms(value) {
193                        if let Some(v) = forms.get("long").and_then(|v| v.as_string()) {
194                            locale.terms.and = Some(v.to_string());
195                        }
196                        if let Some(v) = forms.get("symbol").and_then(|v| v.as_string()) {
197                            locale.terms.and_symbol = Some(v.to_string());
198                        }
199                    }
200                }
201                "et_al" => {
202                    if let Some(forms) = Self::get_forms(value)
203                        && let Some(v) = forms.get("long").and_then(|v| v.as_string())
204                    {
205                        locale.terms.et_al = Some(v.to_string());
206                    }
207                }
208                "and others" | "and_others" => {
209                    if let Some(forms) = Self::get_forms(value)
210                        && let Some(v) = forms.get("long").and_then(|v| v.as_string())
211                    {
212                        locale.terms.and_others = Some(v.to_string());
213                    }
214                }
215                "accessed" => {
216                    if let Some(forms) = Self::get_forms(value)
217                        && let Some(v) = forms.get("long").and_then(|v| v.as_string())
218                    {
219                        locale.terms.accessed = Some(v.to_string());
220                    }
221                }
222                "ibid" => {
223                    if let Some(forms) = Self::get_forms(value)
224                        && let Some(v) = forms.get("long").and_then(|v| v.as_string())
225                    {
226                        locale.terms.ibid = Some(v.to_string());
227                    }
228                }
229                "no date" => {
230                    let simple = Self::extract_simple_term_from_raw(value);
231                    let short_fallback = simple.short.as_default_str().to_string();
232                    locale
233                        .terms
234                        .general
235                        .insert(super::types::GeneralTerm::NoDate, simple);
236                    locale.terms.no_date.get_or_insert(short_fallback);
237                }
238                "no_date" => {
239                    let simple = Self::extract_simple_term_from_raw(value);
240                    locale.terms.no_date = Some(simple.short.as_str().to_string());
241                    locale
242                        .terms
243                        .general
244                        .entry(super::types::GeneralTerm::NoDate)
245                        .or_insert(simple);
246                }
247                _ => {
248                    if let Some(general_term) = Self::parse_general_term(key) {
249                        let simple = Self::extract_simple_term_from_raw(value);
250                        locale.terms.general.insert(general_term, simple);
251                    }
252                }
253            }
254        }
255
256        for (key, role_term) in &raw.roles {
257            if let Some(role) = Self::parse_role_name(key) {
258                let contributor_term = ContributorTerm {
259                    singular: Self::extract_simple_term(&role_term.long, &role_term.short, false),
260                    plural: Self::extract_simple_term(&role_term.long, &role_term.short, true),
261                    verb: Self::extract_verb_term(&role_term.verb, &role_term.verb_short),
262                };
263                locale.roles.insert(role, contributor_term);
264            }
265        }
266
267        locale.evaluator = match locale.evaluation.message_syntax {
268            MessageSyntax::Mf2 => Arc::new(Mf2MessageEvaluator),
269            MessageSyntax::Static => Arc::new(Mf2MessageEvaluator),
270        };
271
272        locale
273    }
274
275    /// Get default articles for a locale based on language code.
276    fn default_articles_for_locale(locale_id: &str) -> Vec<String> {
277        #[allow(clippy::string_slice, reason = "locale_id is expected to be ASCII")]
278        let lang = &locale_id[..2.min(locale_id.len())];
279        match lang {
280            "en" => vec!["the".into(), "a".into(), "an".into()],
281            "de" => vec![
282                "der".into(),
283                "die".into(),
284                "das".into(),
285                "ein".into(),
286                "eine".into(),
287            ],
288            "fr" => vec![
289                "le".into(),
290                "la".into(),
291                "les".into(),
292                "l'".into(),
293                "un".into(),
294                "une".into(),
295            ],
296            "es" => vec![
297                "el".into(),
298                "la".into(),
299                "los".into(),
300                "las".into(),
301                "un".into(),
302                "una".into(),
303            ],
304            "it" => vec![
305                "il".into(),
306                "lo".into(),
307                "la".into(),
308                "i".into(),
309                "gli".into(),
310                "le".into(),
311                "un".into(),
312                "una".into(),
313            ],
314            "pt" => vec![
315                "o".into(),
316                "a".into(),
317                "os".into(),
318                "as".into(),
319                "um".into(),
320                "uma".into(),
321            ],
322            "nl" => vec!["de".into(), "het".into(), "een".into()],
323            _ => vec![],
324        }
325    }
326
327    fn get_forms(value: &raw::RawTermValue) -> Option<&HashMap<String, raw::RawTermValue>> {
328        match value {
329            raw::RawTermValue::Forms(forms) => Some(forms),
330            _ => None,
331        }
332    }
333
334    fn parse_locator_type(name: &str) -> Option<LocatorType> {
335        LocatorType::from_key(name).ok()
336    }
337
338    fn parse_builtin_locator_type(name: &str) -> Option<LocatorType> {
339        match Self::parse_locator_type(name)? {
340            LocatorType::Custom(_) => None,
341            locator => Some(locator),
342        }
343    }
344
345    fn parse_role_name(name: &str) -> Option<ContributorRole> {
346        match name {
347            "author" => Some(ContributorRole::Author),
348            "chair" => Some(ContributorRole::Chair),
349            "editor" => Some(ContributorRole::Editor),
350            "translator" => Some(ContributorRole::Translator),
351            "director" => Some(ContributorRole::Director),
352            "compiler" => Some(ContributorRole::Composer),
353            "illustrator" => Some(ContributorRole::Illustrator),
354            "collection-editor" => Some(ContributorRole::CollectionEditor),
355            "container-author" => Some(ContributorRole::ContainerAuthor),
356            "editorial-director" => Some(ContributorRole::EditorialDirector),
357            "textual-editor" | "textual_editor" => Some(ContributorRole::TextualEditor),
358            "interviewer" => Some(ContributorRole::Interviewer),
359            "original-author" => Some(ContributorRole::OriginalAuthor),
360            "recipient" => Some(ContributorRole::Recipient),
361            "reviewed-author" => Some(ContributorRole::ReviewedAuthor),
362            "composer" => Some(ContributorRole::Composer),
363            _ => None,
364        }
365    }
366
367    fn extract_singular_plural(value: Option<&&raw::RawTermValue>) -> Option<SingularPlural> {
368        match value {
369            Some(raw::RawTermValue::SingularPlural { singular, plural }) => Some(SingularPlural {
370                singular: Self::from_raw_gendered_string(singular),
371                plural: Self::from_raw_gendered_string(plural),
372            }),
373            Some(raw::RawTermValue::Simple(s)) => Some(SingularPlural {
374                singular: MaybeGendered::Plain(s.clone()),
375                plural: MaybeGendered::Plain(s.clone()),
376            }),
377            Some(raw::RawTermValue::Gendered {
378                masculine,
379                feminine,
380                neuter,
381                common,
382            }) => Some(SingularPlural {
383                singular: MaybeGendered::Gendered {
384                    masculine: masculine.clone(),
385                    feminine: feminine.clone(),
386                    neuter: neuter.clone(),
387                    common: common.clone(),
388                },
389                plural: MaybeGendered::Gendered {
390                    masculine: masculine.clone(),
391                    feminine: feminine.clone(),
392                    neuter: neuter.clone(),
393                    common: common.clone(),
394                },
395            }),
396            Some(raw::RawTermValue::Forms(forms)) => {
397                let singular = forms
398                    .get("singular")
399                    .map(Self::extract_maybe_gendered_string);
400                let plural = forms.get("plural").map(Self::extract_maybe_gendered_string);
401
402                singular.map(|s| SingularPlural {
403                    plural: plural.unwrap_or_else(|| s.clone()),
404                    singular: s,
405                })
406            }
407            _ => None,
408        }
409    }
410
411    fn extract_simple_term(
412        long: &Option<raw::RawTermValue>,
413        short: &Option<raw::RawTermValue>,
414        plural: bool,
415    ) -> SimpleTerm {
416        let long_str = long
417            .as_ref()
418            .map(|v| Self::extract_simple_gendered_term(v, plural))
419            .unwrap_or_default();
420
421        let short_str = short
422            .as_ref()
423            .map(|v| Self::extract_simple_gendered_term(v, plural))
424            .unwrap_or_default();
425
426        SimpleTerm {
427            long: long_str,
428            short: short_str,
429        }
430    }
431
432    fn extract_verb_term(
433        verb: &Option<raw::RawTermValue>,
434        verb_short: &Option<raw::RawTermValue>,
435    ) -> SimpleTerm {
436        let long_str = verb
437            .as_ref()
438            .and_then(|v| v.as_string())
439            .unwrap_or("")
440            .into();
441
442        let short_str = verb_short
443            .as_ref()
444            .and_then(|v| v.as_string())
445            .unwrap_or("")
446            .into();
447
448        SimpleTerm {
449            long: long_str,
450            short: short_str,
451        }
452    }
453
454    /// Normalize a locale term key to canonical kebab-case.
455    ///
456    /// Locale YAML files and style templates may use underscores or spaces
457    /// interchangeably with hyphens (e.g. `no_date`, `no date`, `no-date`).
458    /// This helper converts all three forms to the single canonical
459    /// kebab-case key so `parse_general_term` only needs to match one pattern
460    /// per term.
461    fn normalize_term_key(s: &str) -> String {
462        s.replace(['_', ' '], "-")
463    }
464
465    /// Parse a locale term key into a structured general-term identifier.
466    pub fn parse_general_term(name: &str) -> Option<super::types::GeneralTerm> {
467        use super::types::GeneralTerm;
468        match Self::normalize_term_key(name).as_str() {
469            "in" => Some(GeneralTerm::In),
470            "accessed" => Some(GeneralTerm::Accessed),
471            "retrieved" => Some(GeneralTerm::Retrieved),
472            "at" => Some(GeneralTerm::At),
473            "from" => Some(GeneralTerm::From),
474            "of" => Some(GeneralTerm::Of),
475            "to" => Some(GeneralTerm::To),
476            "by" => Some(GeneralTerm::By),
477            "no-date" => Some(GeneralTerm::NoDate),
478            "anonymous" => Some(GeneralTerm::Anonymous),
479            "circa" => Some(GeneralTerm::Circa),
480            "available-at" => Some(GeneralTerm::AvailableAt),
481            "ibid" => Some(GeneralTerm::Ibid),
482            "and" => Some(GeneralTerm::And),
483            "et-al" => Some(GeneralTerm::EtAl),
484            "and-others" => Some(GeneralTerm::AndOthers),
485            "forthcoming" => Some(GeneralTerm::Forthcoming),
486            "online" => Some(GeneralTerm::Online),
487            "here" => Some(GeneralTerm::Here),
488            "deposited" => Some(GeneralTerm::Deposited),
489            "review-of" => Some(GeneralTerm::ReviewOf),
490            "original-work-published" => Some(GeneralTerm::OriginalWorkPublished),
491            "personal-communication" => Some(GeneralTerm::PersonalCommunication),
492            "patent" => Some(GeneralTerm::Patent),
493            "volume" => Some(GeneralTerm::Volume),
494            "issue" => Some(GeneralTerm::Issue),
495            "page" => Some(GeneralTerm::Page),
496            "chapter" => Some(GeneralTerm::Chapter),
497            "edition" => Some(GeneralTerm::Edition),
498            "section" => Some(GeneralTerm::Section),
499            _ => None,
500        }
501    }
502
503    fn extract_simple_term_from_raw(value: &raw::RawTermValue) -> SimpleTerm {
504        match value {
505            raw::RawTermValue::Simple(s) => SimpleTerm {
506                long: s.clone().into(),
507                short: s.clone().into(),
508            },
509            raw::RawTermValue::Gendered {
510                masculine,
511                feminine,
512                neuter,
513                common,
514            } => SimpleTerm {
515                long: MaybeGendered::Gendered {
516                    masculine: masculine.clone(),
517                    feminine: feminine.clone(),
518                    neuter: neuter.clone(),
519                    common: common.clone(),
520                },
521                short: MaybeGendered::Gendered {
522                    masculine: masculine.clone(),
523                    feminine: feminine.clone(),
524                    neuter: neuter.clone(),
525                    common: common.clone(),
526                },
527            },
528            raw::RawTermValue::Forms(forms) => {
529                let long = forms
530                    .get("long")
531                    .map(Self::extract_maybe_gendered_string)
532                    .unwrap_or_default();
533                let short = forms
534                    .get("short")
535                    .map(Self::extract_maybe_gendered_string)
536                    .unwrap_or_else(|| long.clone());
537                SimpleTerm { long, short }
538            }
539            raw::RawTermValue::SingularPlural { singular, .. } => SimpleTerm {
540                long: Self::from_raw_gendered_string(singular),
541                short: Self::from_raw_gendered_string(singular),
542            },
543        }
544    }
545
546    fn from_raw_gendered_string(value: &raw::RawGenderedString) -> MaybeGendered<String> {
547        match value {
548            raw::RawGenderedString::Simple(value) => MaybeGendered::Plain(value.clone()),
549            raw::RawGenderedString::Gendered {
550                masculine,
551                feminine,
552                neuter,
553                common,
554            } => MaybeGendered::Gendered {
555                masculine: masculine.clone(),
556                feminine: feminine.clone(),
557                neuter: neuter.clone(),
558                common: common.clone(),
559            },
560        }
561    }
562
563    fn extract_maybe_gendered_string(value: &raw::RawTermValue) -> MaybeGendered<String> {
564        match value {
565            raw::RawTermValue::Simple(value) => MaybeGendered::Plain(value.clone()),
566            raw::RawTermValue::Gendered {
567                masculine,
568                feminine,
569                neuter,
570                common,
571            } => MaybeGendered::Gendered {
572                masculine: masculine.clone(),
573                feminine: feminine.clone(),
574                neuter: neuter.clone(),
575                common: common.clone(),
576            },
577            raw::RawTermValue::SingularPlural { singular, .. } => {
578                Self::from_raw_gendered_string(singular)
579            }
580            raw::RawTermValue::Forms(forms) => forms
581                .get("long")
582                .or_else(|| forms.get("singular"))
583                .map(Self::extract_maybe_gendered_string)
584                .unwrap_or_default(),
585        }
586    }
587
588    fn extract_simple_gendered_term(
589        value: &raw::RawTermValue,
590        plural: bool,
591    ) -> MaybeGendered<String> {
592        match value {
593            raw::RawTermValue::Simple(value) => MaybeGendered::Plain(value.clone()),
594            raw::RawTermValue::Gendered {
595                masculine,
596                feminine,
597                neuter,
598                common,
599            } => MaybeGendered::Gendered {
600                masculine: masculine.clone(),
601                feminine: feminine.clone(),
602                neuter: neuter.clone(),
603                common: common.clone(),
604            },
605            raw::RawTermValue::SingularPlural {
606                singular,
607                plural: plural_value,
608            } => {
609                if plural {
610                    Self::from_raw_gendered_string(plural_value)
611                } else {
612                    Self::from_raw_gendered_string(singular)
613                }
614            }
615            raw::RawTermValue::Forms(forms) => {
616                let key = if plural { "plural" } else { "singular" };
617                forms
618                    .get(key)
619                    .or_else(|| forms.get("long"))
620                    .map(Self::extract_maybe_gendered_string)
621                    .unwrap_or_default()
622            }
623        }
624    }
625
626    /// Apply a partial override, merging its fields into this locale.
627    ///
628    /// Performs key-by-key insertion or replacement for:
629    /// - `messages`: new or updated message IDs
630    /// - `grammar_options`: if `Some`, replaces the entire block and syncs
631    ///   `punctuation_in_quote` field
632    /// - `legacy_term_aliases`: new or updated term aliases
633    pub fn apply_override(&mut self, ov: &LocaleOverride) {
634        for (k, v) in &ov.messages {
635            self.messages.insert(k.clone(), v.clone());
636        }
637        if let Some(go) = &ov.grammar_options {
638            self.grammar_options = go.clone();
639            self.punctuation_in_quote = go.punctuation_in_quote;
640        }
641        for (k, v) in &ov.legacy_term_aliases {
642            self.legacy_term_aliases.insert(k.clone(), v.clone());
643        }
644    }
645}