Skip to main content

citum_engine/values/
mod.rs

1/*
2SPDX-License-Identifier: MIT OR Apache-2.0
3SPDX-FileCopyrightText: © 2023-2026 Bruce D'Arcus and Citum contributors
4*/
5
6//! Value extraction for template components.
7//!
8//! This module provides the logic to extract formatted values from references
9//! based on template component specifications.
10
11/// Contributor extraction and name-formatting helpers.
12pub mod contributor;
13/// Date extraction and date-formatting helpers.
14pub mod date;
15/// List-component value extraction helpers.
16pub mod list;
17/// Locator rendering logic.
18pub mod locator;
19/// Numeric variable extraction and page-range helpers.
20pub mod number;
21/// Shared helpers for collapsing consecutive numeric or ordinal numbering.
22pub mod range;
23/// Locale term resolution helpers.
24pub mod term;
25/// Title text-case transform functions.
26pub mod text_case;
27/// Title extraction and title-formatting helpers.
28pub mod title;
29/// Generic variable extraction helpers.
30pub mod variable;
31
32#[cfg(test)]
33#[allow(
34    clippy::unwrap_used,
35    clippy::expect_used,
36    clippy::panic,
37    clippy::indexing_slicing,
38    clippy::todo,
39    clippy::unimplemented,
40    clippy::unreachable,
41    clippy::get_unwrap,
42    reason = "Panicking is acceptable and often desired in tests."
43)]
44mod tests;
45
46use crate::reference::Reference;
47use citum_schema::locale::Locale;
48use citum_schema::options::{Config, bibliography::BibliographyConfig};
49use citum_schema::reference::types::Title;
50use citum_schema::template::{TemplateComponent, TitleType};
51
52pub use contributor::format_contributors_short;
53pub use date::int_to_letter;
54
55/// Resolve preferred transliteration from a map of transliterations.
56///
57/// Applies priority-based matching:
58/// 1. Preferred transliteration list: exact match
59/// 2. Preferred transliteration list: substring match
60/// 3. Preferred script: exact match
61/// 4. Preferred script: substring match
62fn resolve_transliteration<'a>(
63    transliterations: &'a std::collections::HashMap<String, String>,
64    preferred_transliteration: Option<&[String]>,
65    preferred_script: Option<&String>,
66) -> Option<&'a str> {
67    // 1. Priority list: exact match
68    if let Some(tags) = preferred_transliteration {
69        for tag in tags {
70            if let Some(v) = transliterations.get(tag) {
71                return Some(v.as_str());
72            }
73        }
74        // 2. Priority list: substring match
75        for tag in tags {
76            for (k, v) in transliterations {
77                if k.contains(tag.as_str()) {
78                    return Some(v.as_str());
79                }
80            }
81        }
82    }
83    // 3. preferred_script exact match
84    if let Some(script) = preferred_script {
85        if let Some(v) = transliterations.get(script) {
86            return Some(v.as_str());
87        }
88        // 4. preferred_script substring match
89        for (k, v) in transliterations {
90            if k.contains(script.as_str()) {
91                return Some(v.as_str());
92            }
93        }
94    }
95    None
96}
97
98fn resolve_translation<'a>(
99    translations: &'a std::collections::HashMap<citum_schema::reference::LangID, String>,
100    style_locale: &str,
101) -> Option<&'a str> {
102    translations
103        .get(style_locale)
104        .or_else(|| {
105            style_locale
106                .split(['-', '_'])
107                .next()
108                .and_then(|base| translations.get(base))
109        })
110        .map(String::as_str)
111}
112
113/// Resolve a multilingual string based on style configuration.
114///
115/// Applies BCP 47 fallback logic:
116/// 1. Exact tag match (e.g., "ja-Latn-hepburn")
117/// 2. Script prefix match (e.g., "ja-Latn")
118/// 3. Fallback to original field
119///
120/// # Arguments
121/// * `string` - The multilingual string to resolve
122/// * `mode` - The rendering mode from style config
123/// * `preferred_transliteration` - Optional ordered list of BCP 47 transliteration tags
124/// * `preferred_script` - Optional preferred script (e.g., "Latn")
125/// * `style_locale` - The style's locale for translation matching
126#[must_use]
127pub fn resolve_multilingual_string(
128    string: &citum_schema::reference::types::MultilingualString,
129    mode: Option<&citum_schema::options::MultilingualMode>,
130    preferred_transliteration: Option<&[String]>,
131    preferred_script: Option<&String>,
132    style_locale: &str,
133) -> String {
134    use citum_schema::options::MultilingualMode;
135    use citum_schema::reference::types::MultilingualString;
136
137    match string {
138        MultilingualString::Simple(s) => s.clone(),
139        MultilingualString::Complex(complex) => {
140            let mode = mode.unwrap_or(&MultilingualMode::Primary);
141
142            match mode {
143                MultilingualMode::Primary => complex.original.clone(),
144
145                MultilingualMode::Transliterated => {
146                    if let Some(trans) = resolve_transliteration(
147                        &complex.transliterations,
148                        preferred_transliteration,
149                        preferred_script,
150                    ) {
151                        return trans.to_string();
152                    }
153
154                    // Fallback: use any available transliteration, or original
155                    complex
156                        .transliterations
157                        .values()
158                        .next()
159                        .cloned()
160                        .unwrap_or_else(|| complex.original.clone())
161                }
162
163                MultilingualMode::Translated => {
164                    // Try to match style locale
165                    resolve_translation(&complex.translations, style_locale)
166                        .map(ToString::to_string)
167                        .unwrap_or_else(|| complex.original.clone())
168                }
169
170                MultilingualMode::Combined => {
171                    // Format: "transliterated [translated]" or fallback variants
172                    let trans = resolve_transliteration(
173                        &complex.transliterations,
174                        preferred_transliteration,
175                        preferred_script,
176                    );
177
178                    let translation = resolve_translation(&complex.translations, style_locale);
179
180                    match (trans, translation) {
181                        (Some(t), Some(tr)) => format!("{t} [{tr}]"),
182                        (Some(t), None) => t.to_string(),
183                        (None, Some(tr)) => format!("{} [{}]", complex.original, tr),
184                        (None, None) => complex.original.clone(),
185                    }
186                }
187
188                MultilingualMode::Pattern(segments) => resolve_multilingual_pattern(
189                    segments,
190                    &complex.original,
191                    &complex.transliterations,
192                    &complex.translations,
193                    preferred_transliteration,
194                    preferred_script,
195                    style_locale,
196                ),
197            }
198        }
199    }
200}
201
202/// Render a [`MultilingualMode::Pattern`] against a complex multilingual string.
203///
204/// Each segment is resolved to its text; segments that are empty or identical to
205/// the previous non-empty segment are skipped (dedup).  The surviving segments are
206/// joined by a single space.
207fn resolve_multilingual_pattern(
208    segments: &[citum_schema::options::MultilingualSegment],
209    original: &str,
210    transliterations: &std::collections::HashMap<String, String>,
211    translations: &std::collections::HashMap<citum_schema::reference::types::LangID, String>,
212    preferred_transliteration: Option<&[String]>,
213    preferred_script: Option<&String>,
214    style_locale: &str,
215) -> String {
216    use citum_schema::options::{MultilingualView, SegmentWrap};
217    let mut parts: Vec<String> = Vec::with_capacity(segments.len());
218    let mut last_text: Option<String> = None;
219
220    for seg in segments {
221        let text: Option<String> = match &seg.view {
222            MultilingualView::Original => Some(original.to_string()),
223            MultilingualView::Transliterated => resolve_transliteration(
224                transliterations,
225                preferred_transliteration,
226                preferred_script,
227            )
228            .map(ToString::to_string),
229            MultilingualView::Translated => {
230                resolve_translation(translations, style_locale).map(ToString::to_string)
231            }
232        };
233
234        let Some(text) = text else { continue };
235        if text.is_empty() {
236            continue;
237        }
238        // Skip duplicate: if this text is identical to the previous segment (e.g. when
239        // transliteration falls back to the same value as original).
240        if last_text.as_deref() == Some(text.as_str()) {
241            continue;
242        }
243
244        let wrapped = match &seg.wrap {
245            SegmentWrap::None => text.clone(),
246            other => other.apply(&text),
247        };
248        last_text = Some(text);
249        parts.push(wrapped);
250    }
251
252    parts.join(" ")
253}
254
255/// Resolve the effective language for one logical field scope on a reference.
256///
257/// This prefers an explicit `field_languages` entry, then a multilingual title
258/// language tag for the provided title value, and finally the reference-level
259/// language.
260#[must_use]
261pub fn effective_field_language(
262    reference: &Reference,
263    scope: &str,
264    title: Option<&Title>,
265) -> Option<String> {
266    reference
267        .field_languages()
268        .get(scope)
269        .map(ToString::to_string)
270        .or_else(|| match title {
271            Some(Title::Multilingual(multilingual)) => {
272                multilingual.lang.as_ref().map(ToString::to_string)
273            }
274            _ => None,
275        })
276        .or_else(|| reference.language().map(|lang| lang.to_string()))
277}
278
279/// Resolve the effective language for the primary title of a reference.
280#[must_use]
281pub fn effective_item_language(reference: &Reference) -> Option<String> {
282    effective_field_language(reference, "title", reference.title().as_ref())
283}
284
285/// Resolve the effective language for the specific template component being rendered.
286#[must_use]
287pub fn effective_component_language(
288    reference: &Reference,
289    component: &TemplateComponent,
290) -> Option<String> {
291    match component {
292        TemplateComponent::Title(title_component) => {
293            let title = match title_component.title {
294                TitleType::Primary => reference.title(),
295                TitleType::ParentMonograph => reference.container_title(),
296                TitleType::ParentSerial => reference.container_title(),
297                _ => reference.title(),
298            };
299
300            let scope = match title_component.title {
301                TitleType::Primary => "title",
302                TitleType::ParentMonograph => "parent-monograph.title",
303                TitleType::ParentSerial => "parent-serial.title",
304                _ => "title",
305            };
306
307            effective_field_language(reference, scope, title.as_ref())
308        }
309        _ => effective_item_language(reference),
310    }
311}
312
313/// Select a structured name from transliteration maps using priority-list then script-match rules.
314fn select_by_transliteration<'a>(
315    m: &'a citum_schema::reference::contributor::MultilingualName,
316    preferred_transliteration: Option<&[String]>,
317    preferred_script: Option<&String>,
318) -> &'a citum_schema::reference::contributor::StructuredName {
319    // 1. Priority list: exact match
320    if let Some(tags) = preferred_transliteration {
321        for tag in tags {
322            if let Some(name) = m.transliterations.get(tag) {
323                return name;
324            }
325        }
326        // 2. Priority list: substring match
327        for tag in tags {
328            if let Some((_, name)) = m
329                .transliterations
330                .iter()
331                .find(|(k, _)| k.contains(tag.as_str()))
332            {
333                return name;
334            }
335        }
336    }
337    // 3. Preferred script: exact match
338    if let Some(script) = preferred_script {
339        if let Some(name) = m.transliterations.get(script) {
340            return name;
341        }
342        // 4. Preferred script: substring match
343        if let Some((_, name)) = m
344            .transliterations
345            .iter()
346            .find(|(tag, _)| tag.contains(script))
347        {
348            return name;
349        }
350    }
351    // Fallback: any available transliteration before falling back to original
352    m.transliterations.values().next().unwrap_or(&m.original)
353}
354
355/// Render the original-script display form of a structured name.
356///
357/// CJK names display family-first with no separator (`华林甫`); other scripts
358/// display given-first with a space.
359fn original_script_display(name: &citum_schema::reference::contributor::StructuredName) -> String {
360    use unicode_script::{Script, UnicodeScript};
361
362    let family = name.family.to_string();
363    let given = name.given.to_string();
364    let is_cjk = family.chars().chain(given.chars()).any(|ch| {
365        matches!(
366            ch.script(),
367            Script::Han | Script::Hiragana | Script::Katakana | Script::Hangul
368        )
369    });
370    if is_cjk || family.is_empty() || given.is_empty() {
371        format!("{family}{given}")
372    } else {
373        format!("{given} {family}")
374    }
375}
376
377/// Resolve a multilingual contributor name based on style configuration.
378///
379/// Uses holistic name matching - selects the entire name variant (original/transliterated/translated)
380/// as a unit rather than mixing fields from different variants.
381///
382/// # Arguments
383/// * `contributor` - The contributor to resolve
384/// * `mode` - The rendering mode from style config
385/// * `preferred_transliteration` - Optional ordered list of BCP 47 transliteration tags
386/// * `preferred_script` - Optional preferred script (e.g., "Latn")
387/// * `style_locale` - The style's locale for translation matching
388#[must_use]
389pub fn resolve_multilingual_name(
390    contributor: &citum_schema::reference::contributor::Contributor,
391    mode: Option<&citum_schema::options::MultilingualMode>,
392    preferred_transliteration: Option<&[String]>,
393    preferred_script: Option<&String>,
394    style_locale: &str,
395) -> Vec<crate::reference::FlatName> {
396    use citum_schema::options::MultilingualMode;
397    use citum_schema::reference::contributor::Contributor;
398
399    match contributor {
400        // Simple and structured names have no multilingual data
401        Contributor::SimpleName(_) | Contributor::StructuredName(_) => contributor.to_names_vec(),
402
403        // Multilingual names: select variant holistically
404        Contributor::Multilingual(m) => {
405            let mode = mode.unwrap_or(&MultilingualMode::Primary);
406
407            let selected_name = match mode {
408                MultilingualMode::Primary => &m.original,
409                MultilingualMode::Transliterated => {
410                    select_by_transliteration(m, preferred_transliteration, preferred_script)
411                }
412                MultilingualMode::Translated => {
413                    m.translations.get(style_locale).unwrap_or(&m.original)
414                }
415                // Combined mode for names defaults to transliterated (parenthetical combo not common for names)
416                MultilingualMode::Combined => {
417                    select_by_transliteration(m, preferred_transliteration, preferred_script)
418                }
419                // Pattern mode for names: render the romanized view, carrying the
420                // original-script form along when the pattern requests it (CNE
421                // style: "Hua Linfu 华林甫").
422                MultilingualMode::Pattern(_) => {
423                    select_by_transliteration(m, preferred_transliteration, preferred_script)
424                }
425            };
426
427            // When a name pattern includes an `original` view alongside the
428            // selected transliteration, carry the original-script display form
429            // (with the segment's wrap applied) so formatting can append it
430            // after the romanized name.
431            let original_script = match mode {
432                MultilingualMode::Pattern(segments) if selected_name != &m.original => segments
433                    .iter()
434                    .find(|segment| {
435                        segment.view == citum_schema::options::MultilingualView::Original
436                    })
437                    .map(|segment| segment.wrap.apply(&original_script_display(&m.original))),
438                _ => None,
439            };
440
441            // Convert selected name to FlatName
442            vec![crate::reference::FlatName {
443                given: Some(selected_name.given.to_string()),
444                family: Some(selected_name.family.to_string()),
445                suffix: selected_name.suffix.clone(),
446                dropping_particle: selected_name.dropping_particle.clone(),
447                non_dropping_particle: selected_name.non_dropping_particle.clone(),
448                literal: None,
449                short_name: None,
450                original_script,
451            }]
452        }
453
454        Contributor::ContributorList(l) => {
455            l.0.iter()
456                .flat_map(|c| {
457                    resolve_multilingual_name(
458                        c,
459                        mode,
460                        preferred_transliteration,
461                        preferred_script,
462                        style_locale,
463                    )
464                })
465                .collect()
466        }
467    }
468}
469
470/// Resolve the URL for a component based on its links configuration and the reference data.
471#[must_use]
472pub fn resolve_url(
473    links: &citum_schema::options::LinksConfig,
474    reference: &Reference,
475) -> Option<String> {
476    use citum_schema::options::LinkTarget;
477
478    let target = links.target.as_ref().unwrap_or(&LinkTarget::UrlOrDoi);
479
480    match target {
481        LinkTarget::Url => reference.url().map(|u| u.to_string()),
482        LinkTarget::Doi => reference.doi().map(|d| format!("https://doi.org/{d}")),
483        LinkTarget::UrlOrDoi => reference
484            .url()
485            .map(|u| u.to_string())
486            .or_else(|| reference.doi().map(|d| format!("https://doi.org/{d}"))),
487        LinkTarget::Pubmed => reference
488            .id()
489            .filter(|id| id.starts_with("pmid:"))
490            .map(|id| {
491                #[allow(clippy::string_slice, reason = "known ASCII prefix")]
492                let result = format!("https://pubmed.ncbi.nlm.nih.gov/{}/", &id[5..]);
493                result
494            }),
495        LinkTarget::Pmcid => reference
496            .id()
497            .filter(|id| id.starts_with("pmc:"))
498            .map(|id| {
499                #[allow(clippy::string_slice, reason = "known ASCII prefix")]
500                let result = format!("https://www.ncbi.nlm.nih.gov/pmc/articles/{}/", &id[4..]);
501                result
502            }),
503    }
504}
505
506/// Resolve the effective URL for a component, checking local links then falling back to global config.
507#[must_use]
508pub fn resolve_effective_url(
509    local_links: Option<&citum_schema::options::LinksConfig>,
510    global_links: Option<&citum_schema::options::LinksConfig>,
511    reference: &Reference,
512    component_anchor: citum_schema::options::LinkAnchor,
513) -> Option<String> {
514    use citum_schema::options::LinkAnchor;
515
516    // 1. Check local links first
517    if let Some(links) = local_links {
518        let anchor = links.anchor.as_ref().unwrap_or(&LinkAnchor::Component);
519        if matches!(anchor, LinkAnchor::Component) || *anchor == component_anchor {
520            return resolve_url(links, reference);
521        }
522    }
523
524    // 2. Fall back to global links if anchor matches this component type
525    if let Some(links) = global_links
526        && let Some(anchor) = &links.anchor
527        && *anchor == component_anchor
528    {
529        return resolve_url(links, reference);
530    }
531
532    None
533}
534
535/// Processed values ready for rendering.
536#[derive(Debug, Clone, Default)]
537pub struct ProcValues<T = String> {
538    /// The primary formatted value.
539    pub value: T,
540    /// Optional prefix to prepend.
541    pub prefix: Option<String>,
542    /// Optional suffix to append.
543    pub suffix: Option<String>,
544    /// Optional URL for hyperlinking.
545    pub url: Option<String>,
546    /// Variable key that was substituted (e.g., "title:Primary" when title replaces author).
547    /// Used to prevent duplicate rendering per CSL variable-once rule.
548    pub substituted_key: Option<String>,
549    /// Whether the value is already pre-formatted.
550    pub pre_formatted: bool,
551}
552
553/// Processing hints computed before rendering a reference or citation item.
554#[derive(Debug, Clone, Default)]
555pub struct ProcHints {
556    /// Whether disambiguation is active (triggers year-suffix).
557    pub disamb_condition: bool,
558    /// Index in the disambiguation group (1-based).
559    pub group_index: usize,
560    /// Total size of the disambiguation group.
561    pub group_length: usize,
562    /// The grouping key used.
563    pub group_key: String,
564    /// Whether to expand given names for disambiguation.
565    pub expand_given_names: bool,
566    /// Whether to expand given names for primary author only.
567    pub expand_given_names_primary_only: bool,
568    /// Minimum number of names to show to resolve ambiguity (overrides et-al-use-first).
569    pub min_names_to_show: Option<usize>,
570    /// Citation number for numeric citation styles (1-based).
571    pub citation_number: Option<usize>,
572    /// Optional sub-label for compound numeric citation addressing (e.g., "a" in "1a").
573    pub citation_sub_label: Option<String>,
574    /// Citation position (first, subsequent, ibid, etc.).
575    pub position: Option<citum_schema::citation::Position>,
576    /// Explicit integral citation name-memory state for this rendered item.
577    pub integral_name_state: Option<citum_schema::citation::IntegralNameState>,
578    /// Explicit org-abbreviation state for this rendered item.
579    pub org_abbreviation_state: Option<citum_schema::citation::IntegralNameState>,
580    /// First note number in which this reference was cited (note styles only).
581    /// Set for subsequent-position citations; `None` otherwise.
582    pub first_reference_note_number: Option<u32>,
583    /// When true, suppress a `disambiguate_only` title component.
584    /// Set when `first_reference_note_number` is present — the note number
585    /// already identifies the work; the disambiguating short title is redundant.
586    pub suppress_disambiguation_title: bool,
587}
588
589/// Context for rendering (citation vs bibliography).
590#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
591pub enum RenderContext {
592    #[default]
593    /// Render values for citation output.
594    Citation,
595    /// Render values for bibliography output.
596    Bibliography,
597}
598
599/// Options for rendering.
600#[derive(Clone)]
601pub struct RenderOptions<'a> {
602    /// Effective configuration after style and default resolution.
603    pub config: &'a Config,
604    /// Effective bibliography-only configuration when rendering bibliography behavior.
605    pub bibliography_config: Option<BibliographyConfig>,
606    /// Locale used for term lookup and locale-sensitive formatting.
607    pub locale: &'a Locale,
608    /// Whether the current render target is a citation or bibliography.
609    pub context: RenderContext,
610    /// Citation mode for the current render operation.
611    pub mode: citum_schema::citation::CitationMode,
612    /// Whether to suppress the author name for this citation.
613    /// Set from the citation-level `suppress_author` flag.
614    pub suppress_author: bool,
615    /// Optional raw citation locator for rendering via locator config.
616    pub locator_raw: Option<&'a citum_schema::citation::CitationLocator>,
617    /// Reference type for optional type-class gating in locator patterns.
618    pub ref_type: Option<String>,
619    /// Whether to output semantic markup (HTML spans, Djot attributes).
620    pub show_semantics: bool,
621    /// The current top-level template index, when propagating preview annotations.
622    pub current_template_index: Option<usize>,
623    /// Document-level abbreviation map for post-render substitution.
624    pub abbreviation_map: Option<&'a crate::api::AbbreviationMap>,
625}
626
627/// Trait for extracting values from template components.
628pub trait ComponentValues {
629    /// Resolve the component into processed render values for one reference.
630    fn values<F: crate::render::format::OutputFormat<Output = String>>(
631        &self,
632        reference: &Reference,
633        hints: &ProcHints,
634        options: &RenderOptions<'_>,
635    ) -> Option<ProcValues<F::Output>>;
636}
637
638impl ComponentValues for TemplateComponent {
639    fn values<F: crate::render::format::OutputFormat<Output = String>>(
640        &self,
641        reference: &Reference,
642        hints: &ProcHints,
643        options: &RenderOptions<'_>,
644    ) -> Option<ProcValues<F::Output>> {
645        match self {
646            TemplateComponent::Contributor(c) => c.values::<F>(reference, hints, options),
647            TemplateComponent::Date(d) => d.values::<F>(reference, hints, options),
648            TemplateComponent::Title(t) => t.values::<F>(reference, hints, options),
649            TemplateComponent::Number(n) => n.values::<F>(reference, hints, options),
650            TemplateComponent::Variable(v) => v.values::<F>(reference, hints, options),
651            TemplateComponent::Group(l) => l.values::<F>(reference, hints, options),
652            TemplateComponent::Term(t) => t.values::<F>(reference, hints, options),
653            _ => None,
654        }
655    }
656}
657
658/// Check if periods should be stripped based on three-tier precedence.
659///
660/// Resolution order:
661/// 1. Component-level `strip_periods`
662/// 2. Global config `strip_periods`
663/// 3. Defaults to false
664#[must_use]
665pub fn should_strip_periods(
666    rendering: &citum_schema::template::Rendering,
667    options: &RenderOptions<'_>,
668) -> bool {
669    rendering
670        .strip_periods
671        .or(options.config.strip_periods)
672        .unwrap_or(false)
673}
674
675/// Strip trailing periods from a string.
676///
677/// Only removes periods at the end of the string, preserves internal periods
678/// (e.g., "Ph.D." remains unchanged if there's no trailing period).
679#[must_use]
680pub fn strip_trailing_periods(s: &str) -> String {
681    s.trim_end_matches('.').to_string()
682}
683
684/// Apply abbreviation substitution if the map contains an entry for `value`.
685///
686/// Returns the abbreviation if found, otherwise returns the original value unchanged.
687#[must_use]
688pub fn apply_abbreviation(value: String, map: Option<&crate::api::AbbreviationMap>) -> String {
689    if let Some(abbr) = map.and_then(|m| m.0.get(&value)) {
690        return abbr.clone();
691    }
692    value
693}