Skip to main content

citum_engine/values/
mod.rs

1/*
2SPDX-License-Identifier: MIT OR Apache-2.0
3SPDX-FileCopyrightText: © 2023-2026 Bruce D'Arcus and Citum contributors
4*/
5
6//! Value extraction for template components.
7//!
8//! This module provides the logic to extract formatted values from references
9//! based on template component specifications.
10
11/// Contributor extraction and name-formatting helpers.
12pub mod contributor;
13/// Date extraction and date-formatting helpers.
14pub mod date;
15/// List-component value extraction helpers.
16pub mod list;
17/// Locator rendering logic.
18pub mod locator;
19/// Numeric variable extraction and page-range helpers.
20pub mod number;
21/// Shared helpers for collapsing consecutive numeric or ordinal numbering.
22pub mod range;
23/// Locale term resolution helpers.
24pub mod term;
25/// Title text-case transform functions.
26pub mod text_case;
27/// Title extraction and title-formatting helpers.
28pub mod title;
29/// Generic variable extraction helpers.
30pub mod variable;
31
32#[cfg(test)]
33#[allow(
34    clippy::unwrap_used,
35    clippy::expect_used,
36    clippy::panic,
37    clippy::indexing_slicing,
38    clippy::todo,
39    clippy::unimplemented,
40    clippy::unreachable,
41    clippy::get_unwrap,
42    reason = "Panicking is acceptable and often desired in tests."
43)]
44mod tests;
45
46use crate::reference::Reference;
47use citum_schema::locale::Locale;
48use citum_schema::options::{Config, bibliography::BibliographyConfig};
49use citum_schema::reference::types::Title;
50use citum_schema::template::{TemplateComponent, TitleType};
51
52pub use contributor::format_contributors_short;
53pub use date::int_to_letter;
54
55/// Resolve preferred transliteration from a map of transliterations.
56///
57/// Applies priority-based matching:
58/// 1. Preferred transliteration list: exact match
59/// 2. Preferred transliteration list: substring match
60/// 3. Preferred script: exact match
61/// 4. Preferred script: substring match
62fn resolve_transliteration<'a>(
63    transliterations: &'a std::collections::HashMap<String, String>,
64    preferred_transliteration: Option<&[String]>,
65    preferred_script: Option<&String>,
66) -> Option<&'a str> {
67    // 1. Priority list: exact match
68    if let Some(tags) = preferred_transliteration {
69        for tag in tags {
70            if let Some(v) = transliterations.get(tag) {
71                return Some(v.as_str());
72            }
73        }
74        // 2. Priority list: substring match
75        for tag in tags {
76            for (k, v) in transliterations {
77                if k.contains(tag.as_str()) {
78                    return Some(v.as_str());
79                }
80            }
81        }
82    }
83    // 3. preferred_script exact match
84    if let Some(script) = preferred_script {
85        if let Some(v) = transliterations.get(script) {
86            return Some(v.as_str());
87        }
88        // 4. preferred_script substring match
89        for (k, v) in transliterations {
90            if k.contains(script.as_str()) {
91                return Some(v.as_str());
92            }
93        }
94    }
95    None
96}
97
98fn resolve_translation<'a>(
99    translations: &'a std::collections::HashMap<citum_schema::reference::LangID, String>,
100    style_locale: &str,
101) -> Option<&'a str> {
102    translations
103        .get(style_locale)
104        .or_else(|| {
105            style_locale
106                .split(['-', '_'])
107                .next()
108                .and_then(|base| translations.get(base))
109        })
110        .map(String::as_str)
111}
112
113/// Resolve a multilingual string based on style configuration.
114///
115/// Applies BCP 47 fallback logic:
116/// 1. Exact tag match (e.g., "ja-Latn-hepburn")
117/// 2. Script prefix match (e.g., "ja-Latn")
118/// 3. Fallback to original field
119///
120/// # Arguments
121/// * `string` - The multilingual string to resolve
122/// * `mode` - The rendering mode from style config
123/// * `preferred_transliteration` - Optional ordered list of BCP 47 transliteration tags
124/// * `preferred_script` - Optional preferred script (e.g., "Latn")
125/// * `style_locale` - The style's locale for translation matching
126#[must_use]
127pub fn resolve_multilingual_string(
128    string: &citum_schema::reference::types::MultilingualString,
129    mode: Option<&citum_schema::options::MultilingualMode>,
130    preferred_transliteration: Option<&[String]>,
131    preferred_script: Option<&String>,
132    style_locale: &str,
133) -> String {
134    use citum_schema::options::MultilingualMode;
135    use citum_schema::reference::types::MultilingualString;
136
137    match string {
138        MultilingualString::Simple(s) => s.clone(),
139        MultilingualString::Complex(complex) => {
140            let mode = mode.unwrap_or(&MultilingualMode::Primary);
141
142            match mode {
143                MultilingualMode::Primary => complex.original.clone(),
144
145                MultilingualMode::Transliterated => {
146                    if let Some(trans) = resolve_transliteration(
147                        &complex.transliterations,
148                        preferred_transliteration,
149                        preferred_script,
150                    ) {
151                        return trans.to_string();
152                    }
153
154                    // Fallback: use any available transliteration, or original
155                    complex
156                        .transliterations
157                        .values()
158                        .next()
159                        .cloned()
160                        .unwrap_or_else(|| complex.original.clone())
161                }
162
163                MultilingualMode::Translated => {
164                    // Try to match style locale
165                    resolve_translation(&complex.translations, style_locale)
166                        .map(ToString::to_string)
167                        .unwrap_or_else(|| complex.original.clone())
168                }
169
170                MultilingualMode::Combined => {
171                    // Format: "transliterated [translated]" or fallback variants
172                    let trans = resolve_transliteration(
173                        &complex.transliterations,
174                        preferred_transliteration,
175                        preferred_script,
176                    );
177
178                    let translation = resolve_translation(&complex.translations, style_locale);
179
180                    match (trans, translation) {
181                        (Some(t), Some(tr)) => format!("{t} [{tr}]"),
182                        (Some(t), None) => t.to_string(),
183                        (None, Some(tr)) => format!("{} [{}]", complex.original, tr),
184                        (None, None) => complex.original.clone(),
185                    }
186                }
187
188                MultilingualMode::Pattern(segments) => resolve_multilingual_pattern(
189                    segments,
190                    &complex.original,
191                    &complex.transliterations,
192                    &complex.translations,
193                    preferred_transliteration,
194                    preferred_script,
195                    style_locale,
196                ),
197            }
198        }
199    }
200}
201
202/// Render a [`MultilingualMode::Pattern`] against a complex multilingual string.
203///
204/// Each segment is resolved to its text; segments that are empty or identical to
205/// the previous non-empty segment are skipped (dedup).  The surviving segments are
206/// joined by a single space.
207fn resolve_multilingual_pattern(
208    segments: &[citum_schema::options::MultilingualSegment],
209    original: &str,
210    transliterations: &std::collections::HashMap<String, String>,
211    translations: &std::collections::HashMap<citum_schema::reference::types::LangID, String>,
212    preferred_transliteration: Option<&[String]>,
213    preferred_script: Option<&String>,
214    style_locale: &str,
215) -> String {
216    use citum_schema::options::{MultilingualView, SegmentWrap};
217    let mut parts: Vec<String> = Vec::with_capacity(segments.len());
218    let mut last_text: Option<String> = None;
219
220    for seg in segments {
221        let text: Option<String> = match &seg.view {
222            MultilingualView::OriginalScript => Some(original.to_string()),
223            MultilingualView::Transliterated => resolve_transliteration(
224                transliterations,
225                preferred_transliteration,
226                preferred_script,
227            )
228            .map(ToString::to_string),
229            MultilingualView::Translated => {
230                resolve_translation(translations, style_locale).map(ToString::to_string)
231            }
232        };
233
234        let Some(text) = text else { continue };
235        if text.is_empty() {
236            continue;
237        }
238        // Skip duplicate: if this text is identical to the previous segment (e.g. when
239        // transliteration falls back to the same value as original).
240        if last_text.as_deref() == Some(text.as_str()) {
241            continue;
242        }
243
244        let wrapped = match &seg.wrap {
245            SegmentWrap::None => text.clone(),
246            other => other.apply(&text),
247        };
248        last_text = Some(text);
249        parts.push(wrapped);
250    }
251
252    parts.join(" ")
253}
254
255/// Resolve the effective language for one logical field scope on a reference.
256///
257/// This prefers an explicit `field_languages` entry, then a multilingual title
258/// language tag for the provided title value, and finally the reference-level
259/// language.
260#[must_use]
261pub fn effective_field_language(
262    reference: &Reference,
263    scope: &str,
264    title: Option<&Title>,
265) -> Option<String> {
266    reference
267        .field_languages()
268        .get(scope)
269        .map(ToString::to_string)
270        .or_else(|| match title {
271            Some(Title::Multilingual(multilingual)) => {
272                multilingual.lang.as_ref().map(ToString::to_string)
273            }
274            _ => None,
275        })
276        .or_else(|| reference.language().map(|lang| lang.to_string()))
277}
278
279/// Resolve the effective language for the primary title of a reference.
280#[must_use]
281pub fn effective_item_language(reference: &Reference) -> Option<String> {
282    effective_field_language(reference, "title", reference.title().as_ref())
283}
284
285/// Resolve the effective language for the specific template component being rendered.
286#[must_use]
287pub fn effective_component_language(
288    reference: &Reference,
289    component: &TemplateComponent,
290) -> Option<String> {
291    match component {
292        TemplateComponent::Title(title_component) => {
293            let title = match title_component.title {
294                TitleType::Primary => reference.title(),
295                TitleType::ContainerTitle => reference.container_title(),
296                TitleType::ParentMonograph => reference.container_title(),
297                TitleType::ParentSerial => reference.container_title(),
298                TitleType::CollectionTitle => reference.collection_title(),
299                _ => reference.title(),
300            };
301
302            let scope = match title_component.title {
303                TitleType::Primary => "title",
304                TitleType::ContainerTitle => "container-title",
305                TitleType::ParentMonograph => "parent-monograph.title",
306                TitleType::ParentSerial => "parent-serial.title",
307                TitleType::CollectionTitle => "collection-title",
308                _ => "title",
309            };
310
311            effective_field_language(reference, scope, title.as_ref())
312        }
313        _ => effective_item_language(reference),
314    }
315}
316
317/// Select a structured name from transliteration maps using priority-list then script-match rules.
318fn select_by_transliteration<'a>(
319    m: &'a citum_schema::reference::contributor::MultilingualName,
320    preferred_transliteration: Option<&[String]>,
321    preferred_script: Option<&String>,
322) -> &'a citum_schema::reference::contributor::StructuredName {
323    // 1. Priority list: exact match
324    if let Some(tags) = preferred_transliteration {
325        for tag in tags {
326            if let Some(name) = m.transliterations.get(tag) {
327                return name;
328            }
329        }
330        // 2. Priority list: substring match
331        for tag in tags {
332            if let Some((_, name)) = m
333                .transliterations
334                .iter()
335                .find(|(k, _)| k.contains(tag.as_str()))
336            {
337                return name;
338            }
339        }
340    }
341    // 3. Preferred script: exact match
342    if let Some(script) = preferred_script {
343        if let Some(name) = m.transliterations.get(script) {
344            return name;
345        }
346        // 4. Preferred script: substring match
347        if let Some((_, name)) = m
348            .transliterations
349            .iter()
350            .find(|(tag, _)| tag.contains(script))
351        {
352            return name;
353        }
354    }
355    // Fallback: any available transliteration before falling back to original
356    m.transliterations.values().next().unwrap_or(&m.original)
357}
358
359/// Render the original-script display form of a structured name.
360///
361/// CJK names display family-first with no separator (`华林甫`); other scripts
362/// display given-first with a space.
363fn original_script_display(name: &citum_schema::reference::contributor::StructuredName) -> String {
364    use unicode_script::{Script, UnicodeScript};
365
366    let family = name.family.to_string();
367    let given = name.given.to_string();
368    let is_cjk = family.chars().chain(given.chars()).any(|ch| {
369        matches!(
370            ch.script(),
371            Script::Han | Script::Hiragana | Script::Katakana | Script::Hangul
372        )
373    });
374    if is_cjk || family.is_empty() || given.is_empty() {
375        format!("{family}{given}")
376    } else {
377        format!("{given} {family}")
378    }
379}
380
381/// Resolve a multilingual contributor name based on style configuration.
382///
383/// Uses holistic name matching - selects the entire name variant (original/transliterated/translated)
384/// as a unit rather than mixing fields from different variants.
385///
386/// # Arguments
387/// * `contributor` - The contributor to resolve
388/// * `mode` - The rendering mode from style config
389/// * `preferred_transliteration` - Optional ordered list of BCP 47 transliteration tags
390/// * `preferred_script` - Optional preferred script (e.g., "Latn")
391/// * `style_locale` - The style's locale for translation matching
392#[must_use]
393pub fn resolve_multilingual_name(
394    contributor: &citum_schema::reference::contributor::Contributor,
395    mode: Option<&citum_schema::options::MultilingualMode>,
396    preferred_transliteration: Option<&[String]>,
397    preferred_script: Option<&String>,
398    style_locale: &str,
399) -> Vec<crate::reference::FlatName> {
400    use citum_schema::options::MultilingualMode;
401    use citum_schema::reference::contributor::Contributor;
402
403    match contributor {
404        // Simple and structured names have no multilingual data
405        Contributor::SimpleName(_) | Contributor::StructuredName(_) => contributor.to_names_vec(),
406
407        // Multilingual names: select variant holistically
408        Contributor::Multilingual(m) => {
409            let mode = mode.unwrap_or(&MultilingualMode::Primary);
410
411            let selected_name = match mode {
412                MultilingualMode::Primary => &m.original,
413                MultilingualMode::Transliterated => {
414                    select_by_transliteration(m, preferred_transliteration, preferred_script)
415                }
416                MultilingualMode::Translated => {
417                    m.translations.get(style_locale).unwrap_or(&m.original)
418                }
419                // Combined mode for names defaults to transliterated (parenthetical combo not common for names)
420                MultilingualMode::Combined => {
421                    select_by_transliteration(m, preferred_transliteration, preferred_script)
422                }
423                // Pattern mode for names: render the romanized view, carrying the
424                // original-script form along when the pattern requests it
425                // (e.g. "Hua Linfu 华林甫").
426                MultilingualMode::Pattern(_) => {
427                    select_by_transliteration(m, preferred_transliteration, preferred_script)
428                }
429            };
430
431            // When a name pattern includes an `original-script` view alongside
432            // the selected transliteration, carry the original-script display
433            // form (with the segment's wrap applied) so formatting can append
434            // it after the romanized name.
435            let original_script = match mode {
436                MultilingualMode::Pattern(segments) if selected_name != &m.original => segments
437                    .iter()
438                    .find(|segment| {
439                        segment.view == citum_schema::options::MultilingualView::OriginalScript
440                    })
441                    .map(|segment| segment.wrap.apply(&original_script_display(&m.original))),
442                _ => None,
443            };
444
445            // Convert selected name to FlatName
446            vec![crate::reference::FlatName {
447                given: Some(selected_name.given.to_string()),
448                family: Some(selected_name.family.to_string()),
449                suffix: selected_name.suffix.clone(),
450                dropping_particle: selected_name.dropping_particle.clone(),
451                non_dropping_particle: selected_name.non_dropping_particle.clone(),
452                literal: None,
453                short_name: None,
454                original_script,
455            }]
456        }
457
458        Contributor::ContributorList(l) => {
459            l.0.iter()
460                .flat_map(|c| {
461                    resolve_multilingual_name(
462                        c,
463                        mode,
464                        preferred_transliteration,
465                        preferred_script,
466                        style_locale,
467                    )
468                })
469                .collect()
470        }
471    }
472}
473
474/// Resolve the URL for a component based on its links configuration and the reference data.
475#[must_use]
476pub fn resolve_url(
477    links: &citum_schema::options::LinksConfig,
478    reference: &Reference,
479) -> Option<String> {
480    use citum_schema::options::LinkTarget;
481
482    let target = links.target.as_ref().unwrap_or(&LinkTarget::UrlOrDoi);
483
484    match target {
485        LinkTarget::Url => reference.url().map(|u| u.to_string()),
486        LinkTarget::Doi => reference.doi().map(|d| format!("https://doi.org/{d}")),
487        LinkTarget::UrlOrDoi => reference
488            .url()
489            .map(|u| u.to_string())
490            .or_else(|| reference.doi().map(|d| format!("https://doi.org/{d}"))),
491        LinkTarget::Pubmed => reference
492            .id()
493            .filter(|id| id.starts_with("pmid:"))
494            .map(|id| {
495                #[allow(clippy::string_slice, reason = "known ASCII prefix")]
496                let result = format!("https://pubmed.ncbi.nlm.nih.gov/{}/", &id[5..]);
497                result
498            }),
499        LinkTarget::Pmcid => reference
500            .id()
501            .filter(|id| id.starts_with("pmc:"))
502            .map(|id| {
503                #[allow(clippy::string_slice, reason = "known ASCII prefix")]
504                let result = format!("https://www.ncbi.nlm.nih.gov/pmc/articles/{}/", &id[4..]);
505                result
506            }),
507    }
508}
509
510/// Resolve the effective URL for a component, checking local links then falling back to global config.
511#[must_use]
512pub fn resolve_effective_url(
513    local_links: Option<&citum_schema::options::LinksConfig>,
514    global_links: Option<&citum_schema::options::LinksConfig>,
515    reference: &Reference,
516    component_anchor: citum_schema::options::LinkAnchor,
517) -> Option<String> {
518    use citum_schema::options::LinkAnchor;
519
520    // 1. Check local links first
521    if let Some(links) = local_links {
522        let anchor = links.anchor.as_ref().unwrap_or(&LinkAnchor::Component);
523        if matches!(anchor, LinkAnchor::Component) || *anchor == component_anchor {
524            return resolve_url(links, reference);
525        }
526    }
527
528    // 2. Fall back to global links if anchor matches this component type
529    if let Some(links) = global_links
530        && let Some(anchor) = &links.anchor
531        && *anchor == component_anchor
532    {
533        return resolve_url(links, reference);
534    }
535
536    None
537}
538
539/// Processed values ready for rendering.
540#[derive(Debug, Clone, Default)]
541pub struct ProcValues<T = String> {
542    /// The primary formatted value.
543    pub value: T,
544    /// Optional prefix to prepend.
545    pub prefix: Option<String>,
546    /// Optional suffix to append.
547    pub suffix: Option<String>,
548    /// Optional URL for hyperlinking.
549    pub url: Option<String>,
550    /// Variable key that was substituted (e.g., "title:Primary" when title replaces author).
551    /// Used to prevent duplicate rendering per CSL variable-once rule.
552    pub substituted_key: Option<String>,
553    /// Whether the value is already pre-formatted.
554    pub pre_formatted: bool,
555}
556
557/// Processing hints computed before rendering a reference or citation item.
558#[derive(Debug, Clone, Default)]
559pub struct ProcHints {
560    /// Whether disambiguation is active (triggers year-suffix).
561    pub disamb_condition: bool,
562    /// Index in the disambiguation group (1-based).
563    pub group_index: usize,
564    /// Total size of the disambiguation group.
565    pub group_length: usize,
566    /// The grouping key used.
567    pub group_key: String,
568    /// Whether to expand given names for disambiguation.
569    pub expand_given_names: bool,
570    /// Whether to expand given names for primary author only.
571    pub expand_given_names_primary_only: bool,
572    /// Minimum number of names to show to resolve ambiguity (overrides et-al-use-first).
573    pub min_names_to_show: Option<usize>,
574    /// Citation number for numeric citation styles (1-based).
575    pub citation_number: Option<usize>,
576    /// Optional sub-label for compound numeric citation addressing (e.g., "a" in "1a").
577    pub citation_sub_label: Option<String>,
578    /// Citation position (first, subsequent, ibid, etc.).
579    pub position: Option<citum_schema::citation::Position>,
580    /// Explicit integral citation name-memory state for this rendered item.
581    pub integral_name_state: Option<citum_schema::citation::IntegralNameState>,
582    /// Explicit org-abbreviation state for this rendered item.
583    pub org_abbreviation_state: Option<citum_schema::citation::IntegralNameState>,
584    /// First note number in which this reference was cited (note styles only).
585    /// Set for subsequent-position citations; `None` otherwise.
586    pub first_reference_note_number: Option<u32>,
587    /// When true, suppress a `disambiguate_only` title component.
588    /// Set when `first_reference_note_number` is present — the note number
589    /// already identifies the work; the disambiguating short title is redundant.
590    pub suppress_disambiguation_title: bool,
591}
592
593/// Context for rendering (citation vs bibliography).
594#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
595pub enum RenderContext {
596    #[default]
597    /// Render values for citation output.
598    Citation,
599    /// Render values for bibliography output.
600    Bibliography,
601}
602
603/// Options for rendering.
604#[derive(Clone)]
605pub struct RenderOptions<'a> {
606    /// Effective configuration after style and default resolution.
607    pub config: &'a Config,
608    /// Effective bibliography-only configuration when rendering bibliography behavior.
609    pub bibliography_config: Option<BibliographyConfig>,
610    /// Locale used for term lookup and locale-sensitive formatting.
611    pub locale: &'a Locale,
612    /// Whether the current render target is a citation or bibliography.
613    pub context: RenderContext,
614    /// Citation mode for the current render operation.
615    pub mode: citum_schema::citation::CitationMode,
616    /// Whether to suppress the author name for this citation.
617    /// Set from the citation-level `suppress_author` flag.
618    pub suppress_author: bool,
619    /// Optional raw citation locator for rendering via locator config.
620    pub locator_raw: Option<&'a citum_schema::citation::CitationLocator>,
621    /// Reference type for optional type-class gating in locator patterns.
622    pub ref_type: Option<String>,
623    /// Whether to output semantic markup (HTML spans, Djot attributes).
624    pub show_semantics: bool,
625    /// The current top-level template index, when propagating preview annotations.
626    pub current_template_index: Option<usize>,
627    /// Document-level abbreviation map for post-render substitution.
628    pub abbreviation_map: Option<&'a crate::api::AbbreviationMap>,
629}
630
631/// Trait for extracting values from template components.
632pub trait ComponentValues {
633    /// Resolve the component into processed render values for one reference.
634    fn values<F: crate::render::format::OutputFormat<Output = String>>(
635        &self,
636        reference: &Reference,
637        hints: &ProcHints,
638        options: &RenderOptions<'_>,
639    ) -> Option<ProcValues<F::Output>>;
640}
641
642impl ComponentValues for TemplateComponent {
643    fn values<F: crate::render::format::OutputFormat<Output = String>>(
644        &self,
645        reference: &Reference,
646        hints: &ProcHints,
647        options: &RenderOptions<'_>,
648    ) -> Option<ProcValues<F::Output>> {
649        match self {
650            TemplateComponent::Contributor(c) => c.values::<F>(reference, hints, options),
651            TemplateComponent::Date(d) => d.values::<F>(reference, hints, options),
652            TemplateComponent::Title(t) => t.values::<F>(reference, hints, options),
653            TemplateComponent::Number(n) => n.values::<F>(reference, hints, options),
654            TemplateComponent::Variable(v) => v.values::<F>(reference, hints, options),
655            TemplateComponent::Group(l) => l.values::<F>(reference, hints, options),
656            TemplateComponent::Term(t) => t.values::<F>(reference, hints, options),
657            _ => None,
658        }
659    }
660}
661
662/// Check if periods should be stripped based on three-tier precedence.
663///
664/// Resolution order:
665/// 1. Component-level `strip_periods`
666/// 2. Global config `strip_periods`
667/// 3. Defaults to false
668#[must_use]
669pub fn should_strip_periods(
670    rendering: &citum_schema::template::Rendering,
671    options: &RenderOptions<'_>,
672) -> bool {
673    rendering
674        .strip_periods
675        .or(options.config.strip_periods)
676        .unwrap_or(false)
677}
678
679/// Strip trailing periods from a string.
680///
681/// Only removes periods at the end of the string, preserves internal periods
682/// (e.g., "Ph.D." remains unchanged if there's no trailing period).
683#[must_use]
684pub fn strip_trailing_periods(s: &str) -> String {
685    s.trim_end_matches('.').to_string()
686}
687
688/// Apply abbreviation substitution if the map contains an entry for `value`.
689///
690/// Returns the abbreviation if found, otherwise returns the original value unchanged.
691#[must_use]
692pub fn apply_abbreviation(value: String, map: Option<&crate::api::AbbreviationMap>) -> String {
693    if let Some(abbr) = map.and_then(|m| m.0.get(&value)) {
694        return abbr.clone();
695    }
696    value
697}