citum-engine 0.63.0

Citum citation and bibliography processor
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
/*
SPDX-License-Identifier: MIT OR Apache-2.0
SPDX-FileCopyrightText: © 2023-2026 Bruce D'Arcus and Citum contributors
*/

//! Value extraction for template components.
//!
//! This module provides the logic to extract formatted values from references
//! based on template component specifications.

/// Contributor extraction and name-formatting helpers.
pub mod contributor;
/// Date extraction and date-formatting helpers.
pub mod date;
/// List-component value extraction helpers.
pub mod list;
/// Locator rendering logic.
pub mod locator;
/// Numeric variable extraction and page-range helpers.
pub mod number;
/// Shared helpers for collapsing consecutive numeric or ordinal numbering.
pub mod range;
/// Locale term resolution helpers.
pub mod term;
/// Title text-case transform functions.
pub mod text_case;
/// Title extraction and title-formatting helpers.
pub mod title;
/// Generic variable extraction helpers.
pub mod variable;

#[cfg(test)]
#[allow(
    clippy::unwrap_used,
    clippy::expect_used,
    clippy::panic,
    clippy::indexing_slicing,
    clippy::todo,
    clippy::unimplemented,
    clippy::unreachable,
    clippy::get_unwrap,
    reason = "Panicking is acceptable and often desired in tests."
)]
mod tests;

use crate::reference::Reference;
use citum_schema::locale::Locale;
use citum_schema::options::{Config, bibliography::BibliographyConfig};
use citum_schema::reference::types::Title;
use citum_schema::template::{TemplateComponent, TitleType};

pub use contributor::format_contributors_short;
pub use date::int_to_letter;

/// Resolve preferred transliteration from a map of transliterations.
///
/// Applies priority-based matching:
/// 1. Preferred transliteration list: exact match
/// 2. Preferred transliteration list: substring match
/// 3. Preferred script: exact match
/// 4. Preferred script: substring match
fn resolve_transliteration<'a>(
    transliterations: &'a std::collections::HashMap<String, String>,
    preferred_transliteration: Option<&[String]>,
    preferred_script: Option<&String>,
) -> Option<&'a str> {
    // 1. Priority list: exact match
    if let Some(tags) = preferred_transliteration {
        for tag in tags {
            if let Some(v) = transliterations.get(tag) {
                return Some(v.as_str());
            }
        }
        // 2. Priority list: substring match
        for tag in tags {
            for (k, v) in transliterations {
                if k.contains(tag.as_str()) {
                    return Some(v.as_str());
                }
            }
        }
    }
    // 3. preferred_script exact match
    if let Some(script) = preferred_script {
        if let Some(v) = transliterations.get(script) {
            return Some(v.as_str());
        }
        // 4. preferred_script substring match
        for (k, v) in transliterations {
            if k.contains(script.as_str()) {
                return Some(v.as_str());
            }
        }
    }
    None
}

fn resolve_translation<'a>(
    translations: &'a std::collections::HashMap<citum_schema::reference::LangID, String>,
    style_locale: &str,
) -> Option<&'a str> {
    translations
        .get(style_locale)
        .or_else(|| {
            style_locale
                .split(['-', '_'])
                .next()
                .and_then(|base| translations.get(base))
        })
        .map(String::as_str)
}

/// Resolve a multilingual string based on style configuration.
///
/// Applies BCP 47 fallback logic:
/// 1. Exact tag match (e.g., "ja-Latn-hepburn")
/// 2. Script prefix match (e.g., "ja-Latn")
/// 3. Fallback to original field
///
/// # Arguments
/// * `string` - The multilingual string to resolve
/// * `mode` - The rendering mode from style config
/// * `preferred_transliteration` - Optional ordered list of BCP 47 transliteration tags
/// * `preferred_script` - Optional preferred script (e.g., "Latn")
/// * `style_locale` - The style's locale for translation matching
#[must_use]
pub fn resolve_multilingual_string(
    string: &citum_schema::reference::types::MultilingualString,
    mode: Option<&citum_schema::options::MultilingualMode>,
    preferred_transliteration: Option<&[String]>,
    preferred_script: Option<&String>,
    style_locale: &str,
) -> String {
    use citum_schema::options::MultilingualMode;
    use citum_schema::reference::types::MultilingualString;

    match string {
        MultilingualString::Simple(s) => s.clone(),
        MultilingualString::Complex(complex) => {
            let mode = mode.unwrap_or(&MultilingualMode::Primary);

            match mode {
                MultilingualMode::Primary => complex.original.clone(),

                MultilingualMode::Transliterated => {
                    if let Some(trans) = resolve_transliteration(
                        &complex.transliterations,
                        preferred_transliteration,
                        preferred_script,
                    ) {
                        return trans.to_string();
                    }

                    // Fallback: use any available transliteration, or original
                    complex
                        .transliterations
                        .values()
                        .next()
                        .cloned()
                        .unwrap_or_else(|| complex.original.clone())
                }

                MultilingualMode::Translated => {
                    // Try to match style locale
                    resolve_translation(&complex.translations, style_locale)
                        .map(ToString::to_string)
                        .unwrap_or_else(|| complex.original.clone())
                }

                MultilingualMode::Combined => {
                    // Format: "transliterated [translated]" or fallback variants
                    let trans = resolve_transliteration(
                        &complex.transliterations,
                        preferred_transliteration,
                        preferred_script,
                    );

                    let translation = resolve_translation(&complex.translations, style_locale);

                    match (trans, translation) {
                        (Some(t), Some(tr)) => format!("{t} [{tr}]"),
                        (Some(t), None) => t.to_string(),
                        (None, Some(tr)) => format!("{} [{}]", complex.original, tr),
                        (None, None) => complex.original.clone(),
                    }
                }

                MultilingualMode::Pattern(segments) => resolve_multilingual_pattern(
                    segments,
                    &complex.original,
                    &complex.transliterations,
                    &complex.translations,
                    preferred_transliteration,
                    preferred_script,
                    style_locale,
                ),
            }
        }
    }
}

/// Render a [`MultilingualMode::Pattern`] against a complex multilingual string.
///
/// Each segment is resolved to its text; segments that are empty or identical to
/// the previous non-empty segment are skipped (dedup).  The surviving segments are
/// joined by a single space.
fn resolve_multilingual_pattern(
    segments: &[citum_schema::options::MultilingualSegment],
    original: &str,
    transliterations: &std::collections::HashMap<String, String>,
    translations: &std::collections::HashMap<citum_schema::reference::types::LangID, String>,
    preferred_transliteration: Option<&[String]>,
    preferred_script: Option<&String>,
    style_locale: &str,
) -> String {
    use citum_schema::options::{MultilingualView, SegmentWrap};
    let mut parts: Vec<String> = Vec::with_capacity(segments.len());
    let mut last_text: Option<String> = None;

    for seg in segments {
        let text: Option<String> = match &seg.view {
            MultilingualView::Original => Some(original.to_string()),
            MultilingualView::Transliterated => resolve_transliteration(
                transliterations,
                preferred_transliteration,
                preferred_script,
            )
            .map(ToString::to_string),
            MultilingualView::Translated => {
                resolve_translation(translations, style_locale).map(ToString::to_string)
            }
        };

        let Some(text) = text else { continue };
        if text.is_empty() {
            continue;
        }
        // Skip duplicate: if this text is identical to the previous segment (e.g. when
        // transliteration falls back to the same value as original).
        if last_text.as_deref() == Some(text.as_str()) {
            continue;
        }

        let wrapped = match &seg.wrap {
            SegmentWrap::None => text.clone(),
            other => other.apply(&text),
        };
        last_text = Some(text);
        parts.push(wrapped);
    }

    parts.join(" ")
}

/// Resolve the effective language for one logical field scope on a reference.
///
/// This prefers an explicit `field_languages` entry, then a multilingual title
/// language tag for the provided title value, and finally the reference-level
/// language.
#[must_use]
pub fn effective_field_language(
    reference: &Reference,
    scope: &str,
    title: Option<&Title>,
) -> Option<String> {
    reference
        .field_languages()
        .get(scope)
        .map(ToString::to_string)
        .or_else(|| match title {
            Some(Title::Multilingual(multilingual)) => {
                multilingual.lang.as_ref().map(ToString::to_string)
            }
            _ => None,
        })
        .or_else(|| reference.language().map(|lang| lang.to_string()))
}

/// Resolve the effective language for the primary title of a reference.
#[must_use]
pub fn effective_item_language(reference: &Reference) -> Option<String> {
    effective_field_language(reference, "title", reference.title().as_ref())
}

/// Resolve the effective language for the specific template component being rendered.
#[must_use]
pub fn effective_component_language(
    reference: &Reference,
    component: &TemplateComponent,
) -> Option<String> {
    match component {
        TemplateComponent::Title(title_component) => {
            let title = match title_component.title {
                TitleType::Primary => reference.title(),
                TitleType::ParentMonograph => reference.container_title(),
                TitleType::ParentSerial => reference.container_title(),
                _ => reference.title(),
            };

            let scope = match title_component.title {
                TitleType::Primary => "title",
                TitleType::ParentMonograph => "parent-monograph.title",
                TitleType::ParentSerial => "parent-serial.title",
                _ => "title",
            };

            effective_field_language(reference, scope, title.as_ref())
        }
        _ => effective_item_language(reference),
    }
}

/// Select a structured name from transliteration maps using priority-list then script-match rules.
fn select_by_transliteration<'a>(
    m: &'a citum_schema::reference::contributor::MultilingualName,
    preferred_transliteration: Option<&[String]>,
    preferred_script: Option<&String>,
) -> &'a citum_schema::reference::contributor::StructuredName {
    // 1. Priority list: exact match
    if let Some(tags) = preferred_transliteration {
        for tag in tags {
            if let Some(name) = m.transliterations.get(tag) {
                return name;
            }
        }
        // 2. Priority list: substring match
        for tag in tags {
            if let Some((_, name)) = m
                .transliterations
                .iter()
                .find(|(k, _)| k.contains(tag.as_str()))
            {
                return name;
            }
        }
    }
    // 3. Preferred script: exact match
    if let Some(script) = preferred_script {
        if let Some(name) = m.transliterations.get(script) {
            return name;
        }
        // 4. Preferred script: substring match
        if let Some((_, name)) = m
            .transliterations
            .iter()
            .find(|(tag, _)| tag.contains(script))
        {
            return name;
        }
    }
    // Fallback: any available transliteration before falling back to original
    m.transliterations.values().next().unwrap_or(&m.original)
}

/// Resolve a multilingual contributor name based on style configuration.
///
/// Uses holistic name matching - selects the entire name variant (original/transliterated/translated)
/// as a unit rather than mixing fields from different variants.
///
/// # Arguments
/// * `contributor` - The contributor to resolve
/// * `mode` - The rendering mode from style config
/// * `preferred_transliteration` - Optional ordered list of BCP 47 transliteration tags
/// * `preferred_script` - Optional preferred script (e.g., "Latn")
/// * `style_locale` - The style's locale for translation matching
#[must_use]
pub fn resolve_multilingual_name(
    contributor: &citum_schema::reference::contributor::Contributor,
    mode: Option<&citum_schema::options::MultilingualMode>,
    preferred_transliteration: Option<&[String]>,
    preferred_script: Option<&String>,
    style_locale: &str,
) -> Vec<crate::reference::FlatName> {
    use citum_schema::options::MultilingualMode;
    use citum_schema::reference::contributor::Contributor;

    match contributor {
        // Simple and structured names have no multilingual data
        Contributor::SimpleName(_) | Contributor::StructuredName(_) => contributor.to_names_vec(),

        // Multilingual names: select variant holistically
        Contributor::Multilingual(m) => {
            let mode = mode.unwrap_or(&MultilingualMode::Primary);

            let selected_name = match mode {
                MultilingualMode::Primary => &m.original,
                MultilingualMode::Transliterated => {
                    select_by_transliteration(m, preferred_transliteration, preferred_script)
                }
                MultilingualMode::Translated => {
                    m.translations.get(style_locale).unwrap_or(&m.original)
                }
                // Combined mode for names defaults to transliterated (parenthetical combo not common for names)
                MultilingualMode::Combined => {
                    select_by_transliteration(m, preferred_transliteration, preferred_script)
                }
                // Pattern mode for names: use the first non-original view (romanized).
                // Names are always shown in a single script; multi-view name strings are
                // not idiomatic in any major style guide.
                MultilingualMode::Pattern(_) => {
                    select_by_transliteration(m, preferred_transliteration, preferred_script)
                }
            };

            // Convert selected name to FlatName
            vec![crate::reference::FlatName {
                given: Some(selected_name.given.to_string()),
                family: Some(selected_name.family.to_string()),
                suffix: selected_name.suffix.clone(),
                dropping_particle: selected_name.dropping_particle.clone(),
                non_dropping_particle: selected_name.non_dropping_particle.clone(),
                literal: None,
                short_name: None,
            }]
        }

        Contributor::ContributorList(l) => {
            l.0.iter()
                .flat_map(|c| {
                    resolve_multilingual_name(
                        c,
                        mode,
                        preferred_transliteration,
                        preferred_script,
                        style_locale,
                    )
                })
                .collect()
        }
    }
}

/// Resolve the URL for a component based on its links configuration and the reference data.
#[must_use]
pub fn resolve_url(
    links: &citum_schema::options::LinksConfig,
    reference: &Reference,
) -> Option<String> {
    use citum_schema::options::LinkTarget;

    let target = links.target.as_ref().unwrap_or(&LinkTarget::UrlOrDoi);

    match target {
        LinkTarget::Url => reference.url().map(|u| u.to_string()),
        LinkTarget::Doi => reference.doi().map(|d| format!("https://doi.org/{d}")),
        LinkTarget::UrlOrDoi => reference
            .url()
            .map(|u| u.to_string())
            .or_else(|| reference.doi().map(|d| format!("https://doi.org/{d}"))),
        LinkTarget::Pubmed => reference
            .id()
            .filter(|id| id.starts_with("pmid:"))
            .map(|id| {
                #[allow(clippy::string_slice, reason = "known ASCII prefix")]
                let result = format!("https://pubmed.ncbi.nlm.nih.gov/{}/", &id[5..]);
                result
            }),
        LinkTarget::Pmcid => reference
            .id()
            .filter(|id| id.starts_with("pmc:"))
            .map(|id| {
                #[allow(clippy::string_slice, reason = "known ASCII prefix")]
                let result = format!("https://www.ncbi.nlm.nih.gov/pmc/articles/{}/", &id[4..]);
                result
            }),
    }
}

/// Resolve the effective URL for a component, checking local links then falling back to global config.
#[must_use]
pub fn resolve_effective_url(
    local_links: Option<&citum_schema::options::LinksConfig>,
    global_links: Option<&citum_schema::options::LinksConfig>,
    reference: &Reference,
    component_anchor: citum_schema::options::LinkAnchor,
) -> Option<String> {
    use citum_schema::options::LinkAnchor;

    // 1. Check local links first
    if let Some(links) = local_links {
        let anchor = links.anchor.as_ref().unwrap_or(&LinkAnchor::Component);
        if matches!(anchor, LinkAnchor::Component) || *anchor == component_anchor {
            return resolve_url(links, reference);
        }
    }

    // 2. Fall back to global links if anchor matches this component type
    if let Some(links) = global_links
        && let Some(anchor) = &links.anchor
        && *anchor == component_anchor
    {
        return resolve_url(links, reference);
    }

    None
}

/// Processed values ready for rendering.
#[derive(Debug, Clone, Default)]
pub struct ProcValues<T = String> {
    /// The primary formatted value.
    pub value: T,
    /// Optional prefix to prepend.
    pub prefix: Option<String>,
    /// Optional suffix to append.
    pub suffix: Option<String>,
    /// Optional URL for hyperlinking.
    pub url: Option<String>,
    /// Variable key that was substituted (e.g., "title:Primary" when title replaces author).
    /// Used to prevent duplicate rendering per CSL variable-once rule.
    pub substituted_key: Option<String>,
    /// Whether the value is already pre-formatted.
    pub pre_formatted: bool,
}

/// Processing hints computed before rendering a reference or citation item.
#[derive(Debug, Clone, Default)]
pub struct ProcHints {
    /// Whether disambiguation is active (triggers year-suffix).
    pub disamb_condition: bool,
    /// Index in the disambiguation group (1-based).
    pub group_index: usize,
    /// Total size of the disambiguation group.
    pub group_length: usize,
    /// The grouping key used.
    pub group_key: String,
    /// Whether to expand given names for disambiguation.
    pub expand_given_names: bool,
    /// Whether to expand given names for primary author only.
    pub expand_given_names_primary_only: bool,
    /// Minimum number of names to show to resolve ambiguity (overrides et-al-use-first).
    pub min_names_to_show: Option<usize>,
    /// Citation number for numeric citation styles (1-based).
    pub citation_number: Option<usize>,
    /// Optional sub-label for compound numeric citation addressing (e.g., "a" in "1a").
    pub citation_sub_label: Option<String>,
    /// Citation position (first, subsequent, ibid, etc.).
    pub position: Option<citum_schema::citation::Position>,
    /// Explicit integral citation name-memory state for this rendered item.
    pub integral_name_state: Option<citum_schema::citation::IntegralNameState>,
    /// Explicit org-abbreviation state for this rendered item.
    pub org_abbreviation_state: Option<citum_schema::citation::IntegralNameState>,
    /// First note number in which this reference was cited (note styles only).
    /// Set for subsequent-position citations; `None` otherwise.
    pub first_reference_note_number: Option<u32>,
    /// When true, suppress a `disambiguate_only` title component.
    /// Set when `first_reference_note_number` is present — the note number
    /// already identifies the work; the disambiguating short title is redundant.
    pub suppress_disambiguation_title: bool,
}

/// Context for rendering (citation vs bibliography).
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
pub enum RenderContext {
    #[default]
    /// Render values for citation output.
    Citation,
    /// Render values for bibliography output.
    Bibliography,
}

/// Options for rendering.
#[derive(Clone)]
pub struct RenderOptions<'a> {
    /// Effective configuration after style and default resolution.
    pub config: &'a Config,
    /// Effective bibliography-only configuration when rendering bibliography behavior.
    pub bibliography_config: Option<BibliographyConfig>,
    /// Locale used for term lookup and locale-sensitive formatting.
    pub locale: &'a Locale,
    /// Whether the current render target is a citation or bibliography.
    pub context: RenderContext,
    /// Citation mode for the current render operation.
    pub mode: citum_schema::citation::CitationMode,
    /// Whether to suppress the author name for this citation.
    /// Set from the citation-level `suppress_author` flag.
    pub suppress_author: bool,
    /// Optional raw citation locator for rendering via locator config.
    pub locator_raw: Option<&'a citum_schema::citation::CitationLocator>,
    /// Reference type for optional type-class gating in locator patterns.
    pub ref_type: Option<String>,
    /// Whether to output semantic markup (HTML spans, Djot attributes).
    pub show_semantics: bool,
    /// The current top-level template index, when propagating preview annotations.
    pub current_template_index: Option<usize>,
    /// Document-level abbreviation map for post-render substitution.
    pub abbreviation_map: Option<&'a crate::api::AbbreviationMap>,
}

/// Trait for extracting values from template components.
pub trait ComponentValues {
    /// Resolve the component into processed render values for one reference.
    fn values<F: crate::render::format::OutputFormat<Output = String>>(
        &self,
        reference: &Reference,
        hints: &ProcHints,
        options: &RenderOptions<'_>,
    ) -> Option<ProcValues<F::Output>>;
}

impl ComponentValues for TemplateComponent {
    fn values<F: crate::render::format::OutputFormat<Output = String>>(
        &self,
        reference: &Reference,
        hints: &ProcHints,
        options: &RenderOptions<'_>,
    ) -> Option<ProcValues<F::Output>> {
        match self {
            TemplateComponent::Contributor(c) => c.values::<F>(reference, hints, options),
            TemplateComponent::Date(d) => d.values::<F>(reference, hints, options),
            TemplateComponent::Title(t) => t.values::<F>(reference, hints, options),
            TemplateComponent::Number(n) => n.values::<F>(reference, hints, options),
            TemplateComponent::Variable(v) => v.values::<F>(reference, hints, options),
            TemplateComponent::Group(l) => l.values::<F>(reference, hints, options),
            TemplateComponent::Term(t) => t.values::<F>(reference, hints, options),
            _ => None,
        }
    }
}

/// Check if periods should be stripped based on three-tier precedence.
///
/// Resolution order:
/// 1. Component-level `strip_periods`
/// 2. Global config `strip_periods`
/// 3. Defaults to false
#[must_use]
pub fn should_strip_periods(
    rendering: &citum_schema::template::Rendering,
    options: &RenderOptions<'_>,
) -> bool {
    rendering
        .strip_periods
        .or(options.config.strip_periods)
        .unwrap_or(false)
}

/// Strip trailing periods from a string.
///
/// Only removes periods at the end of the string, preserves internal periods
/// (e.g., "Ph.D." remains unchanged if there's no trailing period).
#[must_use]
pub fn strip_trailing_periods(s: &str) -> String {
    s.trim_end_matches('.').to_string()
}

/// Apply abbreviation substitution if the map contains an entry for `value`.
///
/// Returns the abbreviation if found, otherwise returns the original value unchanged.
#[must_use]
pub fn apply_abbreviation(value: String, map: Option<&crate::api::AbbreviationMap>) -> String {
    if let Some(abbr) = map.and_then(|m| m.0.get(&value)) {
        return abbr.clone();
    }
    value
}