commonmeta 0.9.6

Library for conversions to/from the Commonmeta scholarly metadata format
Documentation
//! Date and datetime utilities.
//!
//! EDTF level 0 (ISO 8601 subset used by DataCite and InvenioRDM) is the
//! primary parser. A manual fallback handles malformed dates from upstream
//! sources (e.g. single-digit months like "2023-8-25").
//!
//! Mirrors the Go `dateutils` package in front-matter/commonmeta.

use chrono::{NaiveDateTime, Timelike};
use edtf::level_0::Edtf;

// ── Building dates from parts ─────────────────────────────────────────────────

/// Build an ISO 8601 partial date from numeric parts.
/// Pass `0` for month or day to omit that component.
/// `date_from_parts(2023, 8, 5)` → `"2023-08-05"`.
pub fn date_from_parts(year: i32, month: u32, day: u32) -> String {
    if year == 0 {
        return String::new();
    }
    match (month, day) {
        (0, _) => format!("{:04}", year),
        (m, 0) => format!("{:04}-{:02}", year, m),
        (m, d) => format!("{:04}-{:02}-{:02}", year, m, d),
    }
}

/// Build an ISO 8601 partial date from string parts (empty string = omit).
/// Used for Crossref XML and RIS date fields.
pub fn date_from_str_parts(year: &str, month: &str, day: &str) -> String {
    let y: i32 = year.trim().parse().unwrap_or(0);
    let m: u32 = month.trim().parse().unwrap_or(0);
    let d: u32 = day.trim().parse().unwrap_or(0);
    date_from_parts(y, m, d)
}

// ── Parsing ───────────────────────────────────────────────────────────────────

/// Parse a date or datetime string and return an ISO 8601 **date**
/// (`"YYYY"`, `"YYYY-MM"`, or `"YYYY-MM-DD"`). Time components are discarded.
///
/// Uses EDTF level 0 as the primary parser (covers DataCite and InvenioRDM).
/// Falls back to manual extraction for malformed dates (e.g. single-digit months).
/// Returns `""` on failure.
pub fn parse_date(s: &str) -> String {
    if s.is_empty() {
        return String::new();
    }

    // Primary: EDTF level 0 parser
    if let Ok(edtf) = Edtf::parse(s) {
        if let Some(date) = edtf.as_date() {
            return date_from_parts(date.year(), date.month(), date.day());
        }
        if let Some(dt) = edtf.as_datetime() {
            let d = dt.date();
            return date_from_parts(d.year(), d.month(), d.day());
        }
        if let Some((start, _)) = edtf.as_interval() {
            return date_from_parts(start.year(), start.month(), start.day());
        }
    }

    // Fallback: manual extraction for non-conforming strings
    let date_part = {
        let s = match s.find('T') { Some(p) => &s[..p], None => s };
        match s.find(' ') { Some(p) => &s[..p], None => s }
    };
    let mut parts = date_part.splitn(3, '-');
    let year: i32 = match parts.next().and_then(|s| s.trim().parse().ok()) {
        Some(y) if y != 0 => y,
        _ => return String::new(),
    };
    let month: u32 = parts.next().and_then(|s| s.trim().parse().ok()).unwrap_or(0);
    let day: u32 = parts.next().and_then(|s| s.trim().parse().ok()).unwrap_or(0);
    date_from_parts(year, month, day)
}

/// Parse a date or datetime string and return an ISO 8601 datetime
/// (`"YYYY-MM-DDTHH:MM:SSZ"`), or a date-only string when the time is midnight.
/// Returns `""` on failure.
///
/// Uses EDTF level 0 as the primary parser. Strips sub-second fractions (e.g.
/// from InvenioRDM timestamps) before attempting EDTF, then falls back to chrono.
pub fn parse_datetime(s: &str) -> String {
    if s.is_empty() {
        return String::new();
    }

    // Pre-process: strip sub-second fractions that EDTF doesn't accept
    let stripped = strip_milliseconds(s);
    let input = if stripped.is_empty() { s } else { &stripped };

    // Primary: EDTF level 0
    if let Ok(edtf) = Edtf::parse(input) {
        if let Some(dt) = edtf.as_datetime() {
            let t = dt.time();
            if t.hour() == 0 && t.minute() == 0 && t.second() == 0 {
                let d = dt.date();
                return date_from_parts(d.year(), d.month(), d.day());
            }
            let d = dt.date();
            return format!(
                "{:04}-{:02}-{:02}T{:02}:{:02}:{:02}Z",
                d.year(), d.month(), d.day(),
                t.hour(), t.minute(), t.second()
            );
        }
        if let Some(date) = edtf.as_date() {
            return date_from_parts(date.year(), date.month(), date.day());
        }
    }

    // Fallback: chrono multi-format parsing
    let bare = input.trim_end_matches('Z');
    for fmt in &["%Y-%m-%dT%H:%M:%S", "%Y-%m-%dT%H:%M:%S%.f", "%Y-%m-%d %H:%M:%S", "%Y%m%d%H%M%S"] {
        if let Ok(dt) = NaiveDateTime::parse_from_str(bare, fmt) {
            if dt.hour() == 0 && dt.minute() == 0 && dt.second() == 0 {
                return parse_date(input);
            }
            return dt.format("%Y-%m-%dT%H:%M:%SZ").to_string();
        }
    }

    parse_date(s)
}

/// Pad single-digit month/day components to two digits so they are valid EDTF/ISO 8601.
/// `"2023-8-5"` → `"2023-08-05"`, `"2023-8-5T12:00:00"` → `"2023-08-05T12:00:00"`.
///
/// This is a pre-processing step for malformed upstream data before EDTF parsing.
pub fn normalize_date(date: &str) -> String {
    let (date_part, time_suffix) = match date.find('T') {
        Some(pos) => (&date[..pos], &date[pos..]),
        None => (date, ""),
    };
    let parts: Vec<&str> = date_part.split('-').collect();
    let padded = match parts.as_slice() {
        [y] => y.to_string(),
        [y, m] => format!("{}-{:0>2}", y, m),
        [y, m, d] => format!("{}-{:0>2}-{:0>2}", y, m, d),
        _ => date_part.to_string(),
    };
    format!("{}{}", padded, time_suffix)
}

/// Sanitize an arbitrary date string from upstream sources into valid EDTF / ISO 8601.
/// Handles, in order:
/// 1. Already-valid EDTF (returned as-is).
/// 2. Single-digit month/day after zero-padding (`normalize_date`).
/// 3. European DD.MM.YYYY format → `"YYYY-MM-DD"`.
/// Returns `""` if the input cannot be recognized.
pub fn sanitize_date(s: &str) -> String {
    if s.is_empty() {
        return String::new();
    }
    // 1. Try EDTF — handles dates, datetimes, and intervals.
    //    Intervals yield the start date only.
    if let Ok(edtf) = Edtf::parse(s) {
        return match edtf {
            Edtf::Date(d) => d.to_string(),
            Edtf::DateTime(_) => s.to_string(),
            Edtf::Interval(start, _) => start.to_string(),
        };
    }
    // 2. Zero-pad single-digit month/day then retry EDTF
    let padded = normalize_date(s);
    if let Ok(edtf) = Edtf::parse(&padded) {
        return match edtf {
            Edtf::Date(d) => d.to_string(),
            Edtf::DateTime(_) => padded,
            Edtf::Interval(start, _) => start.to_string(),
        };
    }
    // 3. European DD.MM.YYYY  (e.g. "11.03.2016")
    let b = s.as_bytes();
    if s.len() == 10 && b.get(2) == Some(&b'.') && b.get(5) == Some(&b'.') {
        let r = date_from_str_parts(&s[6..10], &s[3..5], &s[0..2]);
        if !r.is_empty() {
            return r;
        }
    }
    String::new()
}

// ── Unix timestamps ───────────────────────────────────────────────────────────

/// Convert a Unix timestamp to an ISO 8601 date string (`"YYYY-MM-DD"`).
pub fn unix_to_date(ts: i64) -> String {
    if ts == 0 {
        return String::new();
    }
    chrono::DateTime::from_timestamp(ts, 0)
        .map(|dt: chrono::DateTime<chrono::Utc>| dt.format("%Y-%m-%d").to_string())
        .unwrap_or_default()
}

/// Convert a Unix timestamp to an ISO 8601 datetime string (`"YYYY-MM-DDTHH:MM:SSZ"`).
pub fn unix_to_datetime(ts: i64) -> String {
    if ts == 0 {
        return String::new();
    }
    chrono::DateTime::from_timestamp(ts, 0)
        .map(|dt: chrono::DateTime<chrono::Utc>| dt.format("%Y-%m-%dT%H:%M:%SZ").to_string())
        .unwrap_or_default()
}

// ── Stripping / truncating ────────────────────────────────────────────────────

/// Remove sub-second fractions from an ISO 8601 datetime string.
/// A midnight time component (`T00:00:00`) is stripped entirely, leaving only the date.
/// Normalises `+00:00` timezone suffix to `Z`.
///
/// `"2024-01-15T12:34:56.789Z"` → `"2024-01-15T12:34:56Z"`.
/// `"2024-01-15T00:00:00"` → `"2024-01-15"`.
pub fn strip_milliseconds(s: &str) -> String {
    if s.is_empty() {
        return String::new();
    }
    if s.contains("T00:00:00") {
        return s.split('T').next().unwrap_or(s).to_string();
    }
    if let Some(dot) = s.find('.') {
        let rest = &s[dot + 1..];
        let frac_end = rest
            .find(|c: char| !c.is_ascii_digit())
            .map(|i| i + dot + 1)
            .unwrap_or(s.len());
        let suffix = &s[frac_end..];
        let suffix = if suffix.is_empty() { "Z" } else { suffix };
        return format!("{}{}", &s[..dot], suffix);
    }
    if let Some(plus) = s.rfind("+00:00") {
        return format!("{}Z", &s[..plus]);
    }
    s.to_string()
}

/// Return only the date portion of a datetime string (first 10 characters).
/// `"2024-01-15T12:34:56Z"` → `"2024-01-15"`. Pass-through for shorter strings.
pub fn date_only(s: &str) -> String {
    if s.len() >= 10 {
        s[..10].to_string()
    } else {
        s.to_string()
    }
}

// ── Tests ─────────────────────────────────────────────────────────────────────

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_date_from_parts() {
        assert_eq!(date_from_parts(2023, 8, 5), "2023-08-05");
        assert_eq!(date_from_parts(2023, 8, 0), "2023-08");
        assert_eq!(date_from_parts(2023, 0, 0), "2023");
        assert_eq!(date_from_parts(0, 0, 0), "");
    }

    #[test]
    fn test_date_from_str_parts() {
        assert_eq!(date_from_str_parts("2023", "8", "5"), "2023-08-05");
        assert_eq!(date_from_str_parts("2023", "8", ""), "2023-08");
        assert_eq!(date_from_str_parts("2023", "", ""), "2023");
    }

    #[test]
    fn test_parse_date() {
        // Valid EDTF (DataCite/InvenioRDM)
        assert_eq!(parse_date("2023-08-25"), "2023-08-25");
        assert_eq!(parse_date("2023-08-25T00:00:00Z"), "2023-08-25");
        assert_eq!(parse_date("2023-08-25T12:30:00+05:00"), "2023-08-25");
        assert_eq!(parse_date("2023-08"), "2023-08");
        assert_eq!(parse_date("2023"), "2023");
        // EDTF interval: take start date
        assert_eq!(parse_date("2019-04-04/2021-06-06"), "2019-04-04");
        // Malformed (fallback path)
        assert_eq!(parse_date("2023-8-25"), "2023-08-25");
        assert_eq!(parse_date("2023-08-25T00:00:00"), "2023-08-25");
        assert_eq!(parse_date(""), "");
    }

    #[test]
    fn test_sanitize_date() {
        assert_eq!(sanitize_date("2023-08-25"), "2023-08-25");
        assert_eq!(sanitize_date("2023-8-25"), "2023-08-25");
        assert_eq!(sanitize_date("1996-04-04T00:00:00"), "1996-04-04T00:00:00");
        assert_eq!(sanitize_date("11.03.2016"), "2016-03-11");
        assert_eq!(sanitize_date("2020/2020"), "2020");
        assert_eq!(sanitize_date("2023-01-01/2023-12-31"), "2023-01-01");
        assert_eq!(sanitize_date(""), "");
        assert_eq!(sanitize_date("not-a-date"), "");
    }

    #[test]
    fn test_normalize_date() {
        assert_eq!(normalize_date("2023-8-25"), "2023-08-25");
        assert_eq!(normalize_date("2023-8-5T12:00:00"), "2023-08-05T12:00:00");
        assert_eq!(normalize_date("2023-08-25"), "2023-08-25");
        assert_eq!(normalize_date("2023"), "2023");
    }

    #[test]
    fn test_parse_datetime() {
        assert_eq!(parse_datetime("2023-08-25T12:30:00Z"), "2023-08-25T12:30:00Z");
        assert_eq!(parse_datetime("2023-08-25T00:00:00Z"), "2023-08-25");
        assert_eq!(parse_datetime("2023-08-25T00:00:00"), "2023-08-25");
        // InvenioRDM microseconds
        assert_eq!(parse_datetime("2024-01-15T12:34:56.789012+00:00"), "2024-01-15T12:34:56Z");
    }

    #[test]
    fn test_strip_milliseconds() {
        assert_eq!(strip_milliseconds("2024-01-15T12:34:56.789Z"), "2024-01-15T12:34:56Z");
        assert_eq!(strip_milliseconds("2024-01-15T00:00:00"), "2024-01-15");
        assert_eq!(strip_milliseconds("2024-01-15T00:00:00.000Z"), "2024-01-15");
        assert_eq!(strip_milliseconds("2024-01-15T12:34:56+00:00"), "2024-01-15T12:34:56Z");
        assert_eq!(strip_milliseconds(""), "");
    }

    #[test]
    fn test_unix_to_date() {
        assert_eq!(unix_to_date(1711238400), "2024-03-24");
        assert_eq!(unix_to_date(0), "");
    }

    #[test]
    fn test_unix_to_datetime() {
        assert_eq!(unix_to_datetime(1711238400), "2024-03-24T00:00:00Z");
        assert_eq!(unix_to_datetime(0), "");
    }
}