Skip to main content

zer_compare/similarity/
date.rs

1use zer_core::{record::FieldValue, schema::FieldKind};
2
3use crate::similarity::SimilarityFn;
4
5/// Similarity function for Date and Timestamp fields.
6///
7/// Parses ISO-8601 dates (YYYY-MM-DD) and computes similarity based on
8/// calendar distance. Levels map to:
9///   1.0 , exact
10///   0.9 , off by ≤ 1 day (transposition / single-digit error)
11///   0.75, same year + month, different day
12///   0.5 , same year, different month
13///   0.3 , year ± 1 (age-compatible / estimated DOB range)
14///   0.0 , otherwise
15pub struct DateSimilarity;
16
17/// Parse a date string (ISO-8601 YYYY-MM-DD or Unix timestamp) into (year, month, day).
18fn parse_date(s: &str) -> Option<(i32, u32, u32)> {
19    let s = s.trim();
20    // ISO-8601: YYYY-MM-DD or YYYY-MM-DDThh:mm:ss (take date part only)
21    let date_part = s.split('T').next().unwrap_or(s);
22    let parts: Vec<&str> = date_part.split('-').collect();
23    if parts.len() >= 3 {
24        if let (Ok(y), Ok(m), Ok(d)) = (
25            parts[0].parse::<i32>(),
26            parts[1].parse::<u32>(),
27            parts[2].parse::<u32>(),
28        ) {
29            if m >= 1 && m <= 12 && d >= 1 && d <= 31 {
30                return Some((y, m, d));
31            }
32        }
33    }
34    // Unix timestamp (integer seconds)
35    if let Ok(ts) = s.parse::<i64>() {
36        let days_since_epoch = ts / 86400;
37        // Approximate conversion; sufficient for year-level comparisons
38        let y = 1970 + (days_since_epoch / 365) as i32;
39        return Some((y, 1, 1));
40    }
41    None
42}
43
44/// Convert a calendar date to a Julian Day Number for computing day differences.
45fn to_julian(y: i32, m: u32, d: u32) -> i32 {
46    let a  = (14_i32 - m as i32) / 12;
47    let y2 = y + 4800 - a;
48    let m2 = m as i32 + 12 * a - 3;
49    d as i32 + (153 * m2 + 2) / 5 + 365 * y2 + y2 / 4 - y2 / 100 + y2 / 400 - 32045
50}
51
52fn days_between(a: (i32, u32, u32), b: (i32, u32, u32)) -> i32 {
53    (to_julian(a.0, a.1, a.2) - to_julian(b.0, b.1, b.2)).abs()
54}
55
56fn date_score(sa: &str, sb: &str) -> f32 {
57    if sa == sb { return 1.0; }
58    let (da, db) = match (parse_date(sa), parse_date(sb)) {
59        (Some(a), Some(b)) => (a, b),
60        _ => return 0.0,
61    };
62    let diff = days_between(da, db);
63    if diff == 0 { 1.0 }
64    else if diff <= 1 { 0.9 }
65    else if da.0 == db.0 && da.1 == db.1 { 0.75 }
66    else if da.0 == db.0 { 0.5 }
67    else if (da.0 - db.0).abs() <= 1 { 0.3 }
68    else { 0.0 }
69}
70
71impl SimilarityFn for DateSimilarity {
72    fn similarity(&self, a: &FieldValue, b: &FieldValue) -> f32 {
73        let (sa, sb) = match (a, b) {
74            (FieldValue::Text(a), FieldValue::Text(b)) => (a.as_str(), b.as_str()),
75            _ => return 0.0,
76        };
77        date_score(sa, sb)
78    }
79    fn similarity_str(&self, a: &str, b: &str) -> f32 { date_score(a, b) }
80    fn field_kind(&self) -> FieldKind { FieldKind::Date }
81}
82
83#[cfg(test)]
84mod tests {
85    use super::*;
86
87    fn tv(s: &str) -> FieldValue { FieldValue::Text(s.into()) }
88
89    #[test]
90    fn exact_date_match() {
91        let sim = DateSimilarity;
92        assert_eq!(sim.similarity(&tv("1990-06-15"), &tv("1990-06-15")), 1.0);
93    }
94
95    #[test]
96    fn off_by_one_day() {
97        let sim = DateSimilarity;
98        assert_eq!(sim.similarity(&tv("1990-06-15"), &tv("1990-06-16")), 0.9);
99        assert_eq!(sim.similarity(&tv("1990-06-15"), &tv("1990-06-14")), 0.9);
100    }
101
102    #[test]
103    fn same_year_month_different_day() {
104        let sim = DateSimilarity;
105        assert_eq!(sim.similarity(&tv("1990-06-15"), &tv("1990-06-20")), 0.75);
106    }
107
108    #[test]
109    fn same_year_different_month() {
110        let sim = DateSimilarity;
111        assert_eq!(sim.similarity(&tv("1990-06-15"), &tv("1990-09-01")), 0.5);
112    }
113
114    #[test]
115    fn age_compatible_within_one_year() {
116        let sim = DateSimilarity;
117        assert_eq!(sim.similarity(&tv("1990-06-15"), &tv("1991-01-01")), 0.3);
118        assert_eq!(sim.similarity(&tv("1990-01-01"), &tv("1989-07-20")), 0.3);
119    }
120
121    #[test]
122    fn completely_different_dates() {
123        let sim = DateSimilarity;
124        assert_eq!(sim.similarity(&tv("1990-06-15"), &tv("1975-03-22")), 0.0);
125    }
126
127    #[test]
128    fn missing_field_returns_zero() {
129        let sim = DateSimilarity;
130        assert_eq!(sim.similarity(&FieldValue::Null, &tv("1990-06-15")), 0.0);
131        assert_eq!(sim.similarity(&tv("1990-06-15"), &FieldValue::Null), 0.0);
132    }
133
134    #[test]
135    fn timestamp_date_part_comparison() {
136        let sim = DateSimilarity;
137        // T-prefixed ISO-8601 datetime, date parts should match
138        assert_eq!(sim.similarity(&tv("1990-06-15T08:30:00"), &tv("1990-06-15T14:00:00")), 1.0);
139    }
140}