Skip to main content

shifty_engine/
value.rs

1//! Evaluation of value types `test(τ)` and the term ordering used by
2//! `sh:lessThan`, ranges, etc. Naive reference semantics.
3
4use bigdecimal::BigDecimal;
5use oxrdf::vocab::xsd;
6use oxrdf::{Literal, NamedNodeRef, Term};
7use oxsdatatypes::{
8    Boolean, Date, DateTime, DayTimeDuration, Double, Duration, Float, Time, YearMonthDuration,
9};
10use regex::Regex;
11use shifty_algebra::value_type::{Bound, ValueType};
12use std::cmp::Ordering;
13use std::str::FromStr;
14
15const XSD: &str = "http://www.w3.org/2001/XMLSchema#";
16
17/// Does `term` satisfy value type `τ`?
18pub fn value_type_holds(vt: &ValueType, term: &Term) -> bool {
19    match vt {
20        ValueType::Any => true,
21        ValueType::Datatype(dt) => {
22            matches!(term, Term::Literal(l) if l.datatype() == dt.as_ref() && valid_lexical_form(l))
23        }
24        ValueType::NumericRange { lo, hi } => {
25            lo.as_ref().is_none_or(|b| satisfies_lower(term, b))
26                && hi.as_ref().is_none_or(|b| satisfies_upper(term, b))
27        }
28        ValueType::Length { min, max } => match lexical(term) {
29            Some(s) => {
30                let len = s.chars().count() as u64;
31                min.is_none_or(|m| len >= m) && max.is_none_or(|m| len <= m)
32            }
33            None => false,
34        },
35        ValueType::Pattern { regex, flags } => match lexical(term) {
36            Some(s) => compile(regex, flags).is_some_and(|re| re.is_match(&s)),
37            None => false,
38        },
39        ValueType::LangIn(langs) => match term {
40            Term::Literal(l) => l
41                .language()
42                .is_some_and(|tag| langs.iter().any(|range| lang_matches(tag, range))),
43            _ => false,
44        },
45        ValueType::And(parts) => parts.iter().all(|p| value_type_holds(p, term)),
46    }
47}
48
49fn satisfies_lower(term: &Term, b: &Bound) -> bool {
50    match compare_terms(term, &Term::Literal(b.value.clone())) {
51        Some(Ordering::Greater) => true,
52        Some(Ordering::Equal) => b.inclusive,
53        _ => false,
54    }
55}
56
57fn satisfies_upper(term: &Term, b: &Bound) -> bool {
58    match compare_terms(term, &Term::Literal(b.value.clone())) {
59        Some(Ordering::Less) => true,
60        Some(Ordering::Equal) => b.inclusive,
61        _ => false,
62    }
63}
64
65/// Partial ordering over terms: numeric literals compare numerically,
66/// string-typed literals lexicographically, and the XSD date/time/duration
67/// family via their `oxsdatatypes` implementations (which correctly model the
68/// partial order for timezone-unaware comparisons and incomparable durations).
69pub fn compare_terms(a: &Term, b: &Term) -> Option<Ordering> {
70    if let (Term::Literal(la), Term::Literal(lb)) = (a, b) {
71        // When both operands are exact (integer-family or xsd:decimal), compare
72        // via BigDecimal so high-precision decimals and large integers (beyond
73        // f64's 2^53 of integer precision) are not collapsed by a lossy float
74        // conversion. Any pair involving xsd:float/xsd:double falls to the f64
75        // path, matching XPath/SPARQL promotion-to-floating semantics.
76        if let (Some(da), Some(db)) = (exact_decimal(la), exact_decimal(lb)) {
77            return Some(da.cmp(&db));
78        }
79        if let (Some(na), Some(nb)) = (numeric(la), numeric(lb)) {
80            return na.partial_cmp(&nb);
81        }
82        if is_string(la) && is_string(lb) {
83            return Some(la.value().cmp(lb.value()));
84        }
85        let (dta, dtb) = (la.datatype(), lb.datatype());
86        if dta == xsd::DATE_TIME && dtb == xsd::DATE_TIME {
87            let left = DateTime::from_str(la.value()).ok()?;
88            let right = DateTime::from_str(lb.value()).ok()?;
89            return left.partial_cmp(&right);
90        }
91        if dta == xsd::DATE && dtb == xsd::DATE {
92            let left = Date::from_str(la.value()).ok()?;
93            let right = Date::from_str(lb.value()).ok()?;
94            return left.partial_cmp(&right);
95        }
96        if dta == xsd::TIME && dtb == xsd::TIME {
97            let left = Time::from_str(la.value()).ok()?;
98            let right = Time::from_str(lb.value()).ok()?;
99            return left.partial_cmp(&right);
100        }
101        if is_duration_datatype(dta) && is_duration_datatype(dtb) {
102            // Parse via Duration for all three subtypes; Duration::partial_cmp
103            // correctly returns None for incomparable year-month vs day-time pairs.
104            let left = Duration::from_str(la.value()).ok()?;
105            let right = Duration::from_str(lb.value()).ok()?;
106            return left.partial_cmp(&right);
107        }
108    }
109    None
110}
111
112fn is_duration_datatype(dt: NamedNodeRef<'_>) -> bool {
113    dt == xsd::DURATION || dt == xsd::YEAR_MONTH_DURATION || dt == xsd::DAY_TIME_DURATION
114}
115
116fn valid_lexical_form(literal: &Literal) -> bool {
117    let datatype = literal.datatype();
118    let value = literal.value();
119    if datatype == xsd::BOOLEAN {
120        Boolean::from_str(value).is_ok()
121    } else if datatype == xsd::DECIMAL {
122        // xsd:decimal is arbitrary-precision, so validate the lexical form
123        // against the XSD grammar rather than parsing into oxsdatatypes::Decimal
124        // (a fixed-point i128 with ~18 fractional digits, which rejects valid
125        // high-precision values such as QUDT conversion multipliers).
126        is_decimal_lexical(value)
127    } else if datatype == xsd::FLOAT {
128        Float::from_str(value).is_ok()
129    } else if datatype == xsd::DOUBLE {
130        Double::from_str(value).is_ok()
131    } else if datatype == xsd::DATE_TIME {
132        DateTime::from_str(value).is_ok()
133    } else if datatype == xsd::DATE {
134        Date::from_str(value).is_ok()
135    } else if datatype == xsd::TIME {
136        Time::from_str(value).is_ok()
137    } else if datatype == xsd::DURATION {
138        Duration::from_str(value).is_ok()
139    } else if datatype == xsd::YEAR_MONTH_DURATION {
140        YearMonthDuration::from_str(value).is_ok()
141    } else if datatype == xsd::DAY_TIME_DURATION {
142        DayTimeDuration::from_str(value).is_ok()
143    } else if is_integer_datatype(datatype) {
144        integer_in_datatype_range(value, datatype)
145    } else {
146        true
147    }
148}
149
150/// Lexical validity for `xsd:decimal`: `[+-]? ( \d+ (\.\d*)? | \.\d+ )`, i.e.
151/// an optional sign and at least one digit, with an optional single decimal
152/// point. Unlike `oxsdatatypes::Decimal::from_str` this imposes no precision
153/// bound, and unlike a numeric parse it rejects exponents (`1e5` is a valid
154/// `xsd:double` but not a valid `xsd:decimal`). XSD `whiteSpace=collapse`
155/// allows surrounding whitespace, which we trim first.
156fn is_decimal_lexical(value: &str) -> bool {
157    let value = value.trim();
158    let digits = value.strip_prefix(['+', '-']).unwrap_or(value);
159    let mut seen_digit = false;
160    let mut seen_dot = false;
161    for byte in digits.bytes() {
162        match byte {
163            b'0'..=b'9' => seen_digit = true,
164            b'.' if !seen_dot => seen_dot = true,
165            _ => return false,
166        }
167    }
168    seen_digit
169}
170
171fn is_integer_datatype(datatype: NamedNodeRef<'_>) -> bool {
172    matches!(
173        datatype,
174        xsd::INTEGER
175            | xsd::LONG
176            | xsd::INT
177            | xsd::SHORT
178            | xsd::BYTE
179            | xsd::NON_NEGATIVE_INTEGER
180            | xsd::NON_POSITIVE_INTEGER
181            | xsd::NEGATIVE_INTEGER
182            | xsd::POSITIVE_INTEGER
183            | xsd::UNSIGNED_LONG
184            | xsd::UNSIGNED_INT
185            | xsd::UNSIGNED_SHORT
186            | xsd::UNSIGNED_BYTE
187    )
188}
189
190fn integer_in_datatype_range(value: &str, datatype: NamedNodeRef<'_>) -> bool {
191    let value = value.trim();
192    let digits = value.strip_prefix(['+', '-']).unwrap_or(value);
193    if digits.is_empty() || !digits.bytes().all(|byte| byte.is_ascii_digit()) {
194        return false;
195    }
196    if datatype == xsd::INTEGER {
197        return true;
198    }
199    let Ok(number) = value.parse::<i128>() else {
200        return false;
201    };
202    match datatype {
203        xsd::LONG => number >= i64::MIN.into() && number <= i64::MAX.into(),
204        xsd::INT => number >= i32::MIN.into() && number <= i32::MAX.into(),
205        xsd::SHORT => number >= i16::MIN.into() && number <= i16::MAX.into(),
206        xsd::BYTE => number >= i8::MIN.into() && number <= i8::MAX.into(),
207        xsd::NON_NEGATIVE_INTEGER => number >= 0,
208        xsd::NON_POSITIVE_INTEGER => number <= 0,
209        xsd::NEGATIVE_INTEGER => number < 0,
210        xsd::POSITIVE_INTEGER => number > 0,
211        xsd::UNSIGNED_LONG => number >= 0 && number <= u64::MAX.into(),
212        xsd::UNSIGNED_INT => number >= 0 && number <= u32::MAX.into(),
213        xsd::UNSIGNED_SHORT => number >= 0 && number <= u16::MAX.into(),
214        xsd::UNSIGNED_BYTE => number >= 0 && number <= u8::MAX.into(),
215        _ => false,
216    }
217}
218
219/// Exact value of an integer-family or `xsd:decimal` literal as a `BigDecimal`,
220/// or `None` for `float`/`double` and non-numeric literals. This is only used
221/// when *both* operands are exact: per XPath/SPARQL numeric semantics, a
222/// comparison involving an `xsd:float`/`xsd:double` promotes the other operand
223/// to that floating type, so any pair touching a float stays on the lossy `f64`
224/// path rather than being compared exactly.
225fn exact_decimal(l: &Literal) -> Option<BigDecimal> {
226    let dt = l.datatype();
227    if dt == xsd::DECIMAL || is_integer_datatype(dt) {
228        BigDecimal::from_str(l.value().trim()).ok()
229    } else {
230        None
231    }
232}
233
234fn numeric(l: &Literal) -> Option<f64> {
235    is_numeric_datatype(l)
236        .then(|| l.value().parse::<f64>().ok())
237        .flatten()
238}
239
240fn is_numeric_datatype(l: &Literal) -> bool {
241    let dt = l.datatype().as_str();
242    let Some(local) = dt.strip_prefix(XSD) else {
243        return false;
244    };
245    matches!(
246        local,
247        "integer"
248            | "decimal"
249            | "float"
250            | "double"
251            | "long"
252            | "int"
253            | "short"
254            | "byte"
255            | "nonNegativeInteger"
256            | "nonPositiveInteger"
257            | "negativeInteger"
258            | "positiveInteger"
259            | "unsignedLong"
260            | "unsignedInt"
261            | "unsignedShort"
262            | "unsignedByte"
263    )
264}
265
266fn is_string(l: &Literal) -> bool {
267    l.datatype() == xsd::STRING
268}
269
270/// The lexical form to which string facets apply: a literal's value or an IRI;
271/// blank nodes have none.
272fn lexical(term: &Term) -> Option<String> {
273    match term {
274        Term::Literal(l) => Some(l.value().to_string()),
275        Term::NamedNode(n) => Some(n.as_str().to_string()),
276        Term::BlankNode(_) => None,
277    }
278}
279
280/// BCP47 basic language-range match (case-insensitive prefix), plus `*`.
281fn lang_matches(tag: &str, range: &str) -> bool {
282    range == "*"
283        || tag.eq_ignore_ascii_case(range)
284        || tag
285            .to_ascii_lowercase()
286            .starts_with(&format!("{}-", range.to_ascii_lowercase()))
287}
288
289/// Compile a SHACL `sh:pattern` with the supported `sh:flags` subset (i,s,m,x).
290fn compile(regex: &str, flags: &str) -> Option<Regex> {
291    let active: String = flags.chars().filter(|c| "ismx".contains(*c)).collect();
292    let pattern = if active.is_empty() {
293        regex.to_string()
294    } else {
295        format!("(?{active}){regex}")
296    };
297    Regex::new(&pattern).ok()
298}
299
300#[cfg(test)]
301mod tests {
302    use super::*;
303    use oxrdf::Literal;
304    use std::cmp::Ordering;
305
306    fn lit(value: &str, datatype: &str) -> Term {
307        Term::Literal(Literal::new_typed_literal(
308            value,
309            oxrdf::NamedNode::new(datatype).unwrap(),
310        ))
311    }
312    fn date(v: &str) -> Term {
313        lit(v, "http://www.w3.org/2001/XMLSchema#date")
314    }
315    fn time(v: &str) -> Term {
316        lit(v, "http://www.w3.org/2001/XMLSchema#time")
317    }
318    fn dur(v: &str) -> Term {
319        lit(v, "http://www.w3.org/2001/XMLSchema#duration")
320    }
321    fn ymd(v: &str) -> Term {
322        lit(v, "http://www.w3.org/2001/XMLSchema#yearMonthDuration")
323    }
324    fn dtd(v: &str) -> Term {
325        lit(v, "http://www.w3.org/2001/XMLSchema#dayTimeDuration")
326    }
327
328    #[test]
329    fn date_ordering() {
330        assert_eq!(
331            compare_terms(&date("2020-01-01"), &date("2020-06-01")),
332            Some(Ordering::Less)
333        );
334        assert_eq!(
335            compare_terms(&date("2020-06-01"), &date("2020-01-01")),
336            Some(Ordering::Greater)
337        );
338        assert_eq!(
339            compare_terms(&date("2020-03-15"), &date("2020-03-15")),
340            Some(Ordering::Equal)
341        );
342    }
343
344    #[test]
345    fn time_ordering() {
346        assert_eq!(
347            compare_terms(&time("08:00:00"), &time("17:30:00")),
348            Some(Ordering::Less)
349        );
350        assert_eq!(
351            compare_terms(&time("23:59:59"), &time("00:00:00")),
352            Some(Ordering::Greater)
353        );
354        assert_eq!(
355            compare_terms(&time("12:00:00"), &time("12:00:00")),
356            Some(Ordering::Equal)
357        );
358    }
359
360    #[test]
361    fn duration_ordering() {
362        assert_eq!(
363            compare_terms(&dur("P1Y"), &dur("P2Y")),
364            Some(Ordering::Less)
365        );
366        assert_eq!(
367            compare_terms(&dur("P30D"), &dur("P1D")),
368            Some(Ordering::Greater)
369        );
370        assert_eq!(
371            compare_terms(&dur("P1Y"), &dur("P1Y")),
372            Some(Ordering::Equal)
373        );
374    }
375
376    #[test]
377    fn year_month_duration_ordering() {
378        assert_eq!(
379            compare_terms(&ymd("P1Y"), &ymd("P13M")),
380            Some(Ordering::Less)
381        );
382        assert_eq!(
383            compare_terms(&ymd("P2Y"), &ymd("P1Y")),
384            Some(Ordering::Greater)
385        );
386    }
387
388    #[test]
389    fn day_time_duration_ordering() {
390        assert_eq!(
391            compare_terms(&dtd("PT1H"), &dtd("PT2H")),
392            Some(Ordering::Less)
393        );
394        assert_eq!(
395            compare_terms(&dtd("P2D"), &dtd("P1D")),
396            Some(Ordering::Greater)
397        );
398    }
399
400    #[test]
401    fn genuinely_incomparable_durations() {
402        // P1M and P30D are incomparable: a month is 28–31 days so no consistent ordering
403        assert_eq!(compare_terms(&dur("P1M"), &dur("P30D")), None);
404    }
405
406    #[test]
407    fn cross_type_incomparable() {
408        assert_eq!(compare_terms(&date("2020-01-01"), &time("12:00:00")), None);
409        assert_eq!(compare_terms(&date("2020-01-01"), &dur("P1Y")), None);
410    }
411
412    fn decimal(v: &str) -> Term {
413        lit(v, "http://www.w3.org/2001/XMLSchema#decimal")
414    }
415
416    #[test]
417    fn high_precision_decimal_is_a_valid_lexical_form() {
418        // QUDT conversion multipliers carry more fractional digits than the
419        // fixed-point oxsdatatypes::Decimal can hold; xsd:decimal is
420        // arbitrary-precision, so the value type must still hold.
421        let dt = ValueType::Datatype(
422            oxrdf::NamedNode::new("http://www.w3.org/2001/XMLSchema#decimal").unwrap(),
423        );
424        let term = decimal("0.0004719474432000000000000000000000001");
425        assert!(value_type_holds(&dt, &term));
426    }
427
428    #[test]
429    fn decimal_lexical_forms() {
430        assert!(is_decimal_lexical(
431            "0.0004719474432000000000000000000000001"
432        ));
433        assert!(is_decimal_lexical("123"));
434        assert!(is_decimal_lexical("-1.5"));
435        assert!(is_decimal_lexical("+.5"));
436        assert!(is_decimal_lexical("1."));
437        assert!(is_decimal_lexical("  42  "));
438        // Exponents are xsd:double, not xsd:decimal.
439        assert!(!is_decimal_lexical("1.5e10"));
440        assert!(!is_decimal_lexical("."));
441        assert!(!is_decimal_lexical("+"));
442        assert!(!is_decimal_lexical("1.2.3"));
443        assert!(!is_decimal_lexical("abc"));
444    }
445
446    #[test]
447    fn high_precision_decimals_compare_exactly() {
448        // These two differ only in the 37th fractional digit; an f64 round-trip
449        // would collapse them to equal.
450        let a = decimal("0.0004719474432000000000000000000000001");
451        let b = decimal("0.0004719474432000000000000000000000002");
452        assert_eq!(compare_terms(&a, &b), Some(Ordering::Less));
453        assert_eq!(compare_terms(&a, &a.clone()), Some(Ordering::Equal));
454    }
455
456    #[test]
457    fn decimal_vs_double_promotes_to_double() {
458        // XPath/SPARQL semantics: a decimal compared with a double is promoted
459        // to double, so decimal 0.1 and double 0.1 round to the same f64 and
460        // compare equal rather than on their exact (differing) values.
461        let double = lit("0.1", "http://www.w3.org/2001/XMLSchema#double");
462        assert_eq!(
463            compare_terms(&decimal("0.1"), &double),
464            Some(Ordering::Equal)
465        );
466    }
467
468    #[test]
469    fn double_comparisons_still_order_and_handle_nan() {
470        let inf = lit("INF", "http://www.w3.org/2001/XMLSchema#double");
471        assert_eq!(
472            compare_terms(&decimal("1000000000"), &inf),
473            Some(Ordering::Less)
474        );
475        let nan = lit("NaN", "http://www.w3.org/2001/XMLSchema#double");
476        assert_eq!(compare_terms(&decimal("1"), &nan), None);
477    }
478
479    #[test]
480    fn large_integers_compare_exactly() {
481        // Both round to the same f64 (> 2^53) but are distinct integers.
482        let a = lit(
483            "9007199254740993",
484            "http://www.w3.org/2001/XMLSchema#integer",
485        );
486        let b = lit(
487            "9007199254740992",
488            "http://www.w3.org/2001/XMLSchema#integer",
489        );
490        assert_eq!(compare_terms(&a, &b), Some(Ordering::Greater));
491    }
492
493    #[test]
494    fn valid_lexical_form_date() {
495        let good = Literal::new_typed_literal(
496            "2020-01-01",
497            oxrdf::NamedNode::new("http://www.w3.org/2001/XMLSchema#date").unwrap(),
498        );
499        let bad = Literal::new_typed_literal(
500            "not-a-date",
501            oxrdf::NamedNode::new("http://www.w3.org/2001/XMLSchema#date").unwrap(),
502        );
503        assert!(valid_lexical_form(&good));
504        assert!(!valid_lexical_form(&bad));
505    }
506
507    #[test]
508    fn valid_lexical_form_duration() {
509        let good = Literal::new_typed_literal(
510            "P1Y2M3DT4H",
511            oxrdf::NamedNode::new("http://www.w3.org/2001/XMLSchema#duration").unwrap(),
512        );
513        let bad = Literal::new_typed_literal(
514            "not-a-duration",
515            oxrdf::NamedNode::new("http://www.w3.org/2001/XMLSchema#duration").unwrap(),
516        );
517        assert!(valid_lexical_form(&good));
518        assert!(!valid_lexical_form(&bad));
519    }
520}