Skip to main content

schemaorg_rs/validation/
mod.rs

1//! Schema.org vocabulary validation engine.
2//!
3//! Validates a [`StructuredDataGraph`]
4//! against the official Schema.org vocabulary definitions. Produces a
5//! [`ValidationResult`] containing typed diagnostics with severity levels,
6//! machine-readable codes, and human-readable messages.
7//!
8//! # Architecture
9//!
10//! ```text
11//! StructuredDataGraph
12//!     +---- for each SchemaNode:
13//!         +---- Type checker:     unknown, deprecated, pending
14//!         +---- Property checker: unknown, wrong domain, superseded
15//!         +---- Value checker:    type mismatch, coercion, enum validation
16//! ```
17//!
18//! All Schema.org knowledge is compiled into static lookup functions at
19//! build time -- see [`vocabulary`] for details.
20//!
21//! # Examples
22//!
23//! ```no_run
24//! # #[cfg(feature = "validation")]
25//! # {
26//! use schemaorg_rs::{extract_all, validation};
27//!
28//! let html = r#"<script type="application/ld+json">{
29//!   "@context": "https://schema.org",
30//!   "@type": "Product",
31//!   "name": "Widget"
32//! }</script>"#;
33//!
34//! let graph = extract_all(html).unwrap();
35//! let result = validation::validate(&graph);
36//!
37//! if result.has_errors() {
38//!     for diag in result.errors() {
39//!         eprintln!("{}: {}", diag.path, diag.message);
40//!     }
41//! }
42//! # }
43//! ```
44
45pub mod diagnostics;
46mod property_checker;
47mod type_checker;
48mod value_checker;
49
50pub use diagnostics::{DiagnosticCode, Severity, ValidationDiagnostic};
51
52use crate::graph::StructuredDataGraph;
53use crate::types::{SchemaNode, SchemaValue};
54use crate::vocabulary;
55
56/// Result of validating a [`StructuredDataGraph`] against Schema.org.
57///
58/// Contains all diagnostics found during validation, accessible via
59/// convenience methods for filtering by severity.
60#[derive(Debug, Clone, Default)]
61#[must_use]
62pub struct ValidationResult {
63    /// All diagnostics produced during validation.
64    pub diagnostics: Vec<ValidationDiagnostic>,
65}
66
67impl ValidationResult {
68    /// Returns an iterator over error-level diagnostics.
69    pub fn errors(&self) -> impl Iterator<Item = &ValidationDiagnostic> {
70        self.diagnostics
71            .iter()
72            .filter(|d| d.severity == Severity::Error)
73    }
74
75    /// Returns an iterator over warning-level diagnostics.
76    pub fn warnings(&self) -> impl Iterator<Item = &ValidationDiagnostic> {
77        self.diagnostics
78            .iter()
79            .filter(|d| d.severity == Severity::Warning)
80    }
81
82    /// Returns an iterator over info-level diagnostics.
83    pub fn infos(&self) -> impl Iterator<Item = &ValidationDiagnostic> {
84        self.diagnostics
85            .iter()
86            .filter(|d| d.severity == Severity::Info)
87    }
88
89    /// Returns `true` if any error-level diagnostics exist.
90    #[must_use]
91    pub fn has_errors(&self) -> bool {
92        self.diagnostics
93            .iter()
94            .any(|d| d.severity == Severity::Error)
95    }
96
97    /// Returns `true` if any warning-level diagnostics exist.
98    #[must_use]
99    pub fn has_warnings(&self) -> bool {
100        self.diagnostics
101            .iter()
102            .any(|d| d.severity == Severity::Warning)
103    }
104
105    /// Returns the total number of diagnostics.
106    #[must_use]
107    pub fn len(&self) -> usize {
108        self.diagnostics.len()
109    }
110
111    /// Returns `true` if no diagnostics were produced.
112    #[must_use]
113    pub fn is_empty(&self) -> bool {
114        self.diagnostics.is_empty()
115    }
116}
117
118/// Validates a [`StructuredDataGraph`] against the Schema.org vocabulary.
119///
120/// Checks all nodes for:
121/// - Unknown or deprecated types
122/// - Unknown or misplaced properties
123/// - Value type mismatches
124/// - Deprecated/superseded properties
125///
126/// # Examples
127///
128/// ```no_run
129/// # #[cfg(feature = "validation")]
130/// # {
131/// use schemaorg_rs::{extract_all, validation};
132///
133/// let html = r#"<script type="application/ld+json">{
134///   "@context": "https://schema.org",
135///   "@type": "Produc",
136///   "name": "Widget"
137/// }</script>"#;
138///
139/// let graph = extract_all(html).unwrap();
140/// let result = validation::validate(&graph);
141/// assert!(result.has_errors());
142/// # }
143/// ```
144pub fn validate(graph: &StructuredDataGraph) -> ValidationResult {
145    let mut diagnostics = Vec::new();
146    for node in &graph.nodes {
147        let type_label = if node.types.is_empty() {
148            "(unknown)".to_string()
149        } else {
150            node.types.join(", ")
151        };
152        validate_node(node, &type_label, &mut diagnostics);
153    }
154    ValidationResult { diagnostics }
155}
156
157/// Recursively validates a single [`SchemaNode`].
158fn validate_node(node: &SchemaNode, path: &str, diagnostics: &mut Vec<ValidationDiagnostic>) {
159    // 1. Check types
160    for type_name in &node.types {
161        type_checker::check_type(type_name, path, diagnostics);
162    }
163
164    // 2. Check properties
165    for (prop_name, values) in &node.properties {
166        let prop_path = format!("{path}.{prop_name}");
167        property_checker::check_property(prop_name, &node.types, &prop_path, diagnostics);
168
169        // 3. Check values
170        if let Some(prop_def) = vocabulary::lookup_property(prop_name) {
171            for (i, value) in values.iter().enumerate() {
172                let value_path = if values.len() > 1 {
173                    format!("{prop_path}[{i}]")
174                } else {
175                    prop_path.clone()
176                };
177
178                value_checker::check_value(
179                    value,
180                    prop_name,
181                    prop_def.expected_types,
182                    &value_path,
183                    diagnostics,
184                );
185
186                // 4. Recurse into nested nodes
187                if let SchemaValue::Node(nested) = value {
188                    let nested_type_label = if nested.types.is_empty() {
189                        format!("{value_path}.(unknown)")
190                    } else {
191                        format!("{value_path}.{}", nested.types.join(", "))
192                    };
193                    validate_node(nested, &nested_type_label, diagnostics);
194                }
195            }
196        }
197    }
198}
199
200// "Did you mean?" suggestions
201/// Maximum Levenshtein distance for suggestions.
202const MAX_DISTANCE: usize = 3;
203
204/// Maximum length difference for suggestions (filters noise).
205const MAX_LENGTH_DIFF: usize = 3;
206
207/// Suggests the closest match from a list of candidates using Levenshtein distance.
208///
209/// Returns `None` if no candidate is within the threshold.
210pub(crate) fn suggest_similar<'a>(input: &str, candidates: &'a [&str]) -> Option<&'a str> {
211    let input_len = input.len();
212
213    candidates
214        .iter()
215        .filter(|c| {
216            let len_diff = if c.len() > input_len {
217                c.len() - input_len
218            } else {
219                input_len - c.len()
220            };
221            len_diff <= MAX_LENGTH_DIFF
222        })
223        .map(|c| (*c, levenshtein(input, c)))
224        .filter(|(_, d)| *d <= MAX_DISTANCE && *d > 0) // d > 0 to exclude exact matches
225        .min_by_key(|(_, d)| *d)
226        .map(|(c, _)| c)
227}
228
229/// Computes the Levenshtein edit distance between two strings.
230///
231/// O(n*m) time, O(min(n,m)) space using a single-row optimization.
232fn levenshtein(a: &str, b: &str) -> usize {
233    let a_len = a.len();
234    let b_len = b.len();
235
236    if a_len == 0 {
237        return b_len;
238    }
239    if b_len == 0 {
240        return a_len;
241    }
242
243    // Ensure b is the shorter string for space efficiency
244    let (a_bytes, b_bytes) = if a_len < b_len {
245        (b.as_bytes(), a.as_bytes())
246    } else {
247        (a.as_bytes(), b.as_bytes())
248    };
249
250    let b_len = b_bytes.len();
251    let mut row: Vec<usize> = (0..=b_len).collect();
252
253    for (i, a_byte) in a_bytes.iter().enumerate() {
254        let mut prev = i;
255        row[0] = i + 1;
256
257        for (j, b_byte) in b_bytes.iter().enumerate() {
258            let cost = usize::from(!a_byte.eq_ignore_ascii_case(b_byte));
259            let val = (row[j + 1] + 1).min(row[j] + 1).min(prev + cost);
260            prev = row[j + 1];
261            row[j + 1] = val;
262        }
263    }
264
265    row[b_len]
266}
267
268#[cfg(test)]
269mod tests {
270    use super::*;
271
272    #[test]
273    fn levenshtein_basic() {
274        assert_eq!(levenshtein("", ""), 0);
275        assert_eq!(levenshtein("a", ""), 1);
276        assert_eq!(levenshtein("", "a"), 1);
277        assert_eq!(levenshtein("kitten", "sitting"), 3);
278        assert_eq!(levenshtein("Product", "Produc"), 1);
279        assert_eq!(levenshtein("Product", "product"), 0); // case-insensitive
280        assert_eq!(levenshtein("name", "namee"), 1);
281    }
282
283    #[test]
284    fn suggest_similar_finds_close_match() {
285        let candidates = &["Product", "Person", "Place", "Event"];
286        assert_eq!(suggest_similar("Produc", candidates), Some("Product"));
287        assert_eq!(suggest_similar("Prduct", candidates), Some("Product"));
288        assert_eq!(suggest_similar("Perso", candidates), Some("Person"));
289    }
290
291    #[test]
292    fn suggest_similar_none_for_distant() {
293        let candidates = &["Product", "Person", "Place"];
294        assert_eq!(suggest_similar("XYZ123", candidates), None);
295    }
296
297    #[test]
298    fn suggest_similar_none_for_exact() {
299        let candidates = &["Product", "Person"];
300        assert_eq!(suggest_similar("Product", candidates), None);
301    }
302}