Skip to main content

schemaorg_rs/extraction/
mod.rs

1//! Structured data extractors for JSON-LD, Microdata, and `RDFa` Lite.
2//!
3//! This module provides the [`Extractor`] trait and concrete implementations
4//! for each structured data format:
5//!
6//! - [`JsonLdExtractor`] -- `<script type="application/ld+json">` tags
7//! - [`MicrodataExtractor`] -- `itemscope`/`itemprop` attributes
8//! - [`RdfaLiteExtractor`] -- `vocab`/`typeof`/`property` attributes
9//!
10//! Each extractor produces an [`ExtractionOutput`] containing extracted
11//! [`SchemaNode`]s and any non-fatal warnings. For most use cases, prefer
12//! [`extract_all`](crate::graph::extract_all) which runs all extractors
13//! and merges results.
14//!
15//! # Examples
16//!
17//! ```
18//! use schemaorg_rs::extraction::{Extractor, JsonLdExtractor};
19//!
20//! let html = r#"<html><head>
21//! <script type="application/ld+json">{
22//!   "@context": "https://schema.org",
23//!   "@type": "Product",
24//!   "name": "Widget"
25//! }</script>
26//! </head></html>"#;
27//!
28//! let output = JsonLdExtractor.extract(html).unwrap();
29//! assert_eq!(output.nodes.len(), 1);
30//! ```
31
32mod jsonld;
33mod microdata;
34mod rdfa;
35
36pub use jsonld::JsonLdExtractor;
37pub use microdata::MicrodataExtractor;
38pub use rdfa::RdfaLiteExtractor;
39
40use std::borrow::Cow;
41
42use serde::{Deserialize, Serialize};
43
44use crate::error::{ExtractionError, ExtractionWarning};
45use crate::types::{SchemaNode, SchemaValue};
46
47/// Output from a single extractor run.
48///
49/// Contains the extracted nodes and any non-fatal warnings encountered
50/// during extraction.
51///
52/// # Examples
53///
54/// ```
55/// use schemaorg_rs::extraction::{Extractor, JsonLdExtractor};
56///
57/// let output = JsonLdExtractor.extract("<html></html>").unwrap();
58/// assert!(output.nodes.is_empty());
59/// ```
60#[must_use]
61#[derive(Debug, Clone, Default, PartialEq, Serialize, Deserialize)]
62pub struct ExtractionOutput {
63    /// Extracted structured data nodes.
64    pub nodes: Vec<SchemaNode>,
65    /// Non-fatal warnings encountered during extraction.
66    pub warnings: Vec<ExtractionWarning>,
67}
68
69/// Trait implemented by each extraction format (JSON-LD, Microdata, `RDFa`).
70///
71/// Provides a unified interface for extracting structured data from raw HTML.
72/// Each implementation parses the HTML internally using `scraper`.
73///
74/// For better performance when running multiple extractors, use the
75/// format-specific `extract_from_document()` methods which accept a
76/// pre-parsed `scraper::Html` document.
77///
78/// # Examples
79///
80/// ```
81/// use schemaorg_rs::extraction::{Extractor, MicrodataExtractor};
82///
83/// let html = r#"<html><body>
84/// <div itemscope itemtype="https://schema.org/Product">
85/// <span itemprop="name">Widget</span>
86/// </div>
87/// </body></html>"#;
88///
89/// let output = MicrodataExtractor.extract(html).unwrap();
90/// assert_eq!(output.nodes[0].types, vec!["Product"]);
91/// ```
92pub trait Extractor: Send + Sync {
93    /// Extracts structured data nodes from an HTML document.
94    ///
95    /// # Errors
96    ///
97    /// Returns [`ExtractionError`] if a fatal error prevents extraction.
98    /// Most issues are captured as warnings in the returned
99    /// [`ExtractionOutput`] instead.
100    fn extract(&self, html: &str) -> Result<ExtractionOutput, ExtractionError>;
101}
102
103// Shared helpers used by all three extractors
104/// Schema.org URL prefixes to strip from type names and property URIs.
105const SCHEMA_PREFIXES: &[&str] = &["https://schema.org/", "http://schema.org/", "schema:"];
106
107/// Strips Schema.org URL prefixes from a type or property name.
108///
109/// Returns the local name with the prefix removed, or the original
110/// string if no known prefix is present. Returns `Cow::Borrowed` when
111/// no prefix matched (zero-alloc fast path for plain terms like `"Product"`).
112///
113/// `https://schema.org/Product` -> `Product`,
114/// `http://schema.org/Product` -> `Product`, `schema:Product` -> `Product`.
115pub(crate) fn strip_schema_prefix(name: &str) -> Cow<'_, str> {
116    for prefix in SCHEMA_PREFIXES {
117        if let Some(stripped) = name.strip_prefix(prefix) {
118            return Cow::Borrowed(stripped);
119        }
120    }
121    Cow::Borrowed(name)
122}
123
124/// Classifies a text string as [`SchemaValue::Url`], [`SchemaValue::DateTime`],
125/// or [`SchemaValue::Text`].
126///
127/// Uses heuristics:
128/// - Starts with `http://`, `https://`, or `mailto:` -> `Url`
129/// - Matches ISO 8601 date pattern (`YYYY-MM-DD`) -> `DateTime`
130/// - Everything else -> `Text`
131pub(crate) fn classify_text_value(s: &str) -> SchemaValue {
132    if s.starts_with("http://") || s.starts_with("https://") || s.starts_with("mailto:") {
133        return SchemaValue::Url(s.to_string());
134    }
135    // Dates always start with a digit; skip the full check for plain text
136    if s.as_bytes().first().is_some_and(u8::is_ascii_digit) && is_iso_datetime(s) {
137        return SchemaValue::DateTime(s.to_string());
138    }
139    SchemaValue::Text(s.to_string())
140}
141
142/// Checks if a string matches an ISO 8601 date/datetime pattern (`YYYY-MM-DD...`).
143///
144/// Validates the structural pattern and basic range checks (month 01-12,
145/// day 01-31). Full date validation (leap years, etc.) is deferred to M2.
146pub(crate) fn is_iso_datetime(s: &str) -> bool {
147    let bytes = s.as_bytes();
148    if bytes.len() < 10 {
149        return false;
150    }
151
152    let valid_pattern = bytes[0..4].iter().all(u8::is_ascii_digit)
153        && bytes[4] == b'-'
154        && bytes[5..7].iter().all(u8::is_ascii_digit)
155        && bytes[7] == b'-'
156        && bytes[8..10].iter().all(u8::is_ascii_digit);
157
158    if !valid_pattern {
159        return false;
160    }
161
162    // Range checks: month 01-12, day 01-31
163    let month = (bytes[5] - b'0') * 10 + (bytes[6] - b'0');
164    let day = (bytes[8] - b'0') * 10 + (bytes[9] - b'0');
165
166    if !(1..=12).contains(&month) || !(1..=31).contains(&day) {
167        return false;
168    }
169
170    // Must be exactly a date, or followed by a valid ISO 8601 time separator
171    if bytes.len() == 10 {
172        return true;
173    }
174    matches!(bytes[10], b'T' | b't' | b'Z' | b'+' | b'-')
175}
176
177#[cfg(test)]
178mod common_tests {
179    use super::*;
180
181    #[test]
182    fn strip_schema_prefixes() {
183        assert_eq!(strip_schema_prefix("Product").as_ref(), "Product");
184        assert_eq!(
185            strip_schema_prefix("https://schema.org/Product").as_ref(),
186            "Product"
187        );
188        assert_eq!(
189            strip_schema_prefix("http://schema.org/Product").as_ref(),
190            "Product"
191        );
192        assert_eq!(strip_schema_prefix("schema:Product").as_ref(), "Product");
193
194        // Both paths return Cow::Borrowed (zero allocation)
195        assert!(matches!(strip_schema_prefix("Product"), Cow::Borrowed(_)));
196        assert!(matches!(
197            strip_schema_prefix("https://schema.org/Product"),
198            Cow::Borrowed(_)
199        ));
200    }
201
202    #[test]
203    fn classify_text_values() {
204        assert_eq!(
205            classify_text_value("hello"),
206            SchemaValue::Text("hello".into())
207        );
208        assert_eq!(
209            classify_text_value("https://example.com"),
210            SchemaValue::Url("https://example.com".into())
211        );
212        assert_eq!(
213            classify_text_value("2024-01-15"),
214            SchemaValue::DateTime("2024-01-15".into())
215        );
216        assert_eq!(
217            classify_text_value("2024-01-15T10:30:00Z"),
218            SchemaValue::DateTime("2024-01-15T10:30:00Z".into())
219        );
220    }
221
222    #[test]
223    fn iso_datetime_detection() {
224        assert!(is_iso_datetime("2024-01-15"));
225        assert!(is_iso_datetime("2024-01-15T10:30:00"));
226        assert!(is_iso_datetime("2024-01-15T10:30:00Z"));
227        assert!(!is_iso_datetime("hello"));
228        assert!(!is_iso_datetime("2024"));
229        assert!(!is_iso_datetime("not-a-date"));
230        // Invalid ranges
231        assert!(!is_iso_datetime("2024-13-15"));
232        assert!(!is_iso_datetime("2024-00-15"));
233        assert!(!is_iso_datetime("2024-01-00"));
234        assert!(!is_iso_datetime("2024-01-32"));
235        // Valid edge cases
236        assert!(is_iso_datetime("2024-01-01"));
237        assert!(is_iso_datetime("2024-12-31"));
238        // Timezone offsets
239        assert!(is_iso_datetime("2024-01-15+02:00"));
240        assert!(is_iso_datetime("2024-01-15-05:00"));
241        assert!(is_iso_datetime("2024-01-15Z"));
242        // Trailing T with no time part is accepted (valid ISO 8601 date indicator)
243        assert!(is_iso_datetime("2024-01-15T"));
244        // Space is a valid ISO 8601 separator but we reject it to avoid
245        // false positives on sentences that start with a date.
246        assert!(!is_iso_datetime("2024-01-15 10:30:00"));
247        assert!(!is_iso_datetime("2024-01-15 is the deadline"));
248        assert!(!is_iso_datetime("2024-01-15abc"));
249        assert!(!is_iso_datetime("2024-01-15."));
250    }
251}