schemaorg_rs/extraction/mod.rs
1//! Structured data extractors for JSON-LD, Microdata, and `RDFa` Lite.
2//!
3//! This module provides the [`Extractor`] trait and concrete implementations
4//! for each structured data format:
5//!
6//! - [`JsonLdExtractor`] -- `<script type="application/ld+json">` tags
7//! - [`MicrodataExtractor`] -- `itemscope`/`itemprop` attributes
8//! - [`RdfaLiteExtractor`] -- `vocab`/`typeof`/`property` attributes
9//!
10//! Each extractor produces an [`ExtractionOutput`] containing extracted
11//! [`SchemaNode`]s and any non-fatal warnings. For most use cases, prefer
12//! [`extract_all`](crate::graph::extract_all) which runs all extractors
13//! and merges results.
14//!
15//! # Examples
16//!
17//! ```
18//! use schemaorg_rs::extraction::{Extractor, JsonLdExtractor};
19//!
20//! let html = r#"<html><head>
21//! <script type="application/ld+json">{
22//! "@context": "https://schema.org",
23//! "@type": "Product",
24//! "name": "Widget"
25//! }</script>
26//! </head></html>"#;
27//!
28//! let output = JsonLdExtractor.extract(html).unwrap();
29//! assert_eq!(output.nodes.len(), 1);
30//! ```
31
32mod jsonld;
33mod microdata;
34mod rdfa;
35
36pub use jsonld::JsonLdExtractor;
37pub use microdata::MicrodataExtractor;
38pub use rdfa::RdfaLiteExtractor;
39
40use std::borrow::Cow;
41
42use serde::{Deserialize, Serialize};
43
44use crate::error::{ExtractionError, ExtractionWarning};
45use crate::types::{SchemaNode, SchemaValue};
46
47/// Output from a single extractor run.
48///
49/// Contains the extracted nodes and any non-fatal warnings encountered
50/// during extraction.
51///
52/// # Examples
53///
54/// ```
55/// use schemaorg_rs::extraction::{Extractor, JsonLdExtractor};
56///
57/// let output = JsonLdExtractor.extract("<html></html>").unwrap();
58/// assert!(output.nodes.is_empty());
59/// ```
60#[must_use]
61#[derive(Debug, Clone, Default, PartialEq, Serialize, Deserialize)]
62pub struct ExtractionOutput {
63 /// Extracted structured data nodes.
64 pub nodes: Vec<SchemaNode>,
65 /// Non-fatal warnings encountered during extraction.
66 pub warnings: Vec<ExtractionWarning>,
67}
68
69/// Trait implemented by each extraction format (JSON-LD, Microdata, `RDFa`).
70///
71/// Provides a unified interface for extracting structured data from raw HTML.
72/// Each implementation parses the HTML internally using `scraper`.
73///
74/// For better performance when running multiple extractors, use the
75/// format-specific `extract_from_document()` methods which accept a
76/// pre-parsed `scraper::Html` document.
77///
78/// # Examples
79///
80/// ```
81/// use schemaorg_rs::extraction::{Extractor, MicrodataExtractor};
82///
83/// let html = r#"<html><body>
84/// <div itemscope itemtype="https://schema.org/Product">
85/// <span itemprop="name">Widget</span>
86/// </div>
87/// </body></html>"#;
88///
89/// let output = MicrodataExtractor.extract(html).unwrap();
90/// assert_eq!(output.nodes[0].types, vec!["Product"]);
91/// ```
92pub trait Extractor: Send + Sync {
93 /// Extracts structured data nodes from an HTML document.
94 ///
95 /// # Errors
96 ///
97 /// Returns [`ExtractionError`] if a fatal error prevents extraction.
98 /// Most issues are captured as warnings in the returned
99 /// [`ExtractionOutput`] instead.
100 fn extract(&self, html: &str) -> Result<ExtractionOutput, ExtractionError>;
101}
102
103// Shared helpers used by all three extractors
104/// Schema.org URL prefixes to strip from type names and property URIs.
105const SCHEMA_PREFIXES: &[&str] = &["https://schema.org/", "http://schema.org/", "schema:"];
106
107/// Strips Schema.org URL prefixes from a type or property name.
108///
109/// Returns the local name with the prefix removed, or the original
110/// string if no known prefix is present. Returns `Cow::Borrowed` when
111/// no prefix matched (zero-alloc fast path for plain terms like `"Product"`).
112///
113/// `https://schema.org/Product` -> `Product`,
114/// `http://schema.org/Product` -> `Product`, `schema:Product` -> `Product`.
115pub(crate) fn strip_schema_prefix(name: &str) -> Cow<'_, str> {
116 for prefix in SCHEMA_PREFIXES {
117 if let Some(stripped) = name.strip_prefix(prefix) {
118 return Cow::Borrowed(stripped);
119 }
120 }
121 Cow::Borrowed(name)
122}
123
124/// Classifies a text string as [`SchemaValue::Url`], [`SchemaValue::DateTime`],
125/// or [`SchemaValue::Text`].
126///
127/// Uses heuristics:
128/// - Starts with `http://`, `https://`, or `mailto:` -> `Url`
129/// - Matches ISO 8601 date pattern (`YYYY-MM-DD`) -> `DateTime`
130/// - Everything else -> `Text`
131pub(crate) fn classify_text_value(s: &str) -> SchemaValue {
132 if s.starts_with("http://") || s.starts_with("https://") || s.starts_with("mailto:") {
133 return SchemaValue::Url(s.to_string());
134 }
135 // Dates always start with a digit; skip the full check for plain text
136 if s.as_bytes().first().is_some_and(u8::is_ascii_digit) && is_iso_datetime(s) {
137 return SchemaValue::DateTime(s.to_string());
138 }
139 SchemaValue::Text(s.to_string())
140}
141
142/// Checks if a string matches an ISO 8601 date/datetime pattern (`YYYY-MM-DD...`).
143///
144/// Validates the structural pattern and basic range checks (month 01-12,
145/// day 01-31). Full date validation (leap years, etc.) is deferred to M2.
146pub(crate) fn is_iso_datetime(s: &str) -> bool {
147 let bytes = s.as_bytes();
148 if bytes.len() < 10 {
149 return false;
150 }
151
152 let valid_pattern = bytes[0..4].iter().all(u8::is_ascii_digit)
153 && bytes[4] == b'-'
154 && bytes[5..7].iter().all(u8::is_ascii_digit)
155 && bytes[7] == b'-'
156 && bytes[8..10].iter().all(u8::is_ascii_digit);
157
158 if !valid_pattern {
159 return false;
160 }
161
162 // Range checks: month 01-12, day 01-31
163 let month = (bytes[5] - b'0') * 10 + (bytes[6] - b'0');
164 let day = (bytes[8] - b'0') * 10 + (bytes[9] - b'0');
165
166 if !(1..=12).contains(&month) || !(1..=31).contains(&day) {
167 return false;
168 }
169
170 // Must be exactly a date, or followed by a valid ISO 8601 time separator
171 if bytes.len() == 10 {
172 return true;
173 }
174 matches!(bytes[10], b'T' | b't' | b'Z' | b'+' | b'-')
175}
176
177#[cfg(test)]
178mod common_tests {
179 use super::*;
180
181 #[test]
182 fn strip_schema_prefixes() {
183 assert_eq!(strip_schema_prefix("Product").as_ref(), "Product");
184 assert_eq!(
185 strip_schema_prefix("https://schema.org/Product").as_ref(),
186 "Product"
187 );
188 assert_eq!(
189 strip_schema_prefix("http://schema.org/Product").as_ref(),
190 "Product"
191 );
192 assert_eq!(strip_schema_prefix("schema:Product").as_ref(), "Product");
193
194 // Both paths return Cow::Borrowed (zero allocation)
195 assert!(matches!(strip_schema_prefix("Product"), Cow::Borrowed(_)));
196 assert!(matches!(
197 strip_schema_prefix("https://schema.org/Product"),
198 Cow::Borrowed(_)
199 ));
200 }
201
202 #[test]
203 fn classify_text_values() {
204 assert_eq!(
205 classify_text_value("hello"),
206 SchemaValue::Text("hello".into())
207 );
208 assert_eq!(
209 classify_text_value("https://example.com"),
210 SchemaValue::Url("https://example.com".into())
211 );
212 assert_eq!(
213 classify_text_value("2024-01-15"),
214 SchemaValue::DateTime("2024-01-15".into())
215 );
216 assert_eq!(
217 classify_text_value("2024-01-15T10:30:00Z"),
218 SchemaValue::DateTime("2024-01-15T10:30:00Z".into())
219 );
220 }
221
222 #[test]
223 fn iso_datetime_detection() {
224 assert!(is_iso_datetime("2024-01-15"));
225 assert!(is_iso_datetime("2024-01-15T10:30:00"));
226 assert!(is_iso_datetime("2024-01-15T10:30:00Z"));
227 assert!(!is_iso_datetime("hello"));
228 assert!(!is_iso_datetime("2024"));
229 assert!(!is_iso_datetime("not-a-date"));
230 // Invalid ranges
231 assert!(!is_iso_datetime("2024-13-15"));
232 assert!(!is_iso_datetime("2024-00-15"));
233 assert!(!is_iso_datetime("2024-01-00"));
234 assert!(!is_iso_datetime("2024-01-32"));
235 // Valid edge cases
236 assert!(is_iso_datetime("2024-01-01"));
237 assert!(is_iso_datetime("2024-12-31"));
238 // Timezone offsets
239 assert!(is_iso_datetime("2024-01-15+02:00"));
240 assert!(is_iso_datetime("2024-01-15-05:00"));
241 assert!(is_iso_datetime("2024-01-15Z"));
242 // Trailing T with no time part is accepted (valid ISO 8601 date indicator)
243 assert!(is_iso_datetime("2024-01-15T"));
244 // Space is a valid ISO 8601 separator but we reject it to avoid
245 // false positives on sentences that start with a date.
246 assert!(!is_iso_datetime("2024-01-15 10:30:00"));
247 assert!(!is_iso_datetime("2024-01-15 is the deadline"));
248 assert!(!is_iso_datetime("2024-01-15abc"));
249 assert!(!is_iso_datetime("2024-01-15."));
250 }
251}