Skip to main content

schemaorg_rs/
graph.rs

1//! Unified structured data graph combining all extraction formats.
2//!
3//! This module provides the primary entry point [`extract_all`] which runs
4//! all three extractors (JSON-LD, Microdata, `RDFa` Lite) against an HTML
5//! document and merges the results into a single [`StructuredDataGraph`].
6//!
7//! # Pipeline
8//!
9//! 1. Parse the HTML once using `scraper::Html`
10//! 2. Run each extractor against the parsed DOM
11//! 3. Merge all nodes and warnings into a single graph
12//! 4. Individual extractor failures are captured as warnings (not errors)
13//!
14//! # Examples
15//!
16//! ```
17//! use schemaorg_rs::extract_all;
18//!
19//! let html = r#"<html><head>
20//! <script type="application/ld+json">{
21//!   "@context": "https://schema.org",
22//!   "@type": "Product",
23//!   "name": "Widget"
24//! }</script>
25//! </head></html>"#;
26//!
27//! let graph = extract_all(html).unwrap();
28//! assert_eq!(graph.nodes.len(), 1);
29//! assert_eq!(graph.nodes[0].types, vec!["Product"]);
30//! assert!(graph.warnings.is_empty());
31//! ```
32
33use serde::{Deserialize, Serialize};
34
35use crate::error::{ExtractionError, ExtractionWarning, WarningCode};
36use crate::extraction::{ExtractionOutput, JsonLdExtractor, MicrodataExtractor, RdfaLiteExtractor};
37use crate::types::SchemaNode;
38
39/// A unified graph of all structured data extracted from an HTML document.
40///
41/// Combines results from JSON-LD, Microdata, and `RDFa` Lite extractors.
42/// Each node retains its [`SourceFormat`](crate::types::SourceFormat) so
43/// callers can distinguish which markup produced it.
44///
45/// # Examples
46///
47/// ```
48/// use schemaorg_rs::extract_all;
49///
50/// let graph = extract_all("<html></html>").unwrap();
51/// assert!(graph.nodes.is_empty());
52/// assert!(graph.warnings.is_empty());
53/// ```
54#[must_use]
55#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
56pub struct StructuredDataGraph {
57    /// All extracted structured data nodes.
58    pub nodes: Vec<SchemaNode>,
59    /// Non-fatal warnings from all extractors.
60    pub warnings: Vec<ExtractionWarning>,
61}
62
63/// Extracts all structured data from an HTML document.
64///
65/// Runs JSON-LD, Microdata, and `RDFa` Lite extractors and merges the
66/// results into a single [`StructuredDataGraph`].
67///
68/// Individual extractor failures are captured as warnings; only truly
69/// fatal errors (e.g. inability to parse HTML) propagate as errors.
70///
71/// # Errors
72///
73/// Returns [`ExtractionError::Internal`] if a fatal, unrecoverable error
74/// occurs during HTML parsing. In practice this function is infallible:
75/// individual format failures are captured as
76/// [`WarningCode::ExtractorFailed`] warnings.
77///
78/// # Examples
79///
80/// ```
81/// use schemaorg_rs::extract_all;
82///
83/// let html = r#"<html><head>
84/// <script type="application/ld+json">{
85/// "@context": "https://schema.org",
86/// "@type": "Product",
87/// "name": "Widget"
88/// }</script>
89/// </head></html>"#;
90///
91/// let graph = extract_all(html).unwrap();
92/// assert_eq!(graph.nodes.len(), 1);
93/// assert_eq!(graph.nodes[0].types, vec!["Product"]);
94/// ```
95pub fn extract_all(html: &str) -> Result<StructuredDataGraph, ExtractionError> {
96    let document = scraper::Html::parse_document(html);
97
98    let mut nodes = Vec::new();
99    let mut warnings = Vec::new();
100
101    // JSON-LD needs both the parsed document and the raw HTML string
102    // for source-location computation (byte offsets of <script> tags).
103    collect_or_warn(
104        JsonLdExtractor.extract_from_document(&document, html),
105        &mut nodes,
106        &mut warnings,
107    );
108
109    // Microdata and RDFa only need the parsed document.
110    collect_or_warn(
111        MicrodataExtractor.extract_from_document(&document),
112        &mut nodes,
113        &mut warnings,
114    );
115    collect_or_warn(
116        RdfaLiteExtractor.extract_from_document(&document),
117        &mut nodes,
118        &mut warnings,
119    );
120
121    Ok(StructuredDataGraph { nodes, warnings })
122}
123
124/// Merges extractor output or captures failures as warnings.
125fn collect_or_warn(
126    result: Result<ExtractionOutput, ExtractionError>,
127    nodes: &mut Vec<SchemaNode>,
128    warnings: &mut Vec<ExtractionWarning>,
129) {
130    match result {
131        Ok(output) => {
132            nodes.extend(output.nodes);
133            warnings.extend(output.warnings);
134        }
135        Err(e) => {
136            warnings.push(ExtractionWarning {
137                message: format!("extractor failed: {e}"),
138                source_location: None,
139                code: WarningCode::ExtractorFailed,
140            });
141        }
142    }
143}
144
145#[cfg(test)]
146mod tests {
147    use pretty_assertions::assert_eq;
148
149    use super::*;
150    use crate::types::{SchemaValue, SourceFormat};
151
152    #[test]
153    fn extract_all_jsonld() {
154        let html = r#"<html><head><script type="application/ld+json">{
155  "@context": "https://schema.org",
156  "@type": "Product",
157  "name": "Test"
158}</script></head></html>"#;
159
160        let graph = extract_all(html).expect("extraction failed");
161        assert_eq!(graph.nodes.len(), 1);
162        assert_eq!(graph.nodes[0].types, vec!["Product"]);
163        assert_eq!(graph.nodes[0].source_format, SourceFormat::JsonLd);
164        assert_eq!(
165            graph.nodes[0].properties["name"],
166            vec![SchemaValue::Text("Test".into())]
167        );
168    }
169
170    #[test]
171    fn extract_all_empty_html() {
172        let graph = extract_all("<html></html>").expect("extraction failed");
173        assert!(graph.nodes.is_empty());
174        assert!(graph.warnings.is_empty());
175    }
176}