schemaorg_rs/graph.rs
1//! Unified structured data graph combining all extraction formats.
2//!
3//! This module provides the primary entry point [`extract_all`] which runs
4//! all three extractors (JSON-LD, Microdata, `RDFa` Lite) against an HTML
5//! document and merges the results into a single [`StructuredDataGraph`].
6//!
7//! # Pipeline
8//!
9//! 1. Parse the HTML once using `scraper::Html`
10//! 2. Run each extractor against the parsed DOM
11//! 3. Merge all nodes and warnings into a single graph
12//! 4. Individual extractor failures are captured as warnings (not errors)
13//!
14//! # Examples
15//!
16//! ```
17//! use schemaorg_rs::extract_all;
18//!
19//! let html = r#"<html><head>
20//! <script type="application/ld+json">{
21//! "@context": "https://schema.org",
22//! "@type": "Product",
23//! "name": "Widget"
24//! }</script>
25//! </head></html>"#;
26//!
27//! let graph = extract_all(html).unwrap();
28//! assert_eq!(graph.nodes.len(), 1);
29//! assert_eq!(graph.nodes[0].types, vec!["Product"]);
30//! assert!(graph.warnings.is_empty());
31//! ```
32
33use serde::{Deserialize, Serialize};
34
35use crate::error::{ExtractionError, ExtractionWarning, WarningCode};
36use crate::extraction::{ExtractionOutput, JsonLdExtractor, MicrodataExtractor, RdfaLiteExtractor};
37use crate::types::SchemaNode;
38
39/// A unified graph of all structured data extracted from an HTML document.
40///
41/// Combines results from JSON-LD, Microdata, and `RDFa` Lite extractors.
42/// Each node retains its [`SourceFormat`](crate::types::SourceFormat) so
43/// callers can distinguish which markup produced it.
44///
45/// # Examples
46///
47/// ```
48/// use schemaorg_rs::extract_all;
49///
50/// let graph = extract_all("<html></html>").unwrap();
51/// assert!(graph.nodes.is_empty());
52/// assert!(graph.warnings.is_empty());
53/// ```
54#[must_use]
55#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
56pub struct StructuredDataGraph {
57 /// All extracted structured data nodes.
58 pub nodes: Vec<SchemaNode>,
59 /// Non-fatal warnings from all extractors.
60 pub warnings: Vec<ExtractionWarning>,
61}
62
63/// Extracts all structured data from an HTML document.
64///
65/// Runs JSON-LD, Microdata, and `RDFa` Lite extractors and merges the
66/// results into a single [`StructuredDataGraph`].
67///
68/// Individual extractor failures are captured as warnings; only truly
69/// fatal errors (e.g. inability to parse HTML) propagate as errors.
70///
71/// # Errors
72///
73/// Returns [`ExtractionError::Internal`] if a fatal, unrecoverable error
74/// occurs during HTML parsing. In practice this function is infallible:
75/// individual format failures are captured as
76/// [`WarningCode::ExtractorFailed`] warnings.
77///
78/// # Examples
79///
80/// ```
81/// use schemaorg_rs::extract_all;
82///
83/// let html = r#"<html><head>
84/// <script type="application/ld+json">{
85/// "@context": "https://schema.org",
86/// "@type": "Product",
87/// "name": "Widget"
88/// }</script>
89/// </head></html>"#;
90///
91/// let graph = extract_all(html).unwrap();
92/// assert_eq!(graph.nodes.len(), 1);
93/// assert_eq!(graph.nodes[0].types, vec!["Product"]);
94/// ```
95pub fn extract_all(html: &str) -> Result<StructuredDataGraph, ExtractionError> {
96 let document = scraper::Html::parse_document(html);
97
98 let mut nodes = Vec::new();
99 let mut warnings = Vec::new();
100
101 // JSON-LD needs both the parsed document and the raw HTML string
102 // for source-location computation (byte offsets of <script> tags).
103 collect_or_warn(
104 JsonLdExtractor.extract_from_document(&document, html),
105 &mut nodes,
106 &mut warnings,
107 );
108
109 // Microdata and RDFa only need the parsed document.
110 collect_or_warn(
111 MicrodataExtractor.extract_from_document(&document),
112 &mut nodes,
113 &mut warnings,
114 );
115 collect_or_warn(
116 RdfaLiteExtractor.extract_from_document(&document),
117 &mut nodes,
118 &mut warnings,
119 );
120
121 Ok(StructuredDataGraph { nodes, warnings })
122}
123
124/// Merges extractor output or captures failures as warnings.
125fn collect_or_warn(
126 result: Result<ExtractionOutput, ExtractionError>,
127 nodes: &mut Vec<SchemaNode>,
128 warnings: &mut Vec<ExtractionWarning>,
129) {
130 match result {
131 Ok(output) => {
132 nodes.extend(output.nodes);
133 warnings.extend(output.warnings);
134 }
135 Err(e) => {
136 warnings.push(ExtractionWarning {
137 message: format!("extractor failed: {e}"),
138 source_location: None,
139 code: WarningCode::ExtractorFailed,
140 });
141 }
142 }
143}
144
145#[cfg(test)]
146mod tests {
147 use pretty_assertions::assert_eq;
148
149 use super::*;
150 use crate::types::{SchemaValue, SourceFormat};
151
152 #[test]
153 fn extract_all_jsonld() {
154 let html = r#"<html><head><script type="application/ld+json">{
155 "@context": "https://schema.org",
156 "@type": "Product",
157 "name": "Test"
158}</script></head></html>"#;
159
160 let graph = extract_all(html).expect("extraction failed");
161 assert_eq!(graph.nodes.len(), 1);
162 assert_eq!(graph.nodes[0].types, vec!["Product"]);
163 assert_eq!(graph.nodes[0].source_format, SourceFormat::JsonLd);
164 assert_eq!(
165 graph.nodes[0].properties["name"],
166 vec![SchemaValue::Text("Test".into())]
167 );
168 }
169
170 #[test]
171 fn extract_all_empty_html() {
172 let graph = extract_all("<html></html>").expect("extraction failed");
173 assert!(graph.nodes.is_empty());
174 assert!(graph.warnings.is_empty());
175 }
176}