use serde::{Deserialize, Serialize};
use crate::error::{ExtractionError, ExtractionWarning, WarningCode};
use crate::extraction::{ExtractionOutput, JsonLdExtractor, MicrodataExtractor, RdfaLiteExtractor};
use crate::types::SchemaNode;
#[must_use]
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct StructuredDataGraph {
pub nodes: Vec<SchemaNode>,
pub warnings: Vec<ExtractionWarning>,
}
pub fn extract_all(html: &str) -> Result<StructuredDataGraph, ExtractionError> {
let document = scraper::Html::parse_document(html);
let mut nodes = Vec::new();
let mut warnings = Vec::new();
collect_or_warn(
JsonLdExtractor.extract_from_document(&document, html),
&mut nodes,
&mut warnings,
);
collect_or_warn(
MicrodataExtractor.extract_from_document(&document),
&mut nodes,
&mut warnings,
);
collect_or_warn(
RdfaLiteExtractor.extract_from_document(&document),
&mut nodes,
&mut warnings,
);
Ok(StructuredDataGraph { nodes, warnings })
}
fn collect_or_warn(
result: Result<ExtractionOutput, ExtractionError>,
nodes: &mut Vec<SchemaNode>,
warnings: &mut Vec<ExtractionWarning>,
) {
match result {
Ok(output) => {
nodes.extend(output.nodes);
warnings.extend(output.warnings);
}
Err(e) => {
warnings.push(ExtractionWarning {
message: format!("extractor failed: {e}"),
source_location: None,
code: WarningCode::ExtractorFailed,
});
}
}
}
#[cfg(test)]
mod tests {
use pretty_assertions::assert_eq;
use super::*;
use crate::types::{SchemaValue, SourceFormat};
#[test]
fn extract_all_jsonld() {
let html = r#"<html><head><script type="application/ld+json">{
"@context": "https://schema.org",
"@type": "Product",
"name": "Test"
}</script></head></html>"#;
let graph = extract_all(html).expect("extraction failed");
assert_eq!(graph.nodes.len(), 1);
assert_eq!(graph.nodes[0].types, vec!["Product"]);
assert_eq!(graph.nodes[0].source_format, SourceFormat::JsonLd);
assert_eq!(
graph.nodes[0].properties["name"],
vec![SchemaValue::Text("Test".into())]
);
}
#[test]
fn extract_all_empty_html() {
let graph = extract_all("<html></html>").expect("extraction failed");
assert!(graph.nodes.is_empty());
assert!(graph.warnings.is_empty());
}
}