webpage_info/
schema_org.rs

1//! Schema.org structured data extraction
2//!
3//! Parses [Schema.org](https://schema.org/) JSON-LD structured data from HTML documents.
4
5use serde::{Deserialize, Serialize};
6use serde_json::Value;
7
8/// Schema.org structured data item.
9///
10/// Schema.org provides a collection of shared vocabularies that webmasters can use
11/// to mark up their pages in ways that can be understood by major search engines.
12#[derive(Debug, Clone, Serialize, Deserialize)]
13pub struct SchemaOrg {
14    /// The @type of the schema (e.g., "Article", "Product", "Organization")
15    pub schema_type: String,
16
17    /// The full JSON-LD value containing all properties
18    pub value: Value,
19}
20
21impl SchemaOrg {
22    /// Parse Schema.org data from a JSON-LD string.
23    ///
24    /// Returns a vector of SchemaOrg items found in the JSON-LD content.
25    /// Handles both single objects and arrays, as well as @graph structures.
26    pub fn parse(content: &str) -> Vec<Self> {
27        let Ok(node) = serde_json::from_str::<Value>(content) else {
28            return Vec::new();
29        };
30
31        Self::extract_from_value(node)
32    }
33
34    /// Extract Schema.org items from a parsed JSON value.
35    fn extract_from_value(node: Value) -> Vec<Self> {
36        // Convert single object to array for uniform handling, taking ownership
37        let values = match node {
38            Value::Array(arr) => arr,
39            Value::Object(mut obj) => {
40                // Check for @graph structure - take ownership instead of cloning
41                if let Some(Value::Array(graph)) = obj.remove("@graph") {
42                    graph
43                } else {
44                    vec![Value::Object(obj)]
45                }
46            }
47            _ => return Vec::new(),
48        };
49
50        values
51            .into_iter()
52            .filter_map(|v| {
53                let schema_type = match &v["@type"] {
54                    Value::String(s) => s.clone(),
55                    Value::Array(arr) => {
56                        // Handle multiple types - take the first one
57                        arr.first()
58                            .and_then(|v| v.as_str())
59                            .map(|s| s.to_string())?
60                    }
61                    _ => return None,
62                };
63
64                Some(SchemaOrg {
65                    schema_type,
66                    value: v,
67                })
68            })
69            .collect()
70    }
71
72    /// Get a property value from the schema as a string.
73    pub fn get_str(&self, key: &str) -> Option<&str> {
74        self.value.get(key).and_then(|v| v.as_str())
75    }
76
77    /// Get a property value from the schema as an i64.
78    pub fn get_i64(&self, key: &str) -> Option<i64> {
79        self.value.get(key).and_then(|v| v.as_i64())
80    }
81
82    /// Get a property value from the schema as a nested object.
83    pub fn get_object(&self, key: &str) -> Option<&Value> {
84        self.value.get(key).filter(|v| v.is_object())
85    }
86
87    /// Get a property value from the schema as an array.
88    pub fn get_array(&self, key: &str) -> Option<&Vec<Value>> {
89        self.value.get(key).and_then(|v| v.as_array())
90    }
91}
92
93#[cfg(test)]
94mod tests {
95    use super::*;
96
97    #[test]
98    fn test_empty_object() {
99        let schema = SchemaOrg::parse("{}");
100        assert!(schema.is_empty());
101    }
102
103    #[test]
104    fn test_invalid_json() {
105        let schema = SchemaOrg::parse("not json");
106        assert!(schema.is_empty());
107    }
108
109    #[test]
110    fn test_single_type() {
111        let schema = SchemaOrg::parse(r#"{"@type": "NewsArticle", "headline": "Test"}"#);
112        assert_eq!(schema.len(), 1);
113        assert_eq!(schema[0].schema_type, "NewsArticle");
114        assert_eq!(schema[0].get_str("headline"), Some("Test"));
115    }
116
117    #[test]
118    fn test_array_of_types() {
119        let schema = SchemaOrg::parse(r#"[{"@type": "Article"}, {"@type": "WebPage"}]"#);
120        assert_eq!(schema.len(), 2);
121        assert_eq!(schema[0].schema_type, "Article");
122        assert_eq!(schema[1].schema_type, "WebPage");
123    }
124
125    #[test]
126    fn test_graph_structure() {
127        let json = r#"{
128            "@context": "https://schema.org",
129            "@graph": [
130                {"@type": "Organization", "name": "Example"},
131                {"@type": "WebSite", "url": "https://example.org"}
132            ]
133        }"#;
134        let schema = SchemaOrg::parse(json);
135        assert_eq!(schema.len(), 2);
136        assert_eq!(schema[0].schema_type, "Organization");
137        assert_eq!(schema[0].get_str("name"), Some("Example"));
138        assert_eq!(schema[1].schema_type, "WebSite");
139    }
140
141    #[test]
142    fn test_multiple_types() {
143        let schema = SchemaOrg::parse(r#"{"@type": ["Article", "BlogPosting"]}"#);
144        assert_eq!(schema.len(), 1);
145        assert_eq!(schema[0].schema_type, "Article");
146    }
147
148    #[test]
149    fn test_helper_methods() {
150        let schema = SchemaOrg::parse(
151            r#"{
152            "@type": "Product",
153            "name": "Widget",
154            "price": 99,
155            "offers": {"@type": "Offer"},
156            "images": ["a.jpg", "b.jpg"]
157        }"#,
158        );
159
160        let product = &schema[0];
161        assert_eq!(product.get_str("name"), Some("Widget"));
162        assert_eq!(product.get_i64("price"), Some(99));
163        assert!(product.get_object("offers").is_some());
164        assert_eq!(product.get_array("images").map(|a| a.len()), Some(2));
165    }
166}