Skip to main content

schemaorg_rs/extraction/
jsonld.rs

1//! JSON-LD extractor: parses `<script type="application/ld+json">` tags.
2//!
3//! Implements a purpose-built Schema.org JSON-LD parser using `serde_json`
4//! instead of the full `json-ld` crate. This avoids 300+ transitive dependencies
5//! and async requirements while covering >99% of real-world Schema.org usage.
6//!
7//! ## Supported features
8//!
9//! - `@context: "https://schema.org"` (string or array)
10//! - `@type` as string or array
11//! - `@graph` arrays
12//! - `@id` cross-reference resolution (within-document)
13//! - Nested objects
14//!
15//! ## Not supported
16//!
17//! - Remote `@context` fetching
18//! - `@context` term definitions (e.g. `{"cat": "schema:category"}`)
19//! - JSON-LD framing, `@reverse`
20
21use std::collections::{HashMap, HashSet};
22use std::sync::OnceLock;
23
24use indexmap::IndexMap;
25use scraper::{Html, Selector};
26use serde_json::Value;
27
28use crate::error::{ExtractionError, ExtractionWarning, WarningCode};
29use crate::types::{SchemaNode, SchemaValue, SourceFormat, SourceLocation};
30
31use super::{classify_text_value, strip_schema_prefix, ExtractionOutput, Extractor};
32
33/// Maximum nesting depth for JSON-LD objects.
34const MAX_DEPTH: usize = 20;
35
36/// Maximum depth for `@id` cross-reference resolution.
37///
38/// Kept lower than `MAX_DEPTH` to bound amplification when a single
39/// `@id` is referenced from multiple locations in the tree.
40const MAX_REF_DEPTH: usize = 10;
41
42/// Maximum number of `@id` reference resolutions per document.
43///
44/// Bounds total memory amplification when many references point
45/// to the same large node. Each resolution clones the target node.
46const MAX_REF_RESOLUTIONS: usize = 50;
47
48/// Extracts Schema.org structured data from JSON-LD `<script>` tags.
49///
50/// # Examples
51///
52/// ```
53/// use schemaorg_rs::extraction::{Extractor, JsonLdExtractor};
54///
55/// let html = r#"<html><head>
56/// <script type="application/ld+json">{
57/// "@context": "https://schema.org",
58/// "@type": "Product",
59/// "name": "Widget"
60/// }</script>
61/// </head></html>"#;
62///
63/// let output = JsonLdExtractor.extract(html).unwrap();
64/// assert_eq!(output.nodes[0].types, vec!["Product"]);
65/// ```
66pub struct JsonLdExtractor;
67
68impl Extractor for JsonLdExtractor {
69    fn extract(&self, html: &str) -> Result<ExtractionOutput, ExtractionError> {
70        let document = Html::parse_document(html);
71        self.extract_from_document(&document, html)
72    }
73}
74
75impl JsonLdExtractor {
76    /// Extracts from an already-parsed document.
77    ///
78    /// The raw `html` string is needed for source-location computation
79    /// (finding byte offsets of `<script>` tags).
80    ///
81    /// # Errors
82    ///
83    /// Returns [`ExtractionError`] if a fatal error prevents extraction.
84    /// JSON parse failures are captured as warnings, not errors.
85    ///
86    /// # Panics
87    ///
88    /// Panics if the internal CSS selector constant fails to parse.
89    /// This is a compile-time-verified string and will never fail.
90    pub fn extract_from_document(
91        &self,
92        document: &Html,
93        html: &str,
94    ) -> Result<ExtractionOutput, ExtractionError> {
95        static SELECTOR: OnceLock<Selector> = OnceLock::new();
96        let selector = SELECTOR.get_or_init(|| {
97            Selector::parse("script[type=\"application/ld+json\"]")
98                .expect("static JSON-LD selector must parse")
99        });
100
101        let line_index = LineIndex::new(html);
102        let script_offsets = find_script_byte_offsets(html);
103
104        let mut all_nodes = Vec::new();
105        let mut warnings = Vec::new();
106
107        for (idx, element) in document.select(selector).enumerate() {
108            let json_text = element.inner_html();
109            let trimmed = json_text.trim();
110            let source_location = script_offsets
111                .get(idx)
112                .map(|&offset| line_index.location(offset));
113
114            if trimmed.is_empty() {
115                warnings.push(ExtractionWarning {
116                    message: "empty JSON-LD script tag".into(),
117                    source_location,
118                    code: WarningCode::MalformedJsonLd,
119                });
120                continue;
121            }
122
123            let value: Value = match serde_json::from_str(trimmed) {
124                Ok(v) => v,
125                Err(e) => {
126                    warnings.push(ExtractionWarning {
127                        message: format!("failed to parse JSON-LD: {e}"),
128                        source_location,
129                        code: WarningCode::MalformedJsonLd,
130                    });
131                    continue;
132                }
133            };
134
135            let items = extract_json_items(&value, source_location.as_ref(), &mut warnings);
136            all_nodes.extend(items);
137        }
138
139        // Build @id -> index map (lightweight, no node cloning).
140        // First definition wins: later duplicates emit a warning but
141        // do not overwrite the original entry.
142        let mut id_to_index: HashMap<String, usize> = HashMap::new();
143        for (i, node) in all_nodes.iter().enumerate() {
144            if let Some(id) = node.id() {
145                match id_to_index.entry(id.to_owned()) {
146                    std::collections::hash_map::Entry::Occupied(_) => {
147                        warnings.push(ExtractionWarning {
148                            message: format!("duplicate @id: {id}"),
149                            source_location: node.source_location.clone(),
150                            code: WarningCode::DuplicateId,
151                        });
152                    }
153                    std::collections::hash_map::Entry::Vacant(entry) => {
154                        entry.insert(i);
155                    }
156                }
157            }
158        }
159
160        // Clone only nodes that are actually referenced (lazy)
161        let referenced = collect_referenced_ids(&all_nodes);
162        let id_map: HashMap<String, SchemaNode> = referenced
163            .iter()
164            .filter_map(|id| {
165                let &idx = id_to_index.get(id.as_str())?;
166                Some((id.clone(), all_nodes[idx].clone()))
167            })
168            .collect();
169
170        // Resolve @id cross-references
171        resolve_references(&mut all_nodes, &id_map, &mut warnings);
172
173        Ok(ExtractionOutput {
174            nodes: all_nodes,
175            warnings,
176        })
177    }
178}
179
180// JSON -> SchemaNode conversion
181/// Extracts top-level Schema.org items from a parsed JSON value.
182fn extract_json_items(
183    value: &Value,
184    source_location: Option<&SourceLocation>,
185    warnings: &mut Vec<ExtractionWarning>,
186) -> Vec<SchemaNode> {
187    match value {
188        Value::Array(items) => items
189            .iter()
190            .filter_map(|item| json_to_node(item, None, source_location, warnings, 0))
191            .collect(),
192
193        Value::Object(map) => {
194            if let Some(Value::Array(graph_items)) = map.get("@graph") {
195                let context = map.get("@context");
196                graph_items
197                    .iter()
198                    .filter_map(|item| json_to_node(item, context, source_location, warnings, 0))
199                    .collect()
200            } else {
201                json_to_node(value, None, source_location, warnings, 0)
202                    .into_iter()
203                    .collect()
204            }
205        }
206
207        _ => {
208            warnings.push(ExtractionWarning {
209                message: "JSON-LD root must be an object or array".into(),
210                source_location: source_location.cloned(),
211                code: WarningCode::MalformedJsonLd,
212            });
213            Vec::new()
214        }
215    }
216}
217
218/// Converts a JSON object to a [`SchemaNode`].
219///
220/// `parent_context` is the `@context` inherited from a `@graph` wrapper.
221fn json_to_node(
222    value: &Value,
223    parent_context: Option<&Value>,
224    source_location: Option<&SourceLocation>,
225    warnings: &mut Vec<ExtractionWarning>,
226    depth: usize,
227) -> Option<SchemaNode> {
228    if depth > MAX_DEPTH {
229        warnings.push(ExtractionWarning {
230            message: format!("JSON-LD nesting depth exceeds {MAX_DEPTH}, skipping"),
231            source_location: source_location.cloned(),
232            code: WarningCode::MalformedJsonLd,
233        });
234        return None;
235    }
236    let obj = value.as_object()?;
237
238    // Resolve @context: local overrides parent
239    let context = obj.get("@context").or(parent_context);
240
241    // Extract @type
242    let types = extract_types(obj);
243
244    // Warn if no @type and this isn't a pure @id reference
245    if types.is_empty() {
246        let non_meta_keys = obj.keys().filter(|k| !k.starts_with('@')).count();
247        let is_reference = obj.contains_key("@id") && non_meta_keys == 0;
248        if !is_reference && !obj.is_empty() {
249            warnings.push(ExtractionWarning {
250                message: "JSON-LD object has no @type".into(),
251                source_location: source_location.cloned(),
252                code: WarningCode::EmptyType,
253            });
254        }
255    }
256
257    // Build properties
258    let mut properties: IndexMap<String, Vec<SchemaValue>> = IndexMap::new();
259
260    for (key, val) in obj {
261        if key == "@context" || key == "@type" {
262            continue;
263        }
264
265        if key == "@id" {
266            if let Value::String(id) = val {
267                properties
268                    .entry(key.clone())
269                    .or_default()
270                    .push(classify_text_value(id));
271            }
272            continue;
273        }
274
275        let values = json_to_schema_values(val, context, source_location, warnings, depth);
276        if !values.is_empty() {
277            properties.entry(key.clone()).or_default().extend(values);
278        }
279    }
280
281    Some(SchemaNode {
282        types,
283        properties,
284        source_format: SourceFormat::JsonLd,
285        source_location: source_location.cloned(),
286    })
287}
288
289/// Extract `@type` from a JSON-LD object, stripping Schema.org prefixes.
290fn extract_types(obj: &serde_json::Map<String, Value>) -> Vec<String> {
291    match obj.get("@type") {
292        Some(Value::String(t)) => vec![strip_schema_prefix(t).into_owned()],
293        Some(Value::Array(arr)) => arr
294            .iter()
295            .filter_map(|v| v.as_str())
296            .map(|s| strip_schema_prefix(s).into_owned())
297            .collect(),
298        _ => Vec::new(),
299    }
300}
301
302/// Converts a JSON value into [`SchemaValue`]s.
303fn json_to_schema_values(
304    value: &Value,
305    context: Option<&Value>,
306    source_location: Option<&SourceLocation>,
307    warnings: &mut Vec<ExtractionWarning>,
308    depth: usize,
309) -> Vec<SchemaValue> {
310    match value {
311        Value::Null => Vec::new(),
312        Value::Bool(b) => vec![SchemaValue::Boolean(*b)],
313        Value::Number(n) => n
314            .as_f64()
315            .map(|f| vec![SchemaValue::Number(f)])
316            .unwrap_or_default(),
317        Value::String(s) => vec![classify_text_value(s)],
318        Value::Array(arr) => arr
319            .iter()
320            .flat_map(|v| json_to_schema_values(v, context, source_location, warnings, depth))
321            .collect(),
322        Value::Object(_) => json_to_node(value, context, source_location, warnings, depth + 1)
323            .map(|node| vec![SchemaValue::Node(Box::new(node))])
324            .unwrap_or_default(),
325    }
326}
327
328// @id cross-reference resolution
329/// Resolves `{"@id": "..."}` references throughout the node tree.
330///
331/// Total resolutions are capped at [`MAX_REF_RESOLUTIONS`] to prevent
332/// memory amplification from many references to the same large node.
333fn resolve_references(
334    nodes: &mut [SchemaNode],
335    id_map: &HashMap<String, SchemaNode>,
336    warnings: &mut Vec<ExtractionWarning>,
337) {
338    let mut resolution_count: usize = 0;
339    for node in nodes.iter_mut() {
340        resolve_node_refs(node, id_map, warnings, 0, &mut resolution_count);
341    }
342}
343
344/// Recursively resolves references within a single node.
345///
346/// Depth is limited to [`MAX_REF_DEPTH`] and total resolutions to
347/// [`MAX_REF_RESOLUTIONS`] to prevent unbounded amplification.
348fn resolve_node_refs(
349    node: &mut SchemaNode,
350    id_map: &HashMap<String, SchemaNode>,
351    warnings: &mut Vec<ExtractionWarning>,
352    depth: usize,
353    resolution_count: &mut usize,
354) {
355    if depth > MAX_REF_DEPTH {
356        return;
357    }
358
359    for values in node.properties.values_mut() {
360        for value in values.iter_mut() {
361            if let SchemaValue::Node(inner) = value {
362                // Is this a pure @id reference? (no types, only @-prefixed keys)
363                if inner.types.is_empty() {
364                    if let Some(id_values) = inner.properties.get("@id") {
365                        if let Some(SchemaValue::Text(id)) = id_values.first() {
366                            if *resolution_count >= MAX_REF_RESOLUTIONS {
367                                continue;
368                            }
369                            if let Some(resolved) = id_map.get(id.as_str()) {
370                                let has_content =
371                                    !resolved.types.is_empty() || resolved.properties.len() > 1;
372                                if has_content {
373                                    *resolution_count += 1;
374                                    *value = SchemaValue::Node(Box::new(resolved.clone()));
375                                    if let SchemaValue::Node(ref mut n) = value {
376                                        resolve_node_refs(
377                                            n,
378                                            id_map,
379                                            warnings,
380                                            depth + 1,
381                                            resolution_count,
382                                        );
383                                    }
384                                    continue;
385                                }
386                            }
387                            // Only warn for fragment references (e.g. "#foo").
388                            // External @id URIs (e.g. "https://example.com/org/1")
389                            // are valid and should not trigger warnings.
390                            if id.starts_with('#') {
391                                warnings.push(ExtractionWarning {
392                                    message: format!("unresolvable @id reference: {id}"),
393                                    source_location: inner.source_location.clone(),
394                                    code: WarningCode::UnresolvableReference,
395                                });
396                            }
397                            continue;
398                        }
399                    }
400                }
401                // Recurse into non-reference nested nodes
402                resolve_node_refs(inner, id_map, warnings, depth + 1, resolution_count);
403            }
404        }
405    }
406}
407
408// Lazy @id reference collection
409/// Collects all `@id` values that appear as references (not definitions) in the node tree.
410///
411/// A reference is a `SchemaValue::Node` with no types and only an `@id` property.
412/// This is used to determine which nodes need to be cloned for resolution.
413fn collect_referenced_ids(nodes: &[SchemaNode]) -> HashSet<String> {
414    let mut refs = HashSet::new();
415    for node in nodes {
416        collect_refs_in_node(node, &mut refs, 0);
417    }
418    refs
419}
420
421/// Recursively collects `@id` reference strings from a node's properties.
422///
423/// Depth is limited to [`MAX_DEPTH`] to prevent unbounded recursion
424/// on pathological input.
425fn collect_refs_in_node(node: &SchemaNode, refs: &mut HashSet<String>, depth: usize) {
426    if depth > MAX_DEPTH {
427        return;
428    }
429    for values in node.properties.values() {
430        for value in values {
431            if let SchemaValue::Node(inner) = value {
432                if inner.types.is_empty() {
433                    if let Some(id_values) = inner.properties.get("@id") {
434                        if let Some(SchemaValue::Text(id)) = id_values.first() {
435                            refs.insert(id.clone());
436                            continue;
437                        }
438                    }
439                }
440                collect_refs_in_node(inner, refs, depth + 1);
441            }
442        }
443    }
444}
445
446// Source-location utilities
447/// Maps byte offsets to line/column positions.
448struct LineIndex {
449    line_starts: Vec<usize>,
450}
451
452impl LineIndex {
453    fn new(text: &str) -> Self {
454        let mut line_starts = vec![0];
455        for (i, byte) in text.bytes().enumerate() {
456            if byte == b'\n' {
457                line_starts.push(i + 1);
458            }
459        }
460        Self { line_starts }
461    }
462
463    fn location(&self, byte_offset: usize) -> SourceLocation {
464        let line = self
465            .line_starts
466            .partition_point(|&start| start <= byte_offset)
467            .saturating_sub(1);
468        let column = byte_offset.saturating_sub(self.line_starts[line]);
469        SourceLocation {
470            line: line + 1,
471            column: column + 1,
472            byte_offset,
473        }
474    }
475}
476
477/// Finds byte offsets of `<script type="application/ld+json">` tags.
478fn find_script_byte_offsets(html: &str) -> Vec<usize> {
479    let mut offsets = Vec::new();
480    let mut search_from = 0;
481    let pattern = "application/ld+json";
482
483    while let Some(pos) = html[search_from..].find(pattern) {
484        let abs_pos = search_from + pos;
485        if let Some(tag_start) = html[..abs_pos].rfind('<') {
486            if html[tag_start..abs_pos].contains("script") {
487                offsets.push(tag_start);
488            }
489        }
490        search_from = abs_pos + pattern.len();
491    }
492
493    offsets
494}
495
496#[cfg(test)]
497mod tests {
498    use pretty_assertions::assert_eq;
499
500    use super::*;
501
502    #[test]
503    fn line_index_positions() {
504        let idx = LineIndex::new("line1\nline2\nline3");
505        let loc = idx.location(0);
506        assert_eq!((loc.line, loc.column), (1, 1));
507        let loc = idx.location(6);
508        assert_eq!((loc.line, loc.column), (2, 1));
509        let loc = idx.location(8);
510        assert_eq!((loc.line, loc.column), (2, 3));
511    }
512
513    #[test]
514    fn find_script_offsets() {
515        let html =
516            r#"<html><script type="application/ld+json">{"@type":"Product"}</script></html>"#;
517        let offsets = find_script_byte_offsets(html);
518        assert_eq!(offsets.len(), 1);
519        assert!(html[offsets[0]..].starts_with("<script"));
520    }
521
522    #[test]
523    fn basic_product() {
524        let html = r#"<html><head><script type="application/ld+json">{
525  "@context": "https://schema.org",
526  "@type": "Product",
527  "name": "Example Product",
528  "url": "https://example.com/product"
529}</script></head></html>"#;
530
531        let out = JsonLdExtractor.extract(html).expect("extraction failed");
532        assert_eq!(out.nodes.len(), 1);
533        assert_eq!(out.nodes[0].types, vec!["Product"]);
534        assert_eq!(out.nodes[0].source_format, SourceFormat::JsonLd);
535        assert_eq!(
536            out.nodes[0].properties["name"],
537            vec![SchemaValue::Text("Example Product".into())]
538        );
539        assert_eq!(
540            out.nodes[0].properties["url"],
541            vec![SchemaValue::Url("https://example.com/product".into())]
542        );
543    }
544
545    #[test]
546    fn graph_extraction() {
547        let html = r#"<html><head><script type="application/ld+json">{
548  "@context": "https://schema.org",
549  "@graph": [
550    {"@type": "Organization", "name": "Acme"},
551    {"@type": "WebSite", "name": "Acme Site"}
552  ]
553}</script></head></html>"#;
554
555        let out = JsonLdExtractor.extract(html).expect("extraction failed");
556        assert_eq!(out.nodes.len(), 2);
557        assert_eq!(out.nodes[0].types, vec!["Organization"]);
558        assert_eq!(out.nodes[1].types, vec!["WebSite"]);
559    }
560
561    #[test]
562    fn array_type() {
563        let html = r#"<html><head><script type="application/ld+json">{
564  "@context": "https://schema.org",
565  "@type": ["Product", "IndividualProduct"],
566  "name": "Widget"
567}</script></head></html>"#;
568
569        let out = JsonLdExtractor.extract(html).expect("extraction failed");
570        assert_eq!(out.nodes[0].types, vec!["Product", "IndividualProduct"]);
571    }
572
573    #[test]
574    fn nested_object() {
575        let html = r#"<html><head><script type="application/ld+json">{
576  "@context": "https://schema.org",
577  "@type": "Product",
578  "name": "Widget",
579  "offers": {
580    "@type": "Offer",
581    "price": 19.99,
582    "priceCurrency": "USD"
583  }
584}</script></head></html>"#;
585
586        let out = JsonLdExtractor.extract(html).expect("extraction failed");
587        assert_eq!(out.nodes.len(), 1);
588        let offers = &out.nodes[0].properties["offers"];
589        assert_eq!(offers.len(), 1);
590        if let SchemaValue::Node(offer) = &offers[0] {
591            assert_eq!(offer.types, vec!["Offer"]);
592            assert_eq!(offer.properties["price"], vec![SchemaValue::Number(19.99)]);
593            assert_eq!(
594                offer.properties["priceCurrency"],
595                vec![SchemaValue::Text("USD".into())]
596            );
597        } else {
598            panic!("Expected nested Node");
599        }
600    }
601
602    #[test]
603    fn id_cross_reference() {
604        let html = r##"<html><head><script type="application/ld+json">{
605  "@context": "https://schema.org",
606  "@graph": [
607    {"@type": "Product", "name": "Widget", "offers": {"@id": "#offer1"}},
608    {"@id": "#offer1", "@type": "Offer", "price": 29.99}
609  ]
610}</script></head></html>"##;
611
612        let out = JsonLdExtractor.extract(html).expect("extraction failed");
613        assert_eq!(out.nodes.len(), 2);
614        let offers = &out.nodes[0].properties["offers"];
615        if let SchemaValue::Node(offer) = &offers[0] {
616            assert_eq!(offer.types, vec!["Offer"]);
617            assert_eq!(offer.properties["price"], vec![SchemaValue::Number(29.99)]);
618        } else {
619            panic!("Expected resolved Node, got {:?}", offers[0]);
620        }
621    }
622
623    #[test]
624    fn malformed_json_is_warning() {
625        let html =
626            r#"<html><head><script type="application/ld+json">{ invalid }</script></head></html>"#;
627        let out = JsonLdExtractor.extract(html).expect("extraction failed");
628        assert!(out.nodes.is_empty());
629        assert_eq!(out.warnings.len(), 1);
630        assert_eq!(out.warnings[0].code, WarningCode::MalformedJsonLd);
631    }
632
633    #[test]
634    fn empty_script_tag() {
635        let html = r#"<html><head><script type="application/ld+json"></script></head></html>"#;
636        let out = JsonLdExtractor.extract(html).expect("extraction failed");
637        assert!(out.nodes.is_empty());
638        assert_eq!(out.warnings[0].code, WarningCode::MalformedJsonLd);
639    }
640
641    #[test]
642    fn multiple_script_tags() {
643        let html = concat!(
644            r#"<html><head>"#,
645            "\n",
646            r#"<script type="application/ld+json">"#,
647            r#"{"@context":"https://schema.org","@type":"Product","name":"A"}"#,
648            r#"</script>"#,
649            "\n",
650            r#"<script type="application/ld+json">"#,
651            r#"{"@context":"https://schema.org","@type":"Article","name":"B"}"#,
652            r#"</script>"#,
653            "\n",
654            r#"</head></html>"#,
655        );
656
657        let out = JsonLdExtractor.extract(html).expect("extraction failed");
658        assert_eq!(out.nodes.len(), 2);
659        assert_eq!(out.nodes[0].types, vec!["Product"]);
660        assert_eq!(out.nodes[1].types, vec!["Article"]);
661    }
662
663    #[test]
664    fn top_level_array() {
665        let html = r#"<html><head><script type="application/ld+json">[
666  {"@context":"https://schema.org","@type":"Product","name":"A"},
667  {"@context":"https://schema.org","@type":"Article","name":"B"}
668]</script></head></html>"#;
669
670        let out = JsonLdExtractor.extract(html).expect("extraction failed");
671        assert_eq!(out.nodes.len(), 2);
672        assert_eq!(out.nodes[0].types, vec!["Product"]);
673        assert_eq!(out.nodes[1].types, vec!["Article"]);
674    }
675
676    #[test]
677    fn boolean_and_number_values() {
678        let html = r#"<html><head><script type="application/ld+json">{
679  "@context": "https://schema.org",
680  "@type": "Product",
681  "isFamilyFriendly": true,
682  "weight": 1.5
683}</script></head></html>"#;
684
685        let out = JsonLdExtractor.extract(html).expect("extraction failed");
686        assert_eq!(
687            out.nodes[0].properties["isFamilyFriendly"],
688            vec![SchemaValue::Boolean(true)]
689        );
690        assert_eq!(
691            out.nodes[0].properties["weight"],
692            vec![SchemaValue::Number(1.5)]
693        );
694    }
695
696    #[test]
697    fn unresolvable_reference_warns() {
698        let html = r##"<html><head><script type="application/ld+json">{
699  "@context": "https://schema.org",
700  "@type": "Product",
701  "offers": {"@id": "#nonexistent"}
702}</script></head></html>"##;
703
704        let out = JsonLdExtractor.extract(html).expect("extraction failed");
705        assert!(out
706            .warnings
707            .iter()
708            .any(|w| w.code == WarningCode::UnresolvableReference));
709    }
710
711    #[test]
712    fn no_context_with_full_uri_type() {
713        let html = r#"<html><head><script type="application/ld+json">{
714  "@type": "https://schema.org/Product",
715  "name": "Widget"
716}</script></head></html>"#;
717
718        let out = JsonLdExtractor.extract(html).expect("extraction failed");
719        assert_eq!(out.nodes.len(), 1);
720        assert_eq!(out.nodes[0].types, vec!["Product"]);
721    }
722
723    #[test]
724    fn array_context() {
725        let html = r#"<html><head><script type="application/ld+json">{
726  "@context": ["https://schema.org", {"custom": "https://example.com/"}],
727  "@type": "Product",
728  "name": "Widget"
729}</script></head></html>"#;
730
731        let out = JsonLdExtractor.extract(html).expect("extraction failed");
732        assert_eq!(out.nodes[0].types, vec!["Product"]);
733    }
734
735    #[test]
736    fn array_property_values() {
737        let html = r#"<html><head><script type="application/ld+json">{
738  "@context": "https://schema.org",
739  "@type": "Product",
740  "name": "Widget",
741  "image": [
742    "https://example.com/img1.jpg",
743    "https://example.com/img2.jpg"
744  ]
745}</script></head></html>"#;
746
747        let out = JsonLdExtractor.extract(html).expect("extraction failed");
748        assert_eq!(out.nodes[0].properties["image"].len(), 2);
749        assert_eq!(
750            out.nodes[0].properties["image"][0],
751            SchemaValue::Url("https://example.com/img1.jpg".into())
752        );
753    }
754
755    #[test]
756    fn null_values_are_skipped() {
757        let html = r#"<html><head><script type="application/ld+json">{
758  "@context": "https://schema.org",
759  "@type": "Product",
760  "name": "Widget",
761  "description": null
762}</script></head></html>"#;
763
764        let out = JsonLdExtractor.extract(html).expect("extraction failed");
765        assert_eq!(out.nodes.len(), 1);
766        // null values should be skipped entirely
767        assert!(!out.nodes[0].properties.contains_key("description"));
768    }
769
770    #[test]
771    fn integer_numbers() {
772        let html = r#"<html><head><script type="application/ld+json">{
773  "@context": "https://schema.org",
774  "@type": "Product",
775  "name": "Widget",
776  "ratingCount": 42
777}</script></head></html>"#;
778
779        let out = JsonLdExtractor.extract(html).expect("extraction failed");
780        assert_eq!(
781            out.nodes[0].properties["ratingCount"],
782            vec![SchemaValue::Number(42.0)]
783        );
784    }
785
786    #[test]
787    fn graph_context_inherited_by_children() {
788        let html = r#"<html><head><script type="application/ld+json">{
789  "@context": "https://schema.org",
790  "@graph": [
791    {"@type": "Product", "name": "A"},
792    {"@type": "https://schema.org/Article", "name": "B"}
793  ]
794}</script></head></html>"#;
795
796        let out = JsonLdExtractor.extract(html).expect("extraction failed");
797        assert_eq!(out.nodes.len(), 2);
798        assert_eq!(out.nodes[0].types, vec!["Product"]);
799        assert_eq!(out.nodes[1].types, vec!["Article"]);
800    }
801
802    #[test]
803    fn duplicate_id_warns() {
804        let html = r##"<html><head><script type="application/ld+json">{
805  "@context": "https://schema.org",
806  "@graph": [
807    {"@id": "#thing", "@type": "Product", "name": "First"},
808    {"@id": "#thing", "@type": "Article", "name": "Second"}
809  ]
810}</script></head></html>"##;
811
812        let out = JsonLdExtractor.extract(html).expect("extraction failed");
813        assert!(out
814            .warnings
815            .iter()
816            .any(|w| w.code == WarningCode::DuplicateId));
817    }
818
819    #[test]
820    fn deeply_nested_objects() {
821        let html = r#"<html><head><script type="application/ld+json">{
822  "@context": "https://schema.org",
823  "@type": "Product",
824  "name": "Widget",
825  "offers": {
826    "@type": "Offer",
827    "seller": {
828      "@type": "Organization",
829      "address": {
830        "@type": "PostalAddress",
831        "addressCountry": "US"
832      }
833    }
834  }
835}</script></head></html>"#;
836
837        let out = JsonLdExtractor.extract(html).expect("extraction failed");
838        assert_eq!(out.nodes.len(), 1);
839        let offers = &out.nodes[0].properties["offers"];
840        if let SchemaValue::Node(offer) = &offers[0] {
841            let seller = &offer.properties["seller"];
842            if let SchemaValue::Node(org) = &seller[0] {
843                let address = &org.properties["address"];
844                if let SchemaValue::Node(addr) = &address[0] {
845                    assert_eq!(addr.types, vec!["PostalAddress"]);
846                    assert_eq!(
847                        addr.properties["addressCountry"],
848                        vec![SchemaValue::Text("US".into())]
849                    );
850                } else {
851                    panic!("Expected PostalAddress node");
852                }
853            } else {
854                panic!("Expected Organization node");
855            }
856        } else {
857            panic!("Expected Offer node");
858        }
859    }
860
861    #[test]
862    fn whitespace_only_script() {
863        let html = r#"<html><head><script type="application/ld+json">   
864  
865  </script></head></html>"#;
866
867        let out = JsonLdExtractor.extract(html).expect("extraction failed");
868        assert!(out.nodes.is_empty());
869        assert_eq!(out.warnings.len(), 1);
870        assert_eq!(out.warnings[0].code, WarningCode::MalformedJsonLd);
871    }
872
873    #[test]
874    fn source_location_is_set() {
875        let html = concat!(
876            "<html><head>\n",
877            "<script type=\"application/ld+json\">\n",
878            "{\"@type\":\"Product\",\"name\":\"A\"}\n",
879            "</script>\n",
880            "</head></html>",
881        );
882
883        let out = JsonLdExtractor.extract(html).expect("extraction failed");
884        assert_eq!(out.nodes.len(), 1);
885        let loc = out.nodes[0]
886            .source_location
887            .as_ref()
888            .expect("missing source location");
889        // The <script> tag starts on line 2
890        assert_eq!(loc.line, 2);
891    }
892
893    #[test]
894    fn multiple_types_with_uri_prefix() {
895        let html = r#"<html><head><script type="application/ld+json">{
896  "@context": "https://schema.org",
897  "@type": ["https://schema.org/Product", "http://schema.org/IndividualProduct"],
898  "name": "Widget"
899}</script></head></html>"#;
900
901        let out = JsonLdExtractor.extract(html).expect("extraction failed");
902        assert_eq!(out.nodes[0].types, vec!["Product", "IndividualProduct"]);
903    }
904
905    #[test]
906    fn schema_node_id_accessor() {
907        let html = r##"<html><head><script type="application/ld+json">{
908  "@context": "https://schema.org",
909  "@id": "#product1",
910  "@type": "Product",
911  "name": "Widget"
912}</script></head></html>"##;
913
914        let out = JsonLdExtractor.extract(html).expect("extraction failed");
915        assert_eq!(out.nodes[0].id(), Some("#product1"));
916    }
917
918    #[test]
919    fn no_structured_data() {
920        let html = r#"<html><head><title>No structured data</title></head>
921<body><p>Hello world</p></body></html>"#;
922
923        let out = JsonLdExtractor.extract(html).expect("extraction failed");
924        assert!(out.nodes.is_empty());
925        assert!(out.warnings.is_empty());
926    }
927
928    #[test]
929    fn json_ld_with_trailing_comma() {
930        // Many real-world sites have trailing commas in JSON-LD (invalid JSON)
931        let html = r#"<html><head><script type="application/ld+json">{
932  "@type": "Product",
933  "name": "Widget",
934}</script></head></html>"#;
935
936        let out = JsonLdExtractor.extract(html).expect("extraction failed");
937        assert!(out.nodes.is_empty());
938        assert_eq!(out.warnings[0].code, WarningCode::MalformedJsonLd);
939    }
940
941    #[test]
942    fn circular_id_references_do_not_loop() {
943        // A references B, B references A -- must terminate
944        let html = r##"<html><head><script type="application/ld+json">{
945  "@context": "https://schema.org",
946  "@graph": [
947    {"@id": "#a", "@type": "Product", "name": "A", "isRelatedTo": {"@id": "#b"}},
948    {"@id": "#b", "@type": "Article", "name": "B", "isRelatedTo": {"@id": "#a"}}
949  ]
950}</script></head></html>"##;
951
952        let out = JsonLdExtractor.extract(html).expect("must not hang");
953        assert_eq!(out.nodes.len(), 2);
954    }
955
956    #[test]
957    fn self_referencing_id_does_not_loop() {
958        let html = r##"<html><head><script type="application/ld+json">{
959  "@context": "https://schema.org",
960  "@graph": [
961    {"@id": "#self", "@type": "Product", "name": "Me", "isRelatedTo": {"@id": "#self"}}
962  ]
963}</script></head></html>"##;
964
965        let out = JsonLdExtractor.extract(html).expect("must not hang");
966        assert_eq!(out.nodes.len(), 1);
967    }
968
969    #[test]
970    fn empty_id_string() {
971        let html = r##"<html><head><script type="application/ld+json">{
972  "@context": "https://schema.org",
973  "@id": "",
974  "@type": "Product",
975  "name": "Widget"
976}</script></head></html>"##;
977
978        let out = JsonLdExtractor.extract(html).expect("extraction failed");
979        assert_eq!(out.nodes.len(), 1);
980        // Empty @id is stored but should not cause issues
981        assert_eq!(out.nodes[0].id(), Some(""));
982    }
983
984    #[test]
985    fn nesting_at_exactly_max_depth_succeeds() {
986        // Build JSON with exactly MAX_DEPTH (20) levels of nesting
987        let mut json =
988            String::from(r#"{"@context":"https://schema.org","@type":"Thing","name":"L0""#);
989        for i in 1..MAX_DEPTH {
990            json.push_str(&format!(r#","p{i}":{{"@type":"Thing","name":"L{i}""#));
991        }
992        // Close nested objects (one } per level) plus the root
993        for _ in 0..MAX_DEPTH {
994            json.push('}');
995        }
996
997        let html = format!(
998            r#"<html><head><script type="application/ld+json">{json}</script></head></html>"#
999        );
1000
1001        let out = JsonLdExtractor.extract(&html).expect("extraction failed");
1002        assert_eq!(out.nodes.len(), 1);
1003        // No depth warning should fire at exactly MAX_DEPTH
1004        assert!(
1005            !out.warnings.iter().any(|w| w.message.contains("depth")),
1006            "should not warn at MAX_DEPTH"
1007        );
1008    }
1009
1010    #[test]
1011    fn nesting_beyond_max_depth_warns() {
1012        // Build JSON with MAX_DEPTH + 2 levels of nesting
1013        let target = MAX_DEPTH + 2;
1014        let mut json =
1015            String::from(r#"{"@context":"https://schema.org","@type":"Thing","name":"L0""#);
1016        for i in 1..target {
1017            json.push_str(&format!(r#","p{i}":{{"@type":"Thing","name":"L{i}""#));
1018        }
1019        // Close nested objects plus the root
1020        for _ in 0..target {
1021            json.push('}');
1022        }
1023
1024        let html = format!(
1025            r#"<html><head><script type="application/ld+json">{json}</script></head></html>"#
1026        );
1027
1028        let out = JsonLdExtractor.extract(&html).expect("extraction failed");
1029        assert!(
1030            out.warnings.iter().any(|w| w.message.contains("depth")),
1031            "should warn when exceeding MAX_DEPTH"
1032        );
1033    }
1034
1035    #[test]
1036    fn type_is_number_ignored() {
1037        let html = r#"<html><head><script type="application/ld+json">{
1038  "@context": "https://schema.org",
1039  "@type": 42,
1040  "name": "Widget"
1041}</script></head></html>"#;
1042
1043        let out = JsonLdExtractor.extract(html).expect("extraction failed");
1044        assert_eq!(out.nodes.len(), 1);
1045        assert!(out.nodes[0].types.is_empty());
1046        assert!(out
1047            .warnings
1048            .iter()
1049            .any(|w| w.code == WarningCode::EmptyType));
1050    }
1051
1052    #[test]
1053    fn type_is_object_ignored() {
1054        let html = r#"<html><head><script type="application/ld+json">{
1055  "@context": "https://schema.org",
1056  "@type": {"invalid": true},
1057  "name": "Widget"
1058}</script></head></html>"#;
1059
1060        let out = JsonLdExtractor.extract(html).expect("extraction failed");
1061        assert_eq!(out.nodes.len(), 1);
1062        assert!(out.nodes[0].types.is_empty());
1063    }
1064
1065    #[test]
1066    fn type_empty_array() {
1067        let html = r#"<html><head><script type="application/ld+json">{
1068  "@context": "https://schema.org",
1069  "@type": [],
1070  "name": "Widget"
1071}</script></head></html>"#;
1072
1073        let out = JsonLdExtractor.extract(html).expect("extraction failed");
1074        assert_eq!(out.nodes.len(), 1);
1075        assert!(out.nodes[0].types.is_empty());
1076        assert!(out
1077            .warnings
1078            .iter()
1079            .any(|w| w.code == WarningCode::EmptyType));
1080    }
1081
1082    #[test]
1083    fn type_array_with_mixed_values() {
1084        // Non-string values in @type array should be filtered out
1085        let html = r#"<html><head><script type="application/ld+json">{
1086  "@context": "https://schema.org",
1087  "@type": [42, "Product", null, "IndividualProduct"],
1088  "name": "Widget"
1089}</script></head></html>"#;
1090
1091        let out = JsonLdExtractor.extract(html).expect("extraction failed");
1092        assert_eq!(out.nodes[0].types, vec!["Product", "IndividualProduct"]);
1093    }
1094
1095    #[test]
1096    fn non_schema_org_context_still_extracts() {
1097        let html = r#"<html><head><script type="application/ld+json">{
1098  "@context": "https://w3.org/ns/activitystreams",
1099  "@type": "Note",
1100  "content": "Hello"
1101}</script></head></html>"#;
1102
1103        let out = JsonLdExtractor.extract(html).expect("extraction failed");
1104        assert_eq!(out.nodes.len(), 1);
1105        assert_eq!(out.nodes[0].types, vec!["Note"]);
1106        assert_eq!(
1107            out.nodes[0].properties["content"],
1108            vec![SchemaValue::Text("Hello".into())]
1109        );
1110    }
1111
1112    #[test]
1113    fn html_entities_in_script_content() {
1114        // html5ever decodes HTML entities in script text content
1115        let html = r#"<html><head><script type="application/ld+json">{
1116  "@context": "https://schema.org",
1117  "@type": "Product",
1118  "name": "Widget &amp; Gadget"
1119}</script></head></html>"#;
1120
1121        let out = JsonLdExtractor.extract(html).expect("extraction failed");
1122        // serde_json will see the decoded "&" or the raw "&amp;" depending
1123        // on how html5ever handles script content. Either way, extraction
1124        // should succeed without error.
1125        assert_eq!(out.nodes.len(), 1);
1126    }
1127
1128    #[test]
1129    fn multiple_references_to_same_id() {
1130        // Three properties all reference the same @id node
1131        let html = r##"<html><head><script type="application/ld+json">{
1132  "@context": "https://schema.org",
1133  "@graph": [
1134    {
1135      "@type": "Product", "name": "Widget",
1136      "offers": {"@id": "#offer"},
1137      "makesOffer": {"@id": "#offer"},
1138      "hasOfferCatalog": {"@id": "#offer"}
1139    },
1140    {"@id": "#offer", "@type": "Offer", "price": 9.99}
1141  ]
1142}</script></head></html>"##;
1143
1144        let out = JsonLdExtractor.extract(html).expect("extraction failed");
1145        assert_eq!(out.nodes.len(), 2);
1146        // All three references should be resolved
1147        for prop in &["offers", "makesOffer", "hasOfferCatalog"] {
1148            let values = &out.nodes[0].properties[*prop];
1149            if let SchemaValue::Node(node) = &values[0] {
1150                assert_eq!(node.types, vec!["Offer"]);
1151            } else {
1152                panic!("Expected resolved Node for {prop}");
1153            }
1154        }
1155    }
1156
1157    #[test]
1158    fn duplicate_id_first_definition_wins() {
1159        // Verify first-wins semantics after the bug fix
1160        let html = r##"<html><head><script type="application/ld+json">{
1161  "@context": "https://schema.org",
1162  "@graph": [
1163    {"@type": "Product", "name": "P", "offers": {"@id": "#dup"}},
1164    {"@id": "#dup", "@type": "Offer", "price": 10.00, "priceCurrency": "USD"},
1165    {"@id": "#dup", "@type": "Offer", "price": 99.99, "priceCurrency": "EUR"}
1166  ]
1167}</script></head></html>"##;
1168
1169        let out = JsonLdExtractor.extract(html).expect("extraction failed");
1170        // Should warn about duplicate
1171        assert!(out
1172            .warnings
1173            .iter()
1174            .any(|w| w.code == WarningCode::DuplicateId));
1175        // The FIRST definition (price=10.00, USD) should win
1176        let offers = &out.nodes[0].properties["offers"];
1177        if let SchemaValue::Node(offer) = &offers[0] {
1178            assert_eq!(
1179                offer.properties["price"],
1180                vec![SchemaValue::Number(10.0)],
1181                "first @id definition should win"
1182            );
1183            assert_eq!(
1184                offer.properties["priceCurrency"],
1185                vec![SchemaValue::Text("USD".into())],
1186                "first @id definition should win"
1187            );
1188        } else {
1189            panic!("Expected resolved Offer node");
1190        }
1191    }
1192
1193    #[test]
1194    fn json_root_is_string_warns() {
1195        let html = concat!(
1196            r#"<html><head>"#,
1197            r#"<script type="application/ld+json">"#,
1198            r#""just a string""#,
1199            r#"</script></head></html>"#,
1200        );
1201        let out = JsonLdExtractor.extract(html).expect("extraction failed");
1202        assert!(out.nodes.is_empty());
1203        assert_eq!(out.warnings[0].code, WarningCode::MalformedJsonLd);
1204    }
1205
1206    #[test]
1207    fn json_root_is_number_warns() {
1208        let html = r#"<html><head><script type="application/ld+json">42</script></head></html>"#;
1209        let out = JsonLdExtractor.extract(html).expect("extraction failed");
1210        assert!(out.nodes.is_empty());
1211        assert_eq!(out.warnings[0].code, WarningCode::MalformedJsonLd);
1212    }
1213
1214    #[test]
1215    fn external_uri_id_no_warning() {
1216        // External @id URIs should NOT produce unresolvable-reference warnings
1217        let html = r##"<html><head><script type="application/ld+json">{
1218  "@context": "https://schema.org",
1219  "@type": "Product",
1220  "name": "Widget",
1221  "manufacturer": {"@id": "https://example.com/org/1"}
1222}</script></head></html>"##;
1223
1224        let out = JsonLdExtractor.extract(html).expect("extraction failed");
1225        assert!(
1226            !out.warnings
1227                .iter()
1228                .any(|w| w.code == WarningCode::UnresolvableReference),
1229            "external @id URIs should not trigger warnings"
1230        );
1231    }
1232}