Skip to main content

schemaorg_rs/extraction/
jsonld.rs

1//! JSON-LD extractor: parses `<script type="application/ld+json">` tags.
2//!
3//! Implements a purpose-built Schema.org JSON-LD parser using `serde_json`
4//! instead of the full `json-ld` crate. This avoids 300+ transitive dependencies
5//! and async requirements while covering >99% of real-world Schema.org usage.
6//!
7//! ## Supported features
8//!
9//! - `@context: "https://schema.org"` (string or array)
10//! - `@type` as string or array
11//! - `@graph` arrays
12//! - `@id` cross-reference resolution (within-document)
13//! - Nested objects
14//!
15//! ## Not supported
16//!
17//! - Remote `@context` fetching
18//! - `@context` term definitions (e.g. `{"cat": "schema:category"}`)
19//! - JSON-LD framing, `@reverse`
20
21use std::collections::{HashMap, HashSet};
22use std::sync::OnceLock;
23
24use indexmap::IndexMap;
25use scraper::{Html, Selector};
26use serde_json::Value;
27
28use crate::error::{ExtractionError, ExtractionWarning, WarningCode};
29use crate::types::{SchemaNode, SchemaValue, SourceFormat, SourceLocation};
30
31use super::{classify_text_value, strip_schema_prefix, ExtractionOutput, Extractor};
32
33/// Maximum nesting depth for JSON-LD objects.
34const MAX_DEPTH: usize = 20;
35
36/// Maximum depth for `@id` cross-reference resolution.
37///
38/// Kept lower than `MAX_DEPTH` to bound amplification when a single
39/// `@id` is referenced from multiple locations in the tree.
40const MAX_REF_DEPTH: usize = 10;
41
42/// Maximum number of `@id` reference resolutions per document.
43///
44/// Bounds total memory amplification when many references point
45/// to the same large node. Each resolution clones the target node.
46const MAX_REF_RESOLUTIONS: usize = 50;
47
48/// Extracts Schema.org structured data from JSON-LD `<script>` tags.
49///
50/// # Examples
51///
52/// ```
53/// use schemaorg_rs::extraction::{Extractor, JsonLdExtractor};
54///
55/// let html = r#"<html><head>
56/// <script type="application/ld+json">{
57/// "@context": "https://schema.org",
58/// "@type": "Product",
59/// "name": "Widget"
60/// }</script>
61/// </head></html>"#;
62///
63/// let output = JsonLdExtractor.extract(html).unwrap();
64/// assert_eq!(output.nodes[0].types, vec!["Product"]);
65/// ```
66pub struct JsonLdExtractor;
67
68impl Extractor for JsonLdExtractor {
69    fn extract(&self, html: &str) -> Result<ExtractionOutput, ExtractionError> {
70        let document = Html::parse_document(html);
71        self.extract_from_document(&document, html)
72    }
73}
74
75impl JsonLdExtractor {
76    /// Extracts from an already-parsed document.
77    ///
78    /// The raw `html` string is needed for source-location computation
79    /// (finding byte offsets of `<script>` tags).
80    ///
81    /// # Errors
82    ///
83    /// Returns [`ExtractionError`] if a fatal error prevents extraction.
84    /// JSON parse failures are captured as warnings, not errors.
85    ///
86    /// # Panics
87    ///
88    /// Panics if the internal CSS selector constant fails to parse.
89    /// This is a compile-time-verified string and will never fail.
90    pub fn extract_from_document(
91        &self,
92        document: &Html,
93        html: &str,
94    ) -> Result<ExtractionOutput, ExtractionError> {
95        static SELECTOR: OnceLock<Selector> = OnceLock::new();
96        let selector = SELECTOR.get_or_init(|| {
97            Selector::parse("script[type=\"application/ld+json\"]")
98                .expect("static JSON-LD selector must parse")
99        });
100
101        let line_index = LineIndex::new(html);
102        let script_offsets = find_script_byte_offsets(html);
103
104        let mut all_nodes = Vec::new();
105        let mut warnings = Vec::new();
106
107        for (idx, element) in document.select(selector).enumerate() {
108            let json_text = element.inner_html();
109            let trimmed = json_text.trim();
110            let source_location = script_offsets
111                .get(idx)
112                .map(|&offset| line_index.location(offset));
113
114            if trimmed.is_empty() {
115                warnings.push(ExtractionWarning {
116                    message: "empty JSON-LD script tag".into(),
117                    source_location,
118                    code: WarningCode::MalformedJsonLd,
119                });
120                continue;
121            }
122
123            let value: Value = match serde_json::from_str(trimmed) {
124                Ok(v) => v,
125                Err(e) => {
126                    warnings.push(ExtractionWarning {
127                        message: format!("failed to parse JSON-LD: {e}"),
128                        source_location,
129                        code: WarningCode::MalformedJsonLd,
130                    });
131                    continue;
132                }
133            };
134
135            let items = extract_json_items(&value, source_location.as_ref(), &mut warnings);
136            all_nodes.extend(items);
137        }
138
139        // Build @id -> index map (lightweight, no node cloning).
140        // First definition wins: later duplicates emit a warning but
141        // do not overwrite the original entry.
142        let mut id_to_index: HashMap<String, usize> = HashMap::new();
143        for (i, node) in all_nodes.iter().enumerate() {
144            if let Some(id) = node.id() {
145                match id_to_index.entry(id.to_owned()) {
146                    std::collections::hash_map::Entry::Occupied(_) => {
147                        warnings.push(ExtractionWarning {
148                            message: format!("duplicate @id: {id}"),
149                            source_location: node.source_location.clone(),
150                            code: WarningCode::DuplicateId,
151                        });
152                    }
153                    std::collections::hash_map::Entry::Vacant(entry) => {
154                        entry.insert(i);
155                    }
156                }
157            }
158        }
159
160        // Clone only nodes that are actually referenced (lazy)
161        let referenced = collect_referenced_ids(&all_nodes);
162        let id_map: HashMap<String, SchemaNode> = referenced
163            .iter()
164            .filter_map(|id| {
165                let &idx = id_to_index.get(id.as_str())?;
166                Some((id.clone(), all_nodes[idx].clone()))
167            })
168            .collect();
169
170        // Resolve @id cross-references
171        resolve_references(&mut all_nodes, &id_map, &mut warnings);
172
173        Ok(ExtractionOutput {
174            nodes: all_nodes,
175            warnings,
176        })
177    }
178}
179
180// JSON -> SchemaNode conversion
181/// Extracts top-level Schema.org items from a parsed JSON value.
182fn extract_json_items(
183    value: &Value,
184    source_location: Option<&SourceLocation>,
185    warnings: &mut Vec<ExtractionWarning>,
186) -> Vec<SchemaNode> {
187    match value {
188        Value::Array(items) => items
189            .iter()
190            .filter_map(|item| json_to_node(item, None, source_location, warnings, 0))
191            .collect(),
192
193        Value::Object(map) => {
194            if let Some(Value::Array(graph_items)) = map.get("@graph") {
195                let context = map.get("@context");
196                graph_items
197                    .iter()
198                    .filter_map(|item| json_to_node(item, context, source_location, warnings, 0))
199                    .collect()
200            } else {
201                json_to_node(value, None, source_location, warnings, 0)
202                    .into_iter()
203                    .collect()
204            }
205        }
206
207        _ => {
208            warnings.push(ExtractionWarning {
209                message: "JSON-LD root must be an object or array".into(),
210                source_location: source_location.cloned(),
211                code: WarningCode::MalformedJsonLd,
212            });
213            Vec::new()
214        }
215    }
216}
217
218/// Converts a JSON object to a [`SchemaNode`].
219///
220/// `parent_context` is the `@context` inherited from a `@graph` wrapper.
221fn json_to_node(
222    value: &Value,
223    parent_context: Option<&Value>,
224    source_location: Option<&SourceLocation>,
225    warnings: &mut Vec<ExtractionWarning>,
226    depth: usize,
227) -> Option<SchemaNode> {
228    if depth > MAX_DEPTH {
229        warnings.push(ExtractionWarning {
230            message: format!("JSON-LD nesting depth exceeds {MAX_DEPTH}, skipping"),
231            source_location: source_location.cloned(),
232            code: WarningCode::MalformedJsonLd,
233        });
234        return None;
235    }
236    let obj = value.as_object()?;
237
238    // Resolve @context: local overrides parent
239    let context = obj.get("@context").or(parent_context);
240
241    // Extract @type
242    let types = extract_types(obj);
243
244    // Warn if no @type and this isn't a pure @id reference
245    if types.is_empty() {
246        let non_meta_keys = obj.keys().filter(|k| !k.starts_with('@')).count();
247        let is_reference = obj.contains_key("@id") && non_meta_keys == 0;
248        if !is_reference && !obj.is_empty() {
249            warnings.push(ExtractionWarning {
250                message: "JSON-LD object has no @type".into(),
251                source_location: source_location.cloned(),
252                code: WarningCode::EmptyType,
253            });
254        }
255    }
256
257    // Build properties
258    let mut properties: IndexMap<String, Vec<SchemaValue>> = IndexMap::new();
259
260    for (key, val) in obj {
261        if key == "@context" || key == "@type" {
262            continue;
263        }
264
265        if key == "@id" {
266            if let Value::String(id) = val {
267                properties
268                    .entry(key.clone())
269                    .or_default()
270                    .push(classify_text_value(id));
271            }
272            continue;
273        }
274
275        let values = json_to_schema_values(val, context, source_location, warnings, depth);
276        if !values.is_empty() {
277            properties.entry(key.clone()).or_default().extend(values);
278        }
279    }
280
281    Some(SchemaNode {
282        types,
283        properties,
284        source_format: SourceFormat::JsonLd,
285        source_location: source_location.cloned(),
286    })
287}
288
289/// Extract `@type` from a JSON-LD object, stripping Schema.org prefixes.
290fn extract_types(obj: &serde_json::Map<String, Value>) -> Vec<String> {
291    match obj.get("@type") {
292        Some(Value::String(t)) => vec![strip_schema_prefix(t).into_owned()],
293        Some(Value::Array(arr)) => arr
294            .iter()
295            .filter_map(|v| v.as_str())
296            .map(|s| strip_schema_prefix(s).into_owned())
297            .collect(),
298        _ => Vec::new(),
299    }
300}
301
302/// Converts a JSON value into [`SchemaValue`]s.
303fn json_to_schema_values(
304    value: &Value,
305    context: Option<&Value>,
306    source_location: Option<&SourceLocation>,
307    warnings: &mut Vec<ExtractionWarning>,
308    depth: usize,
309) -> Vec<SchemaValue> {
310    match value {
311        Value::Null => Vec::new(),
312        Value::Bool(b) => vec![SchemaValue::Boolean(*b)],
313        Value::Number(n) => n
314            .as_f64()
315            .map(|f| vec![SchemaValue::Number(f)])
316            .unwrap_or_default(),
317        Value::String(s) => vec![classify_text_value(s)],
318        Value::Array(arr) => arr
319            .iter()
320            .flat_map(|v| json_to_schema_values(v, context, source_location, warnings, depth))
321            .collect(),
322        Value::Object(_) => json_to_node(value, context, source_location, warnings, depth + 1)
323            .map(|node| vec![SchemaValue::Node(Box::new(node))])
324            .unwrap_or_default(),
325    }
326}
327
328// @id cross-reference resolution
329/// Resolves `{"@id": "..."}` references throughout the node tree.
330///
331/// Total resolutions are capped at [`MAX_REF_RESOLUTIONS`] to prevent
332/// memory amplification from many references to the same large node.
333fn resolve_references(
334    nodes: &mut [SchemaNode],
335    id_map: &HashMap<String, SchemaNode>,
336    warnings: &mut Vec<ExtractionWarning>,
337) {
338    let mut resolution_count: usize = 0;
339    for node in nodes.iter_mut() {
340        resolve_node_refs(node, id_map, warnings, 0, &mut resolution_count);
341    }
342}
343
344/// Recursively resolves references within a single node.
345///
346/// Depth is limited to [`MAX_REF_DEPTH`] and total resolutions to
347/// [`MAX_REF_RESOLUTIONS`] to prevent unbounded amplification.
348fn resolve_node_refs(
349    node: &mut SchemaNode,
350    id_map: &HashMap<String, SchemaNode>,
351    warnings: &mut Vec<ExtractionWarning>,
352    depth: usize,
353    resolution_count: &mut usize,
354) {
355    if depth > MAX_REF_DEPTH {
356        return;
357    }
358
359    for values in node.properties.values_mut() {
360        for value in values.iter_mut() {
361            if let SchemaValue::Node(inner) = value {
362                // Is this a pure @id reference? (no types, only @-prefixed keys)
363                if inner.types.is_empty() {
364                    if let Some(id_values) = inner.properties.get("@id") {
365                        if let Some(SchemaValue::Text(id)) = id_values.first() {
366                            if *resolution_count >= MAX_REF_RESOLUTIONS {
367                                continue;
368                            }
369                            if let Some(resolved) = id_map.get(id.as_str()) {
370                                let has_content =
371                                    !resolved.types.is_empty() || resolved.properties.len() > 1;
372                                if has_content {
373                                    *resolution_count += 1;
374                                    *value = SchemaValue::Node(Box::new(resolved.clone()));
375                                    if let SchemaValue::Node(ref mut n) = value {
376                                        resolve_node_refs(
377                                            n,
378                                            id_map,
379                                            warnings,
380                                            depth + 1,
381                                            resolution_count,
382                                        );
383                                    }
384                                    continue;
385                                }
386                            }
387                            // Only warn for fragment references (e.g. "#foo").
388                            // External @id URIs (e.g. "https://example.com/org/1")
389                            // are valid and should not trigger warnings.
390                            if id.starts_with('#') {
391                                warnings.push(ExtractionWarning {
392                                    message: format!("unresolvable @id reference: {id}"),
393                                    source_location: inner.source_location.clone(),
394                                    code: WarningCode::UnresolvableReference,
395                                });
396                            }
397                            continue;
398                        }
399                    }
400                }
401                // Recurse into non-reference nested nodes
402                resolve_node_refs(inner, id_map, warnings, depth + 1, resolution_count);
403            }
404        }
405    }
406}
407
408// Lazy @id reference collection
409/// Collects all `@id` values that appear as references (not definitions) in the node tree.
410///
411/// A reference is a `SchemaValue::Node` with no types and only an `@id` property.
412/// This is used to determine which nodes need to be cloned for resolution.
413fn collect_referenced_ids(nodes: &[SchemaNode]) -> HashSet<String> {
414    let mut refs = HashSet::new();
415    for node in nodes {
416        collect_refs_in_node(node, &mut refs, 0);
417    }
418    refs
419}
420
421/// Recursively collects `@id` reference strings from a node's properties.
422///
423/// Depth is limited to [`MAX_DEPTH`] to prevent unbounded recursion
424/// on pathological input.
425fn collect_refs_in_node(node: &SchemaNode, refs: &mut HashSet<String>, depth: usize) {
426    if depth > MAX_DEPTH {
427        return;
428    }
429    for values in node.properties.values() {
430        for value in values {
431            if let SchemaValue::Node(inner) = value {
432                if inner.types.is_empty() {
433                    if let Some(id_values) = inner.properties.get("@id") {
434                        if let Some(SchemaValue::Text(id)) = id_values.first() {
435                            refs.insert(id.clone());
436                            continue;
437                        }
438                    }
439                }
440                collect_refs_in_node(inner, refs, depth + 1);
441            }
442        }
443    }
444}
445
446// Source-location utilities
447/// Maps byte offsets to line/column positions.
448struct LineIndex {
449    line_starts: Vec<usize>,
450}
451
452impl LineIndex {
453    fn new(text: &str) -> Self {
454        let mut line_starts = vec![0];
455        for (i, byte) in text.bytes().enumerate() {
456            if byte == b'\n' {
457                line_starts.push(i + 1);
458            }
459        }
460        Self { line_starts }
461    }
462
463    fn location(&self, byte_offset: usize) -> SourceLocation {
464        let line = self
465            .line_starts
466            .partition_point(|&start| start <= byte_offset)
467            .saturating_sub(1);
468        let column = byte_offset.saturating_sub(self.line_starts[line]);
469        SourceLocation {
470            line: line + 1,
471            column: column + 1,
472            byte_offset,
473        }
474    }
475}
476
477/// Finds byte offsets of `<script type="application/ld+json">` tags.
478fn find_script_byte_offsets(html: &str) -> Vec<usize> {
479    let mut offsets = Vec::new();
480    let mut search_from = 0;
481    let pattern = "application/ld+json";
482
483    while let Some(pos) = html[search_from..].find(pattern) {
484        let abs_pos = search_from + pos;
485        if let Some(tag_start) = html[..abs_pos].rfind('<') {
486            if html[tag_start..abs_pos].contains("script") {
487                offsets.push(tag_start);
488            }
489        }
490        search_from = abs_pos + pattern.len();
491    }
492
493    offsets
494}
495
496#[cfg(test)]
497mod tests {
498    use pretty_assertions::assert_eq;
499
500    use super::*;
501
502    #[test]
503    fn line_index_positions() {
504        let idx = LineIndex::new("line1\nline2\nline3");
505        let loc = idx.location(0);
506        assert_eq!((loc.line, loc.column), (1, 1));
507        let loc = idx.location(6);
508        assert_eq!((loc.line, loc.column), (2, 1));
509        let loc = idx.location(8);
510        assert_eq!((loc.line, loc.column), (2, 3));
511    }
512
513    #[test]
514    fn find_script_offsets() {
515        let html =
516            r#"<html><script type="application/ld+json">{"@type":"Product"}</script></html>"#;
517        let offsets = find_script_byte_offsets(html);
518        assert_eq!(offsets.len(), 1);
519        assert!(html[offsets[0]..].starts_with("<script"));
520    }
521
522    #[test]
523    fn basic_product() {
524        let html = r#"<html><head><script type="application/ld+json">{
525  "@context": "https://schema.org",
526  "@type": "Product",
527  "name": "Example Product",
528  "url": "https://example.com/product"
529}</script></head></html>"#;
530
531        let out = JsonLdExtractor.extract(html).expect("extraction failed");
532        assert_eq!(out.nodes.len(), 1);
533        assert_eq!(out.nodes[0].types, vec!["Product"]);
534        assert_eq!(out.nodes[0].source_format, SourceFormat::JsonLd);
535        assert_eq!(
536            out.nodes[0].properties["name"],
537            vec![SchemaValue::Text("Example Product".into())]
538        );
539        assert_eq!(
540            out.nodes[0].properties["url"],
541            vec![SchemaValue::Url("https://example.com/product".into())]
542        );
543    }
544
545    #[test]
546    fn graph_extraction() {
547        let html = r#"<html><head><script type="application/ld+json">{
548  "@context": "https://schema.org",
549  "@graph": [
550    {"@type": "Organization", "name": "Acme"},
551    {"@type": "WebSite", "name": "Acme Site"}
552  ]
553}</script></head></html>"#;
554
555        let out = JsonLdExtractor.extract(html).expect("extraction failed");
556        assert_eq!(out.nodes.len(), 2);
557        assert_eq!(out.nodes[0].types, vec!["Organization"]);
558        assert_eq!(out.nodes[1].types, vec!["WebSite"]);
559    }
560
561    #[test]
562    fn array_type() {
563        let html = r#"<html><head><script type="application/ld+json">{
564  "@context": "https://schema.org",
565  "@type": ["Product", "IndividualProduct"],
566  "name": "Widget"
567}</script></head></html>"#;
568
569        let out = JsonLdExtractor.extract(html).expect("extraction failed");
570        assert_eq!(out.nodes[0].types, vec!["Product", "IndividualProduct"]);
571    }
572
573    #[test]
574    fn nested_object() {
575        let html = r#"<html><head><script type="application/ld+json">{
576  "@context": "https://schema.org",
577  "@type": "Product",
578  "name": "Widget",
579  "offers": {
580    "@type": "Offer",
581    "price": 19.99,
582    "priceCurrency": "USD"
583  }
584}</script></head></html>"#;
585
586        let out = JsonLdExtractor.extract(html).expect("extraction failed");
587        assert_eq!(out.nodes.len(), 1);
588        let offers = &out.nodes[0].properties["offers"];
589        assert_eq!(offers.len(), 1);
590        if let SchemaValue::Node(offer) = &offers[0] {
591            assert_eq!(offer.types, vec!["Offer"]);
592            assert_eq!(offer.properties["price"], vec![SchemaValue::Number(19.99)]);
593            assert_eq!(
594                offer.properties["priceCurrency"],
595                vec![SchemaValue::Text("USD".into())]
596            );
597        } else {
598            panic!("Expected nested Node");
599        }
600    }
601
602    #[test]
603    fn id_cross_reference() {
604        let html = r##"<html><head><script type="application/ld+json">{
605  "@context": "https://schema.org",
606  "@graph": [
607    {"@type": "Product", "name": "Widget", "offers": {"@id": "#offer1"}},
608    {"@id": "#offer1", "@type": "Offer", "price": 29.99}
609  ]
610}</script></head></html>"##;
611
612        let out = JsonLdExtractor.extract(html).expect("extraction failed");
613        assert_eq!(out.nodes.len(), 2);
614        let offers = &out.nodes[0].properties["offers"];
615        if let SchemaValue::Node(offer) = &offers[0] {
616            assert_eq!(offer.types, vec!["Offer"]);
617            assert_eq!(offer.properties["price"], vec![SchemaValue::Number(29.99)]);
618        } else {
619            panic!("Expected resolved Node, got {:?}", offers[0]);
620        }
621    }
622
623    #[test]
624    fn malformed_json_is_warning() {
625        let html =
626            r#"<html><head><script type="application/ld+json">{ invalid }</script></head></html>"#;
627        let out = JsonLdExtractor.extract(html).expect("extraction failed");
628        assert!(out.nodes.is_empty());
629        assert_eq!(out.warnings.len(), 1);
630        assert_eq!(out.warnings[0].code, WarningCode::MalformedJsonLd);
631    }
632
633    #[test]
634    fn empty_script_tag() {
635        let html = r#"<html><head><script type="application/ld+json"></script></head></html>"#;
636        let out = JsonLdExtractor.extract(html).expect("extraction failed");
637        assert!(out.nodes.is_empty());
638        assert_eq!(out.warnings[0].code, WarningCode::MalformedJsonLd);
639    }
640
641    #[test]
642    fn multiple_script_tags() {
643        let html = concat!(
644            r#"<html><head>"#, "\n",
645            r#"<script type="application/ld+json">"#,
646            r#"{"@context":"https://schema.org","@type":"Product","name":"A"}"#,
647            r#"</script>"#, "\n",
648            r#"<script type="application/ld+json">"#,
649            r#"{"@context":"https://schema.org","@type":"Article","name":"B"}"#,
650            r#"</script>"#, "\n",
651            r#"</head></html>"#,
652        );
653
654        let out = JsonLdExtractor.extract(html).expect("extraction failed");
655        assert_eq!(out.nodes.len(), 2);
656        assert_eq!(out.nodes[0].types, vec!["Product"]);
657        assert_eq!(out.nodes[1].types, vec!["Article"]);
658    }
659
660    #[test]
661    fn top_level_array() {
662        let html = r#"<html><head><script type="application/ld+json">[
663  {"@context":"https://schema.org","@type":"Product","name":"A"},
664  {"@context":"https://schema.org","@type":"Article","name":"B"}
665]</script></head></html>"#;
666
667        let out = JsonLdExtractor.extract(html).expect("extraction failed");
668        assert_eq!(out.nodes.len(), 2);
669        assert_eq!(out.nodes[0].types, vec!["Product"]);
670        assert_eq!(out.nodes[1].types, vec!["Article"]);
671    }
672
673    #[test]
674    fn boolean_and_number_values() {
675        let html = r#"<html><head><script type="application/ld+json">{
676  "@context": "https://schema.org",
677  "@type": "Product",
678  "isFamilyFriendly": true,
679  "weight": 1.5
680}</script></head></html>"#;
681
682        let out = JsonLdExtractor.extract(html).expect("extraction failed");
683        assert_eq!(
684            out.nodes[0].properties["isFamilyFriendly"],
685            vec![SchemaValue::Boolean(true)]
686        );
687        assert_eq!(
688            out.nodes[0].properties["weight"],
689            vec![SchemaValue::Number(1.5)]
690        );
691    }
692
693    #[test]
694    fn unresolvable_reference_warns() {
695        let html = r##"<html><head><script type="application/ld+json">{
696  "@context": "https://schema.org",
697  "@type": "Product",
698  "offers": {"@id": "#nonexistent"}
699}</script></head></html>"##;
700
701        let out = JsonLdExtractor.extract(html).expect("extraction failed");
702        assert!(out
703            .warnings
704            .iter()
705            .any(|w| w.code == WarningCode::UnresolvableReference));
706    }
707
708    #[test]
709    fn no_context_with_full_uri_type() {
710        let html = r#"<html><head><script type="application/ld+json">{
711  "@type": "https://schema.org/Product",
712  "name": "Widget"
713}</script></head></html>"#;
714
715        let out = JsonLdExtractor.extract(html).expect("extraction failed");
716        assert_eq!(out.nodes.len(), 1);
717        assert_eq!(out.nodes[0].types, vec!["Product"]);
718    }
719
720    #[test]
721    fn array_context() {
722        let html = r#"<html><head><script type="application/ld+json">{
723  "@context": ["https://schema.org", {"custom": "https://example.com/"}],
724  "@type": "Product",
725  "name": "Widget"
726}</script></head></html>"#;
727
728        let out = JsonLdExtractor.extract(html).expect("extraction failed");
729        assert_eq!(out.nodes[0].types, vec!["Product"]);
730    }
731
732    #[test]
733    fn array_property_values() {
734        let html = r#"<html><head><script type="application/ld+json">{
735  "@context": "https://schema.org",
736  "@type": "Product",
737  "name": "Widget",
738  "image": [
739    "https://example.com/img1.jpg",
740    "https://example.com/img2.jpg"
741  ]
742}</script></head></html>"#;
743
744        let out = JsonLdExtractor.extract(html).expect("extraction failed");
745        assert_eq!(out.nodes[0].properties["image"].len(), 2);
746        assert_eq!(
747            out.nodes[0].properties["image"][0],
748            SchemaValue::Url("https://example.com/img1.jpg".into())
749        );
750    }
751
752    #[test]
753    fn null_values_are_skipped() {
754        let html = r#"<html><head><script type="application/ld+json">{
755  "@context": "https://schema.org",
756  "@type": "Product",
757  "name": "Widget",
758  "description": null
759}</script></head></html>"#;
760
761        let out = JsonLdExtractor.extract(html).expect("extraction failed");
762        assert_eq!(out.nodes.len(), 1);
763        // null values should be skipped entirely
764        assert!(!out.nodes[0].properties.contains_key("description"));
765    }
766
767    #[test]
768    fn integer_numbers() {
769        let html = r#"<html><head><script type="application/ld+json">{
770  "@context": "https://schema.org",
771  "@type": "Product",
772  "name": "Widget",
773  "ratingCount": 42
774}</script></head></html>"#;
775
776        let out = JsonLdExtractor.extract(html).expect("extraction failed");
777        assert_eq!(
778            out.nodes[0].properties["ratingCount"],
779            vec![SchemaValue::Number(42.0)]
780        );
781    }
782
783    #[test]
784    fn graph_context_inherited_by_children() {
785        let html = r#"<html><head><script type="application/ld+json">{
786  "@context": "https://schema.org",
787  "@graph": [
788    {"@type": "Product", "name": "A"},
789    {"@type": "https://schema.org/Article", "name": "B"}
790  ]
791}</script></head></html>"#;
792
793        let out = JsonLdExtractor.extract(html).expect("extraction failed");
794        assert_eq!(out.nodes.len(), 2);
795        assert_eq!(out.nodes[0].types, vec!["Product"]);
796        assert_eq!(out.nodes[1].types, vec!["Article"]);
797    }
798
799    #[test]
800    fn duplicate_id_warns() {
801        let html = r##"<html><head><script type="application/ld+json">{
802  "@context": "https://schema.org",
803  "@graph": [
804    {"@id": "#thing", "@type": "Product", "name": "First"},
805    {"@id": "#thing", "@type": "Article", "name": "Second"}
806  ]
807}</script></head></html>"##;
808
809        let out = JsonLdExtractor.extract(html).expect("extraction failed");
810        assert!(out
811            .warnings
812            .iter()
813            .any(|w| w.code == WarningCode::DuplicateId));
814    }
815
816    #[test]
817    fn deeply_nested_objects() {
818        let html = r#"<html><head><script type="application/ld+json">{
819  "@context": "https://schema.org",
820  "@type": "Product",
821  "name": "Widget",
822  "offers": {
823    "@type": "Offer",
824    "seller": {
825      "@type": "Organization",
826      "address": {
827        "@type": "PostalAddress",
828        "addressCountry": "US"
829      }
830    }
831  }
832}</script></head></html>"#;
833
834        let out = JsonLdExtractor.extract(html).expect("extraction failed");
835        assert_eq!(out.nodes.len(), 1);
836        let offers = &out.nodes[0].properties["offers"];
837        if let SchemaValue::Node(offer) = &offers[0] {
838            let seller = &offer.properties["seller"];
839            if let SchemaValue::Node(org) = &seller[0] {
840                let address = &org.properties["address"];
841                if let SchemaValue::Node(addr) = &address[0] {
842                    assert_eq!(addr.types, vec!["PostalAddress"]);
843                    assert_eq!(
844                        addr.properties["addressCountry"],
845                        vec![SchemaValue::Text("US".into())]
846                    );
847                } else {
848                    panic!("Expected PostalAddress node");
849                }
850            } else {
851                panic!("Expected Organization node");
852            }
853        } else {
854            panic!("Expected Offer node");
855        }
856    }
857
858    #[test]
859    fn whitespace_only_script() {
860        let html = r#"<html><head><script type="application/ld+json">   
861  
862  </script></head></html>"#;
863
864        let out = JsonLdExtractor.extract(html).expect("extraction failed");
865        assert!(out.nodes.is_empty());
866        assert_eq!(out.warnings.len(), 1);
867        assert_eq!(out.warnings[0].code, WarningCode::MalformedJsonLd);
868    }
869
870    #[test]
871    fn source_location_is_set() {
872        let html = concat!(
873            "<html><head>\n",
874            "<script type=\"application/ld+json\">\n",
875            "{\"@type\":\"Product\",\"name\":\"A\"}\n",
876            "</script>\n",
877            "</head></html>",
878        );
879
880        let out = JsonLdExtractor.extract(html).expect("extraction failed");
881        assert_eq!(out.nodes.len(), 1);
882        let loc = out.nodes[0]
883            .source_location
884            .as_ref()
885            .expect("missing source location");
886        // The <script> tag starts on line 2
887        assert_eq!(loc.line, 2);
888    }
889
890    #[test]
891    fn multiple_types_with_uri_prefix() {
892        let html = r#"<html><head><script type="application/ld+json">{
893  "@context": "https://schema.org",
894  "@type": ["https://schema.org/Product", "http://schema.org/IndividualProduct"],
895  "name": "Widget"
896}</script></head></html>"#;
897
898        let out = JsonLdExtractor.extract(html).expect("extraction failed");
899        assert_eq!(out.nodes[0].types, vec!["Product", "IndividualProduct"]);
900    }
901
902    #[test]
903    fn schema_node_id_accessor() {
904        let html = r##"<html><head><script type="application/ld+json">{
905  "@context": "https://schema.org",
906  "@id": "#product1",
907  "@type": "Product",
908  "name": "Widget"
909}</script></head></html>"##;
910
911        let out = JsonLdExtractor.extract(html).expect("extraction failed");
912        assert_eq!(out.nodes[0].id(), Some("#product1"));
913    }
914
915    #[test]
916    fn no_structured_data() {
917        let html = r#"<html><head><title>No structured data</title></head>
918<body><p>Hello world</p></body></html>"#;
919
920        let out = JsonLdExtractor.extract(html).expect("extraction failed");
921        assert!(out.nodes.is_empty());
922        assert!(out.warnings.is_empty());
923    }
924
925    #[test]
926    fn json_ld_with_trailing_comma() {
927        // Many real-world sites have trailing commas in JSON-LD (invalid JSON)
928        let html = r#"<html><head><script type="application/ld+json">{
929  "@type": "Product",
930  "name": "Widget",
931}</script></head></html>"#;
932
933        let out = JsonLdExtractor.extract(html).expect("extraction failed");
934        assert!(out.nodes.is_empty());
935        assert_eq!(out.warnings[0].code, WarningCode::MalformedJsonLd);
936    }
937
938    #[test]
939    fn circular_id_references_do_not_loop() {
940        // A references B, B references A -- must terminate
941        let html = r##"<html><head><script type="application/ld+json">{
942  "@context": "https://schema.org",
943  "@graph": [
944    {"@id": "#a", "@type": "Product", "name": "A", "isRelatedTo": {"@id": "#b"}},
945    {"@id": "#b", "@type": "Article", "name": "B", "isRelatedTo": {"@id": "#a"}}
946  ]
947}</script></head></html>"##;
948
949        let out = JsonLdExtractor.extract(html).expect("must not hang");
950        assert_eq!(out.nodes.len(), 2);
951    }
952
953    #[test]
954    fn self_referencing_id_does_not_loop() {
955        let html = r##"<html><head><script type="application/ld+json">{
956  "@context": "https://schema.org",
957  "@graph": [
958    {"@id": "#self", "@type": "Product", "name": "Me", "isRelatedTo": {"@id": "#self"}}
959  ]
960}</script></head></html>"##;
961
962        let out = JsonLdExtractor.extract(html).expect("must not hang");
963        assert_eq!(out.nodes.len(), 1);
964    }
965
966    #[test]
967    fn empty_id_string() {
968        let html = r##"<html><head><script type="application/ld+json">{
969  "@context": "https://schema.org",
970  "@id": "",
971  "@type": "Product",
972  "name": "Widget"
973}</script></head></html>"##;
974
975        let out = JsonLdExtractor.extract(html).expect("extraction failed");
976        assert_eq!(out.nodes.len(), 1);
977        // Empty @id is stored but should not cause issues
978        assert_eq!(out.nodes[0].id(), Some(""));
979    }
980
981    #[test]
982    fn nesting_at_exactly_max_depth_succeeds() {
983        // Build JSON with exactly MAX_DEPTH (20) levels of nesting
984        let mut json =
985            String::from(r#"{"@context":"https://schema.org","@type":"Thing","name":"L0""#);
986        for i in 1..MAX_DEPTH {
987            json.push_str(&format!(r#","p{i}":{{"@type":"Thing","name":"L{i}""#));
988        }
989        // Close nested objects (one } per level) plus the root
990        for _ in 0..MAX_DEPTH {
991            json.push('}');
992        }
993
994        let html = format!(
995            r#"<html><head><script type="application/ld+json">{json}</script></head></html>"#
996        );
997
998        let out = JsonLdExtractor.extract(&html).expect("extraction failed");
999        assert_eq!(out.nodes.len(), 1);
1000        // No depth warning should fire at exactly MAX_DEPTH
1001        assert!(
1002            !out.warnings.iter().any(|w| w.message.contains("depth")),
1003            "should not warn at MAX_DEPTH"
1004        );
1005    }
1006
1007    #[test]
1008    fn nesting_beyond_max_depth_warns() {
1009        // Build JSON with MAX_DEPTH + 2 levels of nesting
1010        let target = MAX_DEPTH + 2;
1011        let mut json =
1012            String::from(r#"{"@context":"https://schema.org","@type":"Thing","name":"L0""#);
1013        for i in 1..target {
1014            json.push_str(&format!(r#","p{i}":{{"@type":"Thing","name":"L{i}""#));
1015        }
1016        // Close nested objects plus the root
1017        for _ in 0..target {
1018            json.push('}');
1019        }
1020
1021        let html = format!(
1022            r#"<html><head><script type="application/ld+json">{json}</script></head></html>"#
1023        );
1024
1025        let out = JsonLdExtractor.extract(&html).expect("extraction failed");
1026        assert!(
1027            out.warnings.iter().any(|w| w.message.contains("depth")),
1028            "should warn when exceeding MAX_DEPTH"
1029        );
1030    }
1031
1032    #[test]
1033    fn type_is_number_ignored() {
1034        let html = r#"<html><head><script type="application/ld+json">{
1035  "@context": "https://schema.org",
1036  "@type": 42,
1037  "name": "Widget"
1038}</script></head></html>"#;
1039
1040        let out = JsonLdExtractor.extract(html).expect("extraction failed");
1041        assert_eq!(out.nodes.len(), 1);
1042        assert!(out.nodes[0].types.is_empty());
1043        assert!(out
1044            .warnings
1045            .iter()
1046            .any(|w| w.code == WarningCode::EmptyType));
1047    }
1048
1049    #[test]
1050    fn type_is_object_ignored() {
1051        let html = r#"<html><head><script type="application/ld+json">{
1052  "@context": "https://schema.org",
1053  "@type": {"invalid": true},
1054  "name": "Widget"
1055}</script></head></html>"#;
1056
1057        let out = JsonLdExtractor.extract(html).expect("extraction failed");
1058        assert_eq!(out.nodes.len(), 1);
1059        assert!(out.nodes[0].types.is_empty());
1060    }
1061
1062    #[test]
1063    fn type_empty_array() {
1064        let html = r#"<html><head><script type="application/ld+json">{
1065  "@context": "https://schema.org",
1066  "@type": [],
1067  "name": "Widget"
1068}</script></head></html>"#;
1069
1070        let out = JsonLdExtractor.extract(html).expect("extraction failed");
1071        assert_eq!(out.nodes.len(), 1);
1072        assert!(out.nodes[0].types.is_empty());
1073        assert!(out
1074            .warnings
1075            .iter()
1076            .any(|w| w.code == WarningCode::EmptyType));
1077    }
1078
1079    #[test]
1080    fn type_array_with_mixed_values() {
1081        // Non-string values in @type array should be filtered out
1082        let html = r#"<html><head><script type="application/ld+json">{
1083  "@context": "https://schema.org",
1084  "@type": [42, "Product", null, "IndividualProduct"],
1085  "name": "Widget"
1086}</script></head></html>"#;
1087
1088        let out = JsonLdExtractor.extract(html).expect("extraction failed");
1089        assert_eq!(out.nodes[0].types, vec!["Product", "IndividualProduct"]);
1090    }
1091
1092    #[test]
1093    fn non_schema_org_context_still_extracts() {
1094        let html = r#"<html><head><script type="application/ld+json">{
1095  "@context": "https://w3.org/ns/activitystreams",
1096  "@type": "Note",
1097  "content": "Hello"
1098}</script></head></html>"#;
1099
1100        let out = JsonLdExtractor.extract(html).expect("extraction failed");
1101        assert_eq!(out.nodes.len(), 1);
1102        assert_eq!(out.nodes[0].types, vec!["Note"]);
1103        assert_eq!(
1104            out.nodes[0].properties["content"],
1105            vec![SchemaValue::Text("Hello".into())]
1106        );
1107    }
1108
1109    #[test]
1110    fn html_entities_in_script_content() {
1111        // html5ever decodes HTML entities in script text content
1112        let html = r#"<html><head><script type="application/ld+json">{
1113  "@context": "https://schema.org",
1114  "@type": "Product",
1115  "name": "Widget &amp; Gadget"
1116}</script></head></html>"#;
1117
1118        let out = JsonLdExtractor.extract(html).expect("extraction failed");
1119        // serde_json will see the decoded "&" or the raw "&amp;" depending
1120        // on how html5ever handles script content. Either way, extraction
1121        // should succeed without error.
1122        assert_eq!(out.nodes.len(), 1);
1123    }
1124
1125    #[test]
1126    fn multiple_references_to_same_id() {
1127        // Three properties all reference the same @id node
1128        let html = r##"<html><head><script type="application/ld+json">{
1129  "@context": "https://schema.org",
1130  "@graph": [
1131    {
1132      "@type": "Product", "name": "Widget",
1133      "offers": {"@id": "#offer"},
1134      "makesOffer": {"@id": "#offer"},
1135      "hasOfferCatalog": {"@id": "#offer"}
1136    },
1137    {"@id": "#offer", "@type": "Offer", "price": 9.99}
1138  ]
1139}</script></head></html>"##;
1140
1141        let out = JsonLdExtractor.extract(html).expect("extraction failed");
1142        assert_eq!(out.nodes.len(), 2);
1143        // All three references should be resolved
1144        for prop in &["offers", "makesOffer", "hasOfferCatalog"] {
1145            let values = &out.nodes[0].properties[*prop];
1146            if let SchemaValue::Node(node) = &values[0] {
1147                assert_eq!(node.types, vec!["Offer"]);
1148            } else {
1149                panic!("Expected resolved Node for {prop}");
1150            }
1151        }
1152    }
1153
1154    #[test]
1155    fn duplicate_id_first_definition_wins() {
1156        // Verify first-wins semantics after the bug fix
1157        let html = r##"<html><head><script type="application/ld+json">{
1158  "@context": "https://schema.org",
1159  "@graph": [
1160    {"@type": "Product", "name": "P", "offers": {"@id": "#dup"}},
1161    {"@id": "#dup", "@type": "Offer", "price": 10.00, "priceCurrency": "USD"},
1162    {"@id": "#dup", "@type": "Offer", "price": 99.99, "priceCurrency": "EUR"}
1163  ]
1164}</script></head></html>"##;
1165
1166        let out = JsonLdExtractor.extract(html).expect("extraction failed");
1167        // Should warn about duplicate
1168        assert!(out
1169            .warnings
1170            .iter()
1171            .any(|w| w.code == WarningCode::DuplicateId));
1172        // The FIRST definition (price=10.00, USD) should win
1173        let offers = &out.nodes[0].properties["offers"];
1174        if let SchemaValue::Node(offer) = &offers[0] {
1175            assert_eq!(
1176                offer.properties["price"],
1177                vec![SchemaValue::Number(10.0)],
1178                "first @id definition should win"
1179            );
1180            assert_eq!(
1181                offer.properties["priceCurrency"],
1182                vec![SchemaValue::Text("USD".into())],
1183                "first @id definition should win"
1184            );
1185        } else {
1186            panic!("Expected resolved Offer node");
1187        }
1188    }
1189
1190    #[test]
1191    fn json_root_is_string_warns() {
1192        let html = concat!(
1193            r#"<html><head>"#,
1194            r#"<script type="application/ld+json">"#,
1195            r#""just a string""#,
1196            r#"</script></head></html>"#,
1197        );
1198        let out = JsonLdExtractor.extract(html).expect("extraction failed");
1199        assert!(out.nodes.is_empty());
1200        assert_eq!(out.warnings[0].code, WarningCode::MalformedJsonLd);
1201    }
1202
1203    #[test]
1204    fn json_root_is_number_warns() {
1205        let html = r#"<html><head><script type="application/ld+json">42</script></head></html>"#;
1206        let out = JsonLdExtractor.extract(html).expect("extraction failed");
1207        assert!(out.nodes.is_empty());
1208        assert_eq!(out.warnings[0].code, WarningCode::MalformedJsonLd);
1209    }
1210
1211    #[test]
1212    fn external_uri_id_no_warning() {
1213        // External @id URIs should NOT produce unresolvable-reference warnings
1214        let html = r##"<html><head><script type="application/ld+json">{
1215  "@context": "https://schema.org",
1216  "@type": "Product",
1217  "name": "Widget",
1218  "manufacturer": {"@id": "https://example.com/org/1"}
1219}</script></head></html>"##;
1220
1221        let out = JsonLdExtractor.extract(html).expect("extraction failed");
1222        assert!(
1223            !out.warnings
1224                .iter()
1225                .any(|w| w.code == WarningCode::UnresolvableReference),
1226            "external @id URIs should not trigger warnings"
1227        );
1228    }
1229}