1use std::collections::{HashMap, HashSet};
22use std::sync::OnceLock;
23
24use indexmap::IndexMap;
25use scraper::{Html, Selector};
26use serde_json::Value;
27
28use crate::error::{ExtractionError, ExtractionWarning, WarningCode};
29use crate::types::{SchemaNode, SchemaValue, SourceFormat, SourceLocation};
30
31use super::{classify_text_value, strip_schema_prefix, ExtractionOutput, Extractor};
32
33const MAX_DEPTH: usize = 20;
35
36const MAX_REF_DEPTH: usize = 10;
41
42const MAX_REF_RESOLUTIONS: usize = 50;
47
48pub struct JsonLdExtractor;
67
68impl Extractor for JsonLdExtractor {
69 fn extract(&self, html: &str) -> Result<ExtractionOutput, ExtractionError> {
70 let document = Html::parse_document(html);
71 self.extract_from_document(&document, html)
72 }
73}
74
75impl JsonLdExtractor {
76 pub fn extract_from_document(
91 &self,
92 document: &Html,
93 html: &str,
94 ) -> Result<ExtractionOutput, ExtractionError> {
95 static SELECTOR: OnceLock<Selector> = OnceLock::new();
96 let selector = SELECTOR.get_or_init(|| {
97 Selector::parse("script[type=\"application/ld+json\"]")
98 .expect("static JSON-LD selector must parse")
99 });
100
101 let line_index = LineIndex::new(html);
102 let script_offsets = find_script_byte_offsets(html);
103
104 let mut all_nodes = Vec::new();
105 let mut warnings = Vec::new();
106
107 for (idx, element) in document.select(selector).enumerate() {
108 let json_text = element.inner_html();
109 let trimmed = json_text.trim();
110 let source_location = script_offsets
111 .get(idx)
112 .map(|&offset| line_index.location(offset));
113
114 if trimmed.is_empty() {
115 warnings.push(ExtractionWarning {
116 message: "empty JSON-LD script tag".into(),
117 source_location,
118 code: WarningCode::MalformedJsonLd,
119 });
120 continue;
121 }
122
123 let value: Value = match serde_json::from_str(trimmed) {
124 Ok(v) => v,
125 Err(e) => {
126 warnings.push(ExtractionWarning {
127 message: format!("failed to parse JSON-LD: {e}"),
128 source_location,
129 code: WarningCode::MalformedJsonLd,
130 });
131 continue;
132 }
133 };
134
135 let items = extract_json_items(&value, source_location.as_ref(), &mut warnings);
136 all_nodes.extend(items);
137 }
138
139 let mut id_to_index: HashMap<String, usize> = HashMap::new();
143 for (i, node) in all_nodes.iter().enumerate() {
144 if let Some(id) = node.id() {
145 match id_to_index.entry(id.to_owned()) {
146 std::collections::hash_map::Entry::Occupied(_) => {
147 warnings.push(ExtractionWarning {
148 message: format!("duplicate @id: {id}"),
149 source_location: node.source_location.clone(),
150 code: WarningCode::DuplicateId,
151 });
152 }
153 std::collections::hash_map::Entry::Vacant(entry) => {
154 entry.insert(i);
155 }
156 }
157 }
158 }
159
160 let referenced = collect_referenced_ids(&all_nodes);
162 let id_map: HashMap<String, SchemaNode> = referenced
163 .iter()
164 .filter_map(|id| {
165 let &idx = id_to_index.get(id.as_str())?;
166 Some((id.clone(), all_nodes[idx].clone()))
167 })
168 .collect();
169
170 resolve_references(&mut all_nodes, &id_map, &mut warnings);
172
173 Ok(ExtractionOutput {
174 nodes: all_nodes,
175 warnings,
176 })
177 }
178}
179
180fn extract_json_items(
183 value: &Value,
184 source_location: Option<&SourceLocation>,
185 warnings: &mut Vec<ExtractionWarning>,
186) -> Vec<SchemaNode> {
187 match value {
188 Value::Array(items) => items
189 .iter()
190 .filter_map(|item| json_to_node(item, None, source_location, warnings, 0))
191 .collect(),
192
193 Value::Object(map) => {
194 if let Some(Value::Array(graph_items)) = map.get("@graph") {
195 let context = map.get("@context");
196 graph_items
197 .iter()
198 .filter_map(|item| json_to_node(item, context, source_location, warnings, 0))
199 .collect()
200 } else {
201 json_to_node(value, None, source_location, warnings, 0)
202 .into_iter()
203 .collect()
204 }
205 }
206
207 _ => {
208 warnings.push(ExtractionWarning {
209 message: "JSON-LD root must be an object or array".into(),
210 source_location: source_location.cloned(),
211 code: WarningCode::MalformedJsonLd,
212 });
213 Vec::new()
214 }
215 }
216}
217
218fn json_to_node(
222 value: &Value,
223 parent_context: Option<&Value>,
224 source_location: Option<&SourceLocation>,
225 warnings: &mut Vec<ExtractionWarning>,
226 depth: usize,
227) -> Option<SchemaNode> {
228 if depth > MAX_DEPTH {
229 warnings.push(ExtractionWarning {
230 message: format!("JSON-LD nesting depth exceeds {MAX_DEPTH}, skipping"),
231 source_location: source_location.cloned(),
232 code: WarningCode::MalformedJsonLd,
233 });
234 return None;
235 }
236 let obj = value.as_object()?;
237
238 let context = obj.get("@context").or(parent_context);
240
241 let types = extract_types(obj);
243
244 if types.is_empty() {
246 let non_meta_keys = obj.keys().filter(|k| !k.starts_with('@')).count();
247 let is_reference = obj.contains_key("@id") && non_meta_keys == 0;
248 if !is_reference && !obj.is_empty() {
249 warnings.push(ExtractionWarning {
250 message: "JSON-LD object has no @type".into(),
251 source_location: source_location.cloned(),
252 code: WarningCode::EmptyType,
253 });
254 }
255 }
256
257 let mut properties: IndexMap<String, Vec<SchemaValue>> = IndexMap::new();
259
260 for (key, val) in obj {
261 if key == "@context" || key == "@type" {
262 continue;
263 }
264
265 if key == "@id" {
266 if let Value::String(id) = val {
267 properties
268 .entry(key.clone())
269 .or_default()
270 .push(classify_text_value(id));
271 }
272 continue;
273 }
274
275 let values = json_to_schema_values(val, context, source_location, warnings, depth);
276 if !values.is_empty() {
277 properties.entry(key.clone()).or_default().extend(values);
278 }
279 }
280
281 Some(SchemaNode {
282 types,
283 properties,
284 source_format: SourceFormat::JsonLd,
285 source_location: source_location.cloned(),
286 })
287}
288
289fn extract_types(obj: &serde_json::Map<String, Value>) -> Vec<String> {
291 match obj.get("@type") {
292 Some(Value::String(t)) => vec![strip_schema_prefix(t).into_owned()],
293 Some(Value::Array(arr)) => arr
294 .iter()
295 .filter_map(|v| v.as_str())
296 .map(|s| strip_schema_prefix(s).into_owned())
297 .collect(),
298 _ => Vec::new(),
299 }
300}
301
302fn json_to_schema_values(
304 value: &Value,
305 context: Option<&Value>,
306 source_location: Option<&SourceLocation>,
307 warnings: &mut Vec<ExtractionWarning>,
308 depth: usize,
309) -> Vec<SchemaValue> {
310 match value {
311 Value::Null => Vec::new(),
312 Value::Bool(b) => vec![SchemaValue::Boolean(*b)],
313 Value::Number(n) => n
314 .as_f64()
315 .map(|f| vec![SchemaValue::Number(f)])
316 .unwrap_or_default(),
317 Value::String(s) => vec![classify_text_value(s)],
318 Value::Array(arr) => arr
319 .iter()
320 .flat_map(|v| json_to_schema_values(v, context, source_location, warnings, depth))
321 .collect(),
322 Value::Object(_) => json_to_node(value, context, source_location, warnings, depth + 1)
323 .map(|node| vec![SchemaValue::Node(Box::new(node))])
324 .unwrap_or_default(),
325 }
326}
327
328fn resolve_references(
334 nodes: &mut [SchemaNode],
335 id_map: &HashMap<String, SchemaNode>,
336 warnings: &mut Vec<ExtractionWarning>,
337) {
338 let mut resolution_count: usize = 0;
339 for node in nodes.iter_mut() {
340 resolve_node_refs(node, id_map, warnings, 0, &mut resolution_count);
341 }
342}
343
344fn resolve_node_refs(
349 node: &mut SchemaNode,
350 id_map: &HashMap<String, SchemaNode>,
351 warnings: &mut Vec<ExtractionWarning>,
352 depth: usize,
353 resolution_count: &mut usize,
354) {
355 if depth > MAX_REF_DEPTH {
356 return;
357 }
358
359 for values in node.properties.values_mut() {
360 for value in values.iter_mut() {
361 if let SchemaValue::Node(inner) = value {
362 if inner.types.is_empty() {
364 if let Some(id_values) = inner.properties.get("@id") {
365 if let Some(SchemaValue::Text(id)) = id_values.first() {
366 if *resolution_count >= MAX_REF_RESOLUTIONS {
367 continue;
368 }
369 if let Some(resolved) = id_map.get(id.as_str()) {
370 let has_content =
371 !resolved.types.is_empty() || resolved.properties.len() > 1;
372 if has_content {
373 *resolution_count += 1;
374 *value = SchemaValue::Node(Box::new(resolved.clone()));
375 if let SchemaValue::Node(ref mut n) = value {
376 resolve_node_refs(
377 n,
378 id_map,
379 warnings,
380 depth + 1,
381 resolution_count,
382 );
383 }
384 continue;
385 }
386 }
387 if id.starts_with('#') {
391 warnings.push(ExtractionWarning {
392 message: format!("unresolvable @id reference: {id}"),
393 source_location: inner.source_location.clone(),
394 code: WarningCode::UnresolvableReference,
395 });
396 }
397 continue;
398 }
399 }
400 }
401 resolve_node_refs(inner, id_map, warnings, depth + 1, resolution_count);
403 }
404 }
405 }
406}
407
408fn collect_referenced_ids(nodes: &[SchemaNode]) -> HashSet<String> {
414 let mut refs = HashSet::new();
415 for node in nodes {
416 collect_refs_in_node(node, &mut refs, 0);
417 }
418 refs
419}
420
421fn collect_refs_in_node(node: &SchemaNode, refs: &mut HashSet<String>, depth: usize) {
426 if depth > MAX_DEPTH {
427 return;
428 }
429 for values in node.properties.values() {
430 for value in values {
431 if let SchemaValue::Node(inner) = value {
432 if inner.types.is_empty() {
433 if let Some(id_values) = inner.properties.get("@id") {
434 if let Some(SchemaValue::Text(id)) = id_values.first() {
435 refs.insert(id.clone());
436 continue;
437 }
438 }
439 }
440 collect_refs_in_node(inner, refs, depth + 1);
441 }
442 }
443 }
444}
445
446struct LineIndex {
449 line_starts: Vec<usize>,
450}
451
452impl LineIndex {
453 fn new(text: &str) -> Self {
454 let mut line_starts = vec![0];
455 for (i, byte) in text.bytes().enumerate() {
456 if byte == b'\n' {
457 line_starts.push(i + 1);
458 }
459 }
460 Self { line_starts }
461 }
462
463 fn location(&self, byte_offset: usize) -> SourceLocation {
464 let line = self
465 .line_starts
466 .partition_point(|&start| start <= byte_offset)
467 .saturating_sub(1);
468 let column = byte_offset.saturating_sub(self.line_starts[line]);
469 SourceLocation {
470 line: line + 1,
471 column: column + 1,
472 byte_offset,
473 }
474 }
475}
476
477fn find_script_byte_offsets(html: &str) -> Vec<usize> {
479 let mut offsets = Vec::new();
480 let mut search_from = 0;
481 let pattern = "application/ld+json";
482
483 while let Some(pos) = html[search_from..].find(pattern) {
484 let abs_pos = search_from + pos;
485 if let Some(tag_start) = html[..abs_pos].rfind('<') {
486 if html[tag_start..abs_pos].contains("script") {
487 offsets.push(tag_start);
488 }
489 }
490 search_from = abs_pos + pattern.len();
491 }
492
493 offsets
494}
495
496#[cfg(test)]
497mod tests {
498 use pretty_assertions::assert_eq;
499
500 use super::*;
501
502 #[test]
503 fn line_index_positions() {
504 let idx = LineIndex::new("line1\nline2\nline3");
505 let loc = idx.location(0);
506 assert_eq!((loc.line, loc.column), (1, 1));
507 let loc = idx.location(6);
508 assert_eq!((loc.line, loc.column), (2, 1));
509 let loc = idx.location(8);
510 assert_eq!((loc.line, loc.column), (2, 3));
511 }
512
513 #[test]
514 fn find_script_offsets() {
515 let html =
516 r#"<html><script type="application/ld+json">{"@type":"Product"}</script></html>"#;
517 let offsets = find_script_byte_offsets(html);
518 assert_eq!(offsets.len(), 1);
519 assert!(html[offsets[0]..].starts_with("<script"));
520 }
521
522 #[test]
523 fn basic_product() {
524 let html = r#"<html><head><script type="application/ld+json">{
525 "@context": "https://schema.org",
526 "@type": "Product",
527 "name": "Example Product",
528 "url": "https://example.com/product"
529}</script></head></html>"#;
530
531 let out = JsonLdExtractor.extract(html).expect("extraction failed");
532 assert_eq!(out.nodes.len(), 1);
533 assert_eq!(out.nodes[0].types, vec!["Product"]);
534 assert_eq!(out.nodes[0].source_format, SourceFormat::JsonLd);
535 assert_eq!(
536 out.nodes[0].properties["name"],
537 vec![SchemaValue::Text("Example Product".into())]
538 );
539 assert_eq!(
540 out.nodes[0].properties["url"],
541 vec![SchemaValue::Url("https://example.com/product".into())]
542 );
543 }
544
545 #[test]
546 fn graph_extraction() {
547 let html = r#"<html><head><script type="application/ld+json">{
548 "@context": "https://schema.org",
549 "@graph": [
550 {"@type": "Organization", "name": "Acme"},
551 {"@type": "WebSite", "name": "Acme Site"}
552 ]
553}</script></head></html>"#;
554
555 let out = JsonLdExtractor.extract(html).expect("extraction failed");
556 assert_eq!(out.nodes.len(), 2);
557 assert_eq!(out.nodes[0].types, vec!["Organization"]);
558 assert_eq!(out.nodes[1].types, vec!["WebSite"]);
559 }
560
561 #[test]
562 fn array_type() {
563 let html = r#"<html><head><script type="application/ld+json">{
564 "@context": "https://schema.org",
565 "@type": ["Product", "IndividualProduct"],
566 "name": "Widget"
567}</script></head></html>"#;
568
569 let out = JsonLdExtractor.extract(html).expect("extraction failed");
570 assert_eq!(out.nodes[0].types, vec!["Product", "IndividualProduct"]);
571 }
572
573 #[test]
574 fn nested_object() {
575 let html = r#"<html><head><script type="application/ld+json">{
576 "@context": "https://schema.org",
577 "@type": "Product",
578 "name": "Widget",
579 "offers": {
580 "@type": "Offer",
581 "price": 19.99,
582 "priceCurrency": "USD"
583 }
584}</script></head></html>"#;
585
586 let out = JsonLdExtractor.extract(html).expect("extraction failed");
587 assert_eq!(out.nodes.len(), 1);
588 let offers = &out.nodes[0].properties["offers"];
589 assert_eq!(offers.len(), 1);
590 if let SchemaValue::Node(offer) = &offers[0] {
591 assert_eq!(offer.types, vec!["Offer"]);
592 assert_eq!(offer.properties["price"], vec![SchemaValue::Number(19.99)]);
593 assert_eq!(
594 offer.properties["priceCurrency"],
595 vec![SchemaValue::Text("USD".into())]
596 );
597 } else {
598 panic!("Expected nested Node");
599 }
600 }
601
602 #[test]
603 fn id_cross_reference() {
604 let html = r##"<html><head><script type="application/ld+json">{
605 "@context": "https://schema.org",
606 "@graph": [
607 {"@type": "Product", "name": "Widget", "offers": {"@id": "#offer1"}},
608 {"@id": "#offer1", "@type": "Offer", "price": 29.99}
609 ]
610}</script></head></html>"##;
611
612 let out = JsonLdExtractor.extract(html).expect("extraction failed");
613 assert_eq!(out.nodes.len(), 2);
614 let offers = &out.nodes[0].properties["offers"];
615 if let SchemaValue::Node(offer) = &offers[0] {
616 assert_eq!(offer.types, vec!["Offer"]);
617 assert_eq!(offer.properties["price"], vec![SchemaValue::Number(29.99)]);
618 } else {
619 panic!("Expected resolved Node, got {:?}", offers[0]);
620 }
621 }
622
623 #[test]
624 fn malformed_json_is_warning() {
625 let html =
626 r#"<html><head><script type="application/ld+json">{ invalid }</script></head></html>"#;
627 let out = JsonLdExtractor.extract(html).expect("extraction failed");
628 assert!(out.nodes.is_empty());
629 assert_eq!(out.warnings.len(), 1);
630 assert_eq!(out.warnings[0].code, WarningCode::MalformedJsonLd);
631 }
632
633 #[test]
634 fn empty_script_tag() {
635 let html = r#"<html><head><script type="application/ld+json"></script></head></html>"#;
636 let out = JsonLdExtractor.extract(html).expect("extraction failed");
637 assert!(out.nodes.is_empty());
638 assert_eq!(out.warnings[0].code, WarningCode::MalformedJsonLd);
639 }
640
641 #[test]
642 fn multiple_script_tags() {
643 let html = concat!(
644 r#"<html><head>"#,
645 "\n",
646 r#"<script type="application/ld+json">"#,
647 r#"{"@context":"https://schema.org","@type":"Product","name":"A"}"#,
648 r#"</script>"#,
649 "\n",
650 r#"<script type="application/ld+json">"#,
651 r#"{"@context":"https://schema.org","@type":"Article","name":"B"}"#,
652 r#"</script>"#,
653 "\n",
654 r#"</head></html>"#,
655 );
656
657 let out = JsonLdExtractor.extract(html).expect("extraction failed");
658 assert_eq!(out.nodes.len(), 2);
659 assert_eq!(out.nodes[0].types, vec!["Product"]);
660 assert_eq!(out.nodes[1].types, vec!["Article"]);
661 }
662
663 #[test]
664 fn top_level_array() {
665 let html = r#"<html><head><script type="application/ld+json">[
666 {"@context":"https://schema.org","@type":"Product","name":"A"},
667 {"@context":"https://schema.org","@type":"Article","name":"B"}
668]</script></head></html>"#;
669
670 let out = JsonLdExtractor.extract(html).expect("extraction failed");
671 assert_eq!(out.nodes.len(), 2);
672 assert_eq!(out.nodes[0].types, vec!["Product"]);
673 assert_eq!(out.nodes[1].types, vec!["Article"]);
674 }
675
676 #[test]
677 fn boolean_and_number_values() {
678 let html = r#"<html><head><script type="application/ld+json">{
679 "@context": "https://schema.org",
680 "@type": "Product",
681 "isFamilyFriendly": true,
682 "weight": 1.5
683}</script></head></html>"#;
684
685 let out = JsonLdExtractor.extract(html).expect("extraction failed");
686 assert_eq!(
687 out.nodes[0].properties["isFamilyFriendly"],
688 vec![SchemaValue::Boolean(true)]
689 );
690 assert_eq!(
691 out.nodes[0].properties["weight"],
692 vec![SchemaValue::Number(1.5)]
693 );
694 }
695
696 #[test]
697 fn unresolvable_reference_warns() {
698 let html = r##"<html><head><script type="application/ld+json">{
699 "@context": "https://schema.org",
700 "@type": "Product",
701 "offers": {"@id": "#nonexistent"}
702}</script></head></html>"##;
703
704 let out = JsonLdExtractor.extract(html).expect("extraction failed");
705 assert!(out
706 .warnings
707 .iter()
708 .any(|w| w.code == WarningCode::UnresolvableReference));
709 }
710
711 #[test]
712 fn no_context_with_full_uri_type() {
713 let html = r#"<html><head><script type="application/ld+json">{
714 "@type": "https://schema.org/Product",
715 "name": "Widget"
716}</script></head></html>"#;
717
718 let out = JsonLdExtractor.extract(html).expect("extraction failed");
719 assert_eq!(out.nodes.len(), 1);
720 assert_eq!(out.nodes[0].types, vec!["Product"]);
721 }
722
723 #[test]
724 fn array_context() {
725 let html = r#"<html><head><script type="application/ld+json">{
726 "@context": ["https://schema.org", {"custom": "https://example.com/"}],
727 "@type": "Product",
728 "name": "Widget"
729}</script></head></html>"#;
730
731 let out = JsonLdExtractor.extract(html).expect("extraction failed");
732 assert_eq!(out.nodes[0].types, vec!["Product"]);
733 }
734
735 #[test]
736 fn array_property_values() {
737 let html = r#"<html><head><script type="application/ld+json">{
738 "@context": "https://schema.org",
739 "@type": "Product",
740 "name": "Widget",
741 "image": [
742 "https://example.com/img1.jpg",
743 "https://example.com/img2.jpg"
744 ]
745}</script></head></html>"#;
746
747 let out = JsonLdExtractor.extract(html).expect("extraction failed");
748 assert_eq!(out.nodes[0].properties["image"].len(), 2);
749 assert_eq!(
750 out.nodes[0].properties["image"][0],
751 SchemaValue::Url("https://example.com/img1.jpg".into())
752 );
753 }
754
755 #[test]
756 fn null_values_are_skipped() {
757 let html = r#"<html><head><script type="application/ld+json">{
758 "@context": "https://schema.org",
759 "@type": "Product",
760 "name": "Widget",
761 "description": null
762}</script></head></html>"#;
763
764 let out = JsonLdExtractor.extract(html).expect("extraction failed");
765 assert_eq!(out.nodes.len(), 1);
766 assert!(!out.nodes[0].properties.contains_key("description"));
768 }
769
770 #[test]
771 fn integer_numbers() {
772 let html = r#"<html><head><script type="application/ld+json">{
773 "@context": "https://schema.org",
774 "@type": "Product",
775 "name": "Widget",
776 "ratingCount": 42
777}</script></head></html>"#;
778
779 let out = JsonLdExtractor.extract(html).expect("extraction failed");
780 assert_eq!(
781 out.nodes[0].properties["ratingCount"],
782 vec![SchemaValue::Number(42.0)]
783 );
784 }
785
786 #[test]
787 fn graph_context_inherited_by_children() {
788 let html = r#"<html><head><script type="application/ld+json">{
789 "@context": "https://schema.org",
790 "@graph": [
791 {"@type": "Product", "name": "A"},
792 {"@type": "https://schema.org/Article", "name": "B"}
793 ]
794}</script></head></html>"#;
795
796 let out = JsonLdExtractor.extract(html).expect("extraction failed");
797 assert_eq!(out.nodes.len(), 2);
798 assert_eq!(out.nodes[0].types, vec!["Product"]);
799 assert_eq!(out.nodes[1].types, vec!["Article"]);
800 }
801
802 #[test]
803 fn duplicate_id_warns() {
804 let html = r##"<html><head><script type="application/ld+json">{
805 "@context": "https://schema.org",
806 "@graph": [
807 {"@id": "#thing", "@type": "Product", "name": "First"},
808 {"@id": "#thing", "@type": "Article", "name": "Second"}
809 ]
810}</script></head></html>"##;
811
812 let out = JsonLdExtractor.extract(html).expect("extraction failed");
813 assert!(out
814 .warnings
815 .iter()
816 .any(|w| w.code == WarningCode::DuplicateId));
817 }
818
819 #[test]
820 fn deeply_nested_objects() {
821 let html = r#"<html><head><script type="application/ld+json">{
822 "@context": "https://schema.org",
823 "@type": "Product",
824 "name": "Widget",
825 "offers": {
826 "@type": "Offer",
827 "seller": {
828 "@type": "Organization",
829 "address": {
830 "@type": "PostalAddress",
831 "addressCountry": "US"
832 }
833 }
834 }
835}</script></head></html>"#;
836
837 let out = JsonLdExtractor.extract(html).expect("extraction failed");
838 assert_eq!(out.nodes.len(), 1);
839 let offers = &out.nodes[0].properties["offers"];
840 if let SchemaValue::Node(offer) = &offers[0] {
841 let seller = &offer.properties["seller"];
842 if let SchemaValue::Node(org) = &seller[0] {
843 let address = &org.properties["address"];
844 if let SchemaValue::Node(addr) = &address[0] {
845 assert_eq!(addr.types, vec!["PostalAddress"]);
846 assert_eq!(
847 addr.properties["addressCountry"],
848 vec![SchemaValue::Text("US".into())]
849 );
850 } else {
851 panic!("Expected PostalAddress node");
852 }
853 } else {
854 panic!("Expected Organization node");
855 }
856 } else {
857 panic!("Expected Offer node");
858 }
859 }
860
861 #[test]
862 fn whitespace_only_script() {
863 let html = r#"<html><head><script type="application/ld+json">
864
865 </script></head></html>"#;
866
867 let out = JsonLdExtractor.extract(html).expect("extraction failed");
868 assert!(out.nodes.is_empty());
869 assert_eq!(out.warnings.len(), 1);
870 assert_eq!(out.warnings[0].code, WarningCode::MalformedJsonLd);
871 }
872
873 #[test]
874 fn source_location_is_set() {
875 let html = concat!(
876 "<html><head>\n",
877 "<script type=\"application/ld+json\">\n",
878 "{\"@type\":\"Product\",\"name\":\"A\"}\n",
879 "</script>\n",
880 "</head></html>",
881 );
882
883 let out = JsonLdExtractor.extract(html).expect("extraction failed");
884 assert_eq!(out.nodes.len(), 1);
885 let loc = out.nodes[0]
886 .source_location
887 .as_ref()
888 .expect("missing source location");
889 assert_eq!(loc.line, 2);
891 }
892
893 #[test]
894 fn multiple_types_with_uri_prefix() {
895 let html = r#"<html><head><script type="application/ld+json">{
896 "@context": "https://schema.org",
897 "@type": ["https://schema.org/Product", "http://schema.org/IndividualProduct"],
898 "name": "Widget"
899}</script></head></html>"#;
900
901 let out = JsonLdExtractor.extract(html).expect("extraction failed");
902 assert_eq!(out.nodes[0].types, vec!["Product", "IndividualProduct"]);
903 }
904
905 #[test]
906 fn schema_node_id_accessor() {
907 let html = r##"<html><head><script type="application/ld+json">{
908 "@context": "https://schema.org",
909 "@id": "#product1",
910 "@type": "Product",
911 "name": "Widget"
912}</script></head></html>"##;
913
914 let out = JsonLdExtractor.extract(html).expect("extraction failed");
915 assert_eq!(out.nodes[0].id(), Some("#product1"));
916 }
917
918 #[test]
919 fn no_structured_data() {
920 let html = r#"<html><head><title>No structured data</title></head>
921<body><p>Hello world</p></body></html>"#;
922
923 let out = JsonLdExtractor.extract(html).expect("extraction failed");
924 assert!(out.nodes.is_empty());
925 assert!(out.warnings.is_empty());
926 }
927
928 #[test]
929 fn json_ld_with_trailing_comma() {
930 let html = r#"<html><head><script type="application/ld+json">{
932 "@type": "Product",
933 "name": "Widget",
934}</script></head></html>"#;
935
936 let out = JsonLdExtractor.extract(html).expect("extraction failed");
937 assert!(out.nodes.is_empty());
938 assert_eq!(out.warnings[0].code, WarningCode::MalformedJsonLd);
939 }
940
941 #[test]
942 fn circular_id_references_do_not_loop() {
943 let html = r##"<html><head><script type="application/ld+json">{
945 "@context": "https://schema.org",
946 "@graph": [
947 {"@id": "#a", "@type": "Product", "name": "A", "isRelatedTo": {"@id": "#b"}},
948 {"@id": "#b", "@type": "Article", "name": "B", "isRelatedTo": {"@id": "#a"}}
949 ]
950}</script></head></html>"##;
951
952 let out = JsonLdExtractor.extract(html).expect("must not hang");
953 assert_eq!(out.nodes.len(), 2);
954 }
955
956 #[test]
957 fn self_referencing_id_does_not_loop() {
958 let html = r##"<html><head><script type="application/ld+json">{
959 "@context": "https://schema.org",
960 "@graph": [
961 {"@id": "#self", "@type": "Product", "name": "Me", "isRelatedTo": {"@id": "#self"}}
962 ]
963}</script></head></html>"##;
964
965 let out = JsonLdExtractor.extract(html).expect("must not hang");
966 assert_eq!(out.nodes.len(), 1);
967 }
968
969 #[test]
970 fn empty_id_string() {
971 let html = r##"<html><head><script type="application/ld+json">{
972 "@context": "https://schema.org",
973 "@id": "",
974 "@type": "Product",
975 "name": "Widget"
976}</script></head></html>"##;
977
978 let out = JsonLdExtractor.extract(html).expect("extraction failed");
979 assert_eq!(out.nodes.len(), 1);
980 assert_eq!(out.nodes[0].id(), Some(""));
982 }
983
984 #[test]
985 fn nesting_at_exactly_max_depth_succeeds() {
986 let mut json =
988 String::from(r#"{"@context":"https://schema.org","@type":"Thing","name":"L0""#);
989 for i in 1..MAX_DEPTH {
990 json.push_str(&format!(r#","p{i}":{{"@type":"Thing","name":"L{i}""#));
991 }
992 for _ in 0..MAX_DEPTH {
994 json.push('}');
995 }
996
997 let html = format!(
998 r#"<html><head><script type="application/ld+json">{json}</script></head></html>"#
999 );
1000
1001 let out = JsonLdExtractor.extract(&html).expect("extraction failed");
1002 assert_eq!(out.nodes.len(), 1);
1003 assert!(
1005 !out.warnings.iter().any(|w| w.message.contains("depth")),
1006 "should not warn at MAX_DEPTH"
1007 );
1008 }
1009
1010 #[test]
1011 fn nesting_beyond_max_depth_warns() {
1012 let target = MAX_DEPTH + 2;
1014 let mut json =
1015 String::from(r#"{"@context":"https://schema.org","@type":"Thing","name":"L0""#);
1016 for i in 1..target {
1017 json.push_str(&format!(r#","p{i}":{{"@type":"Thing","name":"L{i}""#));
1018 }
1019 for _ in 0..target {
1021 json.push('}');
1022 }
1023
1024 let html = format!(
1025 r#"<html><head><script type="application/ld+json">{json}</script></head></html>"#
1026 );
1027
1028 let out = JsonLdExtractor.extract(&html).expect("extraction failed");
1029 assert!(
1030 out.warnings.iter().any(|w| w.message.contains("depth")),
1031 "should warn when exceeding MAX_DEPTH"
1032 );
1033 }
1034
1035 #[test]
1036 fn type_is_number_ignored() {
1037 let html = r#"<html><head><script type="application/ld+json">{
1038 "@context": "https://schema.org",
1039 "@type": 42,
1040 "name": "Widget"
1041}</script></head></html>"#;
1042
1043 let out = JsonLdExtractor.extract(html).expect("extraction failed");
1044 assert_eq!(out.nodes.len(), 1);
1045 assert!(out.nodes[0].types.is_empty());
1046 assert!(out
1047 .warnings
1048 .iter()
1049 .any(|w| w.code == WarningCode::EmptyType));
1050 }
1051
1052 #[test]
1053 fn type_is_object_ignored() {
1054 let html = r#"<html><head><script type="application/ld+json">{
1055 "@context": "https://schema.org",
1056 "@type": {"invalid": true},
1057 "name": "Widget"
1058}</script></head></html>"#;
1059
1060 let out = JsonLdExtractor.extract(html).expect("extraction failed");
1061 assert_eq!(out.nodes.len(), 1);
1062 assert!(out.nodes[0].types.is_empty());
1063 }
1064
1065 #[test]
1066 fn type_empty_array() {
1067 let html = r#"<html><head><script type="application/ld+json">{
1068 "@context": "https://schema.org",
1069 "@type": [],
1070 "name": "Widget"
1071}</script></head></html>"#;
1072
1073 let out = JsonLdExtractor.extract(html).expect("extraction failed");
1074 assert_eq!(out.nodes.len(), 1);
1075 assert!(out.nodes[0].types.is_empty());
1076 assert!(out
1077 .warnings
1078 .iter()
1079 .any(|w| w.code == WarningCode::EmptyType));
1080 }
1081
1082 #[test]
1083 fn type_array_with_mixed_values() {
1084 let html = r#"<html><head><script type="application/ld+json">{
1086 "@context": "https://schema.org",
1087 "@type": [42, "Product", null, "IndividualProduct"],
1088 "name": "Widget"
1089}</script></head></html>"#;
1090
1091 let out = JsonLdExtractor.extract(html).expect("extraction failed");
1092 assert_eq!(out.nodes[0].types, vec!["Product", "IndividualProduct"]);
1093 }
1094
1095 #[test]
1096 fn non_schema_org_context_still_extracts() {
1097 let html = r#"<html><head><script type="application/ld+json">{
1098 "@context": "https://w3.org/ns/activitystreams",
1099 "@type": "Note",
1100 "content": "Hello"
1101}</script></head></html>"#;
1102
1103 let out = JsonLdExtractor.extract(html).expect("extraction failed");
1104 assert_eq!(out.nodes.len(), 1);
1105 assert_eq!(out.nodes[0].types, vec!["Note"]);
1106 assert_eq!(
1107 out.nodes[0].properties["content"],
1108 vec![SchemaValue::Text("Hello".into())]
1109 );
1110 }
1111
1112 #[test]
1113 fn html_entities_in_script_content() {
1114 let html = r#"<html><head><script type="application/ld+json">{
1116 "@context": "https://schema.org",
1117 "@type": "Product",
1118 "name": "Widget & Gadget"
1119}</script></head></html>"#;
1120
1121 let out = JsonLdExtractor.extract(html).expect("extraction failed");
1122 assert_eq!(out.nodes.len(), 1);
1126 }
1127
1128 #[test]
1129 fn multiple_references_to_same_id() {
1130 let html = r##"<html><head><script type="application/ld+json">{
1132 "@context": "https://schema.org",
1133 "@graph": [
1134 {
1135 "@type": "Product", "name": "Widget",
1136 "offers": {"@id": "#offer"},
1137 "makesOffer": {"@id": "#offer"},
1138 "hasOfferCatalog": {"@id": "#offer"}
1139 },
1140 {"@id": "#offer", "@type": "Offer", "price": 9.99}
1141 ]
1142}</script></head></html>"##;
1143
1144 let out = JsonLdExtractor.extract(html).expect("extraction failed");
1145 assert_eq!(out.nodes.len(), 2);
1146 for prop in &["offers", "makesOffer", "hasOfferCatalog"] {
1148 let values = &out.nodes[0].properties[*prop];
1149 if let SchemaValue::Node(node) = &values[0] {
1150 assert_eq!(node.types, vec!["Offer"]);
1151 } else {
1152 panic!("Expected resolved Node for {prop}");
1153 }
1154 }
1155 }
1156
1157 #[test]
1158 fn duplicate_id_first_definition_wins() {
1159 let html = r##"<html><head><script type="application/ld+json">{
1161 "@context": "https://schema.org",
1162 "@graph": [
1163 {"@type": "Product", "name": "P", "offers": {"@id": "#dup"}},
1164 {"@id": "#dup", "@type": "Offer", "price": 10.00, "priceCurrency": "USD"},
1165 {"@id": "#dup", "@type": "Offer", "price": 99.99, "priceCurrency": "EUR"}
1166 ]
1167}</script></head></html>"##;
1168
1169 let out = JsonLdExtractor.extract(html).expect("extraction failed");
1170 assert!(out
1172 .warnings
1173 .iter()
1174 .any(|w| w.code == WarningCode::DuplicateId));
1175 let offers = &out.nodes[0].properties["offers"];
1177 if let SchemaValue::Node(offer) = &offers[0] {
1178 assert_eq!(
1179 offer.properties["price"],
1180 vec![SchemaValue::Number(10.0)],
1181 "first @id definition should win"
1182 );
1183 assert_eq!(
1184 offer.properties["priceCurrency"],
1185 vec![SchemaValue::Text("USD".into())],
1186 "first @id definition should win"
1187 );
1188 } else {
1189 panic!("Expected resolved Offer node");
1190 }
1191 }
1192
1193 #[test]
1194 fn json_root_is_string_warns() {
1195 let html = concat!(
1196 r#"<html><head>"#,
1197 r#"<script type="application/ld+json">"#,
1198 r#""just a string""#,
1199 r#"</script></head></html>"#,
1200 );
1201 let out = JsonLdExtractor.extract(html).expect("extraction failed");
1202 assert!(out.nodes.is_empty());
1203 assert_eq!(out.warnings[0].code, WarningCode::MalformedJsonLd);
1204 }
1205
1206 #[test]
1207 fn json_root_is_number_warns() {
1208 let html = r#"<html><head><script type="application/ld+json">42</script></head></html>"#;
1209 let out = JsonLdExtractor.extract(html).expect("extraction failed");
1210 assert!(out.nodes.is_empty());
1211 assert_eq!(out.warnings[0].code, WarningCode::MalformedJsonLd);
1212 }
1213
1214 #[test]
1215 fn external_uri_id_no_warning() {
1216 let html = r##"<html><head><script type="application/ld+json">{
1218 "@context": "https://schema.org",
1219 "@type": "Product",
1220 "name": "Widget",
1221 "manufacturer": {"@id": "https://example.com/org/1"}
1222}</script></head></html>"##;
1223
1224 let out = JsonLdExtractor.extract(html).expect("extraction failed");
1225 assert!(
1226 !out.warnings
1227 .iter()
1228 .any(|w| w.code == WarningCode::UnresolvableReference),
1229 "external @id URIs should not trigger warnings"
1230 );
1231 }
1232}