1use std::collections::{HashMap, HashSet};
22use std::sync::OnceLock;
23
24use indexmap::IndexMap;
25use scraper::{Html, Selector};
26use serde_json::Value;
27
28use crate::error::{ExtractionError, ExtractionWarning, WarningCode};
29use crate::types::{SchemaNode, SchemaValue, SourceFormat, SourceLocation};
30
31use super::{classify_text_value, strip_schema_prefix, ExtractionOutput, Extractor};
32
33const MAX_DEPTH: usize = 20;
35
36const MAX_REF_DEPTH: usize = 10;
41
42const MAX_REF_RESOLUTIONS: usize = 50;
47
48pub struct JsonLdExtractor;
67
68impl Extractor for JsonLdExtractor {
69 fn extract(&self, html: &str) -> Result<ExtractionOutput, ExtractionError> {
70 let document = Html::parse_document(html);
71 self.extract_from_document(&document, html)
72 }
73}
74
75impl JsonLdExtractor {
76 pub fn extract_from_document(
91 &self,
92 document: &Html,
93 html: &str,
94 ) -> Result<ExtractionOutput, ExtractionError> {
95 static SELECTOR: OnceLock<Selector> = OnceLock::new();
96 let selector = SELECTOR.get_or_init(|| {
97 Selector::parse("script[type=\"application/ld+json\"]")
98 .expect("static JSON-LD selector must parse")
99 });
100
101 let line_index = LineIndex::new(html);
102 let script_offsets = find_script_byte_offsets(html);
103
104 let mut all_nodes = Vec::new();
105 let mut warnings = Vec::new();
106
107 for (idx, element) in document.select(selector).enumerate() {
108 let json_text = element.inner_html();
109 let trimmed = json_text.trim();
110 let source_location = script_offsets
111 .get(idx)
112 .map(|&offset| line_index.location(offset));
113
114 if trimmed.is_empty() {
115 warnings.push(ExtractionWarning {
116 message: "empty JSON-LD script tag".into(),
117 source_location,
118 code: WarningCode::MalformedJsonLd,
119 });
120 continue;
121 }
122
123 let value: Value = match serde_json::from_str(trimmed) {
124 Ok(v) => v,
125 Err(e) => {
126 warnings.push(ExtractionWarning {
127 message: format!("failed to parse JSON-LD: {e}"),
128 source_location,
129 code: WarningCode::MalformedJsonLd,
130 });
131 continue;
132 }
133 };
134
135 let items = extract_json_items(&value, source_location.as_ref(), &mut warnings);
136 all_nodes.extend(items);
137 }
138
139 let mut id_to_index: HashMap<String, usize> = HashMap::new();
143 for (i, node) in all_nodes.iter().enumerate() {
144 if let Some(id) = node.id() {
145 match id_to_index.entry(id.to_owned()) {
146 std::collections::hash_map::Entry::Occupied(_) => {
147 warnings.push(ExtractionWarning {
148 message: format!("duplicate @id: {id}"),
149 source_location: node.source_location.clone(),
150 code: WarningCode::DuplicateId,
151 });
152 }
153 std::collections::hash_map::Entry::Vacant(entry) => {
154 entry.insert(i);
155 }
156 }
157 }
158 }
159
160 let referenced = collect_referenced_ids(&all_nodes);
162 let id_map: HashMap<String, SchemaNode> = referenced
163 .iter()
164 .filter_map(|id| {
165 let &idx = id_to_index.get(id.as_str())?;
166 Some((id.clone(), all_nodes[idx].clone()))
167 })
168 .collect();
169
170 resolve_references(&mut all_nodes, &id_map, &mut warnings);
172
173 Ok(ExtractionOutput {
174 nodes: all_nodes,
175 warnings,
176 })
177 }
178}
179
180fn extract_json_items(
183 value: &Value,
184 source_location: Option<&SourceLocation>,
185 warnings: &mut Vec<ExtractionWarning>,
186) -> Vec<SchemaNode> {
187 match value {
188 Value::Array(items) => items
189 .iter()
190 .filter_map(|item| json_to_node(item, None, source_location, warnings, 0))
191 .collect(),
192
193 Value::Object(map) => {
194 if let Some(Value::Array(graph_items)) = map.get("@graph") {
195 let context = map.get("@context");
196 graph_items
197 .iter()
198 .filter_map(|item| json_to_node(item, context, source_location, warnings, 0))
199 .collect()
200 } else {
201 json_to_node(value, None, source_location, warnings, 0)
202 .into_iter()
203 .collect()
204 }
205 }
206
207 _ => {
208 warnings.push(ExtractionWarning {
209 message: "JSON-LD root must be an object or array".into(),
210 source_location: source_location.cloned(),
211 code: WarningCode::MalformedJsonLd,
212 });
213 Vec::new()
214 }
215 }
216}
217
218fn json_to_node(
222 value: &Value,
223 parent_context: Option<&Value>,
224 source_location: Option<&SourceLocation>,
225 warnings: &mut Vec<ExtractionWarning>,
226 depth: usize,
227) -> Option<SchemaNode> {
228 if depth > MAX_DEPTH {
229 warnings.push(ExtractionWarning {
230 message: format!("JSON-LD nesting depth exceeds {MAX_DEPTH}, skipping"),
231 source_location: source_location.cloned(),
232 code: WarningCode::MalformedJsonLd,
233 });
234 return None;
235 }
236 let obj = value.as_object()?;
237
238 let context = obj.get("@context").or(parent_context);
240
241 let types = extract_types(obj);
243
244 if types.is_empty() {
246 let non_meta_keys = obj.keys().filter(|k| !k.starts_with('@')).count();
247 let is_reference = obj.contains_key("@id") && non_meta_keys == 0;
248 if !is_reference && !obj.is_empty() {
249 warnings.push(ExtractionWarning {
250 message: "JSON-LD object has no @type".into(),
251 source_location: source_location.cloned(),
252 code: WarningCode::EmptyType,
253 });
254 }
255 }
256
257 let mut properties: IndexMap<String, Vec<SchemaValue>> = IndexMap::new();
259
260 for (key, val) in obj {
261 if key == "@context" || key == "@type" {
262 continue;
263 }
264
265 if key == "@id" {
266 if let Value::String(id) = val {
267 properties
268 .entry(key.clone())
269 .or_default()
270 .push(classify_text_value(id));
271 }
272 continue;
273 }
274
275 let values = json_to_schema_values(val, context, source_location, warnings, depth);
276 if !values.is_empty() {
277 properties.entry(key.clone()).or_default().extend(values);
278 }
279 }
280
281 Some(SchemaNode {
282 types,
283 properties,
284 source_format: SourceFormat::JsonLd,
285 source_location: source_location.cloned(),
286 })
287}
288
289fn extract_types(obj: &serde_json::Map<String, Value>) -> Vec<String> {
291 match obj.get("@type") {
292 Some(Value::String(t)) => vec![strip_schema_prefix(t).into_owned()],
293 Some(Value::Array(arr)) => arr
294 .iter()
295 .filter_map(|v| v.as_str())
296 .map(|s| strip_schema_prefix(s).into_owned())
297 .collect(),
298 _ => Vec::new(),
299 }
300}
301
302fn json_to_schema_values(
304 value: &Value,
305 context: Option<&Value>,
306 source_location: Option<&SourceLocation>,
307 warnings: &mut Vec<ExtractionWarning>,
308 depth: usize,
309) -> Vec<SchemaValue> {
310 match value {
311 Value::Null => Vec::new(),
312 Value::Bool(b) => vec![SchemaValue::Boolean(*b)],
313 Value::Number(n) => n
314 .as_f64()
315 .map(|f| vec![SchemaValue::Number(f)])
316 .unwrap_or_default(),
317 Value::String(s) => vec![classify_text_value(s)],
318 Value::Array(arr) => arr
319 .iter()
320 .flat_map(|v| json_to_schema_values(v, context, source_location, warnings, depth))
321 .collect(),
322 Value::Object(_) => json_to_node(value, context, source_location, warnings, depth + 1)
323 .map(|node| vec![SchemaValue::Node(Box::new(node))])
324 .unwrap_or_default(),
325 }
326}
327
328fn resolve_references(
334 nodes: &mut [SchemaNode],
335 id_map: &HashMap<String, SchemaNode>,
336 warnings: &mut Vec<ExtractionWarning>,
337) {
338 let mut resolution_count: usize = 0;
339 for node in nodes.iter_mut() {
340 resolve_node_refs(node, id_map, warnings, 0, &mut resolution_count);
341 }
342}
343
344fn resolve_node_refs(
349 node: &mut SchemaNode,
350 id_map: &HashMap<String, SchemaNode>,
351 warnings: &mut Vec<ExtractionWarning>,
352 depth: usize,
353 resolution_count: &mut usize,
354) {
355 if depth > MAX_REF_DEPTH {
356 return;
357 }
358
359 for values in node.properties.values_mut() {
360 for value in values.iter_mut() {
361 if let SchemaValue::Node(inner) = value {
362 if inner.types.is_empty() {
364 if let Some(id_values) = inner.properties.get("@id") {
365 if let Some(SchemaValue::Text(id)) = id_values.first() {
366 if *resolution_count >= MAX_REF_RESOLUTIONS {
367 continue;
368 }
369 if let Some(resolved) = id_map.get(id.as_str()) {
370 let has_content =
371 !resolved.types.is_empty() || resolved.properties.len() > 1;
372 if has_content {
373 *resolution_count += 1;
374 *value = SchemaValue::Node(Box::new(resolved.clone()));
375 if let SchemaValue::Node(ref mut n) = value {
376 resolve_node_refs(
377 n,
378 id_map,
379 warnings,
380 depth + 1,
381 resolution_count,
382 );
383 }
384 continue;
385 }
386 }
387 if id.starts_with('#') {
391 warnings.push(ExtractionWarning {
392 message: format!("unresolvable @id reference: {id}"),
393 source_location: inner.source_location.clone(),
394 code: WarningCode::UnresolvableReference,
395 });
396 }
397 continue;
398 }
399 }
400 }
401 resolve_node_refs(inner, id_map, warnings, depth + 1, resolution_count);
403 }
404 }
405 }
406}
407
408fn collect_referenced_ids(nodes: &[SchemaNode]) -> HashSet<String> {
414 let mut refs = HashSet::new();
415 for node in nodes {
416 collect_refs_in_node(node, &mut refs, 0);
417 }
418 refs
419}
420
421fn collect_refs_in_node(node: &SchemaNode, refs: &mut HashSet<String>, depth: usize) {
426 if depth > MAX_DEPTH {
427 return;
428 }
429 for values in node.properties.values() {
430 for value in values {
431 if let SchemaValue::Node(inner) = value {
432 if inner.types.is_empty() {
433 if let Some(id_values) = inner.properties.get("@id") {
434 if let Some(SchemaValue::Text(id)) = id_values.first() {
435 refs.insert(id.clone());
436 continue;
437 }
438 }
439 }
440 collect_refs_in_node(inner, refs, depth + 1);
441 }
442 }
443 }
444}
445
446struct LineIndex {
449 line_starts: Vec<usize>,
450}
451
452impl LineIndex {
453 fn new(text: &str) -> Self {
454 let mut line_starts = vec![0];
455 for (i, byte) in text.bytes().enumerate() {
456 if byte == b'\n' {
457 line_starts.push(i + 1);
458 }
459 }
460 Self { line_starts }
461 }
462
463 fn location(&self, byte_offset: usize) -> SourceLocation {
464 let line = self
465 .line_starts
466 .partition_point(|&start| start <= byte_offset)
467 .saturating_sub(1);
468 let column = byte_offset.saturating_sub(self.line_starts[line]);
469 SourceLocation {
470 line: line + 1,
471 column: column + 1,
472 byte_offset,
473 }
474 }
475}
476
477fn find_script_byte_offsets(html: &str) -> Vec<usize> {
479 let mut offsets = Vec::new();
480 let mut search_from = 0;
481 let pattern = "application/ld+json";
482
483 while let Some(pos) = html[search_from..].find(pattern) {
484 let abs_pos = search_from + pos;
485 if let Some(tag_start) = html[..abs_pos].rfind('<') {
486 if html[tag_start..abs_pos].contains("script") {
487 offsets.push(tag_start);
488 }
489 }
490 search_from = abs_pos + pattern.len();
491 }
492
493 offsets
494}
495
496#[cfg(test)]
497mod tests {
498 use pretty_assertions::assert_eq;
499
500 use super::*;
501
502 #[test]
503 fn line_index_positions() {
504 let idx = LineIndex::new("line1\nline2\nline3");
505 let loc = idx.location(0);
506 assert_eq!((loc.line, loc.column), (1, 1));
507 let loc = idx.location(6);
508 assert_eq!((loc.line, loc.column), (2, 1));
509 let loc = idx.location(8);
510 assert_eq!((loc.line, loc.column), (2, 3));
511 }
512
513 #[test]
514 fn find_script_offsets() {
515 let html =
516 r#"<html><script type="application/ld+json">{"@type":"Product"}</script></html>"#;
517 let offsets = find_script_byte_offsets(html);
518 assert_eq!(offsets.len(), 1);
519 assert!(html[offsets[0]..].starts_with("<script"));
520 }
521
522 #[test]
523 fn basic_product() {
524 let html = r#"<html><head><script type="application/ld+json">{
525 "@context": "https://schema.org",
526 "@type": "Product",
527 "name": "Example Product",
528 "url": "https://example.com/product"
529}</script></head></html>"#;
530
531 let out = JsonLdExtractor.extract(html).expect("extraction failed");
532 assert_eq!(out.nodes.len(), 1);
533 assert_eq!(out.nodes[0].types, vec!["Product"]);
534 assert_eq!(out.nodes[0].source_format, SourceFormat::JsonLd);
535 assert_eq!(
536 out.nodes[0].properties["name"],
537 vec![SchemaValue::Text("Example Product".into())]
538 );
539 assert_eq!(
540 out.nodes[0].properties["url"],
541 vec![SchemaValue::Url("https://example.com/product".into())]
542 );
543 }
544
545 #[test]
546 fn graph_extraction() {
547 let html = r#"<html><head><script type="application/ld+json">{
548 "@context": "https://schema.org",
549 "@graph": [
550 {"@type": "Organization", "name": "Acme"},
551 {"@type": "WebSite", "name": "Acme Site"}
552 ]
553}</script></head></html>"#;
554
555 let out = JsonLdExtractor.extract(html).expect("extraction failed");
556 assert_eq!(out.nodes.len(), 2);
557 assert_eq!(out.nodes[0].types, vec!["Organization"]);
558 assert_eq!(out.nodes[1].types, vec!["WebSite"]);
559 }
560
561 #[test]
562 fn array_type() {
563 let html = r#"<html><head><script type="application/ld+json">{
564 "@context": "https://schema.org",
565 "@type": ["Product", "IndividualProduct"],
566 "name": "Widget"
567}</script></head></html>"#;
568
569 let out = JsonLdExtractor.extract(html).expect("extraction failed");
570 assert_eq!(out.nodes[0].types, vec!["Product", "IndividualProduct"]);
571 }
572
573 #[test]
574 fn nested_object() {
575 let html = r#"<html><head><script type="application/ld+json">{
576 "@context": "https://schema.org",
577 "@type": "Product",
578 "name": "Widget",
579 "offers": {
580 "@type": "Offer",
581 "price": 19.99,
582 "priceCurrency": "USD"
583 }
584}</script></head></html>"#;
585
586 let out = JsonLdExtractor.extract(html).expect("extraction failed");
587 assert_eq!(out.nodes.len(), 1);
588 let offers = &out.nodes[0].properties["offers"];
589 assert_eq!(offers.len(), 1);
590 if let SchemaValue::Node(offer) = &offers[0] {
591 assert_eq!(offer.types, vec!["Offer"]);
592 assert_eq!(offer.properties["price"], vec![SchemaValue::Number(19.99)]);
593 assert_eq!(
594 offer.properties["priceCurrency"],
595 vec![SchemaValue::Text("USD".into())]
596 );
597 } else {
598 panic!("Expected nested Node");
599 }
600 }
601
602 #[test]
603 fn id_cross_reference() {
604 let html = r##"<html><head><script type="application/ld+json">{
605 "@context": "https://schema.org",
606 "@graph": [
607 {"@type": "Product", "name": "Widget", "offers": {"@id": "#offer1"}},
608 {"@id": "#offer1", "@type": "Offer", "price": 29.99}
609 ]
610}</script></head></html>"##;
611
612 let out = JsonLdExtractor.extract(html).expect("extraction failed");
613 assert_eq!(out.nodes.len(), 2);
614 let offers = &out.nodes[0].properties["offers"];
615 if let SchemaValue::Node(offer) = &offers[0] {
616 assert_eq!(offer.types, vec!["Offer"]);
617 assert_eq!(offer.properties["price"], vec![SchemaValue::Number(29.99)]);
618 } else {
619 panic!("Expected resolved Node, got {:?}", offers[0]);
620 }
621 }
622
623 #[test]
624 fn malformed_json_is_warning() {
625 let html =
626 r#"<html><head><script type="application/ld+json">{ invalid }</script></head></html>"#;
627 let out = JsonLdExtractor.extract(html).expect("extraction failed");
628 assert!(out.nodes.is_empty());
629 assert_eq!(out.warnings.len(), 1);
630 assert_eq!(out.warnings[0].code, WarningCode::MalformedJsonLd);
631 }
632
633 #[test]
634 fn empty_script_tag() {
635 let html = r#"<html><head><script type="application/ld+json"></script></head></html>"#;
636 let out = JsonLdExtractor.extract(html).expect("extraction failed");
637 assert!(out.nodes.is_empty());
638 assert_eq!(out.warnings[0].code, WarningCode::MalformedJsonLd);
639 }
640
641 #[test]
642 fn multiple_script_tags() {
643 let html = concat!(
644 r#"<html><head>"#, "\n",
645 r#"<script type="application/ld+json">"#,
646 r#"{"@context":"https://schema.org","@type":"Product","name":"A"}"#,
647 r#"</script>"#, "\n",
648 r#"<script type="application/ld+json">"#,
649 r#"{"@context":"https://schema.org","@type":"Article","name":"B"}"#,
650 r#"</script>"#, "\n",
651 r#"</head></html>"#,
652 );
653
654 let out = JsonLdExtractor.extract(html).expect("extraction failed");
655 assert_eq!(out.nodes.len(), 2);
656 assert_eq!(out.nodes[0].types, vec!["Product"]);
657 assert_eq!(out.nodes[1].types, vec!["Article"]);
658 }
659
660 #[test]
661 fn top_level_array() {
662 let html = r#"<html><head><script type="application/ld+json">[
663 {"@context":"https://schema.org","@type":"Product","name":"A"},
664 {"@context":"https://schema.org","@type":"Article","name":"B"}
665]</script></head></html>"#;
666
667 let out = JsonLdExtractor.extract(html).expect("extraction failed");
668 assert_eq!(out.nodes.len(), 2);
669 assert_eq!(out.nodes[0].types, vec!["Product"]);
670 assert_eq!(out.nodes[1].types, vec!["Article"]);
671 }
672
673 #[test]
674 fn boolean_and_number_values() {
675 let html = r#"<html><head><script type="application/ld+json">{
676 "@context": "https://schema.org",
677 "@type": "Product",
678 "isFamilyFriendly": true,
679 "weight": 1.5
680}</script></head></html>"#;
681
682 let out = JsonLdExtractor.extract(html).expect("extraction failed");
683 assert_eq!(
684 out.nodes[0].properties["isFamilyFriendly"],
685 vec![SchemaValue::Boolean(true)]
686 );
687 assert_eq!(
688 out.nodes[0].properties["weight"],
689 vec![SchemaValue::Number(1.5)]
690 );
691 }
692
693 #[test]
694 fn unresolvable_reference_warns() {
695 let html = r##"<html><head><script type="application/ld+json">{
696 "@context": "https://schema.org",
697 "@type": "Product",
698 "offers": {"@id": "#nonexistent"}
699}</script></head></html>"##;
700
701 let out = JsonLdExtractor.extract(html).expect("extraction failed");
702 assert!(out
703 .warnings
704 .iter()
705 .any(|w| w.code == WarningCode::UnresolvableReference));
706 }
707
708 #[test]
709 fn no_context_with_full_uri_type() {
710 let html = r#"<html><head><script type="application/ld+json">{
711 "@type": "https://schema.org/Product",
712 "name": "Widget"
713}</script></head></html>"#;
714
715 let out = JsonLdExtractor.extract(html).expect("extraction failed");
716 assert_eq!(out.nodes.len(), 1);
717 assert_eq!(out.nodes[0].types, vec!["Product"]);
718 }
719
720 #[test]
721 fn array_context() {
722 let html = r#"<html><head><script type="application/ld+json">{
723 "@context": ["https://schema.org", {"custom": "https://example.com/"}],
724 "@type": "Product",
725 "name": "Widget"
726}</script></head></html>"#;
727
728 let out = JsonLdExtractor.extract(html).expect("extraction failed");
729 assert_eq!(out.nodes[0].types, vec!["Product"]);
730 }
731
732 #[test]
733 fn array_property_values() {
734 let html = r#"<html><head><script type="application/ld+json">{
735 "@context": "https://schema.org",
736 "@type": "Product",
737 "name": "Widget",
738 "image": [
739 "https://example.com/img1.jpg",
740 "https://example.com/img2.jpg"
741 ]
742}</script></head></html>"#;
743
744 let out = JsonLdExtractor.extract(html).expect("extraction failed");
745 assert_eq!(out.nodes[0].properties["image"].len(), 2);
746 assert_eq!(
747 out.nodes[0].properties["image"][0],
748 SchemaValue::Url("https://example.com/img1.jpg".into())
749 );
750 }
751
752 #[test]
753 fn null_values_are_skipped() {
754 let html = r#"<html><head><script type="application/ld+json">{
755 "@context": "https://schema.org",
756 "@type": "Product",
757 "name": "Widget",
758 "description": null
759}</script></head></html>"#;
760
761 let out = JsonLdExtractor.extract(html).expect("extraction failed");
762 assert_eq!(out.nodes.len(), 1);
763 assert!(!out.nodes[0].properties.contains_key("description"));
765 }
766
767 #[test]
768 fn integer_numbers() {
769 let html = r#"<html><head><script type="application/ld+json">{
770 "@context": "https://schema.org",
771 "@type": "Product",
772 "name": "Widget",
773 "ratingCount": 42
774}</script></head></html>"#;
775
776 let out = JsonLdExtractor.extract(html).expect("extraction failed");
777 assert_eq!(
778 out.nodes[0].properties["ratingCount"],
779 vec![SchemaValue::Number(42.0)]
780 );
781 }
782
783 #[test]
784 fn graph_context_inherited_by_children() {
785 let html = r#"<html><head><script type="application/ld+json">{
786 "@context": "https://schema.org",
787 "@graph": [
788 {"@type": "Product", "name": "A"},
789 {"@type": "https://schema.org/Article", "name": "B"}
790 ]
791}</script></head></html>"#;
792
793 let out = JsonLdExtractor.extract(html).expect("extraction failed");
794 assert_eq!(out.nodes.len(), 2);
795 assert_eq!(out.nodes[0].types, vec!["Product"]);
796 assert_eq!(out.nodes[1].types, vec!["Article"]);
797 }
798
799 #[test]
800 fn duplicate_id_warns() {
801 let html = r##"<html><head><script type="application/ld+json">{
802 "@context": "https://schema.org",
803 "@graph": [
804 {"@id": "#thing", "@type": "Product", "name": "First"},
805 {"@id": "#thing", "@type": "Article", "name": "Second"}
806 ]
807}</script></head></html>"##;
808
809 let out = JsonLdExtractor.extract(html).expect("extraction failed");
810 assert!(out
811 .warnings
812 .iter()
813 .any(|w| w.code == WarningCode::DuplicateId));
814 }
815
816 #[test]
817 fn deeply_nested_objects() {
818 let html = r#"<html><head><script type="application/ld+json">{
819 "@context": "https://schema.org",
820 "@type": "Product",
821 "name": "Widget",
822 "offers": {
823 "@type": "Offer",
824 "seller": {
825 "@type": "Organization",
826 "address": {
827 "@type": "PostalAddress",
828 "addressCountry": "US"
829 }
830 }
831 }
832}</script></head></html>"#;
833
834 let out = JsonLdExtractor.extract(html).expect("extraction failed");
835 assert_eq!(out.nodes.len(), 1);
836 let offers = &out.nodes[0].properties["offers"];
837 if let SchemaValue::Node(offer) = &offers[0] {
838 let seller = &offer.properties["seller"];
839 if let SchemaValue::Node(org) = &seller[0] {
840 let address = &org.properties["address"];
841 if let SchemaValue::Node(addr) = &address[0] {
842 assert_eq!(addr.types, vec!["PostalAddress"]);
843 assert_eq!(
844 addr.properties["addressCountry"],
845 vec![SchemaValue::Text("US".into())]
846 );
847 } else {
848 panic!("Expected PostalAddress node");
849 }
850 } else {
851 panic!("Expected Organization node");
852 }
853 } else {
854 panic!("Expected Offer node");
855 }
856 }
857
858 #[test]
859 fn whitespace_only_script() {
860 let html = r#"<html><head><script type="application/ld+json">
861
862 </script></head></html>"#;
863
864 let out = JsonLdExtractor.extract(html).expect("extraction failed");
865 assert!(out.nodes.is_empty());
866 assert_eq!(out.warnings.len(), 1);
867 assert_eq!(out.warnings[0].code, WarningCode::MalformedJsonLd);
868 }
869
870 #[test]
871 fn source_location_is_set() {
872 let html = concat!(
873 "<html><head>\n",
874 "<script type=\"application/ld+json\">\n",
875 "{\"@type\":\"Product\",\"name\":\"A\"}\n",
876 "</script>\n",
877 "</head></html>",
878 );
879
880 let out = JsonLdExtractor.extract(html).expect("extraction failed");
881 assert_eq!(out.nodes.len(), 1);
882 let loc = out.nodes[0]
883 .source_location
884 .as_ref()
885 .expect("missing source location");
886 assert_eq!(loc.line, 2);
888 }
889
890 #[test]
891 fn multiple_types_with_uri_prefix() {
892 let html = r#"<html><head><script type="application/ld+json">{
893 "@context": "https://schema.org",
894 "@type": ["https://schema.org/Product", "http://schema.org/IndividualProduct"],
895 "name": "Widget"
896}</script></head></html>"#;
897
898 let out = JsonLdExtractor.extract(html).expect("extraction failed");
899 assert_eq!(out.nodes[0].types, vec!["Product", "IndividualProduct"]);
900 }
901
902 #[test]
903 fn schema_node_id_accessor() {
904 let html = r##"<html><head><script type="application/ld+json">{
905 "@context": "https://schema.org",
906 "@id": "#product1",
907 "@type": "Product",
908 "name": "Widget"
909}</script></head></html>"##;
910
911 let out = JsonLdExtractor.extract(html).expect("extraction failed");
912 assert_eq!(out.nodes[0].id(), Some("#product1"));
913 }
914
915 #[test]
916 fn no_structured_data() {
917 let html = r#"<html><head><title>No structured data</title></head>
918<body><p>Hello world</p></body></html>"#;
919
920 let out = JsonLdExtractor.extract(html).expect("extraction failed");
921 assert!(out.nodes.is_empty());
922 assert!(out.warnings.is_empty());
923 }
924
925 #[test]
926 fn json_ld_with_trailing_comma() {
927 let html = r#"<html><head><script type="application/ld+json">{
929 "@type": "Product",
930 "name": "Widget",
931}</script></head></html>"#;
932
933 let out = JsonLdExtractor.extract(html).expect("extraction failed");
934 assert!(out.nodes.is_empty());
935 assert_eq!(out.warnings[0].code, WarningCode::MalformedJsonLd);
936 }
937
938 #[test]
939 fn circular_id_references_do_not_loop() {
940 let html = r##"<html><head><script type="application/ld+json">{
942 "@context": "https://schema.org",
943 "@graph": [
944 {"@id": "#a", "@type": "Product", "name": "A", "isRelatedTo": {"@id": "#b"}},
945 {"@id": "#b", "@type": "Article", "name": "B", "isRelatedTo": {"@id": "#a"}}
946 ]
947}</script></head></html>"##;
948
949 let out = JsonLdExtractor.extract(html).expect("must not hang");
950 assert_eq!(out.nodes.len(), 2);
951 }
952
953 #[test]
954 fn self_referencing_id_does_not_loop() {
955 let html = r##"<html><head><script type="application/ld+json">{
956 "@context": "https://schema.org",
957 "@graph": [
958 {"@id": "#self", "@type": "Product", "name": "Me", "isRelatedTo": {"@id": "#self"}}
959 ]
960}</script></head></html>"##;
961
962 let out = JsonLdExtractor.extract(html).expect("must not hang");
963 assert_eq!(out.nodes.len(), 1);
964 }
965
966 #[test]
967 fn empty_id_string() {
968 let html = r##"<html><head><script type="application/ld+json">{
969 "@context": "https://schema.org",
970 "@id": "",
971 "@type": "Product",
972 "name": "Widget"
973}</script></head></html>"##;
974
975 let out = JsonLdExtractor.extract(html).expect("extraction failed");
976 assert_eq!(out.nodes.len(), 1);
977 assert_eq!(out.nodes[0].id(), Some(""));
979 }
980
981 #[test]
982 fn nesting_at_exactly_max_depth_succeeds() {
983 let mut json =
985 String::from(r#"{"@context":"https://schema.org","@type":"Thing","name":"L0""#);
986 for i in 1..MAX_DEPTH {
987 json.push_str(&format!(r#","p{i}":{{"@type":"Thing","name":"L{i}""#));
988 }
989 for _ in 0..MAX_DEPTH {
991 json.push('}');
992 }
993
994 let html = format!(
995 r#"<html><head><script type="application/ld+json">{json}</script></head></html>"#
996 );
997
998 let out = JsonLdExtractor.extract(&html).expect("extraction failed");
999 assert_eq!(out.nodes.len(), 1);
1000 assert!(
1002 !out.warnings.iter().any(|w| w.message.contains("depth")),
1003 "should not warn at MAX_DEPTH"
1004 );
1005 }
1006
1007 #[test]
1008 fn nesting_beyond_max_depth_warns() {
1009 let target = MAX_DEPTH + 2;
1011 let mut json =
1012 String::from(r#"{"@context":"https://schema.org","@type":"Thing","name":"L0""#);
1013 for i in 1..target {
1014 json.push_str(&format!(r#","p{i}":{{"@type":"Thing","name":"L{i}""#));
1015 }
1016 for _ in 0..target {
1018 json.push('}');
1019 }
1020
1021 let html = format!(
1022 r#"<html><head><script type="application/ld+json">{json}</script></head></html>"#
1023 );
1024
1025 let out = JsonLdExtractor.extract(&html).expect("extraction failed");
1026 assert!(
1027 out.warnings.iter().any(|w| w.message.contains("depth")),
1028 "should warn when exceeding MAX_DEPTH"
1029 );
1030 }
1031
1032 #[test]
1033 fn type_is_number_ignored() {
1034 let html = r#"<html><head><script type="application/ld+json">{
1035 "@context": "https://schema.org",
1036 "@type": 42,
1037 "name": "Widget"
1038}</script></head></html>"#;
1039
1040 let out = JsonLdExtractor.extract(html).expect("extraction failed");
1041 assert_eq!(out.nodes.len(), 1);
1042 assert!(out.nodes[0].types.is_empty());
1043 assert!(out
1044 .warnings
1045 .iter()
1046 .any(|w| w.code == WarningCode::EmptyType));
1047 }
1048
1049 #[test]
1050 fn type_is_object_ignored() {
1051 let html = r#"<html><head><script type="application/ld+json">{
1052 "@context": "https://schema.org",
1053 "@type": {"invalid": true},
1054 "name": "Widget"
1055}</script></head></html>"#;
1056
1057 let out = JsonLdExtractor.extract(html).expect("extraction failed");
1058 assert_eq!(out.nodes.len(), 1);
1059 assert!(out.nodes[0].types.is_empty());
1060 }
1061
1062 #[test]
1063 fn type_empty_array() {
1064 let html = r#"<html><head><script type="application/ld+json">{
1065 "@context": "https://schema.org",
1066 "@type": [],
1067 "name": "Widget"
1068}</script></head></html>"#;
1069
1070 let out = JsonLdExtractor.extract(html).expect("extraction failed");
1071 assert_eq!(out.nodes.len(), 1);
1072 assert!(out.nodes[0].types.is_empty());
1073 assert!(out
1074 .warnings
1075 .iter()
1076 .any(|w| w.code == WarningCode::EmptyType));
1077 }
1078
1079 #[test]
1080 fn type_array_with_mixed_values() {
1081 let html = r#"<html><head><script type="application/ld+json">{
1083 "@context": "https://schema.org",
1084 "@type": [42, "Product", null, "IndividualProduct"],
1085 "name": "Widget"
1086}</script></head></html>"#;
1087
1088 let out = JsonLdExtractor.extract(html).expect("extraction failed");
1089 assert_eq!(out.nodes[0].types, vec!["Product", "IndividualProduct"]);
1090 }
1091
1092 #[test]
1093 fn non_schema_org_context_still_extracts() {
1094 let html = r#"<html><head><script type="application/ld+json">{
1095 "@context": "https://w3.org/ns/activitystreams",
1096 "@type": "Note",
1097 "content": "Hello"
1098}</script></head></html>"#;
1099
1100 let out = JsonLdExtractor.extract(html).expect("extraction failed");
1101 assert_eq!(out.nodes.len(), 1);
1102 assert_eq!(out.nodes[0].types, vec!["Note"]);
1103 assert_eq!(
1104 out.nodes[0].properties["content"],
1105 vec![SchemaValue::Text("Hello".into())]
1106 );
1107 }
1108
1109 #[test]
1110 fn html_entities_in_script_content() {
1111 let html = r#"<html><head><script type="application/ld+json">{
1113 "@context": "https://schema.org",
1114 "@type": "Product",
1115 "name": "Widget & Gadget"
1116}</script></head></html>"#;
1117
1118 let out = JsonLdExtractor.extract(html).expect("extraction failed");
1119 assert_eq!(out.nodes.len(), 1);
1123 }
1124
1125 #[test]
1126 fn multiple_references_to_same_id() {
1127 let html = r##"<html><head><script type="application/ld+json">{
1129 "@context": "https://schema.org",
1130 "@graph": [
1131 {
1132 "@type": "Product", "name": "Widget",
1133 "offers": {"@id": "#offer"},
1134 "makesOffer": {"@id": "#offer"},
1135 "hasOfferCatalog": {"@id": "#offer"}
1136 },
1137 {"@id": "#offer", "@type": "Offer", "price": 9.99}
1138 ]
1139}</script></head></html>"##;
1140
1141 let out = JsonLdExtractor.extract(html).expect("extraction failed");
1142 assert_eq!(out.nodes.len(), 2);
1143 for prop in &["offers", "makesOffer", "hasOfferCatalog"] {
1145 let values = &out.nodes[0].properties[*prop];
1146 if let SchemaValue::Node(node) = &values[0] {
1147 assert_eq!(node.types, vec!["Offer"]);
1148 } else {
1149 panic!("Expected resolved Node for {prop}");
1150 }
1151 }
1152 }
1153
1154 #[test]
1155 fn duplicate_id_first_definition_wins() {
1156 let html = r##"<html><head><script type="application/ld+json">{
1158 "@context": "https://schema.org",
1159 "@graph": [
1160 {"@type": "Product", "name": "P", "offers": {"@id": "#dup"}},
1161 {"@id": "#dup", "@type": "Offer", "price": 10.00, "priceCurrency": "USD"},
1162 {"@id": "#dup", "@type": "Offer", "price": 99.99, "priceCurrency": "EUR"}
1163 ]
1164}</script></head></html>"##;
1165
1166 let out = JsonLdExtractor.extract(html).expect("extraction failed");
1167 assert!(out
1169 .warnings
1170 .iter()
1171 .any(|w| w.code == WarningCode::DuplicateId));
1172 let offers = &out.nodes[0].properties["offers"];
1174 if let SchemaValue::Node(offer) = &offers[0] {
1175 assert_eq!(
1176 offer.properties["price"],
1177 vec![SchemaValue::Number(10.0)],
1178 "first @id definition should win"
1179 );
1180 assert_eq!(
1181 offer.properties["priceCurrency"],
1182 vec![SchemaValue::Text("USD".into())],
1183 "first @id definition should win"
1184 );
1185 } else {
1186 panic!("Expected resolved Offer node");
1187 }
1188 }
1189
1190 #[test]
1191 fn json_root_is_string_warns() {
1192 let html = concat!(
1193 r#"<html><head>"#,
1194 r#"<script type="application/ld+json">"#,
1195 r#""just a string""#,
1196 r#"</script></head></html>"#,
1197 );
1198 let out = JsonLdExtractor.extract(html).expect("extraction failed");
1199 assert!(out.nodes.is_empty());
1200 assert_eq!(out.warnings[0].code, WarningCode::MalformedJsonLd);
1201 }
1202
1203 #[test]
1204 fn json_root_is_number_warns() {
1205 let html = r#"<html><head><script type="application/ld+json">42</script></head></html>"#;
1206 let out = JsonLdExtractor.extract(html).expect("extraction failed");
1207 assert!(out.nodes.is_empty());
1208 assert_eq!(out.warnings[0].code, WarningCode::MalformedJsonLd);
1209 }
1210
1211 #[test]
1212 fn external_uri_id_no_warning() {
1213 let html = r##"<html><head><script type="application/ld+json">{
1215 "@context": "https://schema.org",
1216 "@type": "Product",
1217 "name": "Widget",
1218 "manufacturer": {"@id": "https://example.com/org/1"}
1219}</script></head></html>"##;
1220
1221 let out = JsonLdExtractor.extract(html).expect("extraction failed");
1222 assert!(
1223 !out.warnings
1224 .iter()
1225 .any(|w| w.code == WarningCode::UnresolvableReference),
1226 "external @id URIs should not trigger warnings"
1227 );
1228 }
1229}