1use dom_query::{Document, Matcher, Selection};
4use serde::{Deserialize, Serialize};
5use serde_json::{Map, Value};
6
7const MAX_NESTING_DEPTH: usize = 64;
9
10#[derive(Debug, thiserror::Error)]
12#[non_exhaustive]
13pub enum SchemaError {
14 #[error("invalid CSS selector '{selector}' in field '{field}'")]
16 InvalidSelector {
17 field: String,
19 selector: String,
21 },
22 #[error("failed to parse schema: {0}")]
24 Parse(#[from] serde_json::Error),
25 #[error("failed to read schema file: {0}")]
27 Io(#[from] std::io::Error),
28 #[error("schema nesting too deep at field '{field}' ({depth} levels, max {max})")]
30 TooDeep {
31 field: String,
33 depth: usize,
35 max: usize,
37 },
38}
39
40#[derive(Debug, Clone, Serialize, Deserialize)]
42#[non_exhaustive]
43pub struct ExtractSchema {
44 #[serde(default, alias = "baseSelector")]
46 pub(crate) base_selector: Option<String>,
47 pub(crate) fields: Vec<ExtractField>,
49}
50
51impl ExtractSchema {
52 pub fn from_json(json: &str) -> Result<Self, SchemaError> {
54 let schema: Self = serde_json::from_str(json)?;
55 schema.validate()?;
56 Ok(schema)
57 }
58
59 pub fn from_path(path: impl AsRef<std::path::Path>) -> Result<Self, SchemaError> {
61 let content = std::fs::read_to_string(path)?;
62 Self::from_json(&content)
63 }
64
65 #[must_use]
67 pub fn builder() -> SchemaBuilder {
68 SchemaBuilder::default()
69 }
70
71 pub fn validate(&self) -> Result<(), SchemaError> {
73 if let Some(sel) = &self.base_selector {
74 check_selector("<base>", sel)?;
75 }
76 for f in &self.fields {
77 f.validate("", 0)?;
78 }
79 Ok(())
80 }
81
82 #[must_use]
84 pub fn base_selector(&self) -> Option<&str> {
85 self.base_selector.as_deref()
86 }
87
88 #[must_use]
90 pub fn fields(&self) -> &[ExtractField] {
91 &self.fields
92 }
93}
94
95#[derive(Debug, Clone, Serialize, Deserialize)]
97#[non_exhaustive]
98pub struct ExtractField {
99 pub(crate) name: String,
101 pub(crate) selector: String,
103 #[serde(flatten)]
105 pub(crate) kind: FieldKind,
106}
107
108impl ExtractField {
109 pub fn new(name: impl Into<String>, selector: impl Into<String>, kind: FieldKind) -> Self {
111 Self {
112 name: name.into(),
113 selector: selector.into(),
114 kind,
115 }
116 }
117
118 #[must_use]
120 pub fn name(&self) -> &str {
121 &self.name
122 }
123
124 #[must_use]
126 pub fn selector(&self) -> &str {
127 &self.selector
128 }
129
130 #[must_use]
132 pub fn kind(&self) -> &FieldKind {
133 &self.kind
134 }
135
136 fn validate(&self, parent: &str, depth: usize) -> Result<(), SchemaError> {
137 let path = if parent.is_empty() {
138 self.name.clone()
139 } else {
140 format!("{parent}.{}", self.name)
141 };
142 if depth > MAX_NESTING_DEPTH {
143 return Err(SchemaError::TooDeep {
144 field: path,
145 depth,
146 max: MAX_NESTING_DEPTH,
147 });
148 }
149 check_selector(&path, &self.selector)?;
150 if let FieldKind::NestedList { fields } = &self.kind {
151 for f in fields {
152 f.validate(&path, depth + 1)?;
153 }
154 }
155 Ok(())
156 }
157}
158
159#[derive(Default, Debug, Clone)]
161pub struct SchemaBuilder {
162 base_selector: Option<String>,
163 fields: Vec<ExtractField>,
164}
165
166impl SchemaBuilder {
167 #[must_use]
169 pub fn base_selector(mut self, selector: impl Into<String>) -> Self {
170 self.base_selector = Some(selector.into());
171 self
172 }
173
174 #[must_use]
176 pub fn field(mut self, name: impl Into<String>, selector: impl Into<String>, kind: FieldKind) -> Self {
177 self.fields.push(ExtractField::new(name, selector, kind));
178 self
179 }
180
181 pub fn build(self) -> Result<ExtractSchema, SchemaError> {
183 let schema = ExtractSchema {
184 base_selector: self.base_selector,
185 fields: self.fields,
186 };
187 schema.validate()?;
188 Ok(schema)
189 }
190}
191
192#[derive(Debug, Clone, Serialize, Deserialize)]
194#[serde(tag = "type", rename_all = "snake_case")]
195#[non_exhaustive]
196pub enum FieldKind {
197 Text,
199 #[serde(alias = "attr")]
201 Attribute {
202 attribute: String,
204 },
205 Html,
207 #[serde(alias = "innerHtml")]
209 InnerHtml,
210 #[serde(alias = "nestedList")]
212 NestedList {
213 fields: Vec<ExtractField>,
215 },
216}
217
218fn check_selector(field: &str, selector: &str) -> Result<(), SchemaError> {
219 if selector.is_empty() {
222 return Ok(());
223 }
224 Matcher::new(selector)
225 .map(|_| ())
226 .map_err(|_| SchemaError::InvalidSelector {
227 field: field.to_string(),
228 selector: selector.to_string(),
229 })
230}
231
232impl ExtractSchema {
233 #[must_use]
235 pub fn extract_from(&self, html: &str) -> Value {
236 let doc = Document::from(html);
237 let root = doc.select("html");
238
239 match &self.base_selector {
240 None => Value::Object(extract_fields(&root, &self.fields)),
241 Some(sel) => {
242 let items: Vec<Value> = doc
243 .select(sel)
244 .iter()
245 .map(|container| Value::Object(extract_fields(&container, &self.fields)))
246 .collect();
247 Value::Array(items)
248 }
249 }
250 }
251}
252
253fn extract_fields(container: &Selection<'_>, fields: &[ExtractField]) -> Map<String, Value> {
254 fields
255 .iter()
256 .map(|f| (f.name.clone(), extract_field(container, f)))
257 .collect()
258}
259
260fn extract_field(container: &Selection<'_>, field: &ExtractField) -> Value {
261 let picked = if field.selector.is_empty() {
263 container.clone()
264 } else {
265 container.select(&field.selector)
266 };
267 if !picked.exists() {
268 return Value::Null;
269 }
270 match &field.kind {
272 FieldKind::Text => Value::String(picked.first().text().to_string()),
273 FieldKind::Attribute { attribute } => picked
274 .first()
275 .attr(attribute)
276 .map_or(Value::Null, |s| Value::String(s.to_string())),
277 FieldKind::Html => Value::String(picked.first().html().to_string()),
278 FieldKind::InnerHtml => Value::String(picked.first().inner_html().to_string()),
279 FieldKind::NestedList { fields } => Value::Array(
280 picked
281 .iter()
282 .map(|sub| Value::Object(extract_fields(&sub, fields)))
283 .collect(),
284 ),
285 }
286}
287
288#[cfg(test)]
289mod tests {
290 use super::*;
291 use serde_json::json;
292
293 const PRODUCTS: &str = r#"
294 <html><body>
295 <div class="product">
296 <h2>Keyboard</h2>
297 <span class="price">$99</span>
298 <a href="/kbd">details</a>
299 <img src="/kbd.png" alt="Keyboard">
300 </div>
301 <div class="product">
302 <h2>Mouse</h2>
303 <span class="price">$49</span>
304 <a href="/mouse">details</a>
305 <img src="/mouse.png" alt="Mouse">
306 </div>
307 </body></html>
308 "#;
309
310 fn schema_from(json: &Value) -> ExtractSchema {
311 ExtractSchema::from_json(&json.to_string()).expect("valid schema")
312 }
313
314 #[test]
315 fn extracts_text_fields_over_base_selector() {
316 let schema = schema_from(&json!({
317 "base_selector": ".product",
318 "fields": [
319 { "name": "title", "selector": "h2", "type": "text" },
320 { "name": "price", "selector": ".price", "type": "text" },
321 ]
322 }));
323 assert_eq!(
324 schema.extract_from(PRODUCTS),
325 json!([
326 { "title": "Keyboard", "price": "$99" },
327 { "title": "Mouse", "price": "$49" }
328 ])
329 );
330 }
331
332 #[test]
333 fn extracts_attribute() {
334 let schema = schema_from(&json!({
335 "base_selector": ".product",
336 "fields": [
337 { "name": "url", "selector": "a", "type": "attribute", "attribute": "href" },
338 { "name": "image", "selector": "img", "type": "attribute", "attribute": "src" },
339 ]
340 }));
341 assert_eq!(
342 schema.extract_from(PRODUCTS),
343 json!([
344 { "url": "/kbd", "image": "/kbd.png" },
345 { "url": "/mouse", "image": "/mouse.png" }
346 ])
347 );
348 }
349
350 #[test]
351 fn extracts_html_and_inner_html() {
352 let html = r#"<html><body><div class="card"><p><b>hi</b></p></div></body></html>"#;
353 let schema = schema_from(&json!({
354 "base_selector": ".card",
355 "fields": [
356 { "name": "outer", "selector": "p", "type": "html" },
357 { "name": "inner", "selector": "p", "type": "inner_html" },
358 ]
359 }));
360 assert_eq!(
361 schema.extract_from(html),
362 json!([{ "outer": "<p><b>hi</b></p>", "inner": "<b>hi</b>" }])
363 );
364 }
365
366 #[test]
367 fn nested_list_extracts_sub_objects() {
368 let html = r#"
369 <html><body>
370 <div class="post">
371 <h3>First</h3>
372 <ul><li>a</li><li>b</li></ul>
373 </div>
374 <div class="post">
375 <h3>Second</h3>
376 <ul><li>c</li></ul>
377 </div>
378 </body></html>
379 "#;
380 let schema = schema_from(&json!({
381 "base_selector": ".post",
382 "fields": [
383 { "name": "title", "selector": "h3", "type": "text" },
384 { "name": "items", "selector": "li", "type": "nested_list",
385 "fields": [
386 { "name": "label", "selector": "*", "type": "text" }
387 ]
388 }
389 ]
390 }));
391 assert_eq!(
392 schema.extract_from(html),
393 json!([
394 { "title": "First", "items": [{ "label": null }, { "label": null }] },
395 { "title": "Second", "items": [{ "label": null }] }
396 ])
397 );
398 }
399
400 #[test]
401 fn missing_field_yields_null() {
402 let schema = schema_from(&json!({
403 "base_selector": ".product",
404 "fields": [
405 { "name": "rating", "selector": ".rating", "type": "text" }
406 ]
407 }));
408 assert_eq!(
409 schema.extract_from(PRODUCTS),
410 json!([{ "rating": null }, { "rating": null }])
411 );
412 }
413
414 #[test]
415 fn no_base_selector_returns_single_object() {
416 let schema = schema_from(&json!({
417 "fields": [
418 { "name": "first_product", "selector": ".product h2", "type": "text" }
419 ]
420 }));
421 assert_eq!(schema.extract_from(PRODUCTS), json!({ "first_product": "Keyboard" }));
422 }
423
424 #[test]
425 fn accepts_camelcase_keys() {
426 let schema = schema_from(&json!({
427 "baseSelector": ".product",
428 "fields": [
429 { "name": "t", "selector": "h2", "type": "text" },
430 { "name": "raw", "selector": "p", "type": "innerHtml" }
431 ]
432 }));
433 assert_eq!(schema.base_selector.as_deref(), Some(".product"));
434 let arr_out = schema.extract_from(PRODUCTS);
435 let arr = arr_out.as_array().unwrap();
436 assert_eq!(arr[0]["t"], "Keyboard");
437 assert_eq!(arr[0]["raw"], Value::Null);
438 }
439
440 #[test]
441 fn rejects_malformed_selector_eagerly() {
442 let json = json!({
443 "base_selector": ".product",
444 "fields": [
445 { "name": "bad", "selector": "###not[[[valid", "type": "text" }
446 ]
447 });
448 let err = ExtractSchema::from_json(&json.to_string()).unwrap_err();
449 assert!(
450 matches!(err, SchemaError::InvalidSelector { field, .. } if field == "bad"),
451 "expected InvalidSelector error for field 'bad'"
452 );
453 }
454
455 #[test]
456 fn nested_invalid_selector_reports_dotted_path() {
457 let json = json!({
458 "fields": [{
459 "name": "products",
460 "selector": ".product",
461 "type": "nested_list",
462 "fields": [{
463 "name": "price",
464 "selector": ".price",
465 "type": "nested_list",
466 "fields": [{ "name": "amount", "selector": "###bad", "type": "text" }]
467 }]
468 }]
469 });
470 let err = ExtractSchema::from_json(&json.to_string()).unwrap_err();
471 assert!(
472 matches!(&err, SchemaError::InvalidSelector { field, .. } if field == "products.price.amount"),
473 "expected dotted path, got: {err:?}"
474 );
475 }
476
477 #[test]
478 fn rejects_malformed_json() {
479 let err = ExtractSchema::from_json("{ not json").unwrap_err();
480 assert!(matches!(err, SchemaError::Parse(_)), "expected Parse error");
481 }
482
483 #[test]
484 fn from_path_surfaces_io_error() {
485 let err = ExtractSchema::from_path("/definitely/not/a/real/path.json").unwrap_err();
486 assert!(matches!(err, SchemaError::Io(_)), "expected Io error, got {err:?}");
487 }
488
489 #[test]
490 fn mixed_present_and_missing_fields() {
491 let schema = schema_from(&json!({
492 "base_selector": ".product",
493 "fields": [
494 { "name": "title", "selector": "h2", "type": "text" },
495 { "name": "rating", "selector": ".rating", "type": "text" }
496 ]
497 }));
498 assert_eq!(
499 schema.extract_from(PRODUCTS),
500 json!([
501 { "title": "Keyboard", "rating": null },
502 { "title": "Mouse", "rating": null }
503 ])
504 );
505 }
506
507 #[test]
508 fn empty_selector_reads_matched_element_text() {
509 let html = r"<html><body><ul><li>alpha</li><li>beta</li></ul></body></html>";
510 let schema = schema_from(&json!({
511 "base_selector": "li",
512 "fields": [
513 { "name": "value", "selector": "", "type": "text" }
514 ]
515 }));
516 assert_eq!(
517 schema.extract_from(html),
518 json!([{ "value": "alpha" }, { "value": "beta" }])
519 );
520 }
521
522 #[test]
523 fn empty_selector_inside_nested_list_reads_each_item() {
524 let html = r#"
525 <html><body>
526 <div class="post">
527 <h3>First</h3>
528 <ul><li>a</li><li>b</li></ul>
529 </div>
530 </body></html>
531 "#;
532 let schema = schema_from(&json!({
533 "base_selector": ".post",
534 "fields": [
535 { "name": "title", "selector": "h3", "type": "text" },
536 { "name": "items", "selector": "li", "type": "nested_list",
537 "fields": [{ "name": "text", "selector": "", "type": "text" }] }
538 ]
539 }));
540 assert_eq!(
541 schema.extract_from(html),
542 json!([{
543 "title": "First",
544 "items": [{ "text": "a" }, { "text": "b" }]
545 }])
546 );
547 }
548
549 #[test]
550 fn empty_selector_reads_matched_element_attribute() {
551 let html = r#"<html><body><a href="/home" title="Home">Go</a></body></html>"#;
552 let schema = schema_from(&json!({
553 "base_selector": "a",
554 "fields": [
555 { "name": "href", "selector": "", "type": "attribute", "attribute": "href" },
556 { "name": "title", "selector": "", "type": "attribute", "attribute": "title" }
557 ]
558 }));
559 assert_eq!(schema.extract_from(html), json!([{ "href": "/home", "title": "Home" }]));
560 }
561
562 #[test]
563 fn builder_constructs_equivalent_schema() {
564 let built = ExtractSchema::builder()
565 .base_selector(".product")
566 .field("title", "h2", FieldKind::Text)
567 .field(
568 "url",
569 "a",
570 FieldKind::Attribute {
571 attribute: "href".into(),
572 },
573 )
574 .build()
575 .unwrap();
576
577 let json_schema = schema_from(&json!({
578 "base_selector": ".product",
579 "fields": [
580 { "name": "title", "selector": "h2", "type": "text" },
581 { "name": "url", "selector": "a", "type": "attribute", "attribute": "href" }
582 ]
583 }));
584
585 assert_eq!(built.extract_from(PRODUCTS), json_schema.extract_from(PRODUCTS));
586 }
587
588 #[test]
589 fn builder_supports_nested_list() {
590 let schema = ExtractSchema::builder()
591 .base_selector(".post")
592 .field("title", "h3", FieldKind::Text)
593 .field(
594 "items",
595 "li",
596 FieldKind::NestedList {
597 fields: vec![ExtractField::new("text", "", FieldKind::Text)],
598 },
599 )
600 .build()
601 .unwrap();
602 let html = r"<html><body><div class='post'><h3>A</h3><ul><li>one</li></ul></div></body></html>";
603 assert_eq!(
604 schema.extract_from(html),
605 json!([{ "title": "A", "items": [{ "text": "one" }] }])
606 );
607 }
608
609 #[test]
610 fn builder_surfaces_selector_errors() {
611 let err = ExtractSchema::builder()
612 .field("bad", "###invalid[[[", FieldKind::Text)
613 .build()
614 .unwrap_err();
615 assert!(
616 matches!(&err, SchemaError::InvalidSelector { field, .. } if field == "bad"),
617 "expected InvalidSelector, got {err:?}"
618 );
619 }
620
621 #[test]
622 fn ignores_unknown_top_level_fields() {
623 let schema = schema_from(&json!({
624 "name": "legacy-label",
625 "base_selector": ".product",
626 "fields": [
627 { "name": "title", "selector": "h2", "type": "text" }
628 ]
629 }));
630 assert_eq!(schema.base_selector.as_deref(), Some(".product"));
631 }
632
633 #[test]
634 fn rejects_unknown_field_type_list() {
635 let json = json!({
636 "fields": [
637 { "name": "items", "selector": "li", "type": "list", "fields": [] }
638 ]
639 });
640 let err = ExtractSchema::from_json(&json.to_string()).unwrap_err();
641 assert!(
642 matches!(err, SchemaError::Parse(_)),
643 "expected Parse error for unsupported 'list' type"
644 );
645 }
646
647 #[test]
648 fn works_on_html_fragment_without_wrappers() {
649 let schema = schema_from(&json!({
650 "fields": [
651 { "name": "heading", "selector": "h1", "type": "text" }
652 ]
653 }));
654 assert_eq!(schema.extract_from("<h1>Hello</h1>"), json!({ "heading": "Hello" }));
655 }
656
657 #[test]
658 fn empty_fields_yields_empty_object() {
659 let schema = schema_from(&json!({ "fields": [] }));
660 assert_eq!(schema.extract_from(PRODUCTS), json!({}));
661 }
662
663 #[test]
664 fn empty_fields_with_base_selector_yields_empty_objects() {
665 let schema = schema_from(&json!({
666 "base_selector": ".product",
667 "fields": []
668 }));
669 assert_eq!(schema.extract_from(PRODUCTS), json!([{}, {}]));
670 }
671
672 #[test]
673 fn base_selector_matches_nothing_yields_empty_array() {
674 let schema = schema_from(&json!({
675 "base_selector": ".does-not-exist",
676 "fields": [
677 { "name": "title", "selector": "h2", "type": "text" }
678 ]
679 }));
680 assert_eq!(schema.extract_from(PRODUCTS), json!([]));
681 }
682
683 #[test]
684 fn nested_list_with_zero_matches_yields_null() {
685 let html = r#"<html><body><div class="post"><h3>Only</h3></div></body></html>"#;
686 let schema = schema_from(&json!({
687 "base_selector": ".post",
688 "fields": [
689 { "name": "title", "selector": "h3", "type": "text" },
690 { "name": "items", "selector": ".missing", "type": "nested_list",
691 "fields": [{ "name": "label", "selector": "*", "type": "text" }] }
692 ]
693 }));
694 assert_eq!(schema.extract_from(html), json!([{ "title": "Only", "items": null }]));
695 }
696
697 #[test]
698 fn attribute_missing_but_element_present_yields_null() {
699 let html = r"<html><body><a>no href</a></body></html>";
700 let schema = schema_from(&json!({
701 "fields": [
702 { "name": "href", "selector": "a", "type": "attribute", "attribute": "href" }
703 ]
704 }));
705 assert_eq!(schema.extract_from(html), json!({ "href": null }));
706 }
707
708 #[test]
709 fn unicode_text_roundtrips() {
710 let html = r"<html><body><h1>日本語 🦀</h1></body></html>";
711 let schema = schema_from(&json!({
712 "fields": [{ "name": "t", "selector": "h1", "type": "text" }]
713 }));
714 assert_eq!(schema.extract_from(html), json!({ "t": "日本語 🦀" }));
715 }
716
717 #[test]
718 fn html_entities_are_decoded_in_text() {
719 let html = r"<html><body><p>A & B < C</p></body></html>";
720 let schema = schema_from(&json!({
721 "fields": [{ "name": "t", "selector": "p", "type": "text" }]
722 }));
723 assert_eq!(schema.extract_from(html), json!({ "t": "A & B < C" }));
724 }
725
726 #[test]
727 fn deeply_nested_three_levels() {
728 let html = r#"
729 <html><body>
730 <div class="cat">
731 <h2>Electronics</h2>
732 <div class="prod">
733 <h3>Laptop</h3>
734 <ul class="specs"><li>16GB</li><li>1TB</li></ul>
735 </div>
736 </div>
737 </body></html>
738 "#;
739 let schema = schema_from(&json!({
740 "base_selector": ".cat",
741 "fields": [
742 { "name": "name", "selector": "h2", "type": "text" },
743 { "name": "products", "selector": ".prod", "type": "nested_list",
744 "fields": [
745 { "name": "title", "selector": "h3", "type": "text" },
746 { "name": "specs", "selector": ".specs li", "type": "nested_list",
747 "fields": [{ "name": "v", "selector": "*", "type": "text" }] }
748 ] }
749 ]
750 }));
751 assert_eq!(
752 schema.extract_from(html),
753 json!([{
754 "name": "Electronics",
755 "products": [{
756 "title": "Laptop",
757 "specs": [{ "v": null }, { "v": null }]
758 }]
759 }])
760 );
761 }
762
763 #[test]
764 fn empty_html_yields_nulls() {
765 let schema = schema_from(&json!({
766 "fields": [{ "name": "t", "selector": "h1", "type": "text" }]
767 }));
768 assert_eq!(schema.extract_from(""), json!({ "t": null }));
769 }
770
771 #[test]
772 fn rejects_excessive_nesting_depth() {
773 let mut kind = FieldKind::Text;
775 for i in (0..MAX_NESTING_DEPTH + 5).rev() {
776 kind = FieldKind::NestedList {
777 fields: vec![ExtractField::new(format!("l{i}"), "*", kind)],
778 };
779 }
780 let err = ExtractSchema::builder().field("root", "*", kind).build().unwrap_err();
781 assert!(matches!(
782 err,
783 SchemaError::TooDeep { depth, max, .. } if depth > max && max == MAX_NESTING_DEPTH
784 ));
785 }
786
787 #[test]
788 fn accepts_nesting_at_depth_limit() {
789 let mut kind = FieldKind::Text;
791 for i in (0..MAX_NESTING_DEPTH).rev() {
792 kind = FieldKind::NestedList {
793 fields: vec![ExtractField::new(format!("l{i}"), "*", kind)],
794 };
795 }
796 let result = ExtractSchema::builder().field("root", "*", kind).build();
797 assert!(result.is_ok());
798 }
799
800 #[test]
801 fn accessors_expose_schema_contents() {
802 let schema = ExtractSchema::builder()
803 .base_selector(".product")
804 .field("title", "h2", FieldKind::Text)
805 .field(
806 "url",
807 "a",
808 FieldKind::Attribute {
809 attribute: "href".into(),
810 },
811 )
812 .build()
813 .unwrap();
814
815 assert_eq!(schema.base_selector(), Some(".product"));
816 assert_eq!(schema.fields().len(), 2);
817 assert_eq!(schema.fields()[0].name(), "title");
818 assert_eq!(schema.fields()[0].selector(), "h2");
819 assert!(matches!(schema.fields()[0].kind(), FieldKind::Text));
820 assert_eq!(schema.fields()[1].name(), "url");
821 assert!(matches!(
822 schema.fields()[1].kind(),
823 FieldKind::Attribute { attribute } if attribute == "href"
824 ));
825 }
826}