1use dom_query::{Document, Matcher, Selection};
4use serde::{Deserialize, Serialize};
5use serde_json::{Map, Value};
6
7pub const MAX_NESTING_DEPTH: usize = 64;
9
10#[derive(Debug, thiserror::Error)]
12#[non_exhaustive]
13pub enum SchemaError {
14 #[error("invalid CSS selector '{selector}' in field '{field}'")]
16 InvalidSelector {
17 field: String,
19 selector: String,
21 },
22 #[error("failed to parse schema: {0}")]
24 Parse(#[from] serde_json::Error),
25 #[error("failed to read schema file: {0}")]
27 Io(#[from] std::io::Error),
28 #[error("schema nesting too deep at field '{field}' ({depth} levels, max {max})")]
30 TooDeep {
31 field: String,
33 depth: usize,
35 max: usize,
37 },
38}
39
40#[derive(Debug, Clone, Serialize, Deserialize)]
42#[non_exhaustive]
43pub struct ExtractSchema {
44 #[serde(default, alias = "baseSelector")]
46 pub(crate) base_selector: Option<String>,
47 pub(crate) fields: Vec<ExtractField>,
49}
50
51impl ExtractSchema {
52 pub fn from_json(json: &str) -> Result<Self, SchemaError> {
54 let schema: Self = serde_json::from_str(json)?;
55 schema.validate()?;
56 Ok(schema)
57 }
58
59 pub fn from_path(path: impl AsRef<std::path::Path>) -> Result<Self, SchemaError> {
61 let content = std::fs::read_to_string(path)?;
62 Self::from_json(&content)
63 }
64
65 #[must_use]
67 pub fn builder() -> SchemaBuilder {
68 SchemaBuilder::default()
69 }
70
71 pub fn validate(&self) -> Result<(), SchemaError> {
73 if let Some(sel) = &self.base_selector {
74 check_selector("<base>", sel)?;
75 }
76 for f in &self.fields {
77 f.validate("", 0)?;
78 }
79 Ok(())
80 }
81
82 #[must_use]
84 pub fn base_selector(&self) -> Option<&str> {
85 self.base_selector.as_deref()
86 }
87
88 #[must_use]
90 pub fn fields(&self) -> &[ExtractField] {
91 &self.fields
92 }
93}
94
95#[derive(Debug, Clone, Serialize, Deserialize)]
97#[non_exhaustive]
98pub struct ExtractField {
99 pub(crate) name: String,
101 pub(crate) selector: String,
103 #[serde(flatten)]
105 pub(crate) kind: FieldKind,
106}
107
108impl ExtractField {
109 pub fn new(name: impl Into<String>, selector: impl Into<String>, kind: FieldKind) -> Self {
111 Self {
112 name: name.into(),
113 selector: selector.into(),
114 kind,
115 }
116 }
117
118 #[must_use]
120 pub fn name(&self) -> &str {
121 &self.name
122 }
123
124 #[must_use]
126 pub fn selector(&self) -> &str {
127 &self.selector
128 }
129
130 #[must_use]
132 pub fn kind(&self) -> &FieldKind {
133 &self.kind
134 }
135
136 fn validate(&self, parent: &str, depth: usize) -> Result<(), SchemaError> {
137 let path = if parent.is_empty() {
138 self.name.clone()
139 } else {
140 format!("{parent}.{}", self.name)
141 };
142 if depth > MAX_NESTING_DEPTH {
143 return Err(SchemaError::TooDeep {
144 field: path,
145 depth,
146 max: MAX_NESTING_DEPTH,
147 });
148 }
149 check_selector(&path, &self.selector)?;
150 if let FieldKind::NestedList { fields } = &self.kind {
151 for f in fields {
152 f.validate(&path, depth + 1)?;
153 }
154 }
155 Ok(())
156 }
157}
158
159#[derive(Default, Debug, Clone)]
161pub struct SchemaBuilder {
162 base_selector: Option<String>,
163 fields: Vec<ExtractField>,
164}
165
166impl SchemaBuilder {
167 #[must_use]
169 pub fn base_selector(mut self, selector: impl Into<String>) -> Self {
170 self.base_selector = Some(selector.into());
171 self
172 }
173
174 #[must_use]
176 pub fn field(mut self, name: impl Into<String>, selector: impl Into<String>, kind: FieldKind) -> Self {
177 self.fields.push(ExtractField::new(name, selector, kind));
178 self
179 }
180
181 pub fn build(self) -> Result<ExtractSchema, SchemaError> {
183 let schema = ExtractSchema {
184 base_selector: self.base_selector,
185 fields: self.fields,
186 };
187 schema.validate()?;
188 Ok(schema)
189 }
190}
191
192#[derive(Debug, Clone, Serialize, Deserialize)]
194#[serde(tag = "type", rename_all = "snake_case")]
195#[non_exhaustive]
196pub enum FieldKind {
197 Text,
199 #[serde(alias = "attr")]
201 Attribute {
202 attribute: String,
204 },
205 Html,
207 #[serde(alias = "innerHtml")]
209 InnerHtml,
210 #[serde(alias = "nestedList")]
212 NestedList {
213 fields: Vec<ExtractField>,
215 },
216}
217
218fn check_selector(field: &str, selector: &str) -> Result<(), SchemaError> {
219 if selector.is_empty() {
222 return Ok(());
223 }
224 Matcher::new(selector)
225 .map(|_| ())
226 .map_err(|_| SchemaError::InvalidSelector {
227 field: field.to_string(),
228 selector: selector.to_string(),
229 })
230}
231
232impl ExtractSchema {
233 #[must_use]
235 pub fn extract_from(&self, html: &str) -> Value {
236 let doc = Document::from(html);
237 let root = doc.select("html");
238
239 match &self.base_selector {
240 None => Value::Object(extract_fields(&root, &self.fields)),
241 Some(sel) => {
242 let items: Vec<Value> = doc
243 .select(sel)
244 .iter()
245 .map(|container| Value::Object(extract_fields(&container, &self.fields)))
246 .collect();
247 Value::Array(items)
248 }
249 }
250 }
251}
252
253fn extract_fields(container: &Selection<'_>, fields: &[ExtractField]) -> Map<String, Value> {
254 fields
255 .iter()
256 .map(|f| (f.name.clone(), extract_field(container, f)))
257 .collect()
258}
259
260fn extract_field(container: &Selection<'_>, field: &ExtractField) -> Value {
261 let picked = if field.selector.is_empty() {
263 container.clone()
264 } else {
265 container.select(&field.selector)
266 };
267 if !picked.exists() {
268 return Value::Null;
269 }
270 match &field.kind {
272 FieldKind::Text => Value::String(picked.first().text().to_string()),
273 FieldKind::Attribute { attribute } => picked
274 .first()
275 .attr(attribute)
276 .map_or(Value::Null, |s| Value::String(s.to_string())),
277 FieldKind::Html => Value::String(picked.first().html().to_string()),
278 FieldKind::InnerHtml => Value::String(picked.first().inner_html().to_string()),
279 FieldKind::NestedList { fields } => Value::Array(
280 picked
281 .iter()
282 .map(|sub| Value::Object(extract_fields(&sub, fields)))
283 .collect(),
284 ),
285 }
286}
287
288#[cfg(test)]
289mod tests {
290 use serde_json::json;
291
292 use super::*;
293
294 const PRODUCTS: &str = r#"
295 <html><body>
296 <div class="product">
297 <h2>Keyboard</h2>
298 <span class="price">$99</span>
299 <a href="/kbd">details</a>
300 <img src="/kbd.png" alt="Keyboard">
301 </div>
302 <div class="product">
303 <h2>Mouse</h2>
304 <span class="price">$49</span>
305 <a href="/mouse">details</a>
306 <img src="/mouse.png" alt="Mouse">
307 </div>
308 </body></html>
309 "#;
310
311 fn schema_from(json: &Value) -> ExtractSchema {
312 ExtractSchema::from_json(&json.to_string()).expect("valid schema")
313 }
314
315 #[test]
316 fn extracts_text_fields_over_base_selector() {
317 let schema = schema_from(&json!({
318 "base_selector": ".product",
319 "fields": [
320 { "name": "title", "selector": "h2", "type": "text" },
321 { "name": "price", "selector": ".price", "type": "text" },
322 ]
323 }));
324 assert_eq!(
325 schema.extract_from(PRODUCTS),
326 json!([
327 { "title": "Keyboard", "price": "$99" },
328 { "title": "Mouse", "price": "$49" }
329 ])
330 );
331 }
332
333 #[test]
334 fn extracts_attribute() {
335 let schema = schema_from(&json!({
336 "base_selector": ".product",
337 "fields": [
338 { "name": "url", "selector": "a", "type": "attribute", "attribute": "href" },
339 { "name": "image", "selector": "img", "type": "attribute", "attribute": "src" },
340 ]
341 }));
342 assert_eq!(
343 schema.extract_from(PRODUCTS),
344 json!([
345 { "url": "/kbd", "image": "/kbd.png" },
346 { "url": "/mouse", "image": "/mouse.png" }
347 ])
348 );
349 }
350
351 #[test]
352 fn extracts_html_and_inner_html() {
353 let html = r#"<html><body><div class="card"><p><b>hi</b></p></div></body></html>"#;
354 let schema = schema_from(&json!({
355 "base_selector": ".card",
356 "fields": [
357 { "name": "outer", "selector": "p", "type": "html" },
358 { "name": "inner", "selector": "p", "type": "inner_html" },
359 ]
360 }));
361 assert_eq!(
362 schema.extract_from(html),
363 json!([{ "outer": "<p><b>hi</b></p>", "inner": "<b>hi</b>" }])
364 );
365 }
366
367 #[test]
368 fn nested_list_extracts_sub_objects() {
369 let html = r#"
370 <html><body>
371 <div class="post">
372 <h3>First</h3>
373 <ul><li>a</li><li>b</li></ul>
374 </div>
375 <div class="post">
376 <h3>Second</h3>
377 <ul><li>c</li></ul>
378 </div>
379 </body></html>
380 "#;
381 let schema = schema_from(&json!({
382 "base_selector": ".post",
383 "fields": [
384 { "name": "title", "selector": "h3", "type": "text" },
385 { "name": "items", "selector": "li", "type": "nested_list",
386 "fields": [
387 { "name": "label", "selector": "*", "type": "text" }
388 ]
389 }
390 ]
391 }));
392 assert_eq!(
393 schema.extract_from(html),
394 json!([
395 { "title": "First", "items": [{ "label": null }, { "label": null }] },
396 { "title": "Second", "items": [{ "label": null }] }
397 ])
398 );
399 }
400
401 #[test]
402 fn missing_field_yields_null() {
403 let schema = schema_from(&json!({
404 "base_selector": ".product",
405 "fields": [
406 { "name": "rating", "selector": ".rating", "type": "text" }
407 ]
408 }));
409 assert_eq!(
410 schema.extract_from(PRODUCTS),
411 json!([{ "rating": null }, { "rating": null }])
412 );
413 }
414
415 #[test]
416 fn no_base_selector_returns_single_object() {
417 let schema = schema_from(&json!({
418 "fields": [
419 { "name": "first_product", "selector": ".product h2", "type": "text" }
420 ]
421 }));
422 assert_eq!(schema.extract_from(PRODUCTS), json!({ "first_product": "Keyboard" }));
423 }
424
425 #[test]
426 fn accepts_camelcase_keys() {
427 let schema = schema_from(&json!({
428 "baseSelector": ".product",
429 "fields": [
430 { "name": "t", "selector": "h2", "type": "text" },
431 { "name": "raw", "selector": "p", "type": "innerHtml" }
432 ]
433 }));
434 assert_eq!(schema.base_selector.as_deref(), Some(".product"));
435 let arr_out = schema.extract_from(PRODUCTS);
436 let arr = arr_out.as_array().unwrap();
437 assert_eq!(arr[0]["t"], "Keyboard");
438 assert_eq!(arr[0]["raw"], Value::Null);
439 }
440
441 #[test]
442 fn rejects_malformed_selector_eagerly() {
443 let json = json!({
444 "base_selector": ".product",
445 "fields": [
446 { "name": "bad", "selector": "###not[[[valid", "type": "text" }
447 ]
448 });
449 let err = ExtractSchema::from_json(&json.to_string()).unwrap_err();
450 assert!(
451 matches!(err, SchemaError::InvalidSelector { field, .. } if field == "bad"),
452 "expected InvalidSelector error for field 'bad'"
453 );
454 }
455
456 #[test]
457 fn nested_invalid_selector_reports_dotted_path() {
458 let json = json!({
459 "fields": [{
460 "name": "products",
461 "selector": ".product",
462 "type": "nested_list",
463 "fields": [{
464 "name": "price",
465 "selector": ".price",
466 "type": "nested_list",
467 "fields": [{ "name": "amount", "selector": "###bad", "type": "text" }]
468 }]
469 }]
470 });
471 let err = ExtractSchema::from_json(&json.to_string()).unwrap_err();
472 assert!(
473 matches!(&err, SchemaError::InvalidSelector { field, .. } if field == "products.price.amount"),
474 "expected dotted path, got: {err:?}"
475 );
476 }
477
478 #[test]
479 fn rejects_malformed_json() {
480 let err = ExtractSchema::from_json("{ not json").unwrap_err();
481 assert!(matches!(err, SchemaError::Parse(_)), "expected Parse error");
482 }
483
484 #[test]
485 fn from_path_surfaces_io_error() {
486 let err = ExtractSchema::from_path("/definitely/not/a/real/path.json").unwrap_err();
487 assert!(matches!(err, SchemaError::Io(_)), "expected Io error, got {err:?}");
488 }
489
490 #[test]
491 fn mixed_present_and_missing_fields() {
492 let schema = schema_from(&json!({
493 "base_selector": ".product",
494 "fields": [
495 { "name": "title", "selector": "h2", "type": "text" },
496 { "name": "rating", "selector": ".rating", "type": "text" }
497 ]
498 }));
499 assert_eq!(
500 schema.extract_from(PRODUCTS),
501 json!([
502 { "title": "Keyboard", "rating": null },
503 { "title": "Mouse", "rating": null }
504 ])
505 );
506 }
507
508 #[test]
509 fn empty_selector_reads_matched_element_text() {
510 let html = r"<html><body><ul><li>alpha</li><li>beta</li></ul></body></html>";
511 let schema = schema_from(&json!({
512 "base_selector": "li",
513 "fields": [
514 { "name": "value", "selector": "", "type": "text" }
515 ]
516 }));
517 assert_eq!(
518 schema.extract_from(html),
519 json!([{ "value": "alpha" }, { "value": "beta" }])
520 );
521 }
522
523 #[test]
524 fn empty_selector_inside_nested_list_reads_each_item() {
525 let html = r#"
526 <html><body>
527 <div class="post">
528 <h3>First</h3>
529 <ul><li>a</li><li>b</li></ul>
530 </div>
531 </body></html>
532 "#;
533 let schema = schema_from(&json!({
534 "base_selector": ".post",
535 "fields": [
536 { "name": "title", "selector": "h3", "type": "text" },
537 { "name": "items", "selector": "li", "type": "nested_list",
538 "fields": [{ "name": "text", "selector": "", "type": "text" }] }
539 ]
540 }));
541 assert_eq!(
542 schema.extract_from(html),
543 json!([{
544 "title": "First",
545 "items": [{ "text": "a" }, { "text": "b" }]
546 }])
547 );
548 }
549
550 #[test]
551 fn empty_selector_reads_matched_element_attribute() {
552 let html = r#"<html><body><a href="/home" title="Home">Go</a></body></html>"#;
553 let schema = schema_from(&json!({
554 "base_selector": "a",
555 "fields": [
556 { "name": "href", "selector": "", "type": "attribute", "attribute": "href" },
557 { "name": "title", "selector": "", "type": "attribute", "attribute": "title" }
558 ]
559 }));
560 assert_eq!(schema.extract_from(html), json!([{ "href": "/home", "title": "Home" }]));
561 }
562
563 #[test]
564 fn builder_constructs_equivalent_schema() {
565 let built = ExtractSchema::builder()
566 .base_selector(".product")
567 .field("title", "h2", FieldKind::Text)
568 .field(
569 "url",
570 "a",
571 FieldKind::Attribute {
572 attribute: "href".into(),
573 },
574 )
575 .build()
576 .unwrap();
577
578 let json_schema = schema_from(&json!({
579 "base_selector": ".product",
580 "fields": [
581 { "name": "title", "selector": "h2", "type": "text" },
582 { "name": "url", "selector": "a", "type": "attribute", "attribute": "href" }
583 ]
584 }));
585
586 assert_eq!(built.extract_from(PRODUCTS), json_schema.extract_from(PRODUCTS));
587 }
588
589 #[test]
590 fn builder_supports_nested_list() {
591 let schema = ExtractSchema::builder()
592 .base_selector(".post")
593 .field("title", "h3", FieldKind::Text)
594 .field(
595 "items",
596 "li",
597 FieldKind::NestedList {
598 fields: vec![ExtractField::new("text", "", FieldKind::Text)],
599 },
600 )
601 .build()
602 .unwrap();
603 let html = r"<html><body><div class='post'><h3>A</h3><ul><li>one</li></ul></div></body></html>";
604 assert_eq!(
605 schema.extract_from(html),
606 json!([{ "title": "A", "items": [{ "text": "one" }] }])
607 );
608 }
609
610 #[test]
611 fn builder_surfaces_selector_errors() {
612 let err = ExtractSchema::builder()
613 .field("bad", "###invalid[[[", FieldKind::Text)
614 .build()
615 .unwrap_err();
616 assert!(
617 matches!(&err, SchemaError::InvalidSelector { field, .. } if field == "bad"),
618 "expected InvalidSelector, got {err:?}"
619 );
620 }
621
622 #[test]
623 fn ignores_unknown_top_level_fields() {
624 let schema = schema_from(&json!({
625 "name": "legacy-label",
626 "base_selector": ".product",
627 "fields": [
628 { "name": "title", "selector": "h2", "type": "text" }
629 ]
630 }));
631 assert_eq!(schema.base_selector.as_deref(), Some(".product"));
632 }
633
634 #[test]
635 fn rejects_unknown_field_type_list() {
636 let json = json!({
637 "fields": [
638 { "name": "items", "selector": "li", "type": "list", "fields": [] }
639 ]
640 });
641 let err = ExtractSchema::from_json(&json.to_string()).unwrap_err();
642 assert!(
643 matches!(err, SchemaError::Parse(_)),
644 "expected Parse error for unsupported 'list' type"
645 );
646 }
647
648 #[test]
649 fn works_on_html_fragment_without_wrappers() {
650 let schema = schema_from(&json!({
651 "fields": [
652 { "name": "heading", "selector": "h1", "type": "text" }
653 ]
654 }));
655 assert_eq!(schema.extract_from("<h1>Hello</h1>"), json!({ "heading": "Hello" }));
656 }
657
658 #[test]
659 fn empty_fields_yields_empty_object() {
660 let schema = schema_from(&json!({ "fields": [] }));
661 assert_eq!(schema.extract_from(PRODUCTS), json!({}));
662 }
663
664 #[test]
665 fn empty_fields_with_base_selector_yields_empty_objects() {
666 let schema = schema_from(&json!({
667 "base_selector": ".product",
668 "fields": []
669 }));
670 assert_eq!(schema.extract_from(PRODUCTS), json!([{}, {}]));
671 }
672
673 #[test]
674 fn base_selector_matches_nothing_yields_empty_array() {
675 let schema = schema_from(&json!({
676 "base_selector": ".does-not-exist",
677 "fields": [
678 { "name": "title", "selector": "h2", "type": "text" }
679 ]
680 }));
681 assert_eq!(schema.extract_from(PRODUCTS), json!([]));
682 }
683
684 #[test]
685 fn nested_list_with_zero_matches_yields_null() {
686 let html = r#"<html><body><div class="post"><h3>Only</h3></div></body></html>"#;
687 let schema = schema_from(&json!({
688 "base_selector": ".post",
689 "fields": [
690 { "name": "title", "selector": "h3", "type": "text" },
691 { "name": "items", "selector": ".missing", "type": "nested_list",
692 "fields": [{ "name": "label", "selector": "*", "type": "text" }] }
693 ]
694 }));
695 assert_eq!(schema.extract_from(html), json!([{ "title": "Only", "items": null }]));
696 }
697
698 #[test]
699 fn attribute_missing_but_element_present_yields_null() {
700 let html = r"<html><body><a>no href</a></body></html>";
701 let schema = schema_from(&json!({
702 "fields": [
703 { "name": "href", "selector": "a", "type": "attribute", "attribute": "href" }
704 ]
705 }));
706 assert_eq!(schema.extract_from(html), json!({ "href": null }));
707 }
708
709 #[test]
710 fn unicode_text_roundtrips() {
711 let html = r"<html><body><h1>日本語 🦀</h1></body></html>";
712 let schema = schema_from(&json!({
713 "fields": [{ "name": "t", "selector": "h1", "type": "text" }]
714 }));
715 assert_eq!(schema.extract_from(html), json!({ "t": "日本語 🦀" }));
716 }
717
718 #[test]
719 fn html_entities_are_decoded_in_text() {
720 let html = r"<html><body><p>A & B < C</p></body></html>";
721 let schema = schema_from(&json!({
722 "fields": [{ "name": "t", "selector": "p", "type": "text" }]
723 }));
724 assert_eq!(schema.extract_from(html), json!({ "t": "A & B < C" }));
725 }
726
727 #[test]
728 fn deeply_nested_three_levels() {
729 let html = r#"
730 <html><body>
731 <div class="cat">
732 <h2>Electronics</h2>
733 <div class="prod">
734 <h3>Laptop</h3>
735 <ul class="specs"><li>16GB</li><li>1TB</li></ul>
736 </div>
737 </div>
738 </body></html>
739 "#;
740 let schema = schema_from(&json!({
741 "base_selector": ".cat",
742 "fields": [
743 { "name": "name", "selector": "h2", "type": "text" },
744 { "name": "products", "selector": ".prod", "type": "nested_list",
745 "fields": [
746 { "name": "title", "selector": "h3", "type": "text" },
747 { "name": "specs", "selector": ".specs li", "type": "nested_list",
748 "fields": [{ "name": "v", "selector": "*", "type": "text" }] }
749 ] }
750 ]
751 }));
752 assert_eq!(
753 schema.extract_from(html),
754 json!([{
755 "name": "Electronics",
756 "products": [{
757 "title": "Laptop",
758 "specs": [{ "v": null }, { "v": null }]
759 }]
760 }])
761 );
762 }
763
764 #[test]
765 fn empty_html_yields_nulls() {
766 let schema = schema_from(&json!({
767 "fields": [{ "name": "t", "selector": "h1", "type": "text" }]
768 }));
769 assert_eq!(schema.extract_from(""), json!({ "t": null }));
770 }
771
772 #[test]
773 fn rejects_excessive_nesting_depth() {
774 let mut kind = FieldKind::Text;
776 for i in (0..MAX_NESTING_DEPTH + 5).rev() {
777 kind = FieldKind::NestedList {
778 fields: vec![ExtractField::new(format!("l{i}"), "*", kind)],
779 };
780 }
781 let err = ExtractSchema::builder().field("root", "*", kind).build().unwrap_err();
782 assert!(matches!(
783 err,
784 SchemaError::TooDeep { depth, max, .. } if depth > max && max == MAX_NESTING_DEPTH
785 ));
786 }
787
788 #[test]
789 fn accepts_nesting_at_depth_limit() {
790 let mut kind = FieldKind::Text;
792 for i in (0..MAX_NESTING_DEPTH).rev() {
793 kind = FieldKind::NestedList {
794 fields: vec![ExtractField::new(format!("l{i}"), "*", kind)],
795 };
796 }
797 let result = ExtractSchema::builder().field("root", "*", kind).build();
798 assert!(result.is_ok());
799 }
800
801 #[test]
802 fn accessors_expose_schema_contents() {
803 let schema = ExtractSchema::builder()
804 .base_selector(".product")
805 .field("title", "h2", FieldKind::Text)
806 .field(
807 "url",
808 "a",
809 FieldKind::Attribute {
810 attribute: "href".into(),
811 },
812 )
813 .build()
814 .unwrap();
815
816 assert_eq!(schema.base_selector(), Some(".product"));
817 assert_eq!(schema.fields().len(), 2);
818 assert_eq!(schema.fields()[0].name(), "title");
819 assert_eq!(schema.fields()[0].selector(), "h2");
820 assert!(matches!(schema.fields()[0].kind(), FieldKind::Text));
821 assert_eq!(schema.fields()[1].name(), "url");
822 assert!(matches!(
823 schema.fields()[1].kind(),
824 FieldKind::Attribute { attribute } if attribute == "href"
825 ));
826 }
827}