Skip to main content

servo_fetch/
schema.rs

1//! CSS-selector schema extraction.
2
3use dom_query::{Document, Matcher, Selection};
4use serde::{Deserialize, Serialize};
5use serde_json::{Map, Value};
6
7/// Maximum nesting depth allowed for `NestedList` schemas.
8pub const MAX_NESTING_DEPTH: usize = 64;
9
10/// Schema parse or validation error.
11#[derive(Debug, thiserror::Error)]
12#[non_exhaustive]
13pub enum SchemaError {
14    /// A CSS selector failed to parse.
15    #[error("invalid CSS selector '{selector}' in field '{field}'")]
16    InvalidSelector {
17        /// Field whose selector failed.
18        field: String,
19        /// The offending selector text.
20        selector: String,
21    },
22    /// The schema JSON itself is malformed.
23    #[error("failed to parse schema: {0}")]
24    Parse(#[from] serde_json::Error),
25    /// Failed to read the schema file from disk.
26    #[error("failed to read schema file: {0}")]
27    Io(#[from] std::io::Error),
28    /// Schema nesting exceeds [`MAX_NESTING_DEPTH`].
29    #[error("schema nesting too deep at field '{field}' ({depth} levels, max {max})")]
30    TooDeep {
31        /// Dotted path to the field that tripped the limit.
32        field: String,
33        /// Depth at which the limit was tripped.
34        depth: usize,
35        /// Maximum depth allowed.
36        max: usize,
37    },
38}
39
40/// Declarative extraction schema.
41#[derive(Debug, Clone, Serialize, Deserialize)]
42#[non_exhaustive]
43pub struct ExtractSchema {
44    /// Repeated container selector; each match produces one object.
45    #[serde(default, alias = "baseSelector")]
46    pub(crate) base_selector: Option<String>,
47    /// Fields to read from each container.
48    pub(crate) fields: Vec<ExtractField>,
49}
50
51impl ExtractSchema {
52    /// Parse a schema from JSON and validate every selector eagerly.
53    pub fn from_json(json: &str) -> Result<Self, SchemaError> {
54        let schema: Self = serde_json::from_str(json)?;
55        schema.validate()?;
56        Ok(schema)
57    }
58
59    /// Load a schema from a JSON file on disk.
60    pub fn from_path(path: impl AsRef<std::path::Path>) -> Result<Self, SchemaError> {
61        let content = std::fs::read_to_string(path)?;
62        Self::from_json(&content)
63    }
64
65    /// Start building a schema programmatically.
66    #[must_use]
67    pub fn builder() -> SchemaBuilder {
68        SchemaBuilder::default()
69    }
70
71    /// Validate every selector in the schema (including nested fields).
72    pub fn validate(&self) -> Result<(), SchemaError> {
73        if let Some(sel) = &self.base_selector {
74            check_selector("<base>", sel)?;
75        }
76        for f in &self.fields {
77            f.validate("", 0)?;
78        }
79        Ok(())
80    }
81
82    /// Repeated container selector, if any.
83    #[must_use]
84    pub fn base_selector(&self) -> Option<&str> {
85        self.base_selector.as_deref()
86    }
87
88    /// Fields defined in this schema.
89    #[must_use]
90    pub fn fields(&self) -> &[ExtractField] {
91        &self.fields
92    }
93}
94
95/// A single field in an [`ExtractSchema`].
96#[derive(Debug, Clone, Serialize, Deserialize)]
97#[non_exhaustive]
98pub struct ExtractField {
99    /// Output key for this field.
100    pub(crate) name: String,
101    /// CSS selector relative to the current container.
102    pub(crate) selector: String,
103    /// How to extract the value.
104    #[serde(flatten)]
105    pub(crate) kind: FieldKind,
106}
107
108impl ExtractField {
109    /// Construct a field programmatically.
110    pub fn new(name: impl Into<String>, selector: impl Into<String>, kind: FieldKind) -> Self {
111        Self {
112            name: name.into(),
113            selector: selector.into(),
114            kind,
115        }
116    }
117
118    /// Output key name for this field.
119    #[must_use]
120    pub fn name(&self) -> &str {
121        &self.name
122    }
123
124    /// CSS selector for this field.
125    #[must_use]
126    pub fn selector(&self) -> &str {
127        &self.selector
128    }
129
130    /// Extraction kind.
131    #[must_use]
132    pub fn kind(&self) -> &FieldKind {
133        &self.kind
134    }
135
136    fn validate(&self, parent: &str, depth: usize) -> Result<(), SchemaError> {
137        let path = if parent.is_empty() {
138            self.name.clone()
139        } else {
140            format!("{parent}.{}", self.name)
141        };
142        if depth > MAX_NESTING_DEPTH {
143            return Err(SchemaError::TooDeep {
144                field: path,
145                depth,
146                max: MAX_NESTING_DEPTH,
147            });
148        }
149        check_selector(&path, &self.selector)?;
150        if let FieldKind::NestedList { fields } = &self.kind {
151            for f in fields {
152                f.validate(&path, depth + 1)?;
153            }
154        }
155        Ok(())
156    }
157}
158
159/// Builder for [`ExtractSchema`].
160#[derive(Default, Debug, Clone)]
161pub struct SchemaBuilder {
162    base_selector: Option<String>,
163    fields: Vec<ExtractField>,
164}
165
166impl SchemaBuilder {
167    /// Set the base (repeated container) selector.
168    #[must_use]
169    pub fn base_selector(mut self, selector: impl Into<String>) -> Self {
170        self.base_selector = Some(selector.into());
171        self
172    }
173
174    /// Add a field. Accepts any [`FieldKind`] variant.
175    #[must_use]
176    pub fn field(mut self, name: impl Into<String>, selector: impl Into<String>, kind: FieldKind) -> Self {
177        self.fields.push(ExtractField::new(name, selector, kind));
178        self
179    }
180
181    /// Finalize the schema, validating every selector eagerly.
182    pub fn build(self) -> Result<ExtractSchema, SchemaError> {
183        let schema = ExtractSchema {
184            base_selector: self.base_selector,
185            fields: self.fields,
186        };
187        schema.validate()?;
188        Ok(schema)
189    }
190}
191
192/// What to read once a field selector matches.
193#[derive(Debug, Clone, Serialize, Deserialize)]
194#[serde(tag = "type", rename_all = "snake_case")]
195#[non_exhaustive]
196pub enum FieldKind {
197    /// Descendant text of the first match.
198    Text,
199    /// Named attribute on the first match.
200    #[serde(alias = "attr")]
201    Attribute {
202        /// Attribute name to read (e.g. `href`).
203        attribute: String,
204    },
205    /// Outer HTML of the first match.
206    Html,
207    /// Inner HTML of the first match.
208    #[serde(alias = "innerHtml")]
209    InnerHtml,
210    /// Repeated sub-object per match, using nested field definitions.
211    #[serde(alias = "nestedList")]
212    NestedList {
213        /// Nested field definitions.
214        fields: Vec<ExtractField>,
215    },
216}
217
218fn check_selector(field: &str, selector: &str) -> Result<(), SchemaError> {
219    // Empty selector is a sentinel for "the matched element itself" and is
220    // intentionally not a valid CSS expression; skip parsing it.
221    if selector.is_empty() {
222        return Ok(());
223    }
224    Matcher::new(selector)
225        .map(|_| ())
226        .map_err(|_| SchemaError::InvalidSelector {
227            field: field.to_string(),
228            selector: selector.to_string(),
229        })
230}
231
232impl ExtractSchema {
233    /// Apply this schema to HTML, returning structured JSON.
234    #[must_use]
235    pub fn extract_from(&self, html: &str) -> Value {
236        let doc = Document::from(html);
237        let root = doc.select("html");
238
239        match &self.base_selector {
240            None => Value::Object(extract_fields(&root, &self.fields)),
241            Some(sel) => {
242                let items: Vec<Value> = doc
243                    .select(sel)
244                    .iter()
245                    .map(|container| Value::Object(extract_fields(&container, &self.fields)))
246                    .collect();
247                Value::Array(items)
248            }
249        }
250    }
251}
252
253fn extract_fields(container: &Selection<'_>, fields: &[ExtractField]) -> Map<String, Value> {
254    fields
255        .iter()
256        .map(|f| (f.name.clone(), extract_field(container, f)))
257        .collect()
258}
259
260fn extract_field(container: &Selection<'_>, field: &ExtractField) -> Value {
261    // An empty selector is a sentinel for "the matched element itself".
262    let picked = if field.selector.is_empty() {
263        container.clone()
264    } else {
265        container.select(&field.selector)
266    };
267    if !picked.exists() {
268        return Value::Null;
269    }
270    // Scalar kinds read the first match; dom_query concatenates across all matches by default.
271    match &field.kind {
272        FieldKind::Text => Value::String(picked.first().text().to_string()),
273        FieldKind::Attribute { attribute } => picked
274            .first()
275            .attr(attribute)
276            .map_or(Value::Null, |s| Value::String(s.to_string())),
277        FieldKind::Html => Value::String(picked.first().html().to_string()),
278        FieldKind::InnerHtml => Value::String(picked.first().inner_html().to_string()),
279        FieldKind::NestedList { fields } => Value::Array(
280            picked
281                .iter()
282                .map(|sub| Value::Object(extract_fields(&sub, fields)))
283                .collect(),
284        ),
285    }
286}
287
288#[cfg(test)]
289mod tests {
290    use serde_json::json;
291
292    use super::*;
293
294    const PRODUCTS: &str = r#"
295        <html><body>
296          <div class="product">
297            <h2>Keyboard</h2>
298            <span class="price">$99</span>
299            <a href="/kbd">details</a>
300            <img src="/kbd.png" alt="Keyboard">
301          </div>
302          <div class="product">
303            <h2>Mouse</h2>
304            <span class="price">$49</span>
305            <a href="/mouse">details</a>
306            <img src="/mouse.png" alt="Mouse">
307          </div>
308        </body></html>
309    "#;
310
311    fn schema_from(json: &Value) -> ExtractSchema {
312        ExtractSchema::from_json(&json.to_string()).expect("valid schema")
313    }
314
315    #[test]
316    fn extracts_text_fields_over_base_selector() {
317        let schema = schema_from(&json!({
318            "base_selector": ".product",
319            "fields": [
320                { "name": "title", "selector": "h2", "type": "text" },
321                { "name": "price", "selector": ".price", "type": "text" },
322            ]
323        }));
324        assert_eq!(
325            schema.extract_from(PRODUCTS),
326            json!([
327                { "title": "Keyboard", "price": "$99" },
328                { "title": "Mouse", "price": "$49" }
329            ])
330        );
331    }
332
333    #[test]
334    fn extracts_attribute() {
335        let schema = schema_from(&json!({
336            "base_selector": ".product",
337            "fields": [
338                { "name": "url", "selector": "a", "type": "attribute", "attribute": "href" },
339                { "name": "image", "selector": "img", "type": "attribute", "attribute": "src" },
340            ]
341        }));
342        assert_eq!(
343            schema.extract_from(PRODUCTS),
344            json!([
345                { "url": "/kbd", "image": "/kbd.png" },
346                { "url": "/mouse", "image": "/mouse.png" }
347            ])
348        );
349    }
350
351    #[test]
352    fn extracts_html_and_inner_html() {
353        let html = r#"<html><body><div class="card"><p><b>hi</b></p></div></body></html>"#;
354        let schema = schema_from(&json!({
355            "base_selector": ".card",
356            "fields": [
357                { "name": "outer", "selector": "p", "type": "html" },
358                { "name": "inner", "selector": "p", "type": "inner_html" },
359            ]
360        }));
361        assert_eq!(
362            schema.extract_from(html),
363            json!([{ "outer": "<p><b>hi</b></p>", "inner": "<b>hi</b>" }])
364        );
365    }
366
367    #[test]
368    fn nested_list_extracts_sub_objects() {
369        let html = r#"
370            <html><body>
371              <div class="post">
372                <h3>First</h3>
373                <ul><li>a</li><li>b</li></ul>
374              </div>
375              <div class="post">
376                <h3>Second</h3>
377                <ul><li>c</li></ul>
378              </div>
379            </body></html>
380        "#;
381        let schema = schema_from(&json!({
382            "base_selector": ".post",
383            "fields": [
384                { "name": "title", "selector": "h3", "type": "text" },
385                { "name": "items", "selector": "li", "type": "nested_list",
386                  "fields": [
387                    { "name": "label", "selector": "*", "type": "text" }
388                  ]
389                }
390            ]
391        }));
392        assert_eq!(
393            schema.extract_from(html),
394            json!([
395                { "title": "First", "items": [{ "label": null }, { "label": null }] },
396                { "title": "Second", "items": [{ "label": null }] }
397            ])
398        );
399    }
400
401    #[test]
402    fn missing_field_yields_null() {
403        let schema = schema_from(&json!({
404            "base_selector": ".product",
405            "fields": [
406                { "name": "rating", "selector": ".rating", "type": "text" }
407            ]
408        }));
409        assert_eq!(
410            schema.extract_from(PRODUCTS),
411            json!([{ "rating": null }, { "rating": null }])
412        );
413    }
414
415    #[test]
416    fn no_base_selector_returns_single_object() {
417        let schema = schema_from(&json!({
418            "fields": [
419                { "name": "first_product", "selector": ".product h2", "type": "text" }
420            ]
421        }));
422        assert_eq!(schema.extract_from(PRODUCTS), json!({ "first_product": "Keyboard" }));
423    }
424
425    #[test]
426    fn accepts_camelcase_keys() {
427        let schema = schema_from(&json!({
428            "baseSelector": ".product",
429            "fields": [
430                { "name": "t", "selector": "h2", "type": "text" },
431                { "name": "raw", "selector": "p", "type": "innerHtml" }
432            ]
433        }));
434        assert_eq!(schema.base_selector.as_deref(), Some(".product"));
435        let arr_out = schema.extract_from(PRODUCTS);
436        let arr = arr_out.as_array().unwrap();
437        assert_eq!(arr[0]["t"], "Keyboard");
438        assert_eq!(arr[0]["raw"], Value::Null);
439    }
440
441    #[test]
442    fn rejects_malformed_selector_eagerly() {
443        let json = json!({
444            "base_selector": ".product",
445            "fields": [
446                { "name": "bad", "selector": "###not[[[valid", "type": "text" }
447            ]
448        });
449        let err = ExtractSchema::from_json(&json.to_string()).unwrap_err();
450        assert!(
451            matches!(err, SchemaError::InvalidSelector { field, .. } if field == "bad"),
452            "expected InvalidSelector error for field 'bad'"
453        );
454    }
455
456    #[test]
457    fn nested_invalid_selector_reports_dotted_path() {
458        let json = json!({
459            "fields": [{
460                "name": "products",
461                "selector": ".product",
462                "type": "nested_list",
463                "fields": [{
464                    "name": "price",
465                    "selector": ".price",
466                    "type": "nested_list",
467                    "fields": [{ "name": "amount", "selector": "###bad", "type": "text" }]
468                }]
469            }]
470        });
471        let err = ExtractSchema::from_json(&json.to_string()).unwrap_err();
472        assert!(
473            matches!(&err, SchemaError::InvalidSelector { field, .. } if field == "products.price.amount"),
474            "expected dotted path, got: {err:?}"
475        );
476    }
477
478    #[test]
479    fn rejects_malformed_json() {
480        let err = ExtractSchema::from_json("{ not json").unwrap_err();
481        assert!(matches!(err, SchemaError::Parse(_)), "expected Parse error");
482    }
483
484    #[test]
485    fn from_path_surfaces_io_error() {
486        let err = ExtractSchema::from_path("/definitely/not/a/real/path.json").unwrap_err();
487        assert!(matches!(err, SchemaError::Io(_)), "expected Io error, got {err:?}");
488    }
489
490    #[test]
491    fn mixed_present_and_missing_fields() {
492        let schema = schema_from(&json!({
493            "base_selector": ".product",
494            "fields": [
495                { "name": "title", "selector": "h2", "type": "text" },
496                { "name": "rating", "selector": ".rating", "type": "text" }
497            ]
498        }));
499        assert_eq!(
500            schema.extract_from(PRODUCTS),
501            json!([
502                { "title": "Keyboard", "rating": null },
503                { "title": "Mouse", "rating": null }
504            ])
505        );
506    }
507
508    #[test]
509    fn empty_selector_reads_matched_element_text() {
510        let html = r"<html><body><ul><li>alpha</li><li>beta</li></ul></body></html>";
511        let schema = schema_from(&json!({
512            "base_selector": "li",
513            "fields": [
514                { "name": "value", "selector": "", "type": "text" }
515            ]
516        }));
517        assert_eq!(
518            schema.extract_from(html),
519            json!([{ "value": "alpha" }, { "value": "beta" }])
520        );
521    }
522
523    #[test]
524    fn empty_selector_inside_nested_list_reads_each_item() {
525        let html = r#"
526            <html><body>
527              <div class="post">
528                <h3>First</h3>
529                <ul><li>a</li><li>b</li></ul>
530              </div>
531            </body></html>
532        "#;
533        let schema = schema_from(&json!({
534            "base_selector": ".post",
535            "fields": [
536                { "name": "title", "selector": "h3", "type": "text" },
537                { "name": "items", "selector": "li", "type": "nested_list",
538                  "fields": [{ "name": "text", "selector": "", "type": "text" }] }
539            ]
540        }));
541        assert_eq!(
542            schema.extract_from(html),
543            json!([{
544                "title": "First",
545                "items": [{ "text": "a" }, { "text": "b" }]
546            }])
547        );
548    }
549
550    #[test]
551    fn empty_selector_reads_matched_element_attribute() {
552        let html = r#"<html><body><a href="/home" title="Home">Go</a></body></html>"#;
553        let schema = schema_from(&json!({
554            "base_selector": "a",
555            "fields": [
556                { "name": "href", "selector": "", "type": "attribute", "attribute": "href" },
557                { "name": "title", "selector": "", "type": "attribute", "attribute": "title" }
558            ]
559        }));
560        assert_eq!(schema.extract_from(html), json!([{ "href": "/home", "title": "Home" }]));
561    }
562
563    #[test]
564    fn builder_constructs_equivalent_schema() {
565        let built = ExtractSchema::builder()
566            .base_selector(".product")
567            .field("title", "h2", FieldKind::Text)
568            .field(
569                "url",
570                "a",
571                FieldKind::Attribute {
572                    attribute: "href".into(),
573                },
574            )
575            .build()
576            .unwrap();
577
578        let json_schema = schema_from(&json!({
579            "base_selector": ".product",
580            "fields": [
581                { "name": "title", "selector": "h2", "type": "text" },
582                { "name": "url", "selector": "a", "type": "attribute", "attribute": "href" }
583            ]
584        }));
585
586        assert_eq!(built.extract_from(PRODUCTS), json_schema.extract_from(PRODUCTS));
587    }
588
589    #[test]
590    fn builder_supports_nested_list() {
591        let schema = ExtractSchema::builder()
592            .base_selector(".post")
593            .field("title", "h3", FieldKind::Text)
594            .field(
595                "items",
596                "li",
597                FieldKind::NestedList {
598                    fields: vec![ExtractField::new("text", "", FieldKind::Text)],
599                },
600            )
601            .build()
602            .unwrap();
603        let html = r"<html><body><div class='post'><h3>A</h3><ul><li>one</li></ul></div></body></html>";
604        assert_eq!(
605            schema.extract_from(html),
606            json!([{ "title": "A", "items": [{ "text": "one" }] }])
607        );
608    }
609
610    #[test]
611    fn builder_surfaces_selector_errors() {
612        let err = ExtractSchema::builder()
613            .field("bad", "###invalid[[[", FieldKind::Text)
614            .build()
615            .unwrap_err();
616        assert!(
617            matches!(&err, SchemaError::InvalidSelector { field, .. } if field == "bad"),
618            "expected InvalidSelector, got {err:?}"
619        );
620    }
621
622    #[test]
623    fn ignores_unknown_top_level_fields() {
624        let schema = schema_from(&json!({
625            "name": "legacy-label",
626            "base_selector": ".product",
627            "fields": [
628                { "name": "title", "selector": "h2", "type": "text" }
629            ]
630        }));
631        assert_eq!(schema.base_selector.as_deref(), Some(".product"));
632    }
633
634    #[test]
635    fn rejects_unknown_field_type_list() {
636        let json = json!({
637            "fields": [
638                { "name": "items", "selector": "li", "type": "list", "fields": [] }
639            ]
640        });
641        let err = ExtractSchema::from_json(&json.to_string()).unwrap_err();
642        assert!(
643            matches!(err, SchemaError::Parse(_)),
644            "expected Parse error for unsupported 'list' type"
645        );
646    }
647
648    #[test]
649    fn works_on_html_fragment_without_wrappers() {
650        let schema = schema_from(&json!({
651            "fields": [
652                { "name": "heading", "selector": "h1", "type": "text" }
653            ]
654        }));
655        assert_eq!(schema.extract_from("<h1>Hello</h1>"), json!({ "heading": "Hello" }));
656    }
657
658    #[test]
659    fn empty_fields_yields_empty_object() {
660        let schema = schema_from(&json!({ "fields": [] }));
661        assert_eq!(schema.extract_from(PRODUCTS), json!({}));
662    }
663
664    #[test]
665    fn empty_fields_with_base_selector_yields_empty_objects() {
666        let schema = schema_from(&json!({
667            "base_selector": ".product",
668            "fields": []
669        }));
670        assert_eq!(schema.extract_from(PRODUCTS), json!([{}, {}]));
671    }
672
673    #[test]
674    fn base_selector_matches_nothing_yields_empty_array() {
675        let schema = schema_from(&json!({
676            "base_selector": ".does-not-exist",
677            "fields": [
678                { "name": "title", "selector": "h2", "type": "text" }
679            ]
680        }));
681        assert_eq!(schema.extract_from(PRODUCTS), json!([]));
682    }
683
684    #[test]
685    fn nested_list_with_zero_matches_yields_null() {
686        let html = r#"<html><body><div class="post"><h3>Only</h3></div></body></html>"#;
687        let schema = schema_from(&json!({
688            "base_selector": ".post",
689            "fields": [
690                { "name": "title", "selector": "h3", "type": "text" },
691                { "name": "items", "selector": ".missing", "type": "nested_list",
692                  "fields": [{ "name": "label", "selector": "*", "type": "text" }] }
693            ]
694        }));
695        assert_eq!(schema.extract_from(html), json!([{ "title": "Only", "items": null }]));
696    }
697
698    #[test]
699    fn attribute_missing_but_element_present_yields_null() {
700        let html = r"<html><body><a>no href</a></body></html>";
701        let schema = schema_from(&json!({
702            "fields": [
703                { "name": "href", "selector": "a", "type": "attribute", "attribute": "href" }
704            ]
705        }));
706        assert_eq!(schema.extract_from(html), json!({ "href": null }));
707    }
708
709    #[test]
710    fn unicode_text_roundtrips() {
711        let html = r"<html><body><h1>日本語 🦀</h1></body></html>";
712        let schema = schema_from(&json!({
713            "fields": [{ "name": "t", "selector": "h1", "type": "text" }]
714        }));
715        assert_eq!(schema.extract_from(html), json!({ "t": "日本語 🦀" }));
716    }
717
718    #[test]
719    fn html_entities_are_decoded_in_text() {
720        let html = r"<html><body><p>A &amp; B &lt; C</p></body></html>";
721        let schema = schema_from(&json!({
722            "fields": [{ "name": "t", "selector": "p", "type": "text" }]
723        }));
724        assert_eq!(schema.extract_from(html), json!({ "t": "A & B < C" }));
725    }
726
727    #[test]
728    fn deeply_nested_three_levels() {
729        let html = r#"
730            <html><body>
731              <div class="cat">
732                <h2>Electronics</h2>
733                <div class="prod">
734                  <h3>Laptop</h3>
735                  <ul class="specs"><li>16GB</li><li>1TB</li></ul>
736                </div>
737              </div>
738            </body></html>
739        "#;
740        let schema = schema_from(&json!({
741            "base_selector": ".cat",
742            "fields": [
743                { "name": "name", "selector": "h2", "type": "text" },
744                { "name": "products", "selector": ".prod", "type": "nested_list",
745                  "fields": [
746                    { "name": "title", "selector": "h3", "type": "text" },
747                    { "name": "specs", "selector": ".specs li", "type": "nested_list",
748                      "fields": [{ "name": "v", "selector": "*", "type": "text" }] }
749                  ] }
750            ]
751        }));
752        assert_eq!(
753            schema.extract_from(html),
754            json!([{
755                "name": "Electronics",
756                "products": [{
757                    "title": "Laptop",
758                    "specs": [{ "v": null }, { "v": null }]
759                }]
760            }])
761        );
762    }
763
764    #[test]
765    fn empty_html_yields_nulls() {
766        let schema = schema_from(&json!({
767            "fields": [{ "name": "t", "selector": "h1", "type": "text" }]
768        }));
769        assert_eq!(schema.extract_from(""), json!({ "t": null }));
770    }
771
772    #[test]
773    fn rejects_excessive_nesting_depth() {
774        // Build a schema nested deeper than MAX_NESTING_DEPTH (64).
775        let mut kind = FieldKind::Text;
776        for i in (0..MAX_NESTING_DEPTH + 5).rev() {
777            kind = FieldKind::NestedList {
778                fields: vec![ExtractField::new(format!("l{i}"), "*", kind)],
779            };
780        }
781        let err = ExtractSchema::builder().field("root", "*", kind).build().unwrap_err();
782        assert!(matches!(
783            err,
784            SchemaError::TooDeep { depth, max, .. } if depth > max && max == MAX_NESTING_DEPTH
785        ));
786    }
787
788    #[test]
789    fn accepts_nesting_at_depth_limit() {
790        // Build a schema exactly at MAX_NESTING_DEPTH nesting.
791        let mut kind = FieldKind::Text;
792        for i in (0..MAX_NESTING_DEPTH).rev() {
793            kind = FieldKind::NestedList {
794                fields: vec![ExtractField::new(format!("l{i}"), "*", kind)],
795            };
796        }
797        let result = ExtractSchema::builder().field("root", "*", kind).build();
798        assert!(result.is_ok());
799    }
800
801    #[test]
802    fn accessors_expose_schema_contents() {
803        let schema = ExtractSchema::builder()
804            .base_selector(".product")
805            .field("title", "h2", FieldKind::Text)
806            .field(
807                "url",
808                "a",
809                FieldKind::Attribute {
810                    attribute: "href".into(),
811                },
812            )
813            .build()
814            .unwrap();
815
816        assert_eq!(schema.base_selector(), Some(".product"));
817        assert_eq!(schema.fields().len(), 2);
818        assert_eq!(schema.fields()[0].name(), "title");
819        assert_eq!(schema.fields()[0].selector(), "h2");
820        assert!(matches!(schema.fields()[0].kind(), FieldKind::Text));
821        assert_eq!(schema.fields()[1].name(), "url");
822        assert!(matches!(
823            schema.fields()[1].kind(),
824            FieldKind::Attribute { attribute } if attribute == "href"
825        ));
826    }
827}