Skip to main content

servo_fetch/
schema.rs

1//! CSS-selector schema extraction.
2
3use dom_query::{Document, Matcher, Selection};
4use serde::{Deserialize, Serialize};
5use serde_json::{Map, Value};
6
7/// Maximum `NestedList` nesting depth allowed in a schema.
8const MAX_NESTING_DEPTH: usize = 64;
9
10/// Schema parse or validation error.
11#[derive(Debug, thiserror::Error)]
12#[non_exhaustive]
13pub enum SchemaError {
14    /// A CSS selector failed to parse.
15    #[error("invalid CSS selector '{selector}' in field '{field}'")]
16    InvalidSelector {
17        /// Field whose selector failed.
18        field: String,
19        /// The offending selector text.
20        selector: String,
21    },
22    /// The schema JSON itself is malformed.
23    #[error("failed to parse schema: {0}")]
24    Parse(#[from] serde_json::Error),
25    /// Failed to read the schema file from disk.
26    #[error("failed to read schema file: {0}")]
27    Io(#[from] std::io::Error),
28    /// Schema nesting exceeds [`MAX_NESTING_DEPTH`].
29    #[error("schema nesting too deep at field '{field}' ({depth} levels, max {max})")]
30    TooDeep {
31        /// Dotted path to the field that tripped the limit.
32        field: String,
33        /// Depth at which the limit was tripped.
34        depth: usize,
35        /// Maximum depth allowed.
36        max: usize,
37    },
38}
39
40/// Declarative extraction schema.
41#[derive(Debug, Clone, Serialize, Deserialize)]
42#[non_exhaustive]
43pub struct ExtractSchema {
44    /// Repeated container selector; each match produces one object.
45    #[serde(default, alias = "baseSelector")]
46    pub(crate) base_selector: Option<String>,
47    /// Fields to read from each container.
48    pub(crate) fields: Vec<ExtractField>,
49}
50
51impl ExtractSchema {
52    /// Parse a schema from JSON and validate every selector eagerly.
53    pub fn from_json(json: &str) -> Result<Self, SchemaError> {
54        let schema: Self = serde_json::from_str(json)?;
55        schema.validate()?;
56        Ok(schema)
57    }
58
59    /// Load a schema from a JSON file on disk.
60    pub fn from_path(path: impl AsRef<std::path::Path>) -> Result<Self, SchemaError> {
61        let content = std::fs::read_to_string(path)?;
62        Self::from_json(&content)
63    }
64
65    /// Start building a schema programmatically.
66    #[must_use]
67    pub fn builder() -> SchemaBuilder {
68        SchemaBuilder::default()
69    }
70
71    /// Validate every selector in the schema (including nested fields).
72    pub fn validate(&self) -> Result<(), SchemaError> {
73        if let Some(sel) = &self.base_selector {
74            check_selector("<base>", sel)?;
75        }
76        for f in &self.fields {
77            f.validate("", 0)?;
78        }
79        Ok(())
80    }
81
82    /// Repeated container selector, if any.
83    #[must_use]
84    pub fn base_selector(&self) -> Option<&str> {
85        self.base_selector.as_deref()
86    }
87
88    /// Fields defined in this schema.
89    #[must_use]
90    pub fn fields(&self) -> &[ExtractField] {
91        &self.fields
92    }
93}
94
95/// A single field in an [`ExtractSchema`].
96#[derive(Debug, Clone, Serialize, Deserialize)]
97#[non_exhaustive]
98pub struct ExtractField {
99    /// Output key for this field.
100    pub(crate) name: String,
101    /// CSS selector relative to the current container.
102    pub(crate) selector: String,
103    /// How to extract the value.
104    #[serde(flatten)]
105    pub(crate) kind: FieldKind,
106}
107
108impl ExtractField {
109    /// Construct a field programmatically.
110    pub fn new(name: impl Into<String>, selector: impl Into<String>, kind: FieldKind) -> Self {
111        Self {
112            name: name.into(),
113            selector: selector.into(),
114            kind,
115        }
116    }
117
118    /// Output key name for this field.
119    #[must_use]
120    pub fn name(&self) -> &str {
121        &self.name
122    }
123
124    /// CSS selector for this field.
125    #[must_use]
126    pub fn selector(&self) -> &str {
127        &self.selector
128    }
129
130    /// Extraction kind.
131    #[must_use]
132    pub fn kind(&self) -> &FieldKind {
133        &self.kind
134    }
135
136    fn validate(&self, parent: &str, depth: usize) -> Result<(), SchemaError> {
137        let path = if parent.is_empty() {
138            self.name.clone()
139        } else {
140            format!("{parent}.{}", self.name)
141        };
142        if depth > MAX_NESTING_DEPTH {
143            return Err(SchemaError::TooDeep {
144                field: path,
145                depth,
146                max: MAX_NESTING_DEPTH,
147            });
148        }
149        check_selector(&path, &self.selector)?;
150        if let FieldKind::NestedList { fields } = &self.kind {
151            for f in fields {
152                f.validate(&path, depth + 1)?;
153            }
154        }
155        Ok(())
156    }
157}
158
159/// Builder for [`ExtractSchema`].
160#[derive(Default, Debug, Clone)]
161pub struct SchemaBuilder {
162    base_selector: Option<String>,
163    fields: Vec<ExtractField>,
164}
165
166impl SchemaBuilder {
167    /// Set the base (repeated container) selector.
168    #[must_use]
169    pub fn base_selector(mut self, selector: impl Into<String>) -> Self {
170        self.base_selector = Some(selector.into());
171        self
172    }
173
174    /// Add a field. Accepts any [`FieldKind`] variant.
175    #[must_use]
176    pub fn field(mut self, name: impl Into<String>, selector: impl Into<String>, kind: FieldKind) -> Self {
177        self.fields.push(ExtractField::new(name, selector, kind));
178        self
179    }
180
181    /// Finalize the schema, validating every selector eagerly.
182    pub fn build(self) -> Result<ExtractSchema, SchemaError> {
183        let schema = ExtractSchema {
184            base_selector: self.base_selector,
185            fields: self.fields,
186        };
187        schema.validate()?;
188        Ok(schema)
189    }
190}
191
192/// What to read once a field selector matches.
193#[derive(Debug, Clone, Serialize, Deserialize)]
194#[serde(tag = "type", rename_all = "snake_case")]
195#[non_exhaustive]
196pub enum FieldKind {
197    /// Descendant text of the first match.
198    Text,
199    /// Named attribute on the first match.
200    #[serde(alias = "attr")]
201    Attribute {
202        /// Attribute name to read (e.g. `href`).
203        attribute: String,
204    },
205    /// Outer HTML of the first match.
206    Html,
207    /// Inner HTML of the first match.
208    #[serde(alias = "innerHtml")]
209    InnerHtml,
210    /// Repeated sub-object per match, using nested field definitions.
211    #[serde(alias = "nestedList")]
212    NestedList {
213        /// Nested field definitions.
214        fields: Vec<ExtractField>,
215    },
216}
217
218fn check_selector(field: &str, selector: &str) -> Result<(), SchemaError> {
219    // Empty selector is a sentinel for "the matched element itself" and is
220    // intentionally not a valid CSS expression; skip parsing it.
221    if selector.is_empty() {
222        return Ok(());
223    }
224    Matcher::new(selector)
225        .map(|_| ())
226        .map_err(|_| SchemaError::InvalidSelector {
227            field: field.to_string(),
228            selector: selector.to_string(),
229        })
230}
231
232impl ExtractSchema {
233    /// Apply this schema to HTML, returning structured JSON.
234    #[must_use]
235    pub fn extract_from(&self, html: &str) -> Value {
236        let doc = Document::from(html);
237        let root = doc.select("html");
238
239        match &self.base_selector {
240            None => Value::Object(extract_fields(&root, &self.fields)),
241            Some(sel) => {
242                let items: Vec<Value> = doc
243                    .select(sel)
244                    .iter()
245                    .map(|container| Value::Object(extract_fields(&container, &self.fields)))
246                    .collect();
247                Value::Array(items)
248            }
249        }
250    }
251}
252
253fn extract_fields(container: &Selection<'_>, fields: &[ExtractField]) -> Map<String, Value> {
254    fields
255        .iter()
256        .map(|f| (f.name.clone(), extract_field(container, f)))
257        .collect()
258}
259
260fn extract_field(container: &Selection<'_>, field: &ExtractField) -> Value {
261    // An empty selector is a sentinel for "the matched element itself".
262    let picked = if field.selector.is_empty() {
263        container.clone()
264    } else {
265        container.select(&field.selector)
266    };
267    if !picked.exists() {
268        return Value::Null;
269    }
270    // Scalar kinds read the first match; dom_query concatenates across all matches by default.
271    match &field.kind {
272        FieldKind::Text => Value::String(picked.first().text().to_string()),
273        FieldKind::Attribute { attribute } => picked
274            .first()
275            .attr(attribute)
276            .map_or(Value::Null, |s| Value::String(s.to_string())),
277        FieldKind::Html => Value::String(picked.first().html().to_string()),
278        FieldKind::InnerHtml => Value::String(picked.first().inner_html().to_string()),
279        FieldKind::NestedList { fields } => Value::Array(
280            picked
281                .iter()
282                .map(|sub| Value::Object(extract_fields(&sub, fields)))
283                .collect(),
284        ),
285    }
286}
287
288#[cfg(test)]
289mod tests {
290    use super::*;
291    use serde_json::json;
292
293    const PRODUCTS: &str = r#"
294        <html><body>
295          <div class="product">
296            <h2>Keyboard</h2>
297            <span class="price">$99</span>
298            <a href="/kbd">details</a>
299            <img src="/kbd.png" alt="Keyboard">
300          </div>
301          <div class="product">
302            <h2>Mouse</h2>
303            <span class="price">$49</span>
304            <a href="/mouse">details</a>
305            <img src="/mouse.png" alt="Mouse">
306          </div>
307        </body></html>
308    "#;
309
310    fn schema_from(json: &Value) -> ExtractSchema {
311        ExtractSchema::from_json(&json.to_string()).expect("valid schema")
312    }
313
314    #[test]
315    fn extracts_text_fields_over_base_selector() {
316        let schema = schema_from(&json!({
317            "base_selector": ".product",
318            "fields": [
319                { "name": "title", "selector": "h2", "type": "text" },
320                { "name": "price", "selector": ".price", "type": "text" },
321            ]
322        }));
323        assert_eq!(
324            schema.extract_from(PRODUCTS),
325            json!([
326                { "title": "Keyboard", "price": "$99" },
327                { "title": "Mouse", "price": "$49" }
328            ])
329        );
330    }
331
332    #[test]
333    fn extracts_attribute() {
334        let schema = schema_from(&json!({
335            "base_selector": ".product",
336            "fields": [
337                { "name": "url", "selector": "a", "type": "attribute", "attribute": "href" },
338                { "name": "image", "selector": "img", "type": "attribute", "attribute": "src" },
339            ]
340        }));
341        assert_eq!(
342            schema.extract_from(PRODUCTS),
343            json!([
344                { "url": "/kbd", "image": "/kbd.png" },
345                { "url": "/mouse", "image": "/mouse.png" }
346            ])
347        );
348    }
349
350    #[test]
351    fn extracts_html_and_inner_html() {
352        let html = r#"<html><body><div class="card"><p><b>hi</b></p></div></body></html>"#;
353        let schema = schema_from(&json!({
354            "base_selector": ".card",
355            "fields": [
356                { "name": "outer", "selector": "p", "type": "html" },
357                { "name": "inner", "selector": "p", "type": "inner_html" },
358            ]
359        }));
360        assert_eq!(
361            schema.extract_from(html),
362            json!([{ "outer": "<p><b>hi</b></p>", "inner": "<b>hi</b>" }])
363        );
364    }
365
366    #[test]
367    fn nested_list_extracts_sub_objects() {
368        let html = r#"
369            <html><body>
370              <div class="post">
371                <h3>First</h3>
372                <ul><li>a</li><li>b</li></ul>
373              </div>
374              <div class="post">
375                <h3>Second</h3>
376                <ul><li>c</li></ul>
377              </div>
378            </body></html>
379        "#;
380        let schema = schema_from(&json!({
381            "base_selector": ".post",
382            "fields": [
383                { "name": "title", "selector": "h3", "type": "text" },
384                { "name": "items", "selector": "li", "type": "nested_list",
385                  "fields": [
386                    { "name": "label", "selector": "*", "type": "text" }
387                  ]
388                }
389            ]
390        }));
391        assert_eq!(
392            schema.extract_from(html),
393            json!([
394                { "title": "First", "items": [{ "label": null }, { "label": null }] },
395                { "title": "Second", "items": [{ "label": null }] }
396            ])
397        );
398    }
399
400    #[test]
401    fn missing_field_yields_null() {
402        let schema = schema_from(&json!({
403            "base_selector": ".product",
404            "fields": [
405                { "name": "rating", "selector": ".rating", "type": "text" }
406            ]
407        }));
408        assert_eq!(
409            schema.extract_from(PRODUCTS),
410            json!([{ "rating": null }, { "rating": null }])
411        );
412    }
413
414    #[test]
415    fn no_base_selector_returns_single_object() {
416        let schema = schema_from(&json!({
417            "fields": [
418                { "name": "first_product", "selector": ".product h2", "type": "text" }
419            ]
420        }));
421        assert_eq!(schema.extract_from(PRODUCTS), json!({ "first_product": "Keyboard" }));
422    }
423
424    #[test]
425    fn accepts_camelcase_keys() {
426        let schema = schema_from(&json!({
427            "baseSelector": ".product",
428            "fields": [
429                { "name": "t", "selector": "h2", "type": "text" },
430                { "name": "raw", "selector": "p", "type": "innerHtml" }
431            ]
432        }));
433        assert_eq!(schema.base_selector.as_deref(), Some(".product"));
434        let arr_out = schema.extract_from(PRODUCTS);
435        let arr = arr_out.as_array().unwrap();
436        assert_eq!(arr[0]["t"], "Keyboard");
437        assert_eq!(arr[0]["raw"], Value::Null);
438    }
439
440    #[test]
441    fn rejects_malformed_selector_eagerly() {
442        let json = json!({
443            "base_selector": ".product",
444            "fields": [
445                { "name": "bad", "selector": "###not[[[valid", "type": "text" }
446            ]
447        });
448        let err = ExtractSchema::from_json(&json.to_string()).unwrap_err();
449        assert!(
450            matches!(err, SchemaError::InvalidSelector { field, .. } if field == "bad"),
451            "expected InvalidSelector error for field 'bad'"
452        );
453    }
454
455    #[test]
456    fn nested_invalid_selector_reports_dotted_path() {
457        let json = json!({
458            "fields": [{
459                "name": "products",
460                "selector": ".product",
461                "type": "nested_list",
462                "fields": [{
463                    "name": "price",
464                    "selector": ".price",
465                    "type": "nested_list",
466                    "fields": [{ "name": "amount", "selector": "###bad", "type": "text" }]
467                }]
468            }]
469        });
470        let err = ExtractSchema::from_json(&json.to_string()).unwrap_err();
471        assert!(
472            matches!(&err, SchemaError::InvalidSelector { field, .. } if field == "products.price.amount"),
473            "expected dotted path, got: {err:?}"
474        );
475    }
476
477    #[test]
478    fn rejects_malformed_json() {
479        let err = ExtractSchema::from_json("{ not json").unwrap_err();
480        assert!(matches!(err, SchemaError::Parse(_)), "expected Parse error");
481    }
482
483    #[test]
484    fn from_path_surfaces_io_error() {
485        let err = ExtractSchema::from_path("/definitely/not/a/real/path.json").unwrap_err();
486        assert!(matches!(err, SchemaError::Io(_)), "expected Io error, got {err:?}");
487    }
488
489    #[test]
490    fn mixed_present_and_missing_fields() {
491        let schema = schema_from(&json!({
492            "base_selector": ".product",
493            "fields": [
494                { "name": "title", "selector": "h2", "type": "text" },
495                { "name": "rating", "selector": ".rating", "type": "text" }
496            ]
497        }));
498        assert_eq!(
499            schema.extract_from(PRODUCTS),
500            json!([
501                { "title": "Keyboard", "rating": null },
502                { "title": "Mouse", "rating": null }
503            ])
504        );
505    }
506
507    #[test]
508    fn empty_selector_reads_matched_element_text() {
509        let html = r"<html><body><ul><li>alpha</li><li>beta</li></ul></body></html>";
510        let schema = schema_from(&json!({
511            "base_selector": "li",
512            "fields": [
513                { "name": "value", "selector": "", "type": "text" }
514            ]
515        }));
516        assert_eq!(
517            schema.extract_from(html),
518            json!([{ "value": "alpha" }, { "value": "beta" }])
519        );
520    }
521
522    #[test]
523    fn empty_selector_inside_nested_list_reads_each_item() {
524        let html = r#"
525            <html><body>
526              <div class="post">
527                <h3>First</h3>
528                <ul><li>a</li><li>b</li></ul>
529              </div>
530            </body></html>
531        "#;
532        let schema = schema_from(&json!({
533            "base_selector": ".post",
534            "fields": [
535                { "name": "title", "selector": "h3", "type": "text" },
536                { "name": "items", "selector": "li", "type": "nested_list",
537                  "fields": [{ "name": "text", "selector": "", "type": "text" }] }
538            ]
539        }));
540        assert_eq!(
541            schema.extract_from(html),
542            json!([{
543                "title": "First",
544                "items": [{ "text": "a" }, { "text": "b" }]
545            }])
546        );
547    }
548
549    #[test]
550    fn empty_selector_reads_matched_element_attribute() {
551        let html = r#"<html><body><a href="/home" title="Home">Go</a></body></html>"#;
552        let schema = schema_from(&json!({
553            "base_selector": "a",
554            "fields": [
555                { "name": "href", "selector": "", "type": "attribute", "attribute": "href" },
556                { "name": "title", "selector": "", "type": "attribute", "attribute": "title" }
557            ]
558        }));
559        assert_eq!(schema.extract_from(html), json!([{ "href": "/home", "title": "Home" }]));
560    }
561
562    #[test]
563    fn builder_constructs_equivalent_schema() {
564        let built = ExtractSchema::builder()
565            .base_selector(".product")
566            .field("title", "h2", FieldKind::Text)
567            .field(
568                "url",
569                "a",
570                FieldKind::Attribute {
571                    attribute: "href".into(),
572                },
573            )
574            .build()
575            .unwrap();
576
577        let json_schema = schema_from(&json!({
578            "base_selector": ".product",
579            "fields": [
580                { "name": "title", "selector": "h2", "type": "text" },
581                { "name": "url", "selector": "a", "type": "attribute", "attribute": "href" }
582            ]
583        }));
584
585        assert_eq!(built.extract_from(PRODUCTS), json_schema.extract_from(PRODUCTS));
586    }
587
588    #[test]
589    fn builder_supports_nested_list() {
590        let schema = ExtractSchema::builder()
591            .base_selector(".post")
592            .field("title", "h3", FieldKind::Text)
593            .field(
594                "items",
595                "li",
596                FieldKind::NestedList {
597                    fields: vec![ExtractField::new("text", "", FieldKind::Text)],
598                },
599            )
600            .build()
601            .unwrap();
602        let html = r"<html><body><div class='post'><h3>A</h3><ul><li>one</li></ul></div></body></html>";
603        assert_eq!(
604            schema.extract_from(html),
605            json!([{ "title": "A", "items": [{ "text": "one" }] }])
606        );
607    }
608
609    #[test]
610    fn builder_surfaces_selector_errors() {
611        let err = ExtractSchema::builder()
612            .field("bad", "###invalid[[[", FieldKind::Text)
613            .build()
614            .unwrap_err();
615        assert!(
616            matches!(&err, SchemaError::InvalidSelector { field, .. } if field == "bad"),
617            "expected InvalidSelector, got {err:?}"
618        );
619    }
620
621    #[test]
622    fn ignores_unknown_top_level_fields() {
623        let schema = schema_from(&json!({
624            "name": "legacy-label",
625            "base_selector": ".product",
626            "fields": [
627                { "name": "title", "selector": "h2", "type": "text" }
628            ]
629        }));
630        assert_eq!(schema.base_selector.as_deref(), Some(".product"));
631    }
632
633    #[test]
634    fn rejects_unknown_field_type_list() {
635        let json = json!({
636            "fields": [
637                { "name": "items", "selector": "li", "type": "list", "fields": [] }
638            ]
639        });
640        let err = ExtractSchema::from_json(&json.to_string()).unwrap_err();
641        assert!(
642            matches!(err, SchemaError::Parse(_)),
643            "expected Parse error for unsupported 'list' type"
644        );
645    }
646
647    #[test]
648    fn works_on_html_fragment_without_wrappers() {
649        let schema = schema_from(&json!({
650            "fields": [
651                { "name": "heading", "selector": "h1", "type": "text" }
652            ]
653        }));
654        assert_eq!(schema.extract_from("<h1>Hello</h1>"), json!({ "heading": "Hello" }));
655    }
656
657    #[test]
658    fn empty_fields_yields_empty_object() {
659        let schema = schema_from(&json!({ "fields": [] }));
660        assert_eq!(schema.extract_from(PRODUCTS), json!({}));
661    }
662
663    #[test]
664    fn empty_fields_with_base_selector_yields_empty_objects() {
665        let schema = schema_from(&json!({
666            "base_selector": ".product",
667            "fields": []
668        }));
669        assert_eq!(schema.extract_from(PRODUCTS), json!([{}, {}]));
670    }
671
672    #[test]
673    fn base_selector_matches_nothing_yields_empty_array() {
674        let schema = schema_from(&json!({
675            "base_selector": ".does-not-exist",
676            "fields": [
677                { "name": "title", "selector": "h2", "type": "text" }
678            ]
679        }));
680        assert_eq!(schema.extract_from(PRODUCTS), json!([]));
681    }
682
683    #[test]
684    fn nested_list_with_zero_matches_yields_null() {
685        let html = r#"<html><body><div class="post"><h3>Only</h3></div></body></html>"#;
686        let schema = schema_from(&json!({
687            "base_selector": ".post",
688            "fields": [
689                { "name": "title", "selector": "h3", "type": "text" },
690                { "name": "items", "selector": ".missing", "type": "nested_list",
691                  "fields": [{ "name": "label", "selector": "*", "type": "text" }] }
692            ]
693        }));
694        assert_eq!(schema.extract_from(html), json!([{ "title": "Only", "items": null }]));
695    }
696
697    #[test]
698    fn attribute_missing_but_element_present_yields_null() {
699        let html = r"<html><body><a>no href</a></body></html>";
700        let schema = schema_from(&json!({
701            "fields": [
702                { "name": "href", "selector": "a", "type": "attribute", "attribute": "href" }
703            ]
704        }));
705        assert_eq!(schema.extract_from(html), json!({ "href": null }));
706    }
707
708    #[test]
709    fn unicode_text_roundtrips() {
710        let html = r"<html><body><h1>日本語 🦀</h1></body></html>";
711        let schema = schema_from(&json!({
712            "fields": [{ "name": "t", "selector": "h1", "type": "text" }]
713        }));
714        assert_eq!(schema.extract_from(html), json!({ "t": "日本語 🦀" }));
715    }
716
717    #[test]
718    fn html_entities_are_decoded_in_text() {
719        let html = r"<html><body><p>A &amp; B &lt; C</p></body></html>";
720        let schema = schema_from(&json!({
721            "fields": [{ "name": "t", "selector": "p", "type": "text" }]
722        }));
723        assert_eq!(schema.extract_from(html), json!({ "t": "A & B < C" }));
724    }
725
726    #[test]
727    fn deeply_nested_three_levels() {
728        let html = r#"
729            <html><body>
730              <div class="cat">
731                <h2>Electronics</h2>
732                <div class="prod">
733                  <h3>Laptop</h3>
734                  <ul class="specs"><li>16GB</li><li>1TB</li></ul>
735                </div>
736              </div>
737            </body></html>
738        "#;
739        let schema = schema_from(&json!({
740            "base_selector": ".cat",
741            "fields": [
742                { "name": "name", "selector": "h2", "type": "text" },
743                { "name": "products", "selector": ".prod", "type": "nested_list",
744                  "fields": [
745                    { "name": "title", "selector": "h3", "type": "text" },
746                    { "name": "specs", "selector": ".specs li", "type": "nested_list",
747                      "fields": [{ "name": "v", "selector": "*", "type": "text" }] }
748                  ] }
749            ]
750        }));
751        assert_eq!(
752            schema.extract_from(html),
753            json!([{
754                "name": "Electronics",
755                "products": [{
756                    "title": "Laptop",
757                    "specs": [{ "v": null }, { "v": null }]
758                }]
759            }])
760        );
761    }
762
763    #[test]
764    fn empty_html_yields_nulls() {
765        let schema = schema_from(&json!({
766            "fields": [{ "name": "t", "selector": "h1", "type": "text" }]
767        }));
768        assert_eq!(schema.extract_from(""), json!({ "t": null }));
769    }
770
771    #[test]
772    fn rejects_excessive_nesting_depth() {
773        // Build a schema nested deeper than MAX_NESTING_DEPTH (64).
774        let mut kind = FieldKind::Text;
775        for i in (0..MAX_NESTING_DEPTH + 5).rev() {
776            kind = FieldKind::NestedList {
777                fields: vec![ExtractField::new(format!("l{i}"), "*", kind)],
778            };
779        }
780        let err = ExtractSchema::builder().field("root", "*", kind).build().unwrap_err();
781        assert!(matches!(
782            err,
783            SchemaError::TooDeep { depth, max, .. } if depth > max && max == MAX_NESTING_DEPTH
784        ));
785    }
786
787    #[test]
788    fn accepts_nesting_at_depth_limit() {
789        // Build a schema exactly at MAX_NESTING_DEPTH nesting.
790        let mut kind = FieldKind::Text;
791        for i in (0..MAX_NESTING_DEPTH).rev() {
792            kind = FieldKind::NestedList {
793                fields: vec![ExtractField::new(format!("l{i}"), "*", kind)],
794            };
795        }
796        let result = ExtractSchema::builder().field("root", "*", kind).build();
797        assert!(result.is_ok());
798    }
799
800    #[test]
801    fn accessors_expose_schema_contents() {
802        let schema = ExtractSchema::builder()
803            .base_selector(".product")
804            .field("title", "h2", FieldKind::Text)
805            .field(
806                "url",
807                "a",
808                FieldKind::Attribute {
809                    attribute: "href".into(),
810                },
811            )
812            .build()
813            .unwrap();
814
815        assert_eq!(schema.base_selector(), Some(".product"));
816        assert_eq!(schema.fields().len(), 2);
817        assert_eq!(schema.fields()[0].name(), "title");
818        assert_eq!(schema.fields()[0].selector(), "h2");
819        assert!(matches!(schema.fields()[0].kind(), FieldKind::Text));
820        assert_eq!(schema.fields()[1].name(), "url");
821        assert!(matches!(
822            schema.fields()[1].kind(),
823            FieldKind::Attribute { attribute } if attribute == "href"
824        ));
825    }
826}