Skip to main content

html2json/
lib.rs

1//! html2json - HTML to JSON extractor using html5ever
2//!
3//! A Rust port of cheerio-json-mapper using html5ever for HTML parsing.
4//!
5//! # Overview
6//!
7//! This library extracts structured JSON data from HTML using CSS selectors
8//! defined in a JSON spec format.
9//!
10//! # Basic Example
11//!
12//! ```no_run
13//! use html2json::{extract, Spec};
14//!
15//! let html = r#"<html><body><h1>Hello</h1><p class="desc">World</p></body></html>"#;
16//! let spec_json = r#"{"title": "h1", "description": "p.desc"}"#;
17//! let spec: Spec = serde_json::from_str(spec_json)?;
18//! let result = extract(html, &spec)?;
19//! assert_eq!(result["title"], "Hello");
20//! assert_eq!(result["description"], "World");
21//! # Ok::<(), anyhow::Error>(())
22//! ```
23
24pub mod dom;
25pub mod pipe;
26pub mod spec;
27
28pub use dom::Dom;
29pub use spec::Spec;
30
31use anyhow::Result;
32
33/// Extract JSON from HTML using a spec
34///
35/// # Arguments
36///
37/// * `html` - The HTML source to parse
38/// * `spec` - The extraction specification
39///
40/// # Example
41///
42/// ```
43/// use html2json::{extract, Spec};
44///
45/// let html = r#"<div class="item"><span>Price: $25.00</span></div>"#;
46/// let spec_json = r#"{"price": ".item span | regex:\\$(\\d+\\.\\d+)"}"#;
47/// let spec: Spec = serde_json::from_str(spec_json)?;
48/// let result = extract(html, &spec)?;
49/// assert_eq!(result["price"], "25.00");
50/// # Ok::<(), anyhow::Error>(())
51/// ```
52pub fn extract(html: &str, spec: &Spec) -> Result<serde_json::Value> {
53    let dom = Dom::parse(html)?;
54    dom.extract(spec)
55}
56
57#[cfg(test)]
58mod tests {
59    use crate::extract;
60    use crate::spec::Spec;
61    const HTML: &str = include_str!("../examples/hn.html");
62
63    #[test]
64    fn basic_text_extraction() {
65        let spec: Spec = serde_json::from_str(
66            r##"{
67                "$": "title",
68                "title": "$"
69            }"##,
70        )
71        .unwrap();
72        let result = extract(HTML, &spec).unwrap();
73        assert_eq!(result["title"], "Hacker News");
74    }
75
76    #[test]
77    fn attribute_extraction() {
78        let spec: Spec = serde_json::from_str(
79            r##"{
80                "rss_link": "link[rel=alternate] | attr:href"
81            }"##,
82        )
83        .unwrap();
84        let result = extract(HTML, &spec).unwrap();
85        assert_eq!(result["rss_link"], "rss");
86    }
87
88    #[test]
89    fn scoping_with_dollar() {
90        let spec: Spec = serde_json::from_str(
91            r##"{
92                "$": ".pagetop",
93                "first_link": "a"
94            }"##,
95        )
96        .unwrap();
97        let result = extract(HTML, &spec).unwrap();
98        assert_eq!(result["first_link"], "Hacker News");
99    }
100
101    #[test]
102    fn nested_scoping() {
103        let spec: Spec = serde_json::from_str(
104            r##"{
105                "$": "head",
106                "head_element": {
107                    "$": "link",
108                    "href": "$ | attr:href",
109                    "rel": "$ | attr:rel"
110                }
111            }"##,
112        )
113        .unwrap();
114        let result = extract(HTML, &spec).unwrap();
115        assert_eq!(
116            result["head_element"]["href"],
117            "news.css?fFlkMoHAedK8lfBWEYBd"
118        );
119        assert_eq!(result["head_element"]["rel"], "stylesheet");
120    }
121
122    #[test]
123    fn collection_extraction() {
124        let spec: Spec = serde_json::from_str(
125            r##"{
126                "ranks": [{
127                    "$": ".rank",
128                    "value": "$"
129                }]
130            }"##,
131        )
132        .unwrap();
133        let result = extract(HTML, &spec).unwrap();
134        let arr = result["ranks"].as_array().unwrap();
135        assert!(arr.len() >= 3);
136        assert_eq!(arr[0]["value"], "1.");
137        assert_eq!(arr[1]["value"], "2.");
138        assert_eq!(arr[2]["value"], "3.");
139    }
140
141    #[test]
142    fn collection_with_nested_properties() {
143        let spec: Spec = serde_json::from_str(
144            r##"{
145                "items": [{
146                    "$": "tr.athing",
147                    "id": "$ | attr:id",
148                    "title": ".titleline a"
149                }]
150            }"##,
151        )
152        .unwrap();
153        let result = extract(HTML, &spec).unwrap();
154        let arr = result["items"].as_array().unwrap();
155        assert!(arr.len() >= 2);
156        assert_eq!(arr[0]["id"], "46446815");
157        assert_eq!(arr[0]["title"], "I canceled my book deal");
158    }
159
160    #[test]
161    fn literal_values() {
162        let spec: Spec = serde_json::from_str(
163            r##"{
164                "source": "'html2json'",
165                "version": 1.5,
166                "active": true,
167                "data": null
168            }"##,
169        )
170        .unwrap();
171        let result = extract(HTML, &spec).unwrap();
172        assert_eq!(result["source"], "html2json");
173        assert_eq!(result["version"], 1.5);
174        assert_eq!(result["active"], true);
175        assert!(result["data"].is_null());
176    }
177
178    #[test]
179    fn trim_pipe() {
180        let spec: Spec = serde_json::from_str(
181            r##"{
182                "title": "title | trim"
183            }"##,
184        )
185        .unwrap();
186        let result = extract(HTML, &spec).unwrap();
187        assert_eq!(result["title"], "Hacker News");
188    }
189
190    #[test]
191    fn lowercase_pipe() {
192        let spec: Spec = serde_json::from_str(
193            r##"{
194                "title_lower": "title | lower"
195            }"##,
196        )
197        .unwrap();
198        let result = extract(HTML, &spec).unwrap();
199        assert_eq!(result["title_lower"], "hacker news");
200    }
201
202    #[test]
203    fn uppercase_pipe() {
204        let spec: Spec = serde_json::from_str(
205            r##"{
206                "title_upper": "title | upper"
207            }"##,
208        )
209        .unwrap();
210        let result = extract(HTML, &spec).unwrap();
211        assert_eq!(result["title_upper"], "HACKER NEWS");
212    }
213
214    #[test]
215    fn substring_pipe() {
216        let spec: Spec = serde_json::from_str(
217            r##"{
218                "partial": "title | substr:0:6"
219            }"##,
220        )
221        .unwrap();
222        let result = extract(HTML, &spec).unwrap();
223        assert_eq!(result["partial"], "Hacker");
224    }
225
226    #[test]
227    fn parse_as_number_pipe() {
228        let spec: Spec = serde_json::from_str(
229            r##"{
230                "$": "#hnmain",
231                "table_width": "$ | attr:width | regex:(\\d+) | parseAs:int"
232            }"##,
233        )
234        .unwrap();
235        let result = extract(HTML, &spec).unwrap();
236        assert_eq!(result["table_width"], 85);
237    }
238
239    #[test]
240    fn regex_pipe() {
241        let spec: Spec = serde_json::from_str(
242            r##"{
243                "points": ".score | regex:(\\d+)\\s*points"
244            }"##,
245        )
246        .unwrap();
247        let result = extract(HTML, &spec).unwrap();
248        assert_eq!(result["points"], "156");
249    }
250
251    #[test]
252    fn no_match_returns_null() {
253        let spec: Spec = serde_json::from_str(
254            r##"{
255                "missing": ".nonexistent-element",
256                "present": "title"
257            }"##,
258        )
259        .unwrap();
260        let result = extract(HTML, &spec).unwrap();
261        assert!(result["missing"].is_null());
262        assert_eq!(result["present"], "Hacker News");
263    }
264
265    #[test]
266    fn empty_collection_returns_empty_array() {
267        let spec: Spec = serde_json::from_str(
268            r##"{
269                "items": [{
270                    "$": ".nonexistent",
271                    "value": "$"
272                }]
273            }"##,
274        )
275        .unwrap();
276        let result = extract(HTML, &spec).unwrap();
277        let arr = result["items"].as_array().unwrap();
278        assert_eq!(arr.len(), 0);
279    }
280
281    #[test]
282    fn multiple_attributes() {
283        let spec: Spec = serde_json::from_str(
284            r##"{
285                "lang": "html | attr:lang",
286                "page_title": "title"
287            }"##,
288        )
289        .unwrap();
290        let result = extract(HTML, &spec).unwrap();
291        assert_eq!(result["lang"], "en");
292        assert_eq!(result["page_title"], "Hacker News");
293    }
294
295    #[test]
296    fn complex_nested_structure() {
297        let spec: Spec = serde_json::from_str(
298            r##"{
299                "$": "#hnmain",
300                "submissions": [{
301                    "$": "tr.athing",
302                    "id": "$ | attr:id",
303                    "title": ".titleline a"
304                }]
305            }"##,
306        )
307        .unwrap();
308        let result = extract(HTML, &spec).unwrap();
309        let items = result["submissions"].as_array().unwrap();
310        assert!(items.len() >= 1);
311        assert_eq!(items[0]["id"], "46446815");
312        assert_eq!(items[0]["title"], "I canceled my book deal");
313    }
314
315    #[test]
316    fn self_selector_in_collection() {
317        let spec: Spec = serde_json::from_str(
318            r##"{
319                "titles": [{
320                    "$": ".titleline a",
321                    "text": "$"
322                }]
323            }"##,
324        )
325        .unwrap();
326        let result = extract(HTML, &spec).unwrap();
327        let arr = result["titles"].as_array().unwrap();
328        assert!(arr.len() >= 2);
329        assert_eq!(arr[0]["text"], "I canceled my book deal");
330    }
331
332    #[test]
333    fn next_sibling_selector() {
334        let spec: Spec = serde_json::from_str(
335            r##"{
336                "$": "#hnmain",
337                "items": [{
338                    "$": "tr.athing",
339                    "title": ".titleline a",
340                    "score": "+ .subtext .score"
341                }]
342            }"##,
343        )
344        .unwrap();
345        let result = extract(HTML, &spec).unwrap();
346        let items = result["items"].as_array().unwrap();
347        assert!(items.len() >= 1);
348        assert_eq!(items[0]["title"], "I canceled my book deal");
349        assert_eq!(items[0]["score"], "156 points");
350    }
351
352    #[test]
353    fn void_pipe() {
354        let rss_xml = include_str!("../examples/rss.xml");
355        // The void pipe should work regardless of its position in the pipe chain
356        let spec: Spec = serde_json::from_str(
357            r##"{
358                "$": "channel",
359                "link_trimmed": "link | void | trim",
360                "link_lower": "link | void | lower"
361            }"##,
362        )
363        .unwrap();
364        let result = extract(rss_xml, &spec).unwrap();
365        assert_eq!(result["link_trimmed"], "https://example.com");
366        assert_eq!(result["link_lower"], "https://example.com");
367    }
368
369    #[test]
370    fn rss_feed_extraction() {
371        let rss_xml = include_str!("../examples/rss.xml");
372        let spec_json = include_str!("../examples/rss.json");
373        let expected_json = include_str!("../examples/rss.expected.json");
374
375        let spec: Spec = serde_json::from_str(spec_json).unwrap();
376        let expected: serde_json::Value = serde_json::from_str(expected_json).unwrap();
377        let result = extract(rss_xml, &spec).unwrap();
378
379        similar_asserts::assert_serde_eq!(expected, result);
380    }
381
382    #[test]
383    fn hackernews_extraction() {
384        let spec_json = include_str!("../examples/hn.json");
385        let expected_json = include_str!("../examples/hn.expected.json");
386
387        let spec: Spec = serde_json::from_str(spec_json).unwrap();
388        let expected: serde_json::Value = serde_json::from_str(expected_json).unwrap();
389        let result = extract(HTML, &spec).unwrap();
390
391        similar_asserts::assert_serde_eq!(expected, result);
392    }
393
394    #[test]
395    fn fallback_operator_first_selector_matches() {
396        let html = r#"<html><body><h1 class="main">First</h1><h1 class="fallback">Second</h1></body></html>"#;
397        let spec: Spec = serde_json::from_str(
398            r##"{
399                "title": "h1.main || h1.fallback"
400            }"##,
401        )
402        .unwrap();
403        let result = extract(html, &spec).unwrap();
404        assert_eq!(result["title"], "First");
405    }
406
407    #[test]
408    fn fallback_operator_second_selector_matches() {
409        let html = r#"<html><body><h1 class="fallback">Second</h1></body></html>"#;
410        let spec: Spec = serde_json::from_str(
411            r##"{
412                "title": "h1.main || h1.fallback"
413            }"##,
414        )
415        .unwrap();
416        let result = extract(html, &spec).unwrap();
417        assert_eq!(result["title"], "Second");
418    }
419
420    #[test]
421    fn fallback_operator_all_selectors_fail_returns_null() {
422        let html = r#"<html><body><p>Some content</p></body></html>"#;
423        let spec: Spec = serde_json::from_str(
424            r##"{
425                "title": "h1.main || h1.fallback || h1"
426            }"##,
427        )
428        .unwrap();
429        let result = extract(html, &spec).unwrap();
430        assert!(result["title"].is_null());
431    }
432
433    #[test]
434    fn fallback_operator_with_pipes() {
435        let html = r#"<html><body><h1 class="main">First</h1></body></html>"#;
436        let spec: Spec = serde_json::from_str(
437            r##"{
438                "title": "h1.main || h1.fallback | upper"
439            }"##,
440        )
441        .unwrap();
442        let result = extract(html, &spec).unwrap();
443        assert_eq!(result["title"], "First");
444    }
445
446    #[test]
447    fn fallback_operator_with_pipes_on_fallback() {
448        let html = r#"<html><body><h1 class="fallback">Second</h1></body></html>"#;
449        let spec: Spec = serde_json::from_str(
450            r##"{
451                "title": "h1.main || h1.fallback | upper"
452            }"##,
453        )
454        .unwrap();
455        let result = extract(html, &spec).unwrap();
456        assert_eq!(result["title"], "SECOND");
457    }
458
459    #[test]
460    fn fallback_operator_multiple_options() {
461        let html = r#"<html><body><h1 class="third">Third</h1></body></html>"#;
462        let spec: Spec = serde_json::from_str(
463            r##"{
464                "title": "h1.first || h1.second || h1.third || h1.fourth"
465            }"##,
466        )
467        .unwrap();
468        let result = extract(html, &spec).unwrap();
469        assert_eq!(result["title"], "Third");
470    }
471
472    #[test]
473    fn fallback_operator_empty_string_falls_back() {
474        let html = r#"<html><body><h1 class="main"></h1><h1 class="fallback">Actual Content</h1></body></html>"#;
475        let spec: Spec = serde_json::from_str(
476            r##"{
477                "title": "h1.main || h1.fallback"
478            }"##,
479        )
480        .unwrap();
481        let result = extract(html, &spec).unwrap();
482        assert_eq!(result["title"], "Actual Content");
483    }
484
485    #[test]
486    fn fallback_operator_whitespace_only_falls_back() {
487        let html = r#"<html><body><h1 class="main">   </h1><h1 class="fallback">Actual Content</h1></body></html>"#;
488        let spec: Spec = serde_json::from_str(
489            r##"{
490                "title": "h1.main || h1.fallback"
491            }"##,
492        )
493        .unwrap();
494        let result = extract(html, &spec).unwrap();
495        assert_eq!(result["title"], "Actual Content");
496    }
497
498    #[test]
499    fn fallback_operator_in_collection() {
500        let html = r#"
501            <html><body>
502                <div class="item">
503                    <h1 class="primary">First Item</h1>
504                    <h1 class="secondary">First Fallback</h1>
505                </div>
506                <div class="item">
507                    <h1 class="secondary">Second Item</h1>
508                </div>
509            </body></html>
510        "#;
511        let spec: Spec = serde_json::from_str(
512            r##"{
513                "items": [{
514                    "$": ".item",
515                    "title": "h1.primary || h1.secondary"
516                }]
517            }"##,
518        )
519        .unwrap();
520        let result = extract(html, &spec).unwrap();
521        let arr = result["items"].as_array().unwrap();
522        assert_eq!(arr.len(), 2);
523        assert_eq!(arr[0]["title"], "First Item");
524        assert_eq!(arr[1]["title"], "Second Item");
525    }
526
527    #[test]
528    fn optional_field_removed_when_null() {
529        let html = r#"<html><body><h1>Title</h1></body></html>"#;
530        let spec: Spec = serde_json::from_str(
531            r##"{
532                "title": "h1",
533                "missing?": ".nonexistent",
534                "description": "p"
535            }"##,
536        )
537        .unwrap();
538        let result = extract(html, &spec).unwrap();
539        assert_eq!(result["title"], "Title");
540        assert!(
541            result.get("missing").is_none(),
542            "Optional null field should be removed"
543        );
544        assert!(
545            result.get("description").is_some(),
546            "Non-optional null field should be present as null"
547        );
548        assert!(
549            result["description"].is_null(),
550            "Non-optional null field should be present as null"
551        );
552    }
553
554    #[test]
555    fn optional_field_kept_when_has_value() {
556        let html = r#"<html><body><h1>Title</h1><p class="desc">Description</p></body></html>"#;
557        let spec: Spec = serde_json::from_str(
558            r##"{
559                "title": "h1",
560                "description?": "p.desc"
561            }"##,
562        )
563        .unwrap();
564        let result = extract(html, &spec).unwrap();
565        assert_eq!(result["title"], "Title");
566        assert_eq!(result["description"], "Description");
567    }
568
569    #[test]
570    fn optional_nested_object_removed_when_all_null() {
571        let html = r#"<html><body><h1>Title</h1></body></html>"#;
572        let spec: Spec = serde_json::from_str(
573            r##"{
574                "title": "h1",
575                "metadata?": {
576                    "author": ".author",
577                    "date": ".date"
578                }
579            }"##,
580        )
581        .unwrap();
582        let result = extract(html, &spec).unwrap();
583        assert_eq!(result["title"], "Title");
584        assert!(
585            result.get("metadata").is_none(),
586            "Optional object with all null fields should be removed"
587        );
588    }
589
590    #[test]
591    fn optional_nested_object_kept_when_has_value() {
592        let html = r#"<html><body><h1>Title</h1><span class="author">John</span></body></html>"#;
593        let spec: Spec = serde_json::from_str(
594            r##"{
595                "title": "h1",
596                "metadata?": {
597                    "author": ".author",
598                    "date": ".date"
599                }
600            }"##,
601        )
602        .unwrap();
603        let result = extract(html, &spec).unwrap();
604        assert_eq!(result["title"], "Title");
605        assert_eq!(result["metadata"]["author"], "John");
606        assert!(
607            result["metadata"].get("date").is_none(),
608            "Nested null fields should be removed"
609        );
610    }
611
612    #[test]
613    fn non_optional_nested_object_kept_with_nulls() {
614        let html = r#"<html><body><h1>Title</h1></body></html>"#;
615        let spec: Spec = serde_json::from_str(
616            r##"{
617                "title": "h1",
618                "metadata": {
619                    "author": ".author",
620                    "date": ".date"
621                }
622            }"##,
623        )
624        .unwrap();
625        let result = extract(html, &spec).unwrap();
626        assert_eq!(result["title"], "Title");
627        assert!(
628            result["metadata"].get("author").is_none(),
629            "Nested null fields should be removed recursively"
630        );
631        assert!(
632            result["metadata"].get("date").is_none(),
633            "Nested null fields should be removed recursively"
634        );
635    }
636
637    #[test]
638    fn optional_array_removed_when_empty() {
639        let html = r#"<html><body><h1>Title</h1></body></html>"#;
640        let spec: Spec = serde_json::from_str(
641            r##"{
642                "title": "h1",
643                "items?": [{
644                    "$": ".item",
645                    "value": "$"
646                }]
647            }"##,
648        )
649        .unwrap();
650        let result = extract(html, &spec).unwrap();
651        assert_eq!(result["title"], "Title");
652        assert!(
653            result.get("items").is_none(),
654            "Optional empty array should be removed"
655        );
656    }
657
658    #[test]
659    fn optional_array_kept_when_has_items() {
660        let html = r#"<html><body><h1>Title</h1><div class="item">Item 1</div></body></html>"#;
661        let spec: Spec = serde_json::from_str(
662            r##"{
663                "title": "h1",
664                "items?": [{
665                    "$": ".item",
666                    "value": "$"
667                }]
668            }"##,
669        )
670        .unwrap();
671        let result = extract(html, &spec).unwrap();
672        assert_eq!(result["title"], "Title");
673        assert_eq!(result["items"].as_array().unwrap().len(), 1);
674        assert_eq!(result["items"][0]["value"], "Item 1");
675    }
676
677    #[test]
678    fn recursive_null_filtering_in_nested_objects() {
679        let html = r#"<html><body></body></html>"#;
680        let spec: Spec = serde_json::from_str(
681            r##"{
682                "data?": {
683                    "level1": {
684                        "level2": {
685                            "value": ".missing"
686                        }
687                    }
688                }
689            }"##,
690        )
691        .unwrap();
692        let result = extract(html, &spec).unwrap();
693        // All nested objects should be removed since they're all null
694        assert!(
695            result.get("data").is_none(),
696            "Optional nested object should be removed when all nested values are null"
697        );
698    }
699}
700
701// WASM bindings for JavaScript/TypeScript usage
702#[cfg(feature = "wasm")]
703pub mod wasm {
704    use wasm_bindgen::prelude::*;
705
706    /// Extract JSON from HTML using a spec
707    ///
708    /// # Arguments
709    ///
710    /// * `html` - The HTML source to parse
711    /// * `spec_json` - The extraction specification as JSON string
712    ///
713    /// # Returns
714    ///
715    /// A JSON string with the extracted data
716    ///
717    /// # Errors
718    ///
719    /// Returns a JsValue error if the HTML parsing or extraction fails
720    ///
721    /// # Example
722    ///
723    /// ```javascript
724    /// import { extract } from 'html2json';
725    ///
726    /// const html = '<div class="item"><span>Price: $25.00</span></div>';
727    /// const spec = '{"price": ".item span | regex:\\\\$(\\\\d+\\\\.\\\\d+)"}';
728    /// const result = extract(html, spec);
729    /// console.log(result); // {"price":"25.00"}
730    /// ```
731    #[wasm_bindgen(js_name = extract)]
732    pub fn extract(html: &str, spec_json: &str) -> Result<String, JsValue> {
733        use crate::extract;
734        use crate::spec::Spec;
735
736        let spec: Spec = serde_json::from_str(spec_json)
737            .map_err(|e| JsValue::from_str(&format!("Invalid spec JSON: {}", e)))?;
738
739        let result = extract(html, &spec)
740            .map_err(|e| JsValue::from_str(&format!("Extraction failed: {}", e)))?;
741
742        serde_json::to_string_pretty(&result)
743            .map_err(|e| JsValue::from_str(&format!("JSON serialization failed: {}", e)))
744    }
745}