html2json/
lib.rs

1//! html2json - HTML to JSON extractor using html5ever
2//!
3//! A Rust port of cheerio-json-mapper using html5ever for HTML parsing.
4//!
5//! # Overview
6//!
7//! This library extracts structured JSON data from HTML using CSS selectors
8//! defined in a JSON spec format.
9//!
10//! # Basic Example
11//!
12//! ```no_run
13//! use html2json::{extract, Spec};
14//!
15//! let html = r#"<html><body><h1>Hello</h1><p class="desc">World</p></body></html>"#;
16//! let spec_json = r#"{"title": "h1", "description": "p.desc"}"#;
17//! let spec: Spec = serde_json::from_str(spec_json)?;
18//! let result = extract(html, &spec)?;
19//! assert_eq!(result["title"], "Hello");
20//! assert_eq!(result["description"], "World");
21//! # Ok::<(), anyhow::Error>(())
22//! ```
23
24pub mod dom;
25pub mod extractor;
26pub mod pipe;
27pub mod spec;
28
29pub use extractor::Extractor;
30pub use spec::Spec;
31
32use anyhow::Result;
33
34/// Extract JSON from HTML using a spec
35///
36/// # Arguments
37///
38/// * `html` - The HTML source to parse
39/// * `spec` - The extraction specification
40///
41/// # Example
42///
43/// ```
44/// use html2json::{extract, Spec};
45///
46/// let html = r#"<div class="item"><span>Price: $25.00</span></div>"#;
47/// let spec_json = r#"{"price": ".item span | regex:\\$(\\d+\\.\\d+)"}"#;
48/// let spec: Spec = serde_json::from_str(spec_json)?;
49/// let result = extract(html, &spec)?;
50/// assert_eq!(result["price"], "25.00");
51/// # Ok::<(), anyhow::Error>(())
52/// ```
53pub fn extract(html: &str, spec: &Spec) -> Result<serde_json::Value> {
54    let extractor = Extractor::new(html)?;
55    extractor.extract(spec)
56}
57
58#[cfg(test)]
59mod tests {
60    use crate::extract;
61    use crate::spec::Spec;
62    const HTML: &str = include_str!("../examples/hn.html");
63
64    #[test]
65    fn basic_text_extraction() {
66        let spec: Spec = serde_json::from_str(
67            r##"{
68                "$": "title",
69                "title": "$"
70            }"##,
71        )
72        .unwrap();
73        let result = extract(HTML, &spec).unwrap();
74        assert_eq!(result["title"], "Hacker News");
75    }
76
77    #[test]
78    fn attribute_extraction() {
79        let spec: Spec = serde_json::from_str(
80            r##"{
81                "rss_link": "link[rel=alternate] | attr:href"
82            }"##,
83        )
84        .unwrap();
85        let result = extract(HTML, &spec).unwrap();
86        assert_eq!(result["rss_link"], "rss");
87    }
88
89    #[test]
90    fn scoping_with_dollar() {
91        let spec: Spec = serde_json::from_str(
92            r##"{
93                "$": ".pagetop",
94                "first_link": "a"
95            }"##,
96        )
97        .unwrap();
98        let result = extract(HTML, &spec).unwrap();
99        assert_eq!(result["first_link"], "Hacker News");
100    }
101
102    #[test]
103    fn nested_scoping() {
104        let spec: Spec = serde_json::from_str(
105            r##"{
106                "$": "head",
107                "head_element": {
108                    "$": "link",
109                    "href": "$ | attr:href",
110                    "rel": "$ | attr:rel"
111                }
112            }"##,
113        )
114        .unwrap();
115        let result = extract(HTML, &spec).unwrap();
116        assert_eq!(
117            result["head_element"]["href"],
118            "news.css?fFlkMoHAedK8lfBWEYBd"
119        );
120        assert_eq!(result["head_element"]["rel"], "stylesheet");
121    }
122
123    #[test]
124    fn collection_extraction() {
125        let spec: Spec = serde_json::from_str(
126            r##"{
127                "ranks": [{
128                    "$": ".rank",
129                    "value": "$"
130                }]
131            }"##,
132        )
133        .unwrap();
134        let result = extract(HTML, &spec).unwrap();
135        let arr = result["ranks"].as_array().unwrap();
136        assert!(arr.len() >= 3);
137        assert_eq!(arr[0]["value"], "1.");
138        assert_eq!(arr[1]["value"], "2.");
139        assert_eq!(arr[2]["value"], "3.");
140    }
141
142    #[test]
143    fn collection_with_nested_properties() {
144        let spec: Spec = serde_json::from_str(
145            r##"{
146                "items": [{
147                    "$": "tr.athing",
148                    "id": "$ | attr:id",
149                    "title": ".titleline a"
150                }]
151            }"##,
152        )
153        .unwrap();
154        let result = extract(HTML, &spec).unwrap();
155        let arr = result["items"].as_array().unwrap();
156        assert!(arr.len() >= 2);
157        assert_eq!(arr[0]["id"], "46446815");
158        assert_eq!(arr[0]["title"], "I canceled my book deal");
159    }
160
161    #[test]
162    fn literal_values() {
163        let spec: Spec = serde_json::from_str(
164            r##"{
165                "source": "'html2json'",
166                "version": 1.5,
167                "active": true,
168                "data": null
169            }"##,
170        )
171        .unwrap();
172        let result = extract(HTML, &spec).unwrap();
173        assert_eq!(result["source"], "html2json");
174        assert_eq!(result["version"], 1.5);
175        assert_eq!(result["active"], true);
176        assert!(result["data"].is_null());
177    }
178
179    #[test]
180    fn trim_pipe() {
181        let spec: Spec = serde_json::from_str(
182            r##"{
183                "title": "title | trim"
184            }"##,
185        )
186        .unwrap();
187        let result = extract(HTML, &spec).unwrap();
188        assert_eq!(result["title"], "Hacker News");
189    }
190
191    #[test]
192    fn lowercase_pipe() {
193        let spec: Spec = serde_json::from_str(
194            r##"{
195                "title_lower": "title | lower"
196            }"##,
197        )
198        .unwrap();
199        let result = extract(HTML, &spec).unwrap();
200        assert_eq!(result["title_lower"], "hacker news");
201    }
202
203    #[test]
204    fn uppercase_pipe() {
205        let spec: Spec = serde_json::from_str(
206            r##"{
207                "title_upper": "title | upper"
208            }"##,
209        )
210        .unwrap();
211        let result = extract(HTML, &spec).unwrap();
212        assert_eq!(result["title_upper"], "HACKER NEWS");
213    }
214
215    #[test]
216    fn substring_pipe() {
217        let spec: Spec = serde_json::from_str(
218            r##"{
219                "partial": "title | substr:0:6"
220            }"##,
221        )
222        .unwrap();
223        let result = extract(HTML, &spec).unwrap();
224        assert_eq!(result["partial"], "Hacker");
225    }
226
227    #[test]
228    fn parse_as_number_pipe() {
229        let spec: Spec = serde_json::from_str(
230            r##"{
231                "$": "#hnmain",
232                "table_width": "$ | attr:width | regex:(\\d+) | parseAs:int"
233            }"##,
234        )
235        .unwrap();
236        let result = extract(HTML, &spec).unwrap();
237        assert_eq!(result["table_width"], 85);
238    }
239
240    #[test]
241    fn regex_pipe() {
242        let spec: Spec = serde_json::from_str(
243            r##"{
244                "points": ".score | regex:(\\d+)\\s*points"
245            }"##,
246        )
247        .unwrap();
248        let result = extract(HTML, &spec).unwrap();
249        assert_eq!(result["points"], "156");
250    }
251
252    #[test]
253    fn no_match_returns_null() {
254        let spec: Spec = serde_json::from_str(
255            r##"{
256                "missing": ".nonexistent-element",
257                "present": "title"
258            }"##,
259        )
260        .unwrap();
261        let result = extract(HTML, &spec).unwrap();
262        assert!(result["missing"].is_null());
263        assert_eq!(result["present"], "Hacker News");
264    }
265
266    #[test]
267    fn empty_collection_returns_empty_array() {
268        let spec: Spec = serde_json::from_str(
269            r##"{
270                "items": [{
271                    "$": ".nonexistent",
272                    "value": "$"
273                }]
274            }"##,
275        )
276        .unwrap();
277        let result = extract(HTML, &spec).unwrap();
278        let arr = result["items"].as_array().unwrap();
279        assert_eq!(arr.len(), 0);
280    }
281
282    #[test]
283    fn multiple_attributes() {
284        let spec: Spec = serde_json::from_str(
285            r##"{
286                "lang": "html | attr:lang",
287                "page_title": "title"
288            }"##,
289        )
290        .unwrap();
291        let result = extract(HTML, &spec).unwrap();
292        assert_eq!(result["lang"], "en");
293        assert_eq!(result["page_title"], "Hacker News");
294    }
295
296    #[test]
297    fn complex_nested_structure() {
298        let spec: Spec = serde_json::from_str(
299            r##"{
300                "$": "#hnmain",
301                "submissions": [{
302                    "$": "tr.athing",
303                    "id": "$ | attr:id",
304                    "title": ".titleline a"
305                }]
306            }"##,
307        )
308        .unwrap();
309        let result = extract(HTML, &spec).unwrap();
310        let items = result["submissions"].as_array().unwrap();
311        assert!(items.len() >= 1);
312        assert_eq!(items[0]["id"], "46446815");
313        assert_eq!(items[0]["title"], "I canceled my book deal");
314    }
315
316    #[test]
317    fn self_selector_in_collection() {
318        let spec: Spec = serde_json::from_str(
319            r##"{
320                "titles": [{
321                    "$": ".titleline a",
322                    "text": "$"
323                }]
324            }"##,
325        )
326        .unwrap();
327        let result = extract(HTML, &spec).unwrap();
328        let arr = result["titles"].as_array().unwrap();
329        assert!(arr.len() >= 2);
330        assert_eq!(arr[0]["text"], "I canceled my book deal");
331    }
332
333    #[test]
334    fn next_sibling_selector() {
335        let spec: Spec = serde_json::from_str(
336            r##"{
337                "$": "#hnmain",
338                "items": [{
339                    "$": "tr.athing",
340                    "title": ".titleline a",
341                    "score": "+ .subtext .score"
342                }]
343            }"##,
344        )
345        .unwrap();
346        let result = extract(HTML, &spec).unwrap();
347        let items = result["items"].as_array().unwrap();
348        assert!(items.len() >= 1);
349        assert_eq!(items[0]["title"], "I canceled my book deal");
350        assert_eq!(items[0]["score"], "156 points");
351    }
352
353    #[test]
354    fn void_pipe() {
355        let rss_xml = include_str!("../examples/rss.xml");
356        // The void pipe should work regardless of its position in the pipe chain
357        let spec: Spec = serde_json::from_str(
358            r##"{
359                "$": "channel",
360                "link_trimmed": "link | void | trim",
361                "link_lower": "link | void | lower"
362            }"##,
363        )
364        .unwrap();
365        let result = extract(rss_xml, &spec).unwrap();
366        assert_eq!(result["link_trimmed"], "https://example.com");
367        assert_eq!(result["link_lower"], "https://example.com");
368    }
369
370    #[test]
371    fn rss_feed_extraction() {
372        let rss_xml = include_str!("../examples/rss.xml");
373        let spec_json = include_str!("../examples/rss.json");
374        let expected_json = include_str!("../examples/rss.expected.json");
375
376        let spec: Spec = serde_json::from_str(spec_json).unwrap();
377        let expected: serde_json::Value = serde_json::from_str(expected_json).unwrap();
378        let result = extract(rss_xml, &spec).unwrap();
379
380        similar_asserts::assert_serde_eq!(expected, result);
381    }
382
383    #[test]
384    fn hackernews_extraction() {
385        let spec_json = include_str!("../examples/hn.json");
386        let expected_json = include_str!("../examples/hn.expected.json");
387
388        let spec: Spec = serde_json::from_str(spec_json).unwrap();
389        let expected: serde_json::Value = serde_json::from_str(expected_json).unwrap();
390        let result = extract(HTML, &spec).unwrap();
391
392        similar_asserts::assert_serde_eq!(expected, result);
393    }
394}