1pub mod dom;
25pub mod pipe;
26pub mod spec;
27
28pub use dom::Dom;
29pub use spec::Spec;
30
31use anyhow::Result;
32
33pub fn extract(html: &str, spec: &Spec) -> Result<serde_json::Value> {
53 let dom = Dom::parse(html)?;
54 dom.extract(spec)
55}
56
57#[cfg(test)]
58mod tests {
59 use crate::extract;
60 use crate::spec::Spec;
61 const HTML: &str = include_str!("../examples/hn.html");
62
63 #[test]
64 fn basic_text_extraction() {
65 let spec: Spec = serde_json::from_str(
66 r##"{
67 "$": "title",
68 "title": "$"
69 }"##,
70 )
71 .unwrap();
72 let result = extract(HTML, &spec).unwrap();
73 assert_eq!(result["title"], "Hacker News");
74 }
75
76 #[test]
77 fn attribute_extraction() {
78 let spec: Spec = serde_json::from_str(
79 r##"{
80 "rss_link": "link[rel=alternate] | attr:href"
81 }"##,
82 )
83 .unwrap();
84 let result = extract(HTML, &spec).unwrap();
85 assert_eq!(result["rss_link"], "rss");
86 }
87
88 #[test]
89 fn scoping_with_dollar() {
90 let spec: Spec = serde_json::from_str(
91 r##"{
92 "$": ".pagetop",
93 "first_link": "a"
94 }"##,
95 )
96 .unwrap();
97 let result = extract(HTML, &spec).unwrap();
98 assert_eq!(result["first_link"], "Hacker News");
99 }
100
101 #[test]
102 fn nested_scoping() {
103 let spec: Spec = serde_json::from_str(
104 r##"{
105 "$": "head",
106 "head_element": {
107 "$": "link",
108 "href": "$ | attr:href",
109 "rel": "$ | attr:rel"
110 }
111 }"##,
112 )
113 .unwrap();
114 let result = extract(HTML, &spec).unwrap();
115 assert_eq!(
116 result["head_element"]["href"],
117 "news.css?fFlkMoHAedK8lfBWEYBd"
118 );
119 assert_eq!(result["head_element"]["rel"], "stylesheet");
120 }
121
122 #[test]
123 fn collection_extraction() {
124 let spec: Spec = serde_json::from_str(
125 r##"{
126 "ranks": [{
127 "$": ".rank",
128 "value": "$"
129 }]
130 }"##,
131 )
132 .unwrap();
133 let result = extract(HTML, &spec).unwrap();
134 let arr = result["ranks"].as_array().unwrap();
135 assert!(arr.len() >= 3);
136 assert_eq!(arr[0]["value"], "1.");
137 assert_eq!(arr[1]["value"], "2.");
138 assert_eq!(arr[2]["value"], "3.");
139 }
140
141 #[test]
142 fn collection_with_nested_properties() {
143 let spec: Spec = serde_json::from_str(
144 r##"{
145 "items": [{
146 "$": "tr.athing",
147 "id": "$ | attr:id",
148 "title": ".titleline a"
149 }]
150 }"##,
151 )
152 .unwrap();
153 let result = extract(HTML, &spec).unwrap();
154 let arr = result["items"].as_array().unwrap();
155 assert!(arr.len() >= 2);
156 assert_eq!(arr[0]["id"], "46446815");
157 assert_eq!(arr[0]["title"], "I canceled my book deal");
158 }
159
160 #[test]
161 fn literal_values() {
162 let spec: Spec = serde_json::from_str(
163 r##"{
164 "source": "'html2json'",
165 "version": 1.5,
166 "active": true,
167 "data": null
168 }"##,
169 )
170 .unwrap();
171 let result = extract(HTML, &spec).unwrap();
172 assert_eq!(result["source"], "html2json");
173 assert_eq!(result["version"], 1.5);
174 assert_eq!(result["active"], true);
175 assert!(result["data"].is_null());
176 }
177
178 #[test]
179 fn trim_pipe() {
180 let spec: Spec = serde_json::from_str(
181 r##"{
182 "title": "title | trim"
183 }"##,
184 )
185 .unwrap();
186 let result = extract(HTML, &spec).unwrap();
187 assert_eq!(result["title"], "Hacker News");
188 }
189
190 #[test]
191 fn lowercase_pipe() {
192 let spec: Spec = serde_json::from_str(
193 r##"{
194 "title_lower": "title | lower"
195 }"##,
196 )
197 .unwrap();
198 let result = extract(HTML, &spec).unwrap();
199 assert_eq!(result["title_lower"], "hacker news");
200 }
201
202 #[test]
203 fn uppercase_pipe() {
204 let spec: Spec = serde_json::from_str(
205 r##"{
206 "title_upper": "title | upper"
207 }"##,
208 )
209 .unwrap();
210 let result = extract(HTML, &spec).unwrap();
211 assert_eq!(result["title_upper"], "HACKER NEWS");
212 }
213
214 #[test]
215 fn substring_pipe() {
216 let spec: Spec = serde_json::from_str(
217 r##"{
218 "partial": "title | substr:0:6"
219 }"##,
220 )
221 .unwrap();
222 let result = extract(HTML, &spec).unwrap();
223 assert_eq!(result["partial"], "Hacker");
224 }
225
226 #[test]
227 fn parse_as_number_pipe() {
228 let spec: Spec = serde_json::from_str(
229 r##"{
230 "$": "#hnmain",
231 "table_width": "$ | attr:width | regex:(\\d+) | parseAs:int"
232 }"##,
233 )
234 .unwrap();
235 let result = extract(HTML, &spec).unwrap();
236 assert_eq!(result["table_width"], 85);
237 }
238
239 #[test]
240 fn regex_pipe() {
241 let spec: Spec = serde_json::from_str(
242 r##"{
243 "points": ".score | regex:(\\d+)\\s*points"
244 }"##,
245 )
246 .unwrap();
247 let result = extract(HTML, &spec).unwrap();
248 assert_eq!(result["points"], "156");
249 }
250
251 #[test]
252 fn no_match_returns_null() {
253 let spec: Spec = serde_json::from_str(
254 r##"{
255 "missing": ".nonexistent-element",
256 "present": "title"
257 }"##,
258 )
259 .unwrap();
260 let result = extract(HTML, &spec).unwrap();
261 assert!(result["missing"].is_null());
262 assert_eq!(result["present"], "Hacker News");
263 }
264
265 #[test]
266 fn empty_collection_returns_empty_array() {
267 let spec: Spec = serde_json::from_str(
268 r##"{
269 "items": [{
270 "$": ".nonexistent",
271 "value": "$"
272 }]
273 }"##,
274 )
275 .unwrap();
276 let result = extract(HTML, &spec).unwrap();
277 let arr = result["items"].as_array().unwrap();
278 assert_eq!(arr.len(), 0);
279 }
280
281 #[test]
282 fn multiple_attributes() {
283 let spec: Spec = serde_json::from_str(
284 r##"{
285 "lang": "html | attr:lang",
286 "page_title": "title"
287 }"##,
288 )
289 .unwrap();
290 let result = extract(HTML, &spec).unwrap();
291 assert_eq!(result["lang"], "en");
292 assert_eq!(result["page_title"], "Hacker News");
293 }
294
295 #[test]
296 fn complex_nested_structure() {
297 let spec: Spec = serde_json::from_str(
298 r##"{
299 "$": "#hnmain",
300 "submissions": [{
301 "$": "tr.athing",
302 "id": "$ | attr:id",
303 "title": ".titleline a"
304 }]
305 }"##,
306 )
307 .unwrap();
308 let result = extract(HTML, &spec).unwrap();
309 let items = result["submissions"].as_array().unwrap();
310 assert!(items.len() >= 1);
311 assert_eq!(items[0]["id"], "46446815");
312 assert_eq!(items[0]["title"], "I canceled my book deal");
313 }
314
315 #[test]
316 fn self_selector_in_collection() {
317 let spec: Spec = serde_json::from_str(
318 r##"{
319 "titles": [{
320 "$": ".titleline a",
321 "text": "$"
322 }]
323 }"##,
324 )
325 .unwrap();
326 let result = extract(HTML, &spec).unwrap();
327 let arr = result["titles"].as_array().unwrap();
328 assert!(arr.len() >= 2);
329 assert_eq!(arr[0]["text"], "I canceled my book deal");
330 }
331
332 #[test]
333 fn next_sibling_selector() {
334 let spec: Spec = serde_json::from_str(
335 r##"{
336 "$": "#hnmain",
337 "items": [{
338 "$": "tr.athing",
339 "title": ".titleline a",
340 "score": "+ .subtext .score"
341 }]
342 }"##,
343 )
344 .unwrap();
345 let result = extract(HTML, &spec).unwrap();
346 let items = result["items"].as_array().unwrap();
347 assert!(items.len() >= 1);
348 assert_eq!(items[0]["title"], "I canceled my book deal");
349 assert_eq!(items[0]["score"], "156 points");
350 }
351
352 #[test]
353 fn void_pipe() {
354 let rss_xml = include_str!("../examples/rss.xml");
355 let spec: Spec = serde_json::from_str(
357 r##"{
358 "$": "channel",
359 "link_trimmed": "link | void | trim",
360 "link_lower": "link | void | lower"
361 }"##,
362 )
363 .unwrap();
364 let result = extract(rss_xml, &spec).unwrap();
365 assert_eq!(result["link_trimmed"], "https://example.com");
366 assert_eq!(result["link_lower"], "https://example.com");
367 }
368
369 #[test]
370 fn rss_feed_extraction() {
371 let rss_xml = include_str!("../examples/rss.xml");
372 let spec_json = include_str!("../examples/rss.json");
373 let expected_json = include_str!("../examples/rss.expected.json");
374
375 let spec: Spec = serde_json::from_str(spec_json).unwrap();
376 let expected: serde_json::Value = serde_json::from_str(expected_json).unwrap();
377 let result = extract(rss_xml, &spec).unwrap();
378
379 similar_asserts::assert_serde_eq!(expected, result);
380 }
381
382 #[test]
383 fn hackernews_extraction() {
384 let spec_json = include_str!("../examples/hn.json");
385 let expected_json = include_str!("../examples/hn.expected.json");
386
387 let spec: Spec = serde_json::from_str(spec_json).unwrap();
388 let expected: serde_json::Value = serde_json::from_str(expected_json).unwrap();
389 let result = extract(HTML, &spec).unwrap();
390
391 similar_asserts::assert_serde_eq!(expected, result);
392 }
393
394 #[test]
395 fn fallback_operator_first_selector_matches() {
396 let html = r#"<html><body><h1 class="main">First</h1><h1 class="fallback">Second</h1></body></html>"#;
397 let spec: Spec = serde_json::from_str(
398 r##"{
399 "title": "h1.main || h1.fallback"
400 }"##,
401 )
402 .unwrap();
403 let result = extract(html, &spec).unwrap();
404 assert_eq!(result["title"], "First");
405 }
406
407 #[test]
408 fn fallback_operator_second_selector_matches() {
409 let html = r#"<html><body><h1 class="fallback">Second</h1></body></html>"#;
410 let spec: Spec = serde_json::from_str(
411 r##"{
412 "title": "h1.main || h1.fallback"
413 }"##,
414 )
415 .unwrap();
416 let result = extract(html, &spec).unwrap();
417 assert_eq!(result["title"], "Second");
418 }
419
420 #[test]
421 fn fallback_operator_all_selectors_fail_returns_null() {
422 let html = r#"<html><body><p>Some content</p></body></html>"#;
423 let spec: Spec = serde_json::from_str(
424 r##"{
425 "title": "h1.main || h1.fallback || h1"
426 }"##,
427 )
428 .unwrap();
429 let result = extract(html, &spec).unwrap();
430 assert!(result["title"].is_null());
431 }
432
433 #[test]
434 fn fallback_operator_with_pipes() {
435 let html = r#"<html><body><h1 class="main">First</h1></body></html>"#;
436 let spec: Spec = serde_json::from_str(
437 r##"{
438 "title": "h1.main || h1.fallback | upper"
439 }"##,
440 )
441 .unwrap();
442 let result = extract(html, &spec).unwrap();
443 assert_eq!(result["title"], "First");
444 }
445
446 #[test]
447 fn fallback_operator_with_pipes_on_fallback() {
448 let html = r#"<html><body><h1 class="fallback">Second</h1></body></html>"#;
449 let spec: Spec = serde_json::from_str(
450 r##"{
451 "title": "h1.main || h1.fallback | upper"
452 }"##,
453 )
454 .unwrap();
455 let result = extract(html, &spec).unwrap();
456 assert_eq!(result["title"], "SECOND");
457 }
458
459 #[test]
460 fn fallback_operator_multiple_options() {
461 let html = r#"<html><body><h1 class="third">Third</h1></body></html>"#;
462 let spec: Spec = serde_json::from_str(
463 r##"{
464 "title": "h1.first || h1.second || h1.third || h1.fourth"
465 }"##,
466 )
467 .unwrap();
468 let result = extract(html, &spec).unwrap();
469 assert_eq!(result["title"], "Third");
470 }
471
472 #[test]
473 fn fallback_operator_empty_string_falls_back() {
474 let html = r#"<html><body><h1 class="main"></h1><h1 class="fallback">Actual Content</h1></body></html>"#;
475 let spec: Spec = serde_json::from_str(
476 r##"{
477 "title": "h1.main || h1.fallback"
478 }"##,
479 )
480 .unwrap();
481 let result = extract(html, &spec).unwrap();
482 assert_eq!(result["title"], "Actual Content");
483 }
484
485 #[test]
486 fn fallback_operator_whitespace_only_falls_back() {
487 let html = r#"<html><body><h1 class="main"> </h1><h1 class="fallback">Actual Content</h1></body></html>"#;
488 let spec: Spec = serde_json::from_str(
489 r##"{
490 "title": "h1.main || h1.fallback"
491 }"##,
492 )
493 .unwrap();
494 let result = extract(html, &spec).unwrap();
495 assert_eq!(result["title"], "Actual Content");
496 }
497
498 #[test]
499 fn fallback_operator_in_collection() {
500 let html = r#"
501 <html><body>
502 <div class="item">
503 <h1 class="primary">First Item</h1>
504 <h1 class="secondary">First Fallback</h1>
505 </div>
506 <div class="item">
507 <h1 class="secondary">Second Item</h1>
508 </div>
509 </body></html>
510 "#;
511 let spec: Spec = serde_json::from_str(
512 r##"{
513 "items": [{
514 "$": ".item",
515 "title": "h1.primary || h1.secondary"
516 }]
517 }"##,
518 )
519 .unwrap();
520 let result = extract(html, &spec).unwrap();
521 let arr = result["items"].as_array().unwrap();
522 assert_eq!(arr.len(), 2);
523 assert_eq!(arr[0]["title"], "First Item");
524 assert_eq!(arr[1]["title"], "Second Item");
525 }
526
527 #[test]
528 fn optional_field_removed_when_null() {
529 let html = r#"<html><body><h1>Title</h1></body></html>"#;
530 let spec: Spec = serde_json::from_str(
531 r##"{
532 "title": "h1",
533 "missing?": ".nonexistent",
534 "description": "p"
535 }"##,
536 )
537 .unwrap();
538 let result = extract(html, &spec).unwrap();
539 assert_eq!(result["title"], "Title");
540 assert!(
541 result.get("missing").is_none(),
542 "Optional null field should be removed"
543 );
544 assert!(
545 result.get("description").is_some(),
546 "Non-optional null field should be present as null"
547 );
548 assert!(
549 result["description"].is_null(),
550 "Non-optional null field should be present as null"
551 );
552 }
553
554 #[test]
555 fn optional_field_kept_when_has_value() {
556 let html = r#"<html><body><h1>Title</h1><p class="desc">Description</p></body></html>"#;
557 let spec: Spec = serde_json::from_str(
558 r##"{
559 "title": "h1",
560 "description?": "p.desc"
561 }"##,
562 )
563 .unwrap();
564 let result = extract(html, &spec).unwrap();
565 assert_eq!(result["title"], "Title");
566 assert_eq!(result["description"], "Description");
567 }
568
569 #[test]
570 fn optional_nested_object_removed_when_all_null() {
571 let html = r#"<html><body><h1>Title</h1></body></html>"#;
572 let spec: Spec = serde_json::from_str(
573 r##"{
574 "title": "h1",
575 "metadata?": {
576 "author": ".author",
577 "date": ".date"
578 }
579 }"##,
580 )
581 .unwrap();
582 let result = extract(html, &spec).unwrap();
583 assert_eq!(result["title"], "Title");
584 assert!(
585 result.get("metadata").is_none(),
586 "Optional object with all null fields should be removed"
587 );
588 }
589
590 #[test]
591 fn optional_nested_object_kept_when_has_value() {
592 let html = r#"<html><body><h1>Title</h1><span class="author">John</span></body></html>"#;
593 let spec: Spec = serde_json::from_str(
594 r##"{
595 "title": "h1",
596 "metadata?": {
597 "author": ".author",
598 "date": ".date"
599 }
600 }"##,
601 )
602 .unwrap();
603 let result = extract(html, &spec).unwrap();
604 assert_eq!(result["title"], "Title");
605 assert_eq!(result["metadata"]["author"], "John");
606 assert!(
607 result["metadata"].get("date").is_none(),
608 "Nested null fields should be removed"
609 );
610 }
611
612 #[test]
613 fn non_optional_nested_object_kept_with_nulls() {
614 let html = r#"<html><body><h1>Title</h1></body></html>"#;
615 let spec: Spec = serde_json::from_str(
616 r##"{
617 "title": "h1",
618 "metadata": {
619 "author": ".author",
620 "date": ".date"
621 }
622 }"##,
623 )
624 .unwrap();
625 let result = extract(html, &spec).unwrap();
626 assert_eq!(result["title"], "Title");
627 assert!(
628 result["metadata"].get("author").is_none(),
629 "Nested null fields should be removed recursively"
630 );
631 assert!(
632 result["metadata"].get("date").is_none(),
633 "Nested null fields should be removed recursively"
634 );
635 }
636
637 #[test]
638 fn optional_array_removed_when_empty() {
639 let html = r#"<html><body><h1>Title</h1></body></html>"#;
640 let spec: Spec = serde_json::from_str(
641 r##"{
642 "title": "h1",
643 "items?": [{
644 "$": ".item",
645 "value": "$"
646 }]
647 }"##,
648 )
649 .unwrap();
650 let result = extract(html, &spec).unwrap();
651 assert_eq!(result["title"], "Title");
652 assert!(
653 result.get("items").is_none(),
654 "Optional empty array should be removed"
655 );
656 }
657
658 #[test]
659 fn optional_array_kept_when_has_items() {
660 let html = r#"<html><body><h1>Title</h1><div class="item">Item 1</div></body></html>"#;
661 let spec: Spec = serde_json::from_str(
662 r##"{
663 "title": "h1",
664 "items?": [{
665 "$": ".item",
666 "value": "$"
667 }]
668 }"##,
669 )
670 .unwrap();
671 let result = extract(html, &spec).unwrap();
672 assert_eq!(result["title"], "Title");
673 assert_eq!(result["items"].as_array().unwrap().len(), 1);
674 assert_eq!(result["items"][0]["value"], "Item 1");
675 }
676
677 #[test]
678 fn recursive_null_filtering_in_nested_objects() {
679 let html = r#"<html><body></body></html>"#;
680 let spec: Spec = serde_json::from_str(
681 r##"{
682 "data?": {
683 "level1": {
684 "level2": {
685 "value": ".missing"
686 }
687 }
688 }
689 }"##,
690 )
691 .unwrap();
692 let result = extract(html, &spec).unwrap();
693 assert!(
695 result.get("data").is_none(),
696 "Optional nested object should be removed when all nested values are null"
697 );
698 }
699}
700
701#[cfg(feature = "wasm")]
703pub mod wasm {
704 use wasm_bindgen::prelude::*;
705
706 #[wasm_bindgen(js_name = extract)]
732 pub fn extract(html: &str, spec_json: &str) -> Result<String, JsValue> {
733 use crate::extract;
734 use crate::spec::Spec;
735
736 let spec: Spec = serde_json::from_str(spec_json)
737 .map_err(|e| JsValue::from_str(&format!("Invalid spec JSON: {}", e)))?;
738
739 let result = extract(html, &spec)
740 .map_err(|e| JsValue::from_str(&format!("Extraction failed: {}", e)))?;
741
742 serde_json::to_string_pretty(&result)
743 .map_err(|e| JsValue::from_str(&format!("JSON serialization failed: {}", e)))
744 }
745}