1pub mod dom;
25pub mod extractor;
26pub mod pipe;
27pub mod spec;
28
29pub use extractor::Extractor;
30pub use spec::Spec;
31
32use anyhow::Result;
33
34pub fn extract(html: &str, spec: &Spec) -> Result<serde_json::Value> {
54 let extractor = Extractor::new(html)?;
55 extractor.extract(spec)
56}
57
58#[cfg(test)]
59mod tests {
60 use crate::extract;
61 use crate::spec::Spec;
62 const HTML: &str = include_str!("../examples/hn.html");
63
64 #[test]
65 fn basic_text_extraction() {
66 let spec: Spec = serde_json::from_str(
67 r##"{
68 "$": "title",
69 "title": "$"
70 }"##,
71 )
72 .unwrap();
73 let result = extract(HTML, &spec).unwrap();
74 assert_eq!(result["title"], "Hacker News");
75 }
76
77 #[test]
78 fn attribute_extraction() {
79 let spec: Spec = serde_json::from_str(
80 r##"{
81 "rss_link": "link[rel=alternate] | attr:href"
82 }"##,
83 )
84 .unwrap();
85 let result = extract(HTML, &spec).unwrap();
86 assert_eq!(result["rss_link"], "rss");
87 }
88
89 #[test]
90 fn scoping_with_dollar() {
91 let spec: Spec = serde_json::from_str(
92 r##"{
93 "$": ".pagetop",
94 "first_link": "a"
95 }"##,
96 )
97 .unwrap();
98 let result = extract(HTML, &spec).unwrap();
99 assert_eq!(result["first_link"], "Hacker News");
100 }
101
102 #[test]
103 fn nested_scoping() {
104 let spec: Spec = serde_json::from_str(
105 r##"{
106 "$": "head",
107 "head_element": {
108 "$": "link",
109 "href": "$ | attr:href",
110 "rel": "$ | attr:rel"
111 }
112 }"##,
113 )
114 .unwrap();
115 let result = extract(HTML, &spec).unwrap();
116 assert_eq!(
117 result["head_element"]["href"],
118 "news.css?fFlkMoHAedK8lfBWEYBd"
119 );
120 assert_eq!(result["head_element"]["rel"], "stylesheet");
121 }
122
123 #[test]
124 fn collection_extraction() {
125 let spec: Spec = serde_json::from_str(
126 r##"{
127 "ranks": [{
128 "$": ".rank",
129 "value": "$"
130 }]
131 }"##,
132 )
133 .unwrap();
134 let result = extract(HTML, &spec).unwrap();
135 let arr = result["ranks"].as_array().unwrap();
136 assert!(arr.len() >= 3);
137 assert_eq!(arr[0]["value"], "1.");
138 assert_eq!(arr[1]["value"], "2.");
139 assert_eq!(arr[2]["value"], "3.");
140 }
141
142 #[test]
143 fn collection_with_nested_properties() {
144 let spec: Spec = serde_json::from_str(
145 r##"{
146 "items": [{
147 "$": "tr.athing",
148 "id": "$ | attr:id",
149 "title": ".titleline a"
150 }]
151 }"##,
152 )
153 .unwrap();
154 let result = extract(HTML, &spec).unwrap();
155 let arr = result["items"].as_array().unwrap();
156 assert!(arr.len() >= 2);
157 assert_eq!(arr[0]["id"], "46446815");
158 assert_eq!(arr[0]["title"], "I canceled my book deal");
159 }
160
161 #[test]
162 fn literal_values() {
163 let spec: Spec = serde_json::from_str(
164 r##"{
165 "source": "'html2json'",
166 "version": 1.5,
167 "active": true,
168 "data": null
169 }"##,
170 )
171 .unwrap();
172 let result = extract(HTML, &spec).unwrap();
173 assert_eq!(result["source"], "html2json");
174 assert_eq!(result["version"], 1.5);
175 assert_eq!(result["active"], true);
176 assert!(result["data"].is_null());
177 }
178
179 #[test]
180 fn trim_pipe() {
181 let spec: Spec = serde_json::from_str(
182 r##"{
183 "title": "title | trim"
184 }"##,
185 )
186 .unwrap();
187 let result = extract(HTML, &spec).unwrap();
188 assert_eq!(result["title"], "Hacker News");
189 }
190
191 #[test]
192 fn lowercase_pipe() {
193 let spec: Spec = serde_json::from_str(
194 r##"{
195 "title_lower": "title | lower"
196 }"##,
197 )
198 .unwrap();
199 let result = extract(HTML, &spec).unwrap();
200 assert_eq!(result["title_lower"], "hacker news");
201 }
202
203 #[test]
204 fn uppercase_pipe() {
205 let spec: Spec = serde_json::from_str(
206 r##"{
207 "title_upper": "title | upper"
208 }"##,
209 )
210 .unwrap();
211 let result = extract(HTML, &spec).unwrap();
212 assert_eq!(result["title_upper"], "HACKER NEWS");
213 }
214
215 #[test]
216 fn substring_pipe() {
217 let spec: Spec = serde_json::from_str(
218 r##"{
219 "partial": "title | substr:0:6"
220 }"##,
221 )
222 .unwrap();
223 let result = extract(HTML, &spec).unwrap();
224 assert_eq!(result["partial"], "Hacker");
225 }
226
227 #[test]
228 fn parse_as_number_pipe() {
229 let spec: Spec = serde_json::from_str(
230 r##"{
231 "$": "#hnmain",
232 "table_width": "$ | attr:width | regex:(\\d+) | parseAs:int"
233 }"##,
234 )
235 .unwrap();
236 let result = extract(HTML, &spec).unwrap();
237 assert_eq!(result["table_width"], 85);
238 }
239
240 #[test]
241 fn regex_pipe() {
242 let spec: Spec = serde_json::from_str(
243 r##"{
244 "points": ".score | regex:(\\d+)\\s*points"
245 }"##,
246 )
247 .unwrap();
248 let result = extract(HTML, &spec).unwrap();
249 assert_eq!(result["points"], "156");
250 }
251
252 #[test]
253 fn no_match_returns_null() {
254 let spec: Spec = serde_json::from_str(
255 r##"{
256 "missing": ".nonexistent-element",
257 "present": "title"
258 }"##,
259 )
260 .unwrap();
261 let result = extract(HTML, &spec).unwrap();
262 assert!(result["missing"].is_null());
263 assert_eq!(result["present"], "Hacker News");
264 }
265
266 #[test]
267 fn empty_collection_returns_empty_array() {
268 let spec: Spec = serde_json::from_str(
269 r##"{
270 "items": [{
271 "$": ".nonexistent",
272 "value": "$"
273 }]
274 }"##,
275 )
276 .unwrap();
277 let result = extract(HTML, &spec).unwrap();
278 let arr = result["items"].as_array().unwrap();
279 assert_eq!(arr.len(), 0);
280 }
281
282 #[test]
283 fn multiple_attributes() {
284 let spec: Spec = serde_json::from_str(
285 r##"{
286 "lang": "html | attr:lang",
287 "page_title": "title"
288 }"##,
289 )
290 .unwrap();
291 let result = extract(HTML, &spec).unwrap();
292 assert_eq!(result["lang"], "en");
293 assert_eq!(result["page_title"], "Hacker News");
294 }
295
296 #[test]
297 fn complex_nested_structure() {
298 let spec: Spec = serde_json::from_str(
299 r##"{
300 "$": "#hnmain",
301 "submissions": [{
302 "$": "tr.athing",
303 "id": "$ | attr:id",
304 "title": ".titleline a"
305 }]
306 }"##,
307 )
308 .unwrap();
309 let result = extract(HTML, &spec).unwrap();
310 let items = result["submissions"].as_array().unwrap();
311 assert!(items.len() >= 1);
312 assert_eq!(items[0]["id"], "46446815");
313 assert_eq!(items[0]["title"], "I canceled my book deal");
314 }
315
316 #[test]
317 fn self_selector_in_collection() {
318 let spec: Spec = serde_json::from_str(
319 r##"{
320 "titles": [{
321 "$": ".titleline a",
322 "text": "$"
323 }]
324 }"##,
325 )
326 .unwrap();
327 let result = extract(HTML, &spec).unwrap();
328 let arr = result["titles"].as_array().unwrap();
329 assert!(arr.len() >= 2);
330 assert_eq!(arr[0]["text"], "I canceled my book deal");
331 }
332
333 #[test]
334 fn next_sibling_selector() {
335 let spec: Spec = serde_json::from_str(
336 r##"{
337 "$": "#hnmain",
338 "items": [{
339 "$": "tr.athing",
340 "title": ".titleline a",
341 "score": "+ .subtext .score"
342 }]
343 }"##,
344 )
345 .unwrap();
346 let result = extract(HTML, &spec).unwrap();
347 let items = result["items"].as_array().unwrap();
348 assert!(items.len() >= 1);
349 assert_eq!(items[0]["title"], "I canceled my book deal");
350 assert_eq!(items[0]["score"], "156 points");
351 }
352
353 #[test]
354 fn void_pipe() {
355 let rss_xml = include_str!("../examples/rss.xml");
356 let spec: Spec = serde_json::from_str(
358 r##"{
359 "$": "channel",
360 "link_trimmed": "link | void | trim",
361 "link_lower": "link | void | lower"
362 }"##,
363 )
364 .unwrap();
365 let result = extract(rss_xml, &spec).unwrap();
366 assert_eq!(result["link_trimmed"], "https://example.com");
367 assert_eq!(result["link_lower"], "https://example.com");
368 }
369
370 #[test]
371 fn rss_feed_extraction() {
372 let rss_xml = include_str!("../examples/rss.xml");
373 let spec_json = include_str!("../examples/rss.json");
374 let expected_json = include_str!("../examples/rss.expected.json");
375
376 let spec: Spec = serde_json::from_str(spec_json).unwrap();
377 let expected: serde_json::Value = serde_json::from_str(expected_json).unwrap();
378 let result = extract(rss_xml, &spec).unwrap();
379
380 similar_asserts::assert_serde_eq!(expected, result);
381 }
382
383 #[test]
384 fn hackernews_extraction() {
385 let spec_json = include_str!("../examples/hn.json");
386 let expected_json = include_str!("../examples/hn.expected.json");
387
388 let spec: Spec = serde_json::from_str(spec_json).unwrap();
389 let expected: serde_json::Value = serde_json::from_str(expected_json).unwrap();
390 let result = extract(HTML, &spec).unwrap();
391
392 similar_asserts::assert_serde_eq!(expected, result);
393 }
394}