table_extract/
lib.rs

1// Copyright 2019 Mitchell Kember. Subject to the MIT License.
2
3//! Utility for extracting data from HTML tables.
4//!
5//! This library allows you to parse tables from HTML documents and iterate over
6//! their rows. There are three entry points:
7//!
8//! - [`Table::find_first`] finds the first table.
9//! - [`Table::find_by_id`] finds a table by its HTML id.
10//! - [`Table::find_by_headers`] finds a table that has certain headers.
11//!
12//! Each of these returns an `Option<`[`Table`]`>`, since there might not be any
13//! matching table in the HTML. Once you have a table, you can iterate over it
14//! and access the contents of each [`Row`].
15//!
16//! # Examples
17//!
18//! Here is a simple example that uses [`Table::find_first`] to print the cells
19//! in each row of a table:
20//!
21//! ```
22//! let html = r#"
23//!     <table>
24//!         <tr><th>Name</th><th>Age</th></tr>
25//!         <tr><td>John</td><td>20</td></tr>
26//!     </table>
27//! "#;
28//! let table = table_extract::Table::find_first(html).unwrap();
29//! for row in &table {
30//!     println!(
31//!         "{} is {} years old",
32//!         row.get("Name").unwrap_or("<name missing>"),
33//!         row.get("Age").unwrap_or("<age missing>")
34//!     )
35//! }
36//! ```
37//!
38//! If the document has multiple tables, we can use [`Table::find_by_headers`]
39//! to identify the one we want:
40//!
41//! ```
42//! let html = r#"
43//!     <table></table>
44//!     <table>
45//!         <tr><th>Name</th><th>Age</th></tr>
46//!         <tr><td>John</td><td>20</td></tr>
47//!     </table>
48//! "#;
49//! let table = table_extract::Table::find_by_headers(html, &["Age"]).unwrap();
50//! for row in &table {
51//!     for cell in row {
52//!         println!("Table cell: {}", cell);
53//!     }
54//! }
55//! ```
56//!
57//! [`Table`]: struct.Table.html
58//! [`Row`]: struct.Row.html
59//! [`Table::find_first`]: struct.Table.html#method.find_first
60//! [`Table::find_by_id`]: struct.Table.html#method.find_by_id
61//! [`Table::find_by_headers`]: struct.Table.html#method.find_by_headers
62
63use scraper::element_ref::ElementRef;
64use scraper::{Html, Selector};
65use std::collections::HashMap;
66
67/// A map from `<th>` table headers to their zero-based positions.
68///
69/// For example, consider the following table:
70///
71/// ```html
72/// <table>
73///     <tr><th>Name</th><th>Age</th></tr>
74///     <tr><td>John</td><td>20</td></tr>
75/// </table>
76/// ```
77///
78/// The `Headers` for this table would map "Name" to 0 and "Age" to 1.
79pub type Headers = HashMap<String, usize>;
80
81/// A parsed HTML table.
82///
83/// See [the module level documentation](index.html) for more.
84#[derive(Clone, Debug, Eq, PartialEq)]
85pub struct Table {
86    headers: Headers,
87    data: Vec<Vec<String>>,
88}
89
90impl Table {
91    /// Finds the first table in `html`.
92    pub fn find_first(html: &str) -> Option<Table> {
93        let html = Html::parse_fragment(html);
94        html.select(&css("table")).next().map(Table::new)
95    }
96
97    /// Finds the table in `html` with an id of `id`.
98    pub fn find_by_id(html: &str, id: &str) -> Option<Table> {
99        let html = Html::parse_fragment(html);
100        let selector = format!("table#{}", id);
101        Selector::parse(&selector)
102            .ok()
103            .as_ref()
104            .map(|s| html.select(s))
105            .and_then(|mut s| s.next())
106            .map(Table::new)
107    }
108
109    /// Finds the table in `html` whose first row contains all of the headers
110    /// specified in `headers`. The order does not matter.
111    ///
112    /// If `headers` is empty, this is the same as
113    /// [`find_first`](#method.find_first).
114    pub fn find_by_headers<T>(html: &str, headers: &[T]) -> Option<Table>
115    where
116        T: AsRef<str>,
117    {
118        if headers.is_empty() {
119            return Table::find_first(html);
120        }
121
122        let sel_table = css("table");
123        let sel_tr = css("tr");
124        let sel_th = css("th");
125
126        let html = Html::parse_fragment(html);
127        html.select(&sel_table)
128            .find(|table| {
129                table.select(&sel_tr).next().map_or(false, |tr| {
130                    let cells = select_cells(tr, &sel_th);
131                    headers.iter().all(|h| contains_str(&cells, h.as_ref()))
132                })
133            })
134            .map(Table::new)
135    }
136
137    /// Returns the headers of the table.
138    ///
139    /// This will be empty if the table had no `<th>` tags in its first row. See
140    /// [`Headers`](type.Headers.html) for more.
141    pub fn headers(&self) -> &Headers {
142        &self.headers
143    }
144
145    /// Returns an iterator over the [`Row`](struct.Row.html)s of the table.
146    ///
147    /// Only `<td>` cells are considered when generating rows. If the first row
148    /// of the table is a header row, meaning it contains at least one `<th>`
149    /// cell, the iterator will start on the second row. Use
150    /// [`headers`](#method.headers) to access the header row in that case.
151    pub fn iter(&self) -> Iter {
152        Iter {
153            headers: &self.headers,
154            iter: self.data.iter(),
155        }
156    }
157
158    fn new(element: ElementRef) -> Table {
159        let sel_tr = css("tr");
160        let sel_th = css("th");
161        let sel_td = css("td");
162
163        let mut headers = HashMap::new();
164        let mut rows = element.select(&sel_tr).peekable();
165        if let Some(tr) = rows.peek() {
166            for (i, th) in tr.select(&sel_th).enumerate() {
167                headers.insert(cell_content(th), i);
168            }
169        }
170        if !headers.is_empty() {
171            rows.next();
172        }
173        let data = rows.map(|tr| select_cells(tr, &sel_td)).collect();
174
175        Table { headers, data }
176    }
177}
178
179impl<'a> IntoIterator for &'a Table {
180    type Item = Row<'a>;
181    type IntoIter = Iter<'a>;
182
183    fn into_iter(self) -> Self::IntoIter {
184        self.iter()
185    }
186}
187
188/// An iterator over the rows in a [`Table`](struct.Table.html).
189pub struct Iter<'a> {
190    headers: &'a Headers,
191    iter: std::slice::Iter<'a, Vec<String>>,
192}
193
194impl<'a> Iterator for Iter<'a> {
195    type Item = Row<'a>;
196
197    fn next(&mut self) -> Option<Self::Item> {
198        let headers = self.headers;
199        self.iter.next().map(|cells| Row { headers, cells })
200    }
201}
202
203/// A row in a [`Table`](struct.Table.html).
204///
205/// A row consists of a number of data cells stored as strings. If the row
206/// contains the same number of cells as the table's header row, its cells can
207/// be safely accessed by header names using [`get`](#method.get). Otherwise,
208/// the data should be accessed via [`as_slice`](#method.as_slice) or by
209/// iterating over the row.
210///
211/// This struct can be thought of as a lightweight reference into a table. As
212/// such, it implements the `Copy` trait.
213#[derive(Clone, Copy, Debug, Eq, PartialEq)]
214pub struct Row<'a> {
215    headers: &'a Headers,
216    cells: &'a [String],
217}
218
219impl<'a> Row<'a> {
220    /// Returns the number of cells in the row.
221    pub fn len(&self) -> usize {
222        self.cells.len()
223    }
224
225    /// Returns `true` if the row contains no cells.
226    pub fn is_empty(&self) -> bool {
227        self.cells.is_empty()
228    }
229
230    /// Returns the cell underneath `header`.
231    ///
232    /// Returns `None` if there is no such header, or if there is no cell at
233    /// that position in the row.
234    pub fn get(&self, header: &str) -> Option<&'a str> {
235        self.headers
236            .get(header)
237            .and_then(|&i| self.cells.get(i).map(String::as_str))
238    }
239
240    /// Returns a slice containing all the cells.
241    pub fn as_slice(&self) -> &'a [String] {
242        self.cells
243    }
244
245    /// Returns an iterator over the cells of the row.
246    pub fn iter(&self) -> std::slice::Iter<String> {
247        self.cells.iter()
248    }
249}
250
251impl<'a> IntoIterator for Row<'a> {
252    type Item = &'a String;
253    type IntoIter = std::slice::Iter<'a, String>;
254
255    fn into_iter(self) -> Self::IntoIter {
256        self.cells.iter()
257    }
258}
259
260fn css(selector: &'static str) -> Selector {
261    Selector::parse(selector).unwrap()
262}
263
264fn select_cells(element: ElementRef, selector: &Selector) -> Vec<String> {
265    element.select(selector).map(cell_content).collect()
266}
267
268fn cell_content(element: ElementRef) -> String {
269    element.inner_html().trim().to_string()
270}
271
272fn contains_str(slice: &[String], item: &str) -> bool {
273    slice.iter().any(|s| s == item)
274}
275
276#[cfg(test)]
277mod tests {
278    use super::*;
279
280    const TABLE_EMPTY: &'static str = r#"
281<table></table>
282"#;
283
284    const TABLE_TH: &'static str = r#"
285<table>
286    <tr><th>Name</th><th>Age</th></tr>
287</table>
288"#;
289
290    const TABLE_TD: &'static str = r#"
291<table>
292    <tr><td>Name</td><td>Age</td></tr>
293</table>
294"#;
295
296    const TABLE_TH_TD: &'static str = r#"
297<table>
298    <tr><th>Name</th><th>Age</th></tr>
299    <tr><td>John</td><td>20</td></tr>
300</table>
301"#;
302
303    const TABLE_TD_TD: &'static str = r#"
304<table>
305    <tr><td>Name</td><td>Age</td></tr>
306    <tr><td>John</td><td>20</td></tr>
307</table>
308"#;
309
310    const TABLE_TH_TH: &'static str = r#"
311<table>
312    <tr><th>Name</th><th>Age</th></tr>
313    <tr><th>John</th><th>20</th></tr>
314</table>
315"#;
316
317    const TABLE_COMPLEX: &'static str = r#"
318<table>
319    <tr><th>Name</th><th>Age</th><th>Extra</th></tr>
320    <tr><td>John</td><td>20</td></tr>
321    <tr><td>May</td><td>30</td><td>foo</td></tr>
322    <tr></tr>
323    <tr><td>a</td><td>b</td><td>c</td><td>d</td></tr>
324</table>
325"#;
326
327    const HTML_NO_TABLE: &'static str = r#"
328<!doctype HTML>
329<html>
330    <head><title>foo</title></head>
331    <body><p>Hi.</p></body>
332</html>
333"#;
334
335    const HTML_TWO_TABLES: &'static str = r#"
336<!doctype HTML>
337<html>
338    <head><title>foo</title></head>
339    <body>
340        <table id="first">
341            <tr><th>Name</th><th>Age</th></tr>
342            <tr><td>John</td><td>20</td></tr>
343        </table>
344        <table id="second">
345            <tr><th>Name</th><th>Weight</th></tr>
346            <tr><td>John</td><td>150</td></tr>
347        </table>
348    </body>
349</html>
350"#;
351
352    const HTML_TABLE_FRAGMENT: &'static str = r#"
353        <table id="first">
354            <tr><th>Name</th><th>Age</th></tr>
355            <tr><td>John</td><td>20</td></tr>
356        </table>
357    </body>
358</html>
359"#;
360
361    #[test]
362    fn test_find_first_none() {
363        assert_eq!(None, Table::find_first(""));
364        assert_eq!(None, Table::find_first("foo"));
365        assert_eq!(None, Table::find_first(HTML_NO_TABLE));
366    }
367
368    #[test]
369    fn test_find_first_empty() {
370        let empty = Table {
371            headers: HashMap::new(),
372            data: Vec::new(),
373        };
374        assert_eq!(Some(empty), Table::find_first(TABLE_EMPTY));
375    }
376
377    #[test]
378    fn test_find_first_some() {
379        assert!(Table::find_first(TABLE_TH).is_some());
380        assert!(Table::find_first(TABLE_TD).is_some());
381    }
382
383    #[test]
384    fn test_find_by_id_none() {
385        assert_eq!(None, Table::find_by_id("", ""));
386        assert_eq!(None, Table::find_by_id("foo", "id"));
387        assert_eq!(None, Table::find_by_id(HTML_NO_TABLE, "id"));
388
389        assert_eq!(None, Table::find_by_id(TABLE_EMPTY, "id"));
390        assert_eq!(None, Table::find_by_id(TABLE_TH, "id"));
391        assert_eq!(None, Table::find_by_id(TABLE_TH, ""));
392        assert_eq!(None, Table::find_by_id(HTML_TWO_TABLES, "id"));
393    }
394
395    #[test]
396    fn test_find_by_id_some() {
397        assert!(Table::find_by_id(HTML_TWO_TABLES, "first").is_some());
398        assert!(Table::find_by_id(HTML_TWO_TABLES, "second").is_some());
399    }
400
401    #[test]
402    fn test_find_by_headers_empty() {
403        let headers: [&str; 0] = [];
404
405        assert_eq!(None, Table::find_by_headers("", &headers));
406        assert_eq!(None, Table::find_by_headers("foo", &headers));
407        assert_eq!(None, Table::find_by_headers(HTML_NO_TABLE, &headers));
408
409        assert!(Table::find_by_headers(TABLE_EMPTY, &headers).is_some());
410        assert!(Table::find_by_headers(HTML_TWO_TABLES, &headers).is_some());
411    }
412
413    #[test]
414    fn test_find_by_headers_none() {
415        let headers = ["Name", "Age"];
416        let bad_headers = ["Name", "BAD"];
417
418        assert_eq!(None, Table::find_by_headers("", &headers));
419        assert_eq!(None, Table::find_by_headers("foo", &headers));
420        assert_eq!(None, Table::find_by_headers(HTML_NO_TABLE, &headers));
421
422        assert_eq!(None, Table::find_by_headers(TABLE_EMPTY, &bad_headers));
423        assert_eq!(None, Table::find_by_headers(TABLE_TH, &bad_headers));
424
425        assert_eq!(None, Table::find_by_headers(TABLE_TD, &headers));
426        assert_eq!(None, Table::find_by_headers(TABLE_TD, &bad_headers));
427    }
428
429    #[test]
430    fn test_find_by_headers_some() {
431        let headers: [&str; 0] = [];
432        assert!(Table::find_by_headers(TABLE_TH, &headers).is_some());
433        assert!(Table::find_by_headers(TABLE_TH_TD, &headers).is_some());
434        assert!(Table::find_by_headers(HTML_TWO_TABLES, &headers).is_some());
435
436        let headers = ["Name"];
437        assert!(Table::find_by_headers(TABLE_TH, &headers).is_some());
438        assert!(Table::find_by_headers(TABLE_TH_TD, &headers).is_some());
439        assert!(Table::find_by_headers(HTML_TWO_TABLES, &headers).is_some());
440
441        let headers = ["Age", "Name"];
442        assert!(Table::find_by_headers(TABLE_TH, &headers).is_some());
443        assert!(Table::find_by_headers(TABLE_TH_TD, &headers).is_some());
444        assert!(Table::find_by_headers(HTML_TWO_TABLES, &headers).is_some());
445    }
446
447    #[test]
448    fn test_find_first_incomplete_fragment() {
449        assert!(Table::find_first(HTML_TABLE_FRAGMENT).is_some());
450    }
451
452    #[test]
453    fn test_headers_empty() {
454        let empty = HashMap::new();
455        assert_eq!(&empty, Table::find_first(TABLE_TD).unwrap().headers());
456        assert_eq!(&empty, Table::find_first(TABLE_TD_TD).unwrap().headers());
457    }
458
459    #[test]
460    fn test_headers_nonempty() {
461        let mut headers = HashMap::new();
462        headers.insert("Name".to_string(), 0);
463        headers.insert("Age".to_string(), 1);
464
465        assert_eq!(&headers, Table::find_first(TABLE_TH).unwrap().headers());
466        assert_eq!(&headers, Table::find_first(TABLE_TH_TD).unwrap().headers());
467        assert_eq!(&headers, Table::find_first(TABLE_TH_TH).unwrap().headers());
468
469        headers.insert("Extra".to_string(), 2);
470        assert_eq!(
471            &headers,
472            Table::find_first(TABLE_COMPLEX).unwrap().headers()
473        );
474    }
475
476    #[test]
477    fn test_iter_empty() {
478        assert_eq!(0, Table::find_first(TABLE_EMPTY).unwrap().iter().count());
479        assert_eq!(0, Table::find_first(TABLE_TH).unwrap().iter().count());
480    }
481
482    #[test]
483    fn test_iter_nonempty() {
484        assert_eq!(1, Table::find_first(TABLE_TD).unwrap().iter().count());
485        assert_eq!(1, Table::find_first(TABLE_TH_TD).unwrap().iter().count());
486        assert_eq!(2, Table::find_first(TABLE_TD_TD).unwrap().iter().count());
487        assert_eq!(1, Table::find_first(TABLE_TH_TH).unwrap().iter().count());
488        assert_eq!(4, Table::find_first(TABLE_COMPLEX).unwrap().iter().count());
489    }
490
491    #[test]
492    fn test_row_is_empty() {
493        let table = Table::find_first(TABLE_TD).unwrap();
494        assert_eq!(
495            vec![false],
496            table.iter().map(|r| r.is_empty()).collect::<Vec<_>>()
497        );
498
499        let table = Table::find_first(TABLE_COMPLEX).unwrap();
500        assert_eq!(
501            vec![false, false, true, false],
502            table.iter().map(|r| r.is_empty()).collect::<Vec<_>>()
503        );
504    }
505
506    #[test]
507    fn test_row_len() {
508        let table = Table::find_first(TABLE_TD).unwrap();
509        assert_eq!(vec![2], table.iter().map(|r| r.len()).collect::<Vec<_>>());
510
511        let table = Table::find_first(TABLE_COMPLEX).unwrap();
512        assert_eq!(
513            vec![2, 3, 0, 4],
514            table.iter().map(|r| r.len()).collect::<Vec<_>>()
515        );
516    }
517
518    #[test]
519    fn test_row_get_without_headers() {
520        let table = Table::find_first(TABLE_TD).unwrap();
521        let mut iter = table.iter();
522        let row = iter.next().unwrap();
523
524        assert_eq!(None, row.get(""));
525        assert_eq!(None, row.get("foo"));
526        assert_eq!(None, row.get("Name"));
527        assert_eq!(None, row.get("Age"));
528
529        assert_eq!(None, iter.next());
530    }
531
532    #[test]
533    fn test_row_get_with_headers() {
534        let table = Table::find_first(TABLE_TH_TD).unwrap();
535        let mut iter = table.iter();
536        let row = iter.next().unwrap();
537
538        assert_eq!(None, row.get(""));
539        assert_eq!(None, row.get("foo"));
540        assert_eq!(Some("John"), row.get("Name"));
541        assert_eq!(Some("20"), row.get("Age"));
542
543        assert_eq!(None, iter.next());
544    }
545
546    #[test]
547    fn test_row_get_complex() {
548        let table = Table::find_first(TABLE_COMPLEX).unwrap();
549        let mut iter = table.iter();
550
551        let row = iter.next().unwrap();
552        assert_eq!(Some("John"), row.get("Name"));
553        assert_eq!(Some("20"), row.get("Age"));
554        assert_eq!(None, row.get("Extra"));
555
556        let row = iter.next().unwrap();
557        assert_eq!(Some("May"), row.get("Name"));
558        assert_eq!(Some("30"), row.get("Age"));
559        assert_eq!(Some("foo"), row.get("Extra"));
560
561        let row = iter.next().unwrap();
562        assert_eq!(None, row.get("Name"));
563        assert_eq!(None, row.get("Age"));
564        assert_eq!(None, row.get("Extra"));
565
566        let row = iter.next().unwrap();
567        assert_eq!(Some("a"), row.get("Name"));
568        assert_eq!(Some("b"), row.get("Age"));
569        assert_eq!(Some("c"), row.get("Extra"));
570
571        assert_eq!(None, iter.next());
572    }
573
574    #[test]
575    fn test_row_as_slice_without_headers() {
576        let table = Table::find_first(TABLE_TD).unwrap();
577        let mut iter = table.iter();
578
579        assert_eq!(&["Name", "Age"], iter.next().unwrap().as_slice());
580        assert_eq!(None, iter.next());
581    }
582
583    #[test]
584    fn test_row_as_slice_with_headers() {
585        let table = Table::find_first(TABLE_TH_TD).unwrap();
586        let mut iter = table.iter();
587
588        assert_eq!(&["John", "20"], iter.next().unwrap().as_slice());
589        assert_eq!(None, iter.next());
590    }
591
592    #[test]
593    fn test_row_as_slice_complex() {
594        let table = Table::find_first(TABLE_COMPLEX).unwrap();
595        let mut iter = table.iter();
596        let empty: [&str; 0] = [];
597
598        assert_eq!(&["John", "20"], iter.next().unwrap().as_slice());
599        assert_eq!(&["May", "30", "foo"], iter.next().unwrap().as_slice());
600        assert_eq!(&empty, iter.next().unwrap().as_slice());
601        assert_eq!(&["a", "b", "c", "d"], iter.next().unwrap().as_slice());
602        assert_eq!(None, iter.next());
603    }
604
605    #[test]
606    fn test_row_iter_simple() {
607        let table = Table::find_first(TABLE_TD).unwrap();
608        let row = table.iter().next().unwrap();
609        let mut iter = row.iter();
610
611        assert_eq!(Some("Name"), iter.next().map(String::as_str));
612        assert_eq!(Some("Age"), iter.next().map(String::as_str));
613        assert_eq!(None, iter.next());
614    }
615
616    #[test]
617    fn test_row_iter_complex() {
618        let table = Table::find_first(TABLE_COMPLEX).unwrap();
619        let mut table_iter = table.iter();
620
621        let row = table_iter.next().unwrap();
622        let mut iter = row.iter();
623        assert_eq!(Some("John"), iter.next().map(String::as_str));
624        assert_eq!(Some("20"), iter.next().map(String::as_str));
625        assert_eq!(None, iter.next());
626
627        let row = table_iter.next().unwrap();
628        let mut iter = row.iter();
629        assert_eq!(Some("May"), iter.next().map(String::as_str));
630        assert_eq!(Some("30"), iter.next().map(String::as_str));
631        assert_eq!(Some("foo"), iter.next().map(String::as_str));
632        assert_eq!(None, iter.next());
633
634        let row = table_iter.next().unwrap();
635        let mut iter = row.iter();
636        assert_eq!(None, iter.next());
637
638        let row = table_iter.next().unwrap();
639        let mut iter = row.iter();
640        assert_eq!(Some("a"), iter.next().map(String::as_str));
641        assert_eq!(Some("b"), iter.next().map(String::as_str));
642        assert_eq!(Some("c"), iter.next().map(String::as_str));
643        assert_eq!(Some("d"), iter.next().map(String::as_str));
644        assert_eq!(None, iter.next());
645    }
646}