use scraper::element_ref::ElementRef;
use scraper::{Html, Selector};
use std::collections::HashMap;
pub type Headers = HashMap<String, usize>;
#[derive(Clone, Debug, Eq, PartialEq)]
pub struct Table {
headers: Headers,
data: Vec<Vec<String>>,
}
impl Table {
pub fn find_first(html: &str) -> Option<Table> {
let html = Html::parse_fragment(html);
html.select(&css("table")).next().map(Table::new)
}
pub fn find_by_id(html: &str, id: &str) -> Option<Table> {
let html = Html::parse_fragment(html);
let selector = format!("table#{}", id);
Selector::parse(&selector)
.ok()
.as_ref()
.map(|s| html.select(s))
.and_then(|mut s| s.next())
.map(Table::new)
}
pub fn find_by_headers<T>(html: &str, headers: &[T]) -> Option<Table>
where
T: AsRef<str>,
{
if headers.is_empty() {
return Table::find_first(html);
}
let sel_table = css("table");
let sel_tr = css("tr");
let sel_th = css("th");
let html = Html::parse_fragment(html);
html.select(&sel_table)
.find(|table| {
table.select(&sel_tr).next().map_or(false, |tr| {
let cells = select_cells(tr, &sel_th);
headers.iter().all(|h| contains_str(&cells, h.as_ref()))
})
})
.map(Table::new)
}
pub fn headers(&self) -> &Headers {
&self.headers
}
pub fn iter(&self) -> Iter {
Iter {
headers: &self.headers,
iter: self.data.iter(),
}
}
fn new(element: ElementRef) -> Table {
let sel_tr = css("tr");
let sel_th = css("th");
let sel_td = css("td");
let mut headers = HashMap::new();
let mut rows = element.select(&sel_tr).peekable();
if let Some(tr) = rows.peek() {
for (i, th) in tr.select(&sel_th).enumerate() {
headers.insert(cell_content(th), i);
}
}
if !headers.is_empty() {
rows.next();
}
let data = rows.map(|tr| select_cells(tr, &sel_td)).collect();
Table { headers, data }
}
}
impl<'a> IntoIterator for &'a Table {
type Item = Row<'a>;
type IntoIter = Iter<'a>;
fn into_iter(self) -> Self::IntoIter {
self.iter()
}
}
pub struct Iter<'a> {
headers: &'a Headers,
iter: std::slice::Iter<'a, Vec<String>>,
}
impl<'a> Iterator for Iter<'a> {
type Item = Row<'a>;
fn next(&mut self) -> Option<Self::Item> {
let headers = self.headers;
self.iter.next().map(|cells| Row { headers, cells })
}
}
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub struct Row<'a> {
headers: &'a Headers,
cells: &'a [String],
}
impl<'a> Row<'a> {
pub fn len(&self) -> usize {
self.cells.len()
}
pub fn is_empty(&self) -> bool {
self.cells.is_empty()
}
pub fn get(&self, header: &str) -> Option<&'a str> {
self.headers
.get(header)
.and_then(|&i| self.cells.get(i).map(String::as_str))
}
pub fn as_slice(&self) -> &'a [String] {
self.cells
}
pub fn iter(&self) -> std::slice::Iter<String> {
self.cells.iter()
}
}
impl<'a> IntoIterator for Row<'a> {
type Item = &'a String;
type IntoIter = std::slice::Iter<'a, String>;
fn into_iter(self) -> Self::IntoIter {
self.cells.iter()
}
}
fn css(selector: &'static str) -> Selector {
Selector::parse(selector).unwrap()
}
fn select_cells(element: ElementRef, selector: &Selector) -> Vec<String> {
element.select(selector).map(cell_content).collect()
}
fn cell_content(element: ElementRef) -> String {
element.inner_html().trim().to_string()
}
fn contains_str(slice: &[String], item: &str) -> bool {
slice.iter().any(|s| s == item)
}
#[cfg(test)]
mod tests {
use super::*;
const TABLE_EMPTY: &'static str = r#"
<table></table>
"#;
const TABLE_TH: &'static str = r#"
<table>
<tr><th>Name</th><th>Age</th></tr>
</table>
"#;
const TABLE_TD: &'static str = r#"
<table>
<tr><td>Name</td><td>Age</td></tr>
</table>
"#;
const TABLE_TH_TD: &'static str = r#"
<table>
<tr><th>Name</th><th>Age</th></tr>
<tr><td>John</td><td>20</td></tr>
</table>
"#;
const TABLE_TD_TD: &'static str = r#"
<table>
<tr><td>Name</td><td>Age</td></tr>
<tr><td>John</td><td>20</td></tr>
</table>
"#;
const TABLE_TH_TH: &'static str = r#"
<table>
<tr><th>Name</th><th>Age</th></tr>
<tr><th>John</th><th>20</th></tr>
</table>
"#;
const TABLE_COMPLEX: &'static str = r#"
<table>
<tr><th>Name</th><th>Age</th><th>Extra</th></tr>
<tr><td>John</td><td>20</td></tr>
<tr><td>May</td><td>30</td><td>foo</td></tr>
<tr></tr>
<tr><td>a</td><td>b</td><td>c</td><td>d</td></tr>
</table>
"#;
const HTML_NO_TABLE: &'static str = r#"
<!doctype HTML>
<html>
<head><title>foo</title></head>
<body><p>Hi.</p></body>
</html>
"#;
const HTML_TWO_TABLES: &'static str = r#"
<!doctype HTML>
<html>
<head><title>foo</title></head>
<body>
<table id="first">
<tr><th>Name</th><th>Age</th></tr>
<tr><td>John</td><td>20</td></tr>
</table>
<table id="second">
<tr><th>Name</th><th>Weight</th></tr>
<tr><td>John</td><td>150</td></tr>
</table>
</body>
</html>
"#;
const HTML_TABLE_FRAGMENT: &'static str = r#"
<table id="first">
<tr><th>Name</th><th>Age</th></tr>
<tr><td>John</td><td>20</td></tr>
</table>
</body>
</html>
"#;
#[test]
fn test_find_first_none() {
assert_eq!(None, Table::find_first(""));
assert_eq!(None, Table::find_first("foo"));
assert_eq!(None, Table::find_first(HTML_NO_TABLE));
}
#[test]
fn test_find_first_empty() {
let empty = Table {
headers: HashMap::new(),
data: Vec::new(),
};
assert_eq!(Some(empty), Table::find_first(TABLE_EMPTY));
}
#[test]
fn test_find_first_some() {
assert!(Table::find_first(TABLE_TH).is_some());
assert!(Table::find_first(TABLE_TD).is_some());
}
#[test]
fn test_find_by_id_none() {
assert_eq!(None, Table::find_by_id("", ""));
assert_eq!(None, Table::find_by_id("foo", "id"));
assert_eq!(None, Table::find_by_id(HTML_NO_TABLE, "id"));
assert_eq!(None, Table::find_by_id(TABLE_EMPTY, "id"));
assert_eq!(None, Table::find_by_id(TABLE_TH, "id"));
assert_eq!(None, Table::find_by_id(TABLE_TH, ""));
assert_eq!(None, Table::find_by_id(HTML_TWO_TABLES, "id"));
}
#[test]
fn test_find_by_id_some() {
assert!(Table::find_by_id(HTML_TWO_TABLES, "first").is_some());
assert!(Table::find_by_id(HTML_TWO_TABLES, "second").is_some());
}
#[test]
fn test_find_by_headers_empty() {
let headers: [&str; 0] = [];
assert_eq!(None, Table::find_by_headers("", &headers));
assert_eq!(None, Table::find_by_headers("foo", &headers));
assert_eq!(None, Table::find_by_headers(HTML_NO_TABLE, &headers));
assert!(Table::find_by_headers(TABLE_EMPTY, &headers).is_some());
assert!(Table::find_by_headers(HTML_TWO_TABLES, &headers).is_some());
}
#[test]
fn test_find_by_headers_none() {
let headers = ["Name", "Age"];
let bad_headers = ["Name", "BAD"];
assert_eq!(None, Table::find_by_headers("", &headers));
assert_eq!(None, Table::find_by_headers("foo", &headers));
assert_eq!(None, Table::find_by_headers(HTML_NO_TABLE, &headers));
assert_eq!(None, Table::find_by_headers(TABLE_EMPTY, &bad_headers));
assert_eq!(None, Table::find_by_headers(TABLE_TH, &bad_headers));
assert_eq!(None, Table::find_by_headers(TABLE_TD, &headers));
assert_eq!(None, Table::find_by_headers(TABLE_TD, &bad_headers));
}
#[test]
fn test_find_by_headers_some() {
let headers: [&str; 0] = [];
assert!(Table::find_by_headers(TABLE_TH, &headers).is_some());
assert!(Table::find_by_headers(TABLE_TH_TD, &headers).is_some());
assert!(Table::find_by_headers(HTML_TWO_TABLES, &headers).is_some());
let headers = ["Name"];
assert!(Table::find_by_headers(TABLE_TH, &headers).is_some());
assert!(Table::find_by_headers(TABLE_TH_TD, &headers).is_some());
assert!(Table::find_by_headers(HTML_TWO_TABLES, &headers).is_some());
let headers = ["Age", "Name"];
assert!(Table::find_by_headers(TABLE_TH, &headers).is_some());
assert!(Table::find_by_headers(TABLE_TH_TD, &headers).is_some());
assert!(Table::find_by_headers(HTML_TWO_TABLES, &headers).is_some());
}
#[test]
fn test_find_first_incomplete_fragment() {
assert!(Table::find_first(HTML_TABLE_FRAGMENT).is_some());
}
#[test]
fn test_headers_empty() {
let empty = HashMap::new();
assert_eq!(&empty, Table::find_first(TABLE_TD).unwrap().headers());
assert_eq!(&empty, Table::find_first(TABLE_TD_TD).unwrap().headers());
}
#[test]
fn test_headers_nonempty() {
let mut headers = HashMap::new();
headers.insert("Name".to_string(), 0);
headers.insert("Age".to_string(), 1);
assert_eq!(&headers, Table::find_first(TABLE_TH).unwrap().headers());
assert_eq!(&headers, Table::find_first(TABLE_TH_TD).unwrap().headers());
assert_eq!(&headers, Table::find_first(TABLE_TH_TH).unwrap().headers());
headers.insert("Extra".to_string(), 2);
assert_eq!(
&headers,
Table::find_first(TABLE_COMPLEX).unwrap().headers()
);
}
#[test]
fn test_iter_empty() {
assert_eq!(0, Table::find_first(TABLE_EMPTY).unwrap().iter().count());
assert_eq!(0, Table::find_first(TABLE_TH).unwrap().iter().count());
}
#[test]
fn test_iter_nonempty() {
assert_eq!(1, Table::find_first(TABLE_TD).unwrap().iter().count());
assert_eq!(1, Table::find_first(TABLE_TH_TD).unwrap().iter().count());
assert_eq!(2, Table::find_first(TABLE_TD_TD).unwrap().iter().count());
assert_eq!(1, Table::find_first(TABLE_TH_TH).unwrap().iter().count());
assert_eq!(4, Table::find_first(TABLE_COMPLEX).unwrap().iter().count());
}
#[test]
fn test_row_is_empty() {
let table = Table::find_first(TABLE_TD).unwrap();
assert_eq!(
vec![false],
table.iter().map(|r| r.is_empty()).collect::<Vec<_>>()
);
let table = Table::find_first(TABLE_COMPLEX).unwrap();
assert_eq!(
vec![false, false, true, false],
table.iter().map(|r| r.is_empty()).collect::<Vec<_>>()
);
}
#[test]
fn test_row_len() {
let table = Table::find_first(TABLE_TD).unwrap();
assert_eq!(vec![2], table.iter().map(|r| r.len()).collect::<Vec<_>>());
let table = Table::find_first(TABLE_COMPLEX).unwrap();
assert_eq!(
vec![2, 3, 0, 4],
table.iter().map(|r| r.len()).collect::<Vec<_>>()
);
}
#[test]
fn test_row_get_without_headers() {
let table = Table::find_first(TABLE_TD).unwrap();
let mut iter = table.iter();
let row = iter.next().unwrap();
assert_eq!(None, row.get(""));
assert_eq!(None, row.get("foo"));
assert_eq!(None, row.get("Name"));
assert_eq!(None, row.get("Age"));
assert_eq!(None, iter.next());
}
#[test]
fn test_row_get_with_headers() {
let table = Table::find_first(TABLE_TH_TD).unwrap();
let mut iter = table.iter();
let row = iter.next().unwrap();
assert_eq!(None, row.get(""));
assert_eq!(None, row.get("foo"));
assert_eq!(Some("John"), row.get("Name"));
assert_eq!(Some("20"), row.get("Age"));
assert_eq!(None, iter.next());
}
#[test]
fn test_row_get_complex() {
let table = Table::find_first(TABLE_COMPLEX).unwrap();
let mut iter = table.iter();
let row = iter.next().unwrap();
assert_eq!(Some("John"), row.get("Name"));
assert_eq!(Some("20"), row.get("Age"));
assert_eq!(None, row.get("Extra"));
let row = iter.next().unwrap();
assert_eq!(Some("May"), row.get("Name"));
assert_eq!(Some("30"), row.get("Age"));
assert_eq!(Some("foo"), row.get("Extra"));
let row = iter.next().unwrap();
assert_eq!(None, row.get("Name"));
assert_eq!(None, row.get("Age"));
assert_eq!(None, row.get("Extra"));
let row = iter.next().unwrap();
assert_eq!(Some("a"), row.get("Name"));
assert_eq!(Some("b"), row.get("Age"));
assert_eq!(Some("c"), row.get("Extra"));
assert_eq!(None, iter.next());
}
#[test]
fn test_row_as_slice_without_headers() {
let table = Table::find_first(TABLE_TD).unwrap();
let mut iter = table.iter();
assert_eq!(&["Name", "Age"], iter.next().unwrap().as_slice());
assert_eq!(None, iter.next());
}
#[test]
fn test_row_as_slice_with_headers() {
let table = Table::find_first(TABLE_TH_TD).unwrap();
let mut iter = table.iter();
assert_eq!(&["John", "20"], iter.next().unwrap().as_slice());
assert_eq!(None, iter.next());
}
#[test]
fn test_row_as_slice_complex() {
let table = Table::find_first(TABLE_COMPLEX).unwrap();
let mut iter = table.iter();
let empty: [&str; 0] = [];
assert_eq!(&["John", "20"], iter.next().unwrap().as_slice());
assert_eq!(&["May", "30", "foo"], iter.next().unwrap().as_slice());
assert_eq!(&empty, iter.next().unwrap().as_slice());
assert_eq!(&["a", "b", "c", "d"], iter.next().unwrap().as_slice());
assert_eq!(None, iter.next());
}
#[test]
fn test_row_iter_simple() {
let table = Table::find_first(TABLE_TD).unwrap();
let row = table.iter().next().unwrap();
let mut iter = row.iter();
assert_eq!(Some("Name"), iter.next().map(String::as_str));
assert_eq!(Some("Age"), iter.next().map(String::as_str));
assert_eq!(None, iter.next());
}
#[test]
fn test_row_iter_complex() {
let table = Table::find_first(TABLE_COMPLEX).unwrap();
let mut table_iter = table.iter();
let row = table_iter.next().unwrap();
let mut iter = row.iter();
assert_eq!(Some("John"), iter.next().map(String::as_str));
assert_eq!(Some("20"), iter.next().map(String::as_str));
assert_eq!(None, iter.next());
let row = table_iter.next().unwrap();
let mut iter = row.iter();
assert_eq!(Some("May"), iter.next().map(String::as_str));
assert_eq!(Some("30"), iter.next().map(String::as_str));
assert_eq!(Some("foo"), iter.next().map(String::as_str));
assert_eq!(None, iter.next());
let row = table_iter.next().unwrap();
let mut iter = row.iter();
assert_eq!(None, iter.next());
let row = table_iter.next().unwrap();
let mut iter = row.iter();
assert_eq!(Some("a"), iter.next().map(String::as_str));
assert_eq!(Some("b"), iter.next().map(String::as_str));
assert_eq!(Some("c"), iter.next().map(String::as_str));
assert_eq!(Some("d"), iter.next().map(String::as_str));
assert_eq!(None, iter.next());
}
}