html2md 0.2.8

Library and binary to convert simple html documents into markdown
Documentation
use super::TagHandler;
use super::StructuredPrinter;

use std::cmp;

use html5ever::rcdom::{Handle,NodeData};

#[derive(Default)]
pub(super) struct TableHandler;

impl TagHandler for TableHandler {
    
    fn handle(&mut self, tag: &Handle, printer: &mut StructuredPrinter) {
        let mut table_markup = String::new();

        let td_matcher = |cell| tag_name(&cell) == "td";
        let th_matcher = |cell| tag_name(&cell) == "th";
        let any_matcher = |cell| { let name = tag_name(&cell); name == "td" || name == "th" };

        // detect cell width, counts
        let column_count : usize;
        let mut column_widths : Vec<usize>;
        let mut rows = find_children(tag, "tr");
        {
            // detect row count
            let most_big_row = rows.iter().max_by(|left, right| collect_children(&left, any_matcher).len().cmp(&collect_children(&right, any_matcher).len()));
            if most_big_row.is_none() {
                // we don't have rows with content at all
                return;
            }
            // have rows with content, set column count
            column_count = collect_children(&most_big_row.unwrap(), any_matcher).len();
            column_widths = vec![3; column_count];

            // detect max column width
            for row in &rows {
                let cells = collect_children(row, any_matcher);
                for index in 0..column_count {
                    // from regular rows
                    if let Some(cell) = cells.get(index) {
                        let text = to_text(cell);
                        column_widths[index] = cmp::max(column_widths[index], text.len());
                    }
                }
            }
        }

        {
            // add header row
            let mut header_cells : Vec<Handle> = vec![];
            let header_tr = rows.iter().find(|row| collect_children(&row, th_matcher).len() > 0);
            if let Some(header_row) = header_tr {
                // have header row, take data from it
                table_markup.push('|');
                header_cells = collect_children(header_row, th_matcher);
                for index in 0..column_count {
                    let padded_header_text = pad_cell_text(&header_cells.get(index), column_widths[index]);
                    table_markup.push_str(&padded_header_text);
                    table_markup.push('|');
                }
            }
            table_markup.push('\n');

            // add header-body divider row
            table_markup.push('|');
            for index in 0..column_count {
                let width = column_widths[index];
                if width < 3 {
                    // no point in aligning, just post as-is
                    table_markup.push_str(&"-".repeat(width));
                    table_markup.push('|');
                    continue;
                }

                // try to detect alignment
                let mut alignment = String::new();
                if let Some(header_cell) = header_cells.get(index) {
                    // we have a header, try to extract alignment from it
                    alignment = match header_cell.data {
                        NodeData::Element { ref attrs, .. } => {
                            let attrs = attrs.borrow();
                            let align_attr = attrs.iter().find(|attr| attr.name.local.to_string() == "align");
                            align_attr.map(|attr| attr.value.to_string()).unwrap_or_default()
                        }
                        _ => String::new()
                    };
                }

                // push lines according to alignment, fallback to default behaviour
                match alignment.as_ref() {
                    "left" => { table_markup.push(':'); table_markup.push_str(&"-".repeat(width - 1)); }
                    "center" => { table_markup.push(':'); table_markup.push_str(&"-".repeat(width - 2)); table_markup.push(':'); }
                    "right" => { table_markup.push_str(&"-".repeat(width - 1)); table_markup.push(':'); }
                    _ => table_markup.push_str(&"-".repeat(width))
                }
                table_markup.push('|');
            }
            table_markup.push('\n');
        }

        // remove headers, leave only non-header rows now
        // process table rows
        rows.retain(|row| { let children = row.children.borrow(); return children.iter().any(|child| tag_name(&child) == "td"); });
        for row in &rows {
            table_markup.push('|');
            let cells = collect_children(row, td_matcher);
            for index in 0..column_count { // we need to fill all cells in a column, even if some rows don't have enough
                let padded_cell_text = pad_cell_text(&cells.get(index), column_widths[index]);
                table_markup.push_str(&padded_cell_text);
                table_markup.push('|');
            }
            table_markup.push('\n');
        }

        printer.insert_newline();
        printer.insert_newline();
        printer.append_str(&table_markup);
    }

    fn after_handle(&mut self, _printer: &mut StructuredPrinter) {
        
    }

    fn skip_descendants(&self) -> bool {
        return true;
    }
}

/// Pads cell text from right and left so it looks centered inside the table cell
/// ### Arguments
/// `tag` - optional reference to currently processed handle, text is extracted from here
/// 
/// `column_width` - precomputed column width to compute padding length from
fn pad_cell_text(tag: &Option<&Handle>, column_width: usize) -> String {
    let mut result = String::new();
    if let Some(cell) = tag {
        // have header at specified position
        let text = to_text(cell);
        // compute difference between width and text length
        let len_diff = column_width - text.len();
        if len_diff > 0 {
            // should pad
            if len_diff > 1 {
                // should pad from both sides
                let pad_len = len_diff / 2;
                let remainder = len_diff % 2;
                result.push_str(&" ".repeat(pad_len));
                result.push_str(&text);
                result.push_str(&" ".repeat(pad_len + remainder));
            } else {
                // it's just one space, add at the end
                result.push_str(&text);
                result.push(' ');
            }
        } else {
            // shouldn't pad, text fills whole cell
            result.push_str(&text);
        }
    } else {
        // no text in this cell, fill cell with spaces
        let pad_len = column_width;
        result.push_str(&" ".repeat(pad_len));
    }

    return result;
}

/// Extracts tag name from passed tag
/// Returns empty string if it's not an html element
fn tag_name(tag: &Handle) -> String {
    return match tag.data {
        NodeData::Element { ref name, .. } => name.local.to_string(),
        _ => String::new()
    }
}

/// Find descendants of this tag with tag name `name`
/// This includes both direct children and descendants
fn  find_children(tag: &Handle, name: &str) -> Vec<Handle> {
    let mut result: Vec<Handle> = vec![];
    let children = tag.children.borrow();
    for child in children.iter() {
        if tag_name(&child) == name {
            result.push(child.clone());
        }

        let mut descendants = find_children(&child, name);
        result.append(&mut descendants);
    }

    return result;
}

/// Collect direct children that satisfy the predicate
/// This doesn't include descendants
fn collect_children<P>(tag: &Handle, predicate: P) -> Vec<Handle>
where P: Fn(Handle) -> bool {
    let mut result: Vec<Handle> = vec![];
    let children = tag.children.borrow();
    for child in children.iter() {
        if predicate(child.clone()) {
            result.push(child.clone());
        }
    }

    return result;
}

/// Convert html tag to text. This collects all tag children in correct order where they're observed
/// and concatenates their text, recursively.
fn  to_text(tag: &Handle) -> String {
    let mut result = String::new();
    match tag.data {
        NodeData::Text { ref contents }  => result.push_str(&contents.borrow().trim()),
        _ => {}
    }
    let children = tag.children.borrow();
    for child in children.iter() {
        let child_text = to_text(child);
        result.push_str(&child_text);
    }
    
    return result;
}