html2md/
tables.rs

1use super::StructuredPrinter;
2use super::TagHandler;
3use super::{clean_markdown, walk};
4
5use std::{cmp, collections::HashMap};
6
7use markup5ever_rcdom::{Handle, NodeData};
8
9#[derive(Default)]
10pub struct TableHandler;
11
12impl TagHandler for TableHandler {
13    fn handle(&mut self, tag: &Handle, printer: &mut StructuredPrinter) {
14        let mut table_markup = String::new();
15        let any_matcher = |cell: &Handle| {
16            let name = tag_name(cell);
17            name == "td" || name == "th"
18        };
19
20        // detect cell width, counts
21        let column_count: usize;
22        let mut column_widths: Vec<usize>;
23        let rows = find_children(tag, "tr");
24        {
25            // detect row count
26            let most_big_row = rows.iter().max_by(|left, right| {
27                collect_children(left, any_matcher)
28                    .len()
29                    .cmp(&collect_children(right, any_matcher).len())
30            });
31            if most_big_row.is_none() {
32                // we don't have rows with content at all
33                return;
34            }
35            // have rows with content, set column count
36            column_count = collect_children(most_big_row.unwrap(), any_matcher).len();
37            column_widths = vec![3; column_count];
38
39            // detect max column width
40            for row in &rows {
41                let cells = collect_children(row, any_matcher);
42                #[allow(clippy::needless_range_loop)]
43                for index in 0..column_count {
44                    // from regular rows
45                    if let Some(cell) = cells.get(index) {
46                        let text = to_text(cell);
47                        column_widths[index] = cmp::max(column_widths[index], text.chars().count());
48                    }
49                }
50            }
51        }
52
53        // header row must always be present
54        for (idx, row) in rows.iter().enumerate() {
55            table_markup.push('|');
56            let cells = collect_children(row, any_matcher);
57            #[allow(clippy::needless_range_loop)]
58            for index in 0..column_count {
59                // we need to fill all cells in a column, even if some rows don't have enough
60                let padded_cell_text = pad_cell_text(&cells.get(index), column_widths[index]);
61                table_markup.push_str(&padded_cell_text);
62                table_markup.push('|');
63            }
64            table_markup.push('\n');
65
66            if idx == 0 {
67                // first row is a header row
68                // add header-body divider row
69                table_markup.push('|');
70                #[allow(clippy::needless_range_loop)]
71                for index in 0..column_count {
72                    let width = column_widths[index];
73                    if width < 3 {
74                        // no point in aligning, just post as-is
75                        table_markup.push_str(&"-".repeat(width));
76                        table_markup.push('|');
77                        continue;
78                    }
79
80                    // try to detect alignment
81                    let mut alignment = String::new();
82                    if let Some(header_cell) = cells.get(index) {
83                        // we have a header, try to extract alignment from it
84                        alignment = match header_cell.data {
85                            NodeData::Element { ref attrs, .. } => {
86                                let attrs = attrs.borrow();
87                                let align_attr = attrs
88                                    .iter()
89                                    .find(|attr| attr.name.local.to_string() == "align");
90                                align_attr
91                                    .map(|attr| attr.value.to_string())
92                                    .unwrap_or_default()
93                            }
94                            _ => String::new(),
95                        };
96                    }
97
98                    // push lines according to alignment, fallback to default behaviour
99                    match alignment.as_ref() {
100                        "left" => {
101                            table_markup.push(':');
102                            table_markup.push_str(&"-".repeat(width - 1));
103                        }
104                        "center" => {
105                            table_markup.push(':');
106                            table_markup.push_str(&"-".repeat(width - 2));
107                            table_markup.push(':');
108                        }
109                        "right" => {
110                            table_markup.push_str(&"-".repeat(width - 1));
111                            table_markup.push(':');
112                        }
113                        _ => table_markup.push_str(&"-".repeat(width)),
114                    }
115                    table_markup.push('|');
116                }
117                table_markup.push('\n');
118            }
119        }
120
121        printer.insert_newline();
122        printer.insert_newline();
123        printer.append_str(&table_markup);
124    }
125
126    fn after_handle(&mut self, _printer: &mut StructuredPrinter) {}
127
128    fn skip_descendants(&self) -> bool {
129        true
130    }
131}
132
133/// Pads cell text from right and left so it looks centered inside the table cell
134/// ### Arguments
135/// `tag` - optional reference to currently processed handle, text is extracted from here
136///
137/// `column_width` - precomputed column width to compute padding length from
138fn pad_cell_text(tag: &Option<&Handle>, column_width: usize) -> String {
139    let mut result = String::new();
140    if let Some(cell) = tag {
141        // have header at specified position
142        let text = to_text(cell);
143        // compute difference between width and text length
144        let len_diff = column_width - text.chars().count();
145        if len_diff > 0 {
146            // should pad
147            if len_diff > 1 {
148                // should pad from both sides
149                let pad_len = len_diff / 2;
150                let remainder = len_diff % 2;
151                result.push_str(&" ".repeat(pad_len));
152                result.push_str(&text);
153                result.push_str(&" ".repeat(pad_len + remainder));
154            } else {
155                // it's just one space, add at the end
156                result.push_str(&text);
157                result.push(' ');
158            }
159        } else {
160            // shouldn't pad, text fills whole cell
161            result.push_str(&text);
162        }
163    } else {
164        // no text in this cell, fill cell with spaces
165        let pad_len = column_width;
166        result.push_str(&" ".repeat(pad_len));
167    }
168
169    result
170}
171
172/// Extracts tag name from passed tag
173/// Returns empty string if it's not an html element
174fn tag_name(tag: &Handle) -> String {
175    match tag.data {
176        NodeData::Element { ref name, .. } => name.local.to_string(),
177        _ => String::new(),
178    }
179}
180
181/// Find descendants of this tag with tag name `name`
182/// This includes both direct children and descendants
183fn find_children(tag: &Handle, name: &str) -> Vec<Handle> {
184    let mut result: Vec<Handle> = vec![];
185    let children = tag.children.borrow();
186    for child in children.iter() {
187        if tag_name(child) == name {
188            result.push(child.clone());
189        }
190
191        let mut descendants = find_children(child, name);
192        result.append(&mut descendants);
193    }
194
195    result
196}
197
198/// Collect direct children that satisfy the predicate
199/// This doesn't include descendants
200fn collect_children<P>(tag: &Handle, predicate: P) -> Vec<Handle>
201where
202    P: Fn(&Handle) -> bool,
203{
204    let mut result: Vec<Handle> = vec![];
205    let children = tag.children.borrow();
206    for child in children.iter() {
207        let candidate = child.clone();
208        if predicate(&candidate) {
209            result.push(candidate);
210        }
211    }
212
213    result
214}
215
216/// Convert html tag to text. This collects all tag children in correct order where they're observed
217/// and concatenates their text, recursively.
218fn to_text(tag: &Handle) -> String {
219    let mut printer = StructuredPrinter::default();
220    walk(tag, &mut printer, &HashMap::default());
221
222    let result = clean_markdown(&printer.data);
223    result.replace("\n", "<br/>")
224}