html2md/
tables.rs

1use crate::markup5ever_rcdom;
2
3use super::StructuredPrinter;
4use super::TagHandler;
5use super::{clean_markdown, walk};
6
7use std::{cmp, collections::HashMap};
8
9use markup5ever_rcdom::{Handle, NodeData};
10
11#[derive(Default)]
12pub struct TableHandler;
13
14impl TagHandler for TableHandler {
15    fn handle(&mut self, tag: &Handle, printer: &mut StructuredPrinter) {
16        let mut table_markup = String::new();
17        let any_matcher = |cell: &Handle| {
18            let name = tag_name(cell);
19            name == "td" || name == "th"
20        };
21
22        // detect cell width, counts
23        let column_count: usize;
24        let mut column_widths: Vec<usize>;
25        let rows = find_children(tag, "tr");
26        {
27            // detect row count
28            let most_big_row = rows.iter().max_by(|left, right| {
29                collect_children(left, any_matcher)
30                    .len()
31                    .cmp(&collect_children(right, any_matcher).len())
32            });
33            if most_big_row.is_none() {
34                // we don't have rows with content at all
35                return;
36            }
37            // have rows with content, set column count
38            column_count = collect_children(most_big_row.unwrap(), any_matcher).len();
39            column_widths = vec![3; column_count];
40
41            // detect max column width
42            for row in &rows {
43                let cells = collect_children(row, any_matcher);
44                #[allow(clippy::needless_range_loop)]
45                for index in 0..column_count {
46                    // from regular rows
47                    if let Some(cell) = cells.get(index) {
48                        let text = to_text(cell);
49                        column_widths[index] = cmp::max(column_widths[index], text.chars().count());
50                    }
51                }
52            }
53        }
54
55        // header row must always be present
56        for (idx, row) in rows.iter().enumerate() {
57            table_markup.push('|');
58            let cells = collect_children(row, any_matcher);
59            #[allow(clippy::needless_range_loop)]
60            for index in 0..column_count {
61                // we need to fill all cells in a column, even if some rows don't have enough
62                let padded_cell_text = pad_cell_text(&cells.get(index), column_widths[index]);
63                table_markup.push_str(&padded_cell_text);
64                table_markup.push('|');
65            }
66            table_markup.push('\n');
67
68            if idx == 0 {
69                // first row is a header row
70                // add header-body divider row
71                table_markup.push('|');
72                #[allow(clippy::needless_range_loop)]
73                for index in 0..column_count {
74                    let width = column_widths[index];
75                    if width < 3 {
76                        // no point in aligning, just post as-is
77                        table_markup.push_str(&"-".repeat(width));
78                        table_markup.push('|');
79                        continue;
80                    }
81
82                    // try to detect alignment
83                    let mut alignment = String::new();
84                    if let Some(header_cell) = cells.get(index) {
85                        // we have a header, try to extract alignment from it
86                        alignment = match header_cell.data {
87                            NodeData::Element { ref attrs, .. } => {
88                                let attrs = attrs.borrow();
89                                let align_attr = attrs
90                                    .iter()
91                                    .find(|attr| attr.name.local.to_string() == "align");
92                                align_attr
93                                    .map(|attr| attr.value.to_string())
94                                    .unwrap_or_default()
95                            }
96                            _ => String::new(),
97                        };
98                    }
99
100                    // push lines according to alignment, fallback to default behaviour
101                    match alignment.as_ref() {
102                        "left" => {
103                            table_markup.push(':');
104                            table_markup.push_str(&"-".repeat(width - 1));
105                        }
106                        "center" => {
107                            table_markup.push(':');
108                            table_markup.push_str(&"-".repeat(width - 2));
109                            table_markup.push(':');
110                        }
111                        "right" => {
112                            table_markup.push_str(&"-".repeat(width - 1));
113                            table_markup.push(':');
114                        }
115                        _ => table_markup.push_str(&"-".repeat(width)),
116                    }
117                    table_markup.push('|');
118                }
119                table_markup.push('\n');
120            }
121        }
122
123        printer.insert_newline();
124        printer.insert_newline();
125        printer.append_str(&table_markup);
126    }
127
128    fn after_handle(&mut self, _printer: &mut StructuredPrinter) {}
129
130    fn skip_descendants(&self) -> bool {
131        true
132    }
133}
134
135/// Pads cell text from right and left so it looks centered inside the table cell
136/// ### Arguments
137/// `tag` - optional reference to currently processed handle, text is extracted from here
138///
139/// `column_width` - precomputed column width to compute padding length from
140fn pad_cell_text(tag: &Option<&Handle>, column_width: usize) -> String {
141    let mut result = String::new();
142    if let Some(cell) = tag {
143        // have header at specified position
144        let text = to_text(cell);
145        // compute difference between width and text length
146        let len_diff = column_width - text.chars().count();
147        if len_diff > 0 {
148            // should pad
149            if len_diff > 1 {
150                // should pad from both sides
151                let pad_len = len_diff / 2;
152                let remainder = len_diff % 2;
153                result.push_str(&" ".repeat(pad_len));
154                result.push_str(&text);
155                result.push_str(&" ".repeat(pad_len + remainder));
156            } else {
157                // it's just one space, add at the end
158                result.push_str(&text);
159                result.push(' ');
160            }
161        } else {
162            // shouldn't pad, text fills whole cell
163            result.push_str(&text);
164        }
165    } else {
166        // no text in this cell, fill cell with spaces
167        let pad_len = column_width;
168        result.push_str(&" ".repeat(pad_len));
169    }
170
171    result
172}
173
174/// Extracts tag name from passed tag
175/// Returns empty string if it's not an html element
176fn tag_name(tag: &Handle) -> String {
177    match tag.data {
178        NodeData::Element { ref name, .. } => name.local.to_string(),
179        _ => String::new(),
180    }
181}
182
183/// Find descendants of this tag with tag name `name`
184/// This includes both direct children and descendants
185fn find_children(tag: &Handle, name: &str) -> Vec<Handle> {
186    let mut result: Vec<Handle> = vec![];
187    let children = tag.children.borrow();
188    for child in children.iter() {
189        if tag_name(child) == name {
190            result.push(child.clone());
191        }
192
193        let mut descendants = find_children(child, name);
194        result.append(&mut descendants);
195    }
196
197    result
198}
199
200/// Collect direct children that satisfy the predicate
201/// This doesn't include descendants
202fn collect_children<P>(tag: &Handle, predicate: P) -> Vec<Handle>
203where
204    P: Fn(&Handle) -> bool,
205{
206    let mut result: Vec<Handle> = vec![];
207    let children = tag.children.borrow();
208    for child in children.iter() {
209        let candidate = child.clone();
210        if predicate(&candidate) {
211            result.push(candidate);
212        }
213    }
214
215    result
216}
217
218/// Convert html tag to text. This collects all tag children in correct order where they're observed
219/// and concatenates their text, recursively.
220fn to_text(tag: &Handle) -> String {
221    let mut printer = StructuredPrinter::default();
222    walk(tag, &mut printer, &HashMap::default());
223
224    let result = clean_markdown(&printer.data);
225    result.replace("\n", "<br/>")
226}