Skip to main content

panache_parser/
grid_layout.rs

1//! Shared 2D geometry pass for grid tables.
2//!
3//! A grid table's logical cells are rectangles over a canonical column/row
4//! grid, and a spanning cell's content is *non-contiguous* in the byte stream
5//! (a rowspan cell's text is interleaved with other cells' bytes and separator
6//! lines). A rowan CST node covers a single contiguous range, so the cell
7//! tiling cannot be represented as CST nodes and must be recovered by a 2D pass
8//! downstream of the parser. This module is the single home for that pass,
9//! consumed by both the pandoc-native projector (`pandoc_ast::grid_table`) and
10//! the formatter's spanning-grid engine, so the geometry is computed one way.
11//!
12//! The algorithm mirrors pandoc's `gridtables`: build a padded char grid,
13//! take the canonical column boundaries as the union of `+` positions across
14//! every "sep-style" line and the canonical row boundaries as those lines'
15//! indices, then detect each cell as the smallest valid bounding rectangle.
16//! Positions are **character** indices (matching pandoc, which lays grid tables
17//! out on the source character grid), not display columns.
18
19use std::collections::BTreeSet;
20
21/// One laid-out cell of a grid table over the canonical (row band × fine
22/// column) grid. `content` is the cell's interior text with one leading pad
23/// space stripped per line, trailing whitespace trimmed, and leading/trailing
24/// blank lines dropped, joined with `\n`.
25#[derive(Debug, Clone)]
26pub struct GridCellRect {
27    pub start_row: usize,
28    pub start_col: usize,
29    pub row_span: usize,
30    pub col_span: usize,
31    pub content: String,
32}
33
34/// Canonical geometry of a grid table plus its detected cells.
35#[derive(Debug, Clone)]
36pub struct GridLayout {
37    /// Character columns of the canonical vertical boundaries (the union of
38    /// `+` positions across all sep-style lines). `cols_pos.len() - 1` fine
39    /// columns.
40    pub cols_pos: Vec<usize>,
41    /// Indices into the input `lines` of the sep-style lines (canonical row
42    /// boundaries). `row_seps.len() - 1` row bands.
43    pub row_seps: Vec<usize>,
44    pub cells: Vec<GridCellRect>,
45}
46
47/// Analyze a grid table's lines into its canonical geometry and cell tiling.
48///
49/// `lines` must already be dedented to the table's own left edge (no container
50/// indent). Returns `None` when the input doesn't form a grid (fewer than two
51/// column boundaries or fewer than two separator lines).
52#[allow(clippy::needless_range_loop)]
53pub fn analyze_grid(lines: &[&str]) -> Option<GridLayout> {
54    if lines.is_empty() {
55        return None;
56    }
57
58    // Pad lines into a 2D char grid.
59    let max_width = lines.iter().map(|l| l.chars().count()).max().unwrap_or(0);
60    let grid: Vec<Vec<char>> = lines
61        .iter()
62        .map(|l| {
63            let mut chars: Vec<char> = l.chars().collect();
64            chars.resize(max_width, ' ');
65            chars
66        })
67        .collect();
68    let nlines = grid.len();
69
70    // A line is "sep-style" if it contains at least one `+` and no chars
71    // outside `+`/`-`/`=`/`:`/`|`/` `. Partial separators (lines mixing `|`
72    // and `+`) qualify; content lines do not.
73    let is_sep_line: Vec<bool> = grid
74        .iter()
75        .map(|row| {
76            row.contains(&'+')
77                && row
78                    .iter()
79                    .all(|&c| matches!(c, '+' | '-' | '=' | ':' | '|' | ' '))
80        })
81        .collect();
82
83    // Canonical column boundaries: union of `+` columns across all sep-style lines.
84    let mut col_set: BTreeSet<usize> = BTreeSet::new();
85    for (i, row) in grid.iter().enumerate() {
86        if !is_sep_line[i] {
87            continue;
88        }
89        for (j, &c) in row.iter().enumerate() {
90            if c == '+' {
91                col_set.insert(j);
92            }
93        }
94    }
95    let cols_pos: Vec<usize> = col_set.into_iter().collect();
96    if cols_pos.len() < 2 {
97        return None;
98    }
99    let ncols = cols_pos.len() - 1;
100
101    // Canonical row boundaries: line indices of sep-style lines.
102    let row_seps: Vec<usize> = (0..nlines).filter(|&i| is_sep_line[i]).collect();
103    if row_seps.len() < 2 {
104        return None;
105    }
106    let nrows = row_seps.len() - 1;
107
108    // Detect cells.
109    let mut occupied = vec![vec![false; ncols]; nrows];
110    let mut cells: Vec<GridCellRect> = Vec::new();
111    for sr in 0..nrows {
112        for sc in 0..ncols {
113            if occupied[sr][sc] {
114                continue;
115            }
116            let i = row_seps[sr];
117            let j = cols_pos[sc];
118            if grid[i][j] != '+' {
119                // No corner here — the canonical column is missing on this
120                // sep line, meaning the cell that owns this position must
121                // have been emitted earlier and `occupied` should already be
122                // set. If not, the table is malformed; skip.
123                continue;
124            }
125            let Some((er, ec, content)) = find_grid_cell(&grid, i, j, sr, sc, &cols_pos, &row_seps)
126            else {
127                continue;
128            };
129            for r in sr..er {
130                for c in sc..ec {
131                    occupied[r][c] = true;
132                }
133            }
134            cells.push(GridCellRect {
135                start_row: sr,
136                start_col: sc,
137                row_span: er - sr,
138                col_span: ec - sc,
139                content,
140            });
141        }
142    }
143
144    Some(GridLayout {
145        cols_pos,
146        row_seps,
147        cells,
148    })
149}
150
151/// Find the smallest valid grid-table cell with its top-left `+` at
152/// `(i, j)` in the char grid, where `(sr, sc)` are the canonical row /
153/// column indices of that corner.
154///
155/// Returns `(end_row_idx, end_col_idx, content_text)` where the cell
156/// occupies canonical rows `sr..end_row_idx` and canonical columns
157/// `sc..end_col_idx`. Content is the text inside the cell, with one
158/// leading-space pad stripped per line and trailing whitespace trimmed,
159/// joined with `\n`.
160#[allow(clippy::needless_range_loop)]
161fn find_grid_cell(
162    grid: &[Vec<char>],
163    i: usize,
164    j: usize,
165    sr: usize,
166    sc: usize,
167    cols_pos: &[usize],
168    row_seps: &[usize],
169) -> Option<(usize, usize, String)> {
170    let nrows = row_seps.len() - 1;
171    let ncols = cols_pos.len() - 1;
172
173    for ec in (sc + 1)..=ncols {
174        let k = cols_pos[ec];
175        // Top edge (i, j+1..k) must be all sep chars (intermediate `+`s OK).
176        let top_ok = (j + 1..k).all(|c| matches!(grid[i][c], '-' | '=' | ':' | '+'));
177        if !top_ok {
178            // Hit a `|` or ` `; can't extend further right.
179            break;
180        }
181        for er in (sr + 1)..=nrows {
182            let l = row_seps[er];
183            // Left edge col j from i+1..l: chars in {|, +}.
184            let left_ok = (i + 1..l).all(|r| matches!(grid[r][j], '|' | '+'));
185            if !left_ok {
186                break;
187            }
188            // Right edge col k from i+1..l: chars in {|, +}.
189            let right_ok = (i + 1..l).all(|r| matches!(grid[r][k], '|' | '+'));
190            if !right_ok {
191                continue;
192            }
193            // Bottom edge (l, j+1..k): chars in {-, =, :, +}.
194            let bot_ok = (j + 1..k).all(|c| matches!(grid[l][c], '-' | '=' | ':' | '+'));
195            if !bot_ok {
196                continue;
197            }
198            if grid[l][j] != '+' || grid[l][k] != '+' {
199                continue;
200            }
201            // No interior partial separator that fully spans this cell.
202            // A line m strictly between i and l splits the cell if it has
203            // `+` at both col j and col k AND all chars between are sep
204            // chars (i.e., the partial sep extends across the whole cell
205            // horizontally).
206            let interior_split = (i + 1..l).any(|m| {
207                grid[m][j] == '+'
208                    && grid[m][k] == '+'
209                    && (j + 1..k).all(|c| matches!(grid[m][c], '-' | '=' | ':' | '+'))
210            });
211            if interior_split {
212                continue;
213            }
214
215            // Extract content text. For each interior line, take chars
216            // [j+1..k], strip one leading space (cell padding), trim
217            // trailing whitespace.
218            let mut content_lines: Vec<String> = Vec::new();
219            for r in (i + 1)..l {
220                let slice: String = grid[r][j + 1..k].iter().collect();
221                let stripped = slice.strip_prefix(' ').unwrap_or(&slice).to_string();
222                content_lines.push(stripped.trim_end().to_string());
223            }
224            // Drop leading/trailing empty lines.
225            let first = content_lines.iter().position(|s| !s.is_empty());
226            let last = content_lines.iter().rposition(|s| !s.is_empty());
227            let content = match (first, last) {
228                (Some(f), Some(l)) => content_lines[f..=l].join("\n"),
229                _ => String::new(),
230            };
231            return Some((er, ec, content));
232        }
233    }
234    None
235}