panache_parser/grid_layout.rs
1//! Shared 2D geometry pass for grid tables.
2//!
3//! A grid table's logical cells are rectangles over a canonical column/row
4//! grid, and a spanning cell's content is *non-contiguous* in the byte stream
5//! (a rowspan cell's text is interleaved with other cells' bytes and separator
6//! lines). A rowan CST node covers a single contiguous range, so the cell
7//! tiling cannot be represented as CST nodes and must be recovered by a 2D pass
8//! downstream of the parser. This module is the single home for that pass,
9//! consumed by both the pandoc-native projector (`pandoc_ast::grid_table`) and
10//! the formatter's spanning-grid engine, so the geometry is computed one way.
11//!
12//! The algorithm mirrors pandoc's `gridtables`: build a padded char grid,
13//! take the canonical column boundaries as the union of `+` positions across
14//! every "sep-style" line and the canonical row boundaries as those lines'
15//! indices, then detect each cell as the smallest valid bounding rectangle.
16//! Positions are **character** indices (matching pandoc, which lays grid tables
17//! out on the source character grid), not display columns.
18
19use std::collections::BTreeSet;
20
21/// One laid-out cell of a grid table over the canonical (row band × fine
22/// column) grid. `content` is the cell's interior text with one leading pad
23/// space stripped per line, trailing whitespace trimmed, and leading/trailing
24/// blank lines dropped, joined with `\n`.
25#[derive(Debug, Clone)]
26pub struct GridCellRect {
27 pub start_row: usize,
28 pub start_col: usize,
29 pub row_span: usize,
30 pub col_span: usize,
31 pub content: String,
32}
33
34/// Canonical geometry of a grid table plus its detected cells.
35#[derive(Debug, Clone)]
36pub struct GridLayout {
37 /// Character columns of the canonical vertical boundaries (the union of
38 /// `+` positions across all sep-style lines). `cols_pos.len() - 1` fine
39 /// columns.
40 pub cols_pos: Vec<usize>,
41 /// Indices into the input `lines` of the sep-style lines (canonical row
42 /// boundaries). `row_seps.len() - 1` row bands.
43 pub row_seps: Vec<usize>,
44 pub cells: Vec<GridCellRect>,
45}
46
47/// Analyze a grid table's lines into its canonical geometry and cell tiling.
48///
49/// `lines` must already be dedented to the table's own left edge (no container
50/// indent). Returns `None` when the input doesn't form a grid (fewer than two
51/// column boundaries or fewer than two separator lines).
52#[allow(clippy::needless_range_loop)]
53pub fn analyze_grid(lines: &[&str]) -> Option<GridLayout> {
54 if lines.is_empty() {
55 return None;
56 }
57
58 // Pad lines into a 2D char grid.
59 let max_width = lines.iter().map(|l| l.chars().count()).max().unwrap_or(0);
60 let grid: Vec<Vec<char>> = lines
61 .iter()
62 .map(|l| {
63 let mut chars: Vec<char> = l.chars().collect();
64 chars.resize(max_width, ' ');
65 chars
66 })
67 .collect();
68 let nlines = grid.len();
69
70 // A line is "sep-style" if it contains at least one `+` and no chars
71 // outside `+`/`-`/`=`/`:`/`|`/` `. Partial separators (lines mixing `|`
72 // and `+`) qualify; content lines do not.
73 let is_sep_line: Vec<bool> = grid
74 .iter()
75 .map(|row| {
76 row.contains(&'+')
77 && row
78 .iter()
79 .all(|&c| matches!(c, '+' | '-' | '=' | ':' | '|' | ' '))
80 })
81 .collect();
82
83 // Canonical column boundaries: union of `+` columns across all sep-style lines.
84 let mut col_set: BTreeSet<usize> = BTreeSet::new();
85 for (i, row) in grid.iter().enumerate() {
86 if !is_sep_line[i] {
87 continue;
88 }
89 for (j, &c) in row.iter().enumerate() {
90 if c == '+' {
91 col_set.insert(j);
92 }
93 }
94 }
95 let cols_pos: Vec<usize> = col_set.into_iter().collect();
96 if cols_pos.len() < 2 {
97 return None;
98 }
99 let ncols = cols_pos.len() - 1;
100
101 // Canonical row boundaries: line indices of sep-style lines.
102 let row_seps: Vec<usize> = (0..nlines).filter(|&i| is_sep_line[i]).collect();
103 if row_seps.len() < 2 {
104 return None;
105 }
106 let nrows = row_seps.len() - 1;
107
108 // Detect cells.
109 let mut occupied = vec![vec![false; ncols]; nrows];
110 let mut cells: Vec<GridCellRect> = Vec::new();
111 for sr in 0..nrows {
112 for sc in 0..ncols {
113 if occupied[sr][sc] {
114 continue;
115 }
116 let i = row_seps[sr];
117 let j = cols_pos[sc];
118 if grid[i][j] != '+' {
119 // No corner here — the canonical column is missing on this
120 // sep line, meaning the cell that owns this position must
121 // have been emitted earlier and `occupied` should already be
122 // set. If not, the table is malformed; skip.
123 continue;
124 }
125 let Some((er, ec, content)) = find_grid_cell(&grid, i, j, sr, sc, &cols_pos, &row_seps)
126 else {
127 continue;
128 };
129 for r in sr..er {
130 for c in sc..ec {
131 occupied[r][c] = true;
132 }
133 }
134 cells.push(GridCellRect {
135 start_row: sr,
136 start_col: sc,
137 row_span: er - sr,
138 col_span: ec - sc,
139 content,
140 });
141 }
142 }
143
144 Some(GridLayout {
145 cols_pos,
146 row_seps,
147 cells,
148 })
149}
150
151/// Find the smallest valid grid-table cell with its top-left `+` at
152/// `(i, j)` in the char grid, where `(sr, sc)` are the canonical row /
153/// column indices of that corner.
154///
155/// Returns `(end_row_idx, end_col_idx, content_text)` where the cell
156/// occupies canonical rows `sr..end_row_idx` and canonical columns
157/// `sc..end_col_idx`. Content is the text inside the cell, with one
158/// leading-space pad stripped per line and trailing whitespace trimmed,
159/// joined with `\n`.
160#[allow(clippy::needless_range_loop)]
161fn find_grid_cell(
162 grid: &[Vec<char>],
163 i: usize,
164 j: usize,
165 sr: usize,
166 sc: usize,
167 cols_pos: &[usize],
168 row_seps: &[usize],
169) -> Option<(usize, usize, String)> {
170 let nrows = row_seps.len() - 1;
171 let ncols = cols_pos.len() - 1;
172
173 for ec in (sc + 1)..=ncols {
174 let k = cols_pos[ec];
175 // Top edge (i, j+1..k) must be all sep chars (intermediate `+`s OK).
176 let top_ok = (j + 1..k).all(|c| matches!(grid[i][c], '-' | '=' | ':' | '+'));
177 if !top_ok {
178 // Hit a `|` or ` `; can't extend further right.
179 break;
180 }
181 for er in (sr + 1)..=nrows {
182 let l = row_seps[er];
183 // Left edge col j from i+1..l: chars in {|, +}.
184 let left_ok = (i + 1..l).all(|r| matches!(grid[r][j], '|' | '+'));
185 if !left_ok {
186 break;
187 }
188 // Right edge col k from i+1..l: chars in {|, +}.
189 let right_ok = (i + 1..l).all(|r| matches!(grid[r][k], '|' | '+'));
190 if !right_ok {
191 continue;
192 }
193 // Bottom edge (l, j+1..k): chars in {-, =, :, +}.
194 let bot_ok = (j + 1..k).all(|c| matches!(grid[l][c], '-' | '=' | ':' | '+'));
195 if !bot_ok {
196 continue;
197 }
198 if grid[l][j] != '+' || grid[l][k] != '+' {
199 continue;
200 }
201 // No interior partial separator that fully spans this cell.
202 // A line m strictly between i and l splits the cell if it has
203 // `+` at both col j and col k AND all chars between are sep
204 // chars (i.e., the partial sep extends across the whole cell
205 // horizontally).
206 let interior_split = (i + 1..l).any(|m| {
207 grid[m][j] == '+'
208 && grid[m][k] == '+'
209 && (j + 1..k).all(|c| matches!(grid[m][c], '-' | '=' | ':' | '+'))
210 });
211 if interior_split {
212 continue;
213 }
214
215 // Extract content text. For each interior line, take chars
216 // [j+1..k], strip one leading space (cell padding), trim
217 // trailing whitespace.
218 let mut content_lines: Vec<String> = Vec::new();
219 for r in (i + 1)..l {
220 let slice: String = grid[r][j + 1..k].iter().collect();
221 let stripped = slice.strip_prefix(' ').unwrap_or(&slice).to_string();
222 content_lines.push(stripped.trim_end().to_string());
223 }
224 // Drop leading/trailing empty lines.
225 let first = content_lines.iter().position(|s| !s.is_empty());
226 let last = content_lines.iter().rposition(|s| !s.is_empty());
227 let content = match (first, last) {
228 (Some(f), Some(l)) => content_lines[f..=l].join("\n"),
229 _ => String::new(),
230 };
231 return Some((er, ec, content));
232 }
233 }
234 None
235}