use crate::{ColumnStyle, HorizontalAlignment, Table, VerticalAlignment};
struct CellPart {
content: String,
start: usize,
}
fn split_escaped(line: &str, separator: char) -> Vec<CellPart> {
let mut parts = Vec::new();
let mut current_content = String::new();
let mut part_start = 0;
let mut chars = line.char_indices().peekable();
while let Some((byte_idx, ch)) = chars.next() {
if ch == '\\' {
if let Some(&(_, next_ch)) = chars.peek() {
if next_ch == separator {
current_content.push(separator);
chars.next(); continue;
}
}
current_content.push(ch);
} else if ch == separator {
parts.push(CellPart {
content: std::mem::take(&mut current_content),
start: part_start,
});
part_start = byte_idx + ch.len_utf8();
} else {
current_content.push(ch);
}
}
parts.push(CellPart {
content: current_content,
start: part_start,
});
parts
}
fn parse_csv_table(text: &str, base_offset: usize) -> Vec<Vec<CellPart>> {
let text_bytes = text.as_bytes();
let mut reader = csv::ReaderBuilder::new()
.has_headers(false)
.flexible(true) .from_reader(text_bytes);
let mut rows = Vec::new();
for result in reader.records() {
let Ok(record) = result else {
continue;
};
let record_start = record
.position()
.map_or(0, |p| usize::try_from(p.byte()).unwrap_or(0));
let mut cells = Vec::new();
let mut scan_pos = record_start;
for field in &record {
let (field_content_start, next_pos) =
find_csv_field_position(text_bytes, scan_pos, field);
cells.push(CellPart {
content: field.to_string(),
start: base_offset + field_content_start,
});
scan_pos = next_pos;
}
rows.push(cells);
}
rows
}
fn find_csv_field_position(text: &[u8], start: usize, expected_content: &str) -> (usize, usize) {
let Some(&first_byte) = text.get(start) else {
return (start, start);
};
if first_byte == b'"' {
let content_start = start + 1;
let end_pos = find_closing_quote(text, start + 1);
let next_pos = skip_to_next_field(text, end_pos);
(content_start, next_pos)
} else {
let content_start = start;
let end_pos = find_unquoted_field_end(text, start, expected_content.len());
let next_pos = skip_to_next_field(text, end_pos);
(content_start, next_pos)
}
}
fn find_closing_quote(text: &[u8], start: usize) -> usize {
let mut pos = start;
while let Some(&byte) = text.get(pos) {
if byte == b'"' {
if text.get(pos + 1) == Some(&b'"') {
pos += 2;
} else {
return pos;
}
} else {
pos += 1;
}
}
text.len()
}
fn find_unquoted_field_end(text: &[u8], start: usize, content_len: usize) -> usize {
let mut pos = start;
let mut remaining = content_len;
while let Some(&byte) = text.get(pos) {
if byte == b',' || byte == b'\n' || byte == b'\r' {
return pos;
}
if remaining == 0 {
return pos;
}
remaining = remaining.saturating_sub(1);
pos += 1;
}
text.len()
}
fn skip_to_next_field(text: &[u8], pos: usize) -> usize {
let mut pos = pos;
if text.get(pos) == Some(&b'"') {
pos += 1;
}
while let Some(&byte) = text.get(pos) {
if byte == b',' {
return pos + 1;
}
if byte == b'\r' || byte == b'\n' {
if byte == b'\r' && text.get(pos + 1) == Some(&b'\n') {
return pos + 2;
}
return pos + 1;
}
pos += 1;
}
pos
}
fn is_csv_format(separator: &str) -> bool {
separator == ","
}
fn split_line(line: &str, separator: &str) -> Vec<CellPart> {
if let Some(sep_char) = separator.chars().next() {
if separator.len() == 1 {
split_escaped(line, sep_char)
} else {
split_multi_char(line, separator)
}
} else {
vec![CellPart {
content: line.to_string(),
start: 0,
}]
}
}
fn split_multi_char(line: &str, separator: &str) -> Vec<CellPart> {
let mut parts = Vec::new();
let mut last_end = 0;
for (idx, _) in line.match_indices(separator) {
parts.push(CellPart {
content: line.get(last_end..idx).unwrap_or("").to_string(),
start: last_end,
});
last_end = idx + separator.len();
}
parts.push(CellPart {
content: line.get(last_end..).unwrap_or("").to_string(),
start: last_end,
});
parts
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub(crate) enum ParseContext {
FirstPart,
InlineContent,
}
#[derive(Debug, Clone, Copy)]
pub(crate) struct CellSpecifier {
pub colspan: usize,
pub rowspan: usize,
pub halign: Option<HorizontalAlignment>,
pub valign: Option<VerticalAlignment>,
pub style: Option<ColumnStyle>,
pub is_duplication: bool,
pub duplication_count: usize,
}
impl Default for CellSpecifier {
fn default() -> Self {
Self {
colspan: 1,
rowspan: 1,
halign: None,
valign: None,
style: None,
is_duplication: false,
duplication_count: 1,
}
}
}
fn parse_style_byte(byte: u8) -> Option<ColumnStyle> {
match byte {
b'a' => Some(ColumnStyle::AsciiDoc),
b'd' => Some(ColumnStyle::Default),
b'e' => Some(ColumnStyle::Emphasis),
b'h' => Some(ColumnStyle::Header),
b'l' => Some(ColumnStyle::Literal),
b'm' => Some(ColumnStyle::Monospace),
b's' => Some(ColumnStyle::Strong),
_ => None,
}
}
impl CellSpecifier {
#[must_use]
pub fn parse(content: &str, mode: ParseContext) -> (Self, usize) {
let bytes = content.as_bytes();
let mut pos = 0;
let (halign, valign, align_end) = Self::parse_alignments(bytes, pos);
pos = align_end;
let (colspan, colspan_end) = Self::parse_number(content, bytes, pos);
pos = colspan_end;
let (rowspan, rowspan_end) = Self::parse_rowspan(content, bytes, pos);
pos = rowspan_end;
Self::build_result(bytes, pos, colspan, rowspan, halign, valign, mode)
}
fn parse_alignments(
bytes: &[u8],
mut pos: usize,
) -> (
Option<HorizontalAlignment>,
Option<VerticalAlignment>,
usize,
) {
let mut halign: Option<HorizontalAlignment> = None;
let mut valign: Option<VerticalAlignment> = None;
loop {
match bytes.get(pos) {
Some(b'<') => {
halign = Some(HorizontalAlignment::Left);
pos += 1;
}
Some(b'^') => {
halign = Some(HorizontalAlignment::Center);
pos += 1;
}
Some(b'>') => {
halign = Some(HorizontalAlignment::Right);
pos += 1;
}
Some(b'.') => {
match bytes.get(pos + 1) {
Some(b'<') => {
valign = Some(VerticalAlignment::Top);
pos += 2;
}
Some(b'^') => {
valign = Some(VerticalAlignment::Middle);
pos += 2;
}
Some(b'>') => {
valign = Some(VerticalAlignment::Bottom);
pos += 2;
}
_ => break, }
}
_ => break,
}
}
(halign, valign, pos)
}
fn parse_number(content: &str, bytes: &[u8], mut pos: usize) -> (Option<usize>, usize) {
let start = pos;
while bytes.get(pos).is_some_and(u8::is_ascii_digit) {
pos += 1;
}
let value = if pos > start {
content
.get(start..pos)
.and_then(|s| s.parse::<usize>().ok())
} else {
None
};
(value, pos)
}
fn parse_rowspan(content: &str, bytes: &[u8], mut pos: usize) -> (Option<usize>, usize) {
if bytes.get(pos) != Some(&b'.') {
return (None, pos);
}
let dot_pos = pos;
pos += 1;
let start = pos;
while bytes.get(pos).is_some_and(u8::is_ascii_digit) {
pos += 1;
}
if pos > start {
let value = content
.get(start..pos)
.and_then(|s| s.parse::<usize>().ok());
(value, pos)
} else {
(None, dot_pos)
}
}
fn build_result(
bytes: &[u8],
mut pos: usize,
colspan: Option<usize>,
rowspan: Option<usize>,
halign: Option<HorizontalAlignment>,
valign: Option<VerticalAlignment>,
context: ParseContext,
) -> (Self, usize) {
let has_span_or_dup = colspan.is_some() || rowspan.is_some();
let is_duplication = bytes.get(pos) == Some(&b'*');
let is_span = bytes.get(pos) == Some(&b'+');
if (is_span || is_duplication) && has_span_or_dup {
pos += 1;
let style = bytes.get(pos).and_then(|&b| parse_style_byte(b));
if style.is_some() {
pos += 1;
}
let spec = if is_duplication {
Self {
colspan: 1,
rowspan: 1,
halign,
valign,
style,
is_duplication: true,
duplication_count: colspan.unwrap_or(1),
}
} else {
Self {
colspan: colspan.unwrap_or(1),
rowspan: rowspan.unwrap_or(1),
halign,
valign,
style,
is_duplication: false,
duplication_count: 1,
}
};
(spec, pos)
} else if (halign.is_some() || valign.is_some()) && context == ParseContext::FirstPart {
let style = bytes.get(pos).and_then(|&b| parse_style_byte(b));
if style.is_some() {
pos += 1;
}
(
Self {
colspan: 1,
rowspan: 1,
halign,
valign,
style,
is_duplication: false,
duplication_count: 1,
},
pos,
)
} else if context == ParseContext::FirstPart {
let style = bytes.get(pos).and_then(|&b| parse_style_byte(b));
if let Some(style) = style {
pos += 1;
(
Self {
colspan: 1,
rowspan: 1,
halign: None,
valign: None,
style: Some(style),
is_duplication: false,
duplication_count: 1,
},
pos,
)
} else {
(Self::default(), 0)
}
} else {
(Self::default(), 0)
}
}
}
#[derive(Debug, Clone)]
pub(crate) struct ParsedCell {
pub content: String,
pub start: usize,
pub end: usize,
pub colspan: usize,
pub rowspan: usize,
pub halign: Option<HorizontalAlignment>,
pub valign: Option<VerticalAlignment>,
pub style: Option<ColumnStyle>,
pub is_duplication: bool,
pub duplication_count: usize,
}
fn detect_header_after_first_row(lines: &[&str], start_idx: usize, separator: &str) -> bool {
for &line in lines.iter().skip(start_idx) {
let trimmed = line.trim_end();
if !trimmed.is_empty() {
return trimmed.contains(separator);
}
}
false
}
fn is_new_row_start(line: &str, separator: &str) -> bool {
if separator != "|" {
return false;
}
let Some(sep_pos) = line.find(separator) else {
return false;
};
let before_sep = line[..sep_pos].trim();
if before_sep.is_empty() {
return false;
}
let (_, spec_len) = CellSpecifier::parse(before_sep, ParseContext::FirstPart);
spec_len > 0 && spec_len == before_sep.len()
}
fn handle_cross_row_continuation(
lines: &[&str],
i: &mut usize,
current_offset: &mut usize,
rows: &mut [Vec<ParsedCell>],
separator: &str,
) {
while let Some(&next_line) = lines.get(*i) {
let trimmed = next_line.trim_end();
if trimmed.is_empty() || trimmed.contains(separator) {
break;
}
if let Some(last_row) = rows.last_mut() {
if let Some(last_cell) = last_row.last_mut() {
if !last_cell.content.is_empty() {
last_cell.content.push('\n');
}
last_cell.content.push_str(trimmed);
last_cell.end = *current_offset + trimmed.len().saturating_sub(1);
}
}
*current_offset += next_line.len() + 1;
*i += 1;
}
}
impl Table {
pub(crate) fn parse_rows_with_positions(
text: &str,
separator: &str,
has_header: &mut bool,
base_offset: usize,
ncols: Option<usize>,
) -> Vec<Vec<ParsedCell>> {
if is_csv_format(separator) {
return Self::parse_csv_rows_with_positions(text, has_header, base_offset);
}
let mut rows: Vec<Vec<ParsedCell>> = Vec::new();
let mut current_offset = base_offset;
let lines: Vec<&str> = text.lines().collect();
let mut i = 0;
tracing::debug!(
?has_header,
?ncols,
total_lines = lines.len(),
"Starting table parsing"
);
while let Some(&line_ref) = lines.get(i) {
let line = line_ref.trim_end();
tracing::trace!(i, ?line, is_empty = line.is_empty(), "Processing line");
if i == 0 && line.is_empty() {
*has_header = false;
current_offset += line.len() + 1;
i += 1;
continue;
}
let mut row_lines = Vec::new();
let row_start_offset = current_offset;
let first_line = line_ref.trim_end();
let is_single_line_row = first_line.matches(separator).count() > 1;
if is_single_line_row {
row_lines.push(first_line);
current_offset += line_ref.len() + 1;
i += 1;
} else {
while let Some(¤t_line) = lines.get(i) {
let trimmed = current_line.trim_end();
if trimmed.is_empty() {
break;
}
if !row_lines.is_empty() && is_new_row_start(trimmed, separator) {
break;
}
row_lines.push(trimmed);
current_offset += current_line.len() + 1; i += 1;
}
}
if !row_lines.is_empty() {
let columns =
Self::parse_row_with_positions(&row_lines, separator, row_start_offset);
if !is_single_line_row
&& let Some(expected_cols) = ncols
&& let Some(last_row) = rows.last_mut()
{
let last_row_cols: usize = last_row.iter().map(|c| c.colspan).sum();
if last_row_cols < expected_cols {
last_row.extend(columns);
tracing::trace!(
last_row_cols,
expected_cols,
"Merged cells into incomplete row"
);
} else {
rows.push(columns);
}
} else {
rows.push(columns);
}
}
let first_row_col_count: usize = rows
.first()
.map_or(0, |r| r.iter().map(|c| c.colspan).sum());
let first_row_complete = ncols.is_none_or(|n| first_row_col_count >= n);
if rows.len() == 1
&& first_row_complete
&& let Some(&next_line) = lines.get(i)
&& next_line.trim_end().is_empty()
&& detect_header_after_first_row(&lines, i, separator)
{
tracing::debug!("Detected table header via blank line after first row");
*has_header = true;
}
let mut skipped_blank_line = false;
while let Some(&empty_line) = lines.get(i) {
if !empty_line.trim_end().is_empty() {
break;
}
skipped_blank_line = true;
current_offset += empty_line.len() + 1;
i += 1;
}
if skipped_blank_line {
handle_cross_row_continuation(
&lines,
&mut i,
&mut current_offset,
&mut rows,
separator,
);
}
}
rows
}
fn parse_csv_rows_with_positions(
text: &str,
has_header: &mut bool,
base_offset: usize,
) -> Vec<Vec<ParsedCell>> {
let lines: Vec<&str> = text.lines().collect();
if lines.len() >= 2 {
if let Some(&line) = lines.get(1) {
if line.trim().is_empty() {
*has_header = true;
}
}
}
let csv_rows = parse_csv_table(text, base_offset);
let mut rows = Vec::new();
for csv_row in csv_rows {
let mut cells = Vec::new();
for part in csv_row {
let content = part.content.trim();
let start = part.start;
let end = if content.is_empty() {
start
} else {
start + content.len().saturating_sub(1)
};
cells.push(ParsedCell {
content: content.to_string(),
start,
end,
colspan: 1,
rowspan: 1,
halign: None,
valign: None,
style: None,
is_duplication: false,
duplication_count: 1,
});
}
if !cells.is_empty() {
rows.push(cells);
}
}
rows
}
fn parse_row_with_positions(
row_lines: &[&str],
separator: &str,
row_start_offset: usize,
) -> Vec<ParsedCell> {
let mut columns: Vec<ParsedCell> = Vec::new();
let mut current_offset = row_start_offset;
for line in row_lines {
if !line.contains(separator) {
if let Some(last_cell) = columns.last_mut() {
if !last_cell.content.is_empty() {
last_cell.content.push('\n');
}
last_cell.content.push_str(line);
last_cell.end = current_offset + line.len().saturating_sub(1);
}
current_offset += line.len() + 1; continue;
}
let parts = split_line(line, separator);
let mut pending_spec: Option<CellSpecifier> = None;
for (i, part) in parts.iter().enumerate() {
if i == 0 && matches!(separator, "|" | "!") {
let trimmed = part.content.trim();
if !trimmed.is_empty() {
let (spec, spec_len) =
CellSpecifier::parse(trimmed, ParseContext::FirstPart);
if spec_len > 0 && spec_len == trimmed.len() {
pending_spec = Some(spec);
}
}
continue;
}
let cell_content_trimmed = part.content.trim();
let (spec, spec_offset) = if let Some(pending) = pending_spec.take() {
(pending, 0)
} else {
CellSpecifier::parse(cell_content_trimmed, ParseContext::InlineContent)
};
let cell_content = if spec_offset > 0 {
cell_content_trimmed
.get(spec_offset..)
.unwrap_or("")
.trim_start()
} else {
cell_content_trimmed
};
let leading_ws = part.content.len() - part.content.trim_start().len();
let post_spec_ws = if spec_offset > 0 {
let after_spec = cell_content_trimmed.get(spec_offset..).unwrap_or("");
after_spec.len() - after_spec.trim_start().len()
} else {
0
};
let content_start_offset = leading_ws + spec_offset + post_spec_ws;
let cell_start = current_offset + part.start + content_start_offset;
let cell_end = if cell_content.is_empty() {
cell_start
} else {
cell_start + cell_content.len().saturating_sub(1)
};
columns.push(ParsedCell {
content: cell_content.to_string(),
start: cell_start,
end: cell_end,
colspan: spec.colspan,
rowspan: spec.rowspan,
halign: spec.halign,
valign: spec.valign,
style: spec.style,
is_duplication: spec.is_duplication,
duplication_count: spec.duplication_count,
});
}
current_offset += line.len() + 1; }
columns
}
}
#[cfg(test)]
#[allow(clippy::panic, clippy::indexing_slicing)]
mod tests {
use super::*;
#[test]
fn split_escaped_psv_no_escapes() {
let parts = split_escaped("| cell1 | cell2 |", '|');
let [p0, p1, p2, p3] = parts.as_slice() else {
panic!("expected 4 parts, got {}", parts.len());
};
assert_eq!(p0.content, "");
assert_eq!(p1.content, " cell1 ");
assert_eq!(p2.content, " cell2 ");
assert_eq!(p3.content, "");
}
#[test]
fn split_escaped_psv_with_escape() {
let parts = split_escaped(r"| cell with \| pipe | normal |", '|');
let [p0, p1, p2, p3] = parts.as_slice() else {
panic!("expected 4 parts, got {}", parts.len());
};
assert_eq!(p0.content, "");
assert_eq!(p1.content, " cell with | pipe ");
assert_eq!(p2.content, " normal ");
assert_eq!(p3.content, "");
}
#[test]
fn split_escaped_dsv_no_escapes() {
let parts = split_escaped("cell1:cell2:cell3", ':');
let [p0, p1, p2] = parts.as_slice() else {
panic!("expected 3 parts, got {}", parts.len());
};
assert_eq!(p0.content, "cell1");
assert_eq!(p1.content, "cell2");
assert_eq!(p2.content, "cell3");
}
#[test]
fn split_escaped_dsv_with_escape() {
let parts = split_escaped(r"cell with \: colon:normal", ':');
let [p0, p1] = parts.as_slice() else {
panic!("expected 2 parts, got {}", parts.len());
};
assert_eq!(p0.content, "cell with : colon");
assert_eq!(p1.content, "normal");
}
#[test]
fn split_escaped_backslash_not_before_separator() {
let parts = split_escaped(r"cell\n with backslash|next", '|');
let [p0, p1] = parts.as_slice() else {
panic!("expected 2 parts, got {}", parts.len());
};
assert_eq!(p0.content, r"cell\n with backslash");
assert_eq!(p1.content, "next");
}
#[test]
fn split_escaped_multiple_escapes() {
let parts = split_escaped(r"\|start\|middle\|end", '|');
let [p0] = parts.as_slice() else {
panic!("expected 1 part, got {}", parts.len());
};
assert_eq!(p0.content, "|start|middle|end");
}
#[test]
fn split_escaped_positions_tracked() {
let parts = split_escaped("ab|cd|ef", '|');
let [p0, p1, p2] = parts.as_slice() else {
panic!("expected 3 parts, got {}", parts.len());
};
assert_eq!(p0.start, 0);
assert_eq!(p1.start, 3); assert_eq!(p2.start, 6); }
}