table_extractor/parser/
markdown.rs

1use crate::error::Result;
2use crate::{Parser, Table};
3
4pub struct MarkdownParser;
5
6impl Parser for MarkdownParser {
7    fn parse(&self, input: &str) -> Result<Table> {
8        let lines: Vec<&str> = input.lines().collect();
9
10        if lines.is_empty() {
11            return Ok(Table::new(vec![], vec![]));
12        }
13
14        let mut headers = Vec::new();
15        let mut rows = Vec::new();
16        let mut found_separator = false;
17
18        for line in lines {
19            let trimmed = line.trim();
20
21            // Skip empty lines
22            if trimmed.is_empty() {
23                continue;
24            }
25
26            // Check if this is a separator line (contains only |, -, :, and whitespace)
27            if is_separator_line(trimmed) {
28                found_separator = true;
29                continue;
30            }
31
32            // Parse the row
33            let cells = parse_markdown_row(trimmed);
34
35            if !found_separator && headers.is_empty() {
36                // First row is the header
37                headers = cells;
38            } else if found_separator {
39                // Data rows come after the separator
40                rows.push(cells);
41            }
42        }
43
44        Table::new_validated(headers, rows)
45    }
46}
47
48fn is_separator_line(line: &str) -> bool {
49    // A separator line contains only |, -, :, and whitespace
50    line.chars().all(|c| matches!(c, '|' | '-' | ':' | ' '))
51        && line.contains('-')
52        && line.contains('|')
53}
54
55fn parse_markdown_row(line: &str) -> Vec<String> {
56    // Remove leading and trailing pipes
57    let trimmed = line.trim().trim_start_matches('|').trim_end_matches('|');
58
59    // Estimate column count for pre-allocation
60    let estimated_cols = trimmed.chars().filter(|&c| c == '|').count() + 1;
61    let mut cells = Vec::with_capacity(estimated_cols);
62
63    // Split by | and trim each cell
64    // Only allocate new string if trimming changes the value
65    for cell in trimmed.split('|') {
66        cells.push(cell.trim().to_string());
67    }
68
69    cells
70}
71
72#[cfg(test)]
73mod tests {
74    use super::*;
75
76    #[test]
77    fn test_parse_markdown() {
78        let input = r#"| API Metric Name | MongoDB Slice | Position |
79|-----------------|---------------|----------|
80| sessions        | ACQUISITION   | Index 0  |
81| newUsers        | ACQUISITION   | Index 1  |"#;
82
83        let parser = MarkdownParser;
84        let table = parser.parse(input).unwrap();
85
86        assert_eq!(
87            table.headers(),
88            &["API Metric Name", "MongoDB Slice", "Position"]
89        );
90        assert_eq!(table.rows().len(), 2);
91        assert_eq!(table.rows()[0], vec!["sessions", "ACQUISITION", "Index 0"]);
92        assert_eq!(table.rows()[1], vec!["newUsers", "ACQUISITION", "Index 1"]);
93    }
94}