table_extractor/parser/
markdown.rs

1use crate::error::Result;
2use crate::{Parser, Table};
3
4pub struct MarkdownParser;
5
6impl Parser for MarkdownParser {
7    fn parse(&self, input: &str) -> Result<Table> {
8        let lines: Vec<&str> = input.lines().collect();
9
10        if lines.is_empty() {
11            return Ok(Table::new(vec![], vec![]));
12        }
13
14        let mut headers = Vec::new();
15        let mut rows = Vec::new();
16        let mut found_separator = false;
17
18        for line in lines {
19            let trimmed = line.trim();
20
21            // Skip empty lines
22            if trimmed.is_empty() {
23                continue;
24            }
25
26            // Check if this is a separator line (contains only |, -, :, and whitespace)
27            if is_separator_line(trimmed) {
28                found_separator = true;
29                continue;
30            }
31
32            // Parse the row
33            let cells = parse_markdown_row(trimmed);
34
35            if !found_separator && headers.is_empty() {
36                // First row is the header
37                headers = cells;
38            } else if found_separator {
39                // Data rows come after the separator
40                rows.push(cells);
41            }
42        }
43
44        Table::new_validated(headers, rows)
45    }
46}
47
48fn is_separator_line(line: &str) -> bool {
49    // A separator line contains only |, -, :, and whitespace
50    line.chars().all(|c| matches!(c, '|' | '-' | ':' | ' '))
51        && line.contains('-')
52        && line.contains('|')
53}
54
55fn parse_markdown_row(line: &str) -> Vec<String> {
56    // Remove leading and trailing pipes
57    let trimmed = line.trim().trim_start_matches('|').trim_end_matches('|');
58
59    // Split by | and trim each cell
60    trimmed
61        .split('|')
62        .map(|cell| cell.trim().to_string())
63        .collect()
64}
65
66#[cfg(test)]
67mod tests {
68    use super::*;
69
70    #[test]
71    fn test_parse_markdown() {
72        let input = r#"| API Metric Name | MongoDB Slice | Position |
73|-----------------|---------------|----------|
74| sessions        | ACQUISITION   | Index 0  |
75| newUsers        | ACQUISITION   | Index 1  |"#;
76
77        let parser = MarkdownParser;
78        let table = parser.parse(input).unwrap();
79
80        assert_eq!(
81            table.headers,
82            vec!["API Metric Name", "MongoDB Slice", "Position"]
83        );
84        assert_eq!(table.rows.len(), 2);
85        assert_eq!(table.rows[0], vec!["sessions", "ACQUISITION", "Index 0"]);
86        assert_eq!(table.rows[1], vec!["newUsers", "ACQUISITION", "Index 1"]);
87    }
88}