table_extractor/
detector.rs

1use crate::Format;
2use regex::Regex;
3use std::sync::OnceLock;
4
5/// Number of lines to examine for format detection
6const FORMAT_DETECTION_LINE_LIMIT: usize = 10;
7
8// Compile regexes once at startup for performance
9// These are used for format auto-detection
10static MYSQL_BORDER: OnceLock<Regex> = OnceLock::new();
11static POSTGRES_SEP: OnceLock<Regex> = OnceLock::new();
12static MARKDOWN_SEP: OnceLock<Regex> = OnceLock::new();
13
14fn get_mysql_border() -> &'static Regex {
15    MYSQL_BORDER.get_or_init(|| Regex::new(r"^\+[-+]+\+$").expect("Invalid MySQL border regex"))
16}
17
18fn get_postgres_sep() -> &'static Regex {
19    POSTGRES_SEP.get_or_init(|| {
20        Regex::new(r"^\s*-+(\+-+)+\s*$").expect("Invalid PostgreSQL separator regex")
21    })
22}
23
24fn get_markdown_sep() -> &'static Regex {
25    MARKDOWN_SEP.get_or_init(|| {
26        Regex::new(r"^\s*\|(?:\s*:?\s*-+\s*:?\s*\|)+").expect("Invalid Markdown separator regex")
27    })
28}
29
30/// Detects the table format from input text
31pub fn detect_format(input: &str) -> Format {
32    let lines: Vec<&str> = input.lines().take(FORMAT_DETECTION_LINE_LIMIT).collect();
33
34    if lines.is_empty() {
35        return Format::CSV; // Default
36    }
37
38    // Check for MySQL format: +---+ or +----+ borders
39    if is_mysql_format(&lines) {
40        return Format::MySQL;
41    }
42
43    // Check for PostgreSQL format: dashes and pipes as separator
44    if is_postgres_format(&lines) {
45        return Format::PostgreSQL;
46    }
47
48    // Check for Markdown format: |---|---| pattern
49    if is_markdown_format(&lines) {
50        return Format::Markdown;
51    }
52
53    // Check for TSV: contains tabs
54    if is_tsv_format(&lines) {
55        return Format::TSV;
56    }
57
58    // Default to CSV
59    Format::CSV
60}
61
62fn is_mysql_format(lines: &[&str]) -> bool {
63    // MySQL tables have border lines like +----+----+
64    lines
65        .iter()
66        .any(|line| get_mysql_border().is_match(line.trim()))
67}
68
69fn is_postgres_format(lines: &[&str]) -> bool {
70    // PostgreSQL has a separator line like ----+----+----
71    // Usually on the second line
72    if lines.len() < 2 {
73        return false;
74    }
75
76    lines.iter().any(|line| get_postgres_sep().is_match(line))
77}
78
79fn is_markdown_format(lines: &[&str]) -> bool {
80    // Markdown tables have separator lines like |---|---|
81    lines.iter().any(|line| get_markdown_sep().is_match(line))
82}
83
84fn is_tsv_format(lines: &[&str]) -> bool {
85    // TSV contains tabs
86    let has_tabs = lines.iter().any(|line| line.contains('\t'));
87    if !has_tabs {
88        return false;
89    }
90
91    // If it looks like a structured format (Markdown/PostgreSQL), it's not TSV
92    // Markdown has pipes at consistent positions (|col|col|)
93    // PostgreSQL has separator lines with +
94    let looks_like_markdown = lines.iter().any(|line| {
95        let trimmed = line.trim();
96        trimmed.starts_with('|') && trimmed.ends_with('|')
97    });
98
99    let has_plus = lines.iter().any(|line| line.contains('+'));
100
101    has_tabs && !looks_like_markdown && !has_plus
102}
103
104#[cfg(test)]
105mod tests {
106    use super::*;
107
108    #[test]
109    fn test_detect_mysql() {
110        let input = r#"+----+-------+
111| id | name  |
112+----+-------+
113|  1 | Alice |
114+----+-------+"#;
115        assert_eq!(detect_format(input), Format::MySQL);
116    }
117
118    #[test]
119    fn test_detect_postgres() {
120        let input = r#" id | name
121----+-------
122  1 | Alice
123  2 | Bob"#;
124        assert_eq!(detect_format(input), Format::PostgreSQL);
125    }
126
127    #[test]
128    fn test_detect_markdown() {
129        let input = r#"| id | name  |
130|----|-------|
131| 1  | Alice |
132| 2  | Bob   |"#;
133        assert_eq!(detect_format(input), Format::Markdown);
134    }
135
136    #[test]
137    fn test_detect_tsv() {
138        let input = "id\tname\n1\tAlice\n2\tBob";
139        assert_eq!(detect_format(input), Format::TSV);
140    }
141
142    #[test]
143    fn test_detect_csv() {
144        let input = "id,name\n1,Alice\n2,Bob";
145        assert_eq!(detect_format(input), Format::CSV);
146    }
147
148    #[test]
149    fn test_detect_tsv_with_pipes_in_data() {
150        // TSV should be detected even if data contains pipe characters
151        let input = "id\tname\tdesc\n1\tAlice\tUses | pipes\n2\tBob\tNormal text";
152        assert_eq!(detect_format(input), Format::TSV);
153    }
154
155    // ReDoS vulnerability tests - ensure patterns complete quickly even with attack vectors
156    #[test]
157    fn test_postgres_sep_redos_protection() {
158        // Previously vulnerable pattern: r"^[\s\-]+\+[\s\-\+]+$"
159        // Attack vector: many spaces/dashes followed by non-matching char
160        let attack_string = format!("{} X", " -".repeat(100));
161
162        // This should complete quickly (not hang)
163        let result = get_postgres_sep().is_match(&attack_string);
164        assert!(
165            !result,
166            "Attack string should not match valid PostgreSQL separator"
167        );
168
169        // Valid PostgreSQL separators should still match
170        assert!(get_postgres_sep().is_match("----+----+----"));
171        assert!(get_postgres_sep().is_match("  ----+-------  "));
172        assert!(get_postgres_sep().is_match("-+-"));
173    }
174
175    #[test]
176    fn test_markdown_sep_redos_protection() {
177        // Previously vulnerable pattern: r"^\s*\|[\s:-]*-[\s:-]*\|"
178        // Attack vector: pipe followed by many spaces/colons/dashes without final pipe
179        let attack_string = format!("|{} X", " :-".repeat(100));
180
181        // This should complete quickly (not hang)
182        let result = get_markdown_sep().is_match(&attack_string);
183        assert!(
184            !result,
185            "Attack string should not match valid Markdown separator"
186        );
187
188        // Valid Markdown separators should still match
189        assert!(get_markdown_sep().is_match("|---|---|"));
190        assert!(get_markdown_sep().is_match("|:---|:---:|"));
191        assert!(get_markdown_sep().is_match("| --- | --- |"));
192        assert!(get_markdown_sep().is_match("|:-|:-:|"));
193    }
194
195    #[test]
196    fn test_mysql_border_edge_cases() {
197        // Ensure MySQL border regex is robust
198        assert!(get_mysql_border().is_match("+--+"));
199        assert!(get_mysql_border().is_match("+----+----+"));
200        assert!(get_mysql_border().is_match("+-+"));
201
202        // Should not match invalid patterns
203        assert!(!get_mysql_border().is_match("+ - +"));
204        assert!(!get_mysql_border().is_match("++"));
205        assert!(!get_mysql_border().is_match("----"));
206    }
207
208    #[test]
209    fn test_postgres_format_detection_with_various_separators() {
210        // Test detection with different valid PostgreSQL separator styles
211        let input1 = " id | name\n----+-------\n  1 | Alice";
212        assert_eq!(detect_format(input1), Format::PostgreSQL);
213
214        let input2 = " id | name\n--+--\n  1 | Alice";
215        assert_eq!(detect_format(input2), Format::PostgreSQL);
216
217        let input3 = " id | name | age\n----+-------+-----\n  1 | Alice | 30";
218        assert_eq!(detect_format(input3), Format::PostgreSQL);
219    }
220
221    #[test]
222    fn test_markdown_format_detection_with_alignment() {
223        // Test detection with different Markdown alignment styles
224        let input1 = "| id | name |\n|---|---|\n| 1 | Alice |";
225        assert_eq!(detect_format(input1), Format::Markdown);
226
227        let input2 = "| id | name |\n|:---|:---:|\n| 1 | Alice |";
228        assert_eq!(detect_format(input2), Format::Markdown);
229
230        let input3 = "| id | name |\n| :--- | ---: |\n| 1 | Alice |";
231        assert_eq!(detect_format(input3), Format::Markdown);
232    }
233
234    #[test]
235    fn test_no_catastrophic_backtracking_large_input() {
236        // Create a large string that would cause catastrophic backtracking
237        // with the old vulnerable patterns
238        let large_attack = format!("{}X", " -".repeat(1000));
239
240        // This should complete quickly
241        use std::time::Instant;
242        let start = Instant::now();
243        let _ = get_postgres_sep().is_match(&large_attack);
244        let duration = start.elapsed();
245
246        // Should complete in milliseconds, not seconds
247        assert!(
248            duration.as_millis() < 100,
249            "Regex matching took too long: {:?} - possible ReDoS vulnerability",
250            duration
251        );
252    }
253}