table_extractor/
detector.rs

1use crate::Format;
2use regex::Regex;
3use std::sync::OnceLock;
4
5// Compile regexes once at startup for performance
6// These are used for format auto-detection
7static MYSQL_BORDER: OnceLock<Regex> = OnceLock::new();
8static POSTGRES_SEP: OnceLock<Regex> = OnceLock::new();
9static MARKDOWN_SEP: OnceLock<Regex> = OnceLock::new();
10
11fn get_mysql_border() -> &'static Regex {
12    MYSQL_BORDER.get_or_init(|| Regex::new(r"^\+[-+]+\+$").expect("Invalid MySQL border regex"))
13}
14
15fn get_postgres_sep() -> &'static Regex {
16    POSTGRES_SEP.get_or_init(|| {
17        Regex::new(r"^\s*-+(\+-+)+\s*$").expect("Invalid PostgreSQL separator regex")
18    })
19}
20
21fn get_markdown_sep() -> &'static Regex {
22    MARKDOWN_SEP.get_or_init(|| {
23        Regex::new(r"^\s*\|(?:\s*:?\s*-+\s*:?\s*\|)+").expect("Invalid Markdown separator regex")
24    })
25}
26
27/// Detects the table format from input text
28pub fn detect_format(input: &str) -> Format {
29    let lines: Vec<&str> = input.lines().take(10).collect();
30
31    if lines.is_empty() {
32        return Format::CSV; // Default
33    }
34
35    // Check for MySQL format: +---+ or +----+ borders
36    if is_mysql_format(&lines) {
37        return Format::MySQL;
38    }
39
40    // Check for PostgreSQL format: dashes and pipes as separator
41    if is_postgres_format(&lines) {
42        return Format::PostgreSQL;
43    }
44
45    // Check for Markdown format: |---|---| pattern
46    if is_markdown_format(&lines) {
47        return Format::Markdown;
48    }
49
50    // Check for TSV: contains tabs
51    if is_tsv_format(&lines) {
52        return Format::TSV;
53    }
54
55    // Default to CSV
56    Format::CSV
57}
58
59fn is_mysql_format(lines: &[&str]) -> bool {
60    // MySQL tables have border lines like +----+----+
61    lines
62        .iter()
63        .any(|line| get_mysql_border().is_match(line.trim()))
64}
65
66fn is_postgres_format(lines: &[&str]) -> bool {
67    // PostgreSQL has a separator line like ----+----+----
68    // Usually on the second line
69    if lines.len() < 2 {
70        return false;
71    }
72
73    lines.iter().any(|line| get_postgres_sep().is_match(line))
74}
75
76fn is_markdown_format(lines: &[&str]) -> bool {
77    // Markdown tables have separator lines like |---|---|
78    lines.iter().any(|line| get_markdown_sep().is_match(line))
79}
80
81fn is_tsv_format(lines: &[&str]) -> bool {
82    // TSV contains tabs
83    let has_tabs = lines.iter().any(|line| line.contains('\t'));
84    if !has_tabs {
85        return false;
86    }
87
88    // If it looks like a structured format (Markdown/PostgreSQL), it's not TSV
89    // Markdown has pipes at consistent positions (|col|col|)
90    // PostgreSQL has separator lines with +
91    let looks_like_markdown = lines.iter().any(|line| {
92        let trimmed = line.trim();
93        trimmed.starts_with('|') && trimmed.ends_with('|')
94    });
95
96    let has_plus = lines.iter().any(|line| line.contains('+'));
97
98    has_tabs && !looks_like_markdown && !has_plus
99}
100
101#[cfg(test)]
102mod tests {
103    use super::*;
104
105    #[test]
106    fn test_detect_mysql() {
107        let input = r#"+----+-------+
108| id | name  |
109+----+-------+
110|  1 | Alice |
111+----+-------+"#;
112        assert_eq!(detect_format(input), Format::MySQL);
113    }
114
115    #[test]
116    fn test_detect_postgres() {
117        let input = r#" id | name
118----+-------
119  1 | Alice
120  2 | Bob"#;
121        assert_eq!(detect_format(input), Format::PostgreSQL);
122    }
123
124    #[test]
125    fn test_detect_markdown() {
126        let input = r#"| id | name  |
127|----|-------|
128| 1  | Alice |
129| 2  | Bob   |"#;
130        assert_eq!(detect_format(input), Format::Markdown);
131    }
132
133    #[test]
134    fn test_detect_tsv() {
135        let input = "id\tname\n1\tAlice\n2\tBob";
136        assert_eq!(detect_format(input), Format::TSV);
137    }
138
139    #[test]
140    fn test_detect_csv() {
141        let input = "id,name\n1,Alice\n2,Bob";
142        assert_eq!(detect_format(input), Format::CSV);
143    }
144
145    #[test]
146    fn test_detect_tsv_with_pipes_in_data() {
147        // TSV should be detected even if data contains pipe characters
148        let input = "id\tname\tdesc\n1\tAlice\tUses | pipes\n2\tBob\tNormal text";
149        assert_eq!(detect_format(input), Format::TSV);
150    }
151
152    // ReDoS vulnerability tests - ensure patterns complete quickly even with attack vectors
153    #[test]
154    fn test_postgres_sep_redos_protection() {
155        // Previously vulnerable pattern: r"^[\s\-]+\+[\s\-\+]+$"
156        // Attack vector: many spaces/dashes followed by non-matching char
157        let attack_string = format!("{} X", " -".repeat(100));
158
159        // This should complete quickly (not hang)
160        let result = get_postgres_sep().is_match(&attack_string);
161        assert!(
162            !result,
163            "Attack string should not match valid PostgreSQL separator"
164        );
165
166        // Valid PostgreSQL separators should still match
167        assert!(get_postgres_sep().is_match("----+----+----"));
168        assert!(get_postgres_sep().is_match("  ----+-------  "));
169        assert!(get_postgres_sep().is_match("-+-"));
170    }
171
172    #[test]
173    fn test_markdown_sep_redos_protection() {
174        // Previously vulnerable pattern: r"^\s*\|[\s:-]*-[\s:-]*\|"
175        // Attack vector: pipe followed by many spaces/colons/dashes without final pipe
176        let attack_string = format!("|{} X", " :-".repeat(100));
177
178        // This should complete quickly (not hang)
179        let result = get_markdown_sep().is_match(&attack_string);
180        assert!(
181            !result,
182            "Attack string should not match valid Markdown separator"
183        );
184
185        // Valid Markdown separators should still match
186        assert!(get_markdown_sep().is_match("|---|---|"));
187        assert!(get_markdown_sep().is_match("|:---|:---:|"));
188        assert!(get_markdown_sep().is_match("| --- | --- |"));
189        assert!(get_markdown_sep().is_match("|:-|:-:|"));
190    }
191
192    #[test]
193    fn test_mysql_border_edge_cases() {
194        // Ensure MySQL border regex is robust
195        assert!(get_mysql_border().is_match("+--+"));
196        assert!(get_mysql_border().is_match("+----+----+"));
197        assert!(get_mysql_border().is_match("+-+"));
198
199        // Should not match invalid patterns
200        assert!(!get_mysql_border().is_match("+ - +"));
201        assert!(!get_mysql_border().is_match("++"));
202        assert!(!get_mysql_border().is_match("----"));
203    }
204
205    #[test]
206    fn test_postgres_format_detection_with_various_separators() {
207        // Test detection with different valid PostgreSQL separator styles
208        let input1 = " id | name\n----+-------\n  1 | Alice";
209        assert_eq!(detect_format(input1), Format::PostgreSQL);
210
211        let input2 = " id | name\n--+--\n  1 | Alice";
212        assert_eq!(detect_format(input2), Format::PostgreSQL);
213
214        let input3 = " id | name | age\n----+-------+-----\n  1 | Alice | 30";
215        assert_eq!(detect_format(input3), Format::PostgreSQL);
216    }
217
218    #[test]
219    fn test_markdown_format_detection_with_alignment() {
220        // Test detection with different Markdown alignment styles
221        let input1 = "| id | name |\n|---|---|\n| 1 | Alice |";
222        assert_eq!(detect_format(input1), Format::Markdown);
223
224        let input2 = "| id | name |\n|:---|:---:|\n| 1 | Alice |";
225        assert_eq!(detect_format(input2), Format::Markdown);
226
227        let input3 = "| id | name |\n| :--- | ---: |\n| 1 | Alice |";
228        assert_eq!(detect_format(input3), Format::Markdown);
229    }
230
231    #[test]
232    fn test_no_catastrophic_backtracking_large_input() {
233        // Create a large string that would cause catastrophic backtracking
234        // with the old vulnerable patterns
235        let large_attack = format!("{}X", " -".repeat(1000));
236
237        // This should complete quickly
238        use std::time::Instant;
239        let start = Instant::now();
240        let _ = get_postgres_sep().is_match(&large_attack);
241        let duration = start.elapsed();
242
243        // Should complete in milliseconds, not seconds
244        assert!(
245            duration.as_millis() < 100,
246            "Regex matching took too long: {:?} - possible ReDoS vulnerability",
247            duration
248        );
249    }
250}