table_extractor/
detector.rs1use crate::Format;
2use regex::Regex;
3use std::sync::OnceLock;
4
5const FORMAT_DETECTION_LINE_LIMIT: usize = 10;
7
8static MYSQL_BORDER: OnceLock<Regex> = OnceLock::new();
11static POSTGRES_SEP: OnceLock<Regex> = OnceLock::new();
12static MARKDOWN_SEP: OnceLock<Regex> = OnceLock::new();
13
14fn get_mysql_border() -> &'static Regex {
15 MYSQL_BORDER.get_or_init(|| Regex::new(r"^\+[-+]+\+$").expect("Invalid MySQL border regex"))
16}
17
18fn get_postgres_sep() -> &'static Regex {
19 POSTGRES_SEP.get_or_init(|| {
20 Regex::new(r"^\s*-+(\+-+)+\s*$").expect("Invalid PostgreSQL separator regex")
21 })
22}
23
24fn get_markdown_sep() -> &'static Regex {
25 MARKDOWN_SEP.get_or_init(|| {
26 Regex::new(r"^\s*\|(?:\s*:?\s*-+\s*:?\s*\|)+").expect("Invalid Markdown separator regex")
27 })
28}
29
30pub fn detect_format(input: &str) -> Format {
32 let lines: Vec<&str> = input.lines().take(FORMAT_DETECTION_LINE_LIMIT).collect();
33
34 if lines.is_empty() {
35 return Format::CSV; }
37
38 if is_mysql_format(&lines) {
40 return Format::MySQL;
41 }
42
43 if is_postgres_format(&lines) {
45 return Format::PostgreSQL;
46 }
47
48 if is_markdown_format(&lines) {
50 return Format::Markdown;
51 }
52
53 if is_tsv_format(&lines) {
55 return Format::TSV;
56 }
57
58 Format::CSV
60}
61
62fn is_mysql_format(lines: &[&str]) -> bool {
63 lines
65 .iter()
66 .any(|line| get_mysql_border().is_match(line.trim()))
67}
68
69fn is_postgres_format(lines: &[&str]) -> bool {
70 if lines.len() < 2 {
73 return false;
74 }
75
76 lines.iter().any(|line| get_postgres_sep().is_match(line))
77}
78
79fn is_markdown_format(lines: &[&str]) -> bool {
80 lines.iter().any(|line| get_markdown_sep().is_match(line))
82}
83
84fn is_tsv_format(lines: &[&str]) -> bool {
85 let has_tabs = lines.iter().any(|line| line.contains('\t'));
87 if !has_tabs {
88 return false;
89 }
90
91 let looks_like_markdown = lines.iter().any(|line| {
95 let trimmed = line.trim();
96 trimmed.starts_with('|') && trimmed.ends_with('|')
97 });
98
99 let has_plus = lines.iter().any(|line| line.contains('+'));
100
101 has_tabs && !looks_like_markdown && !has_plus
102}
103
104#[cfg(test)]
105mod tests {
106 use super::*;
107
108 #[test]
109 fn test_detect_mysql() {
110 let input = r#"+----+-------+
111| id | name |
112+----+-------+
113| 1 | Alice |
114+----+-------+"#;
115 assert_eq!(detect_format(input), Format::MySQL);
116 }
117
118 #[test]
119 fn test_detect_postgres() {
120 let input = r#" id | name
121----+-------
122 1 | Alice
123 2 | Bob"#;
124 assert_eq!(detect_format(input), Format::PostgreSQL);
125 }
126
127 #[test]
128 fn test_detect_markdown() {
129 let input = r#"| id | name |
130|----|-------|
131| 1 | Alice |
132| 2 | Bob |"#;
133 assert_eq!(detect_format(input), Format::Markdown);
134 }
135
136 #[test]
137 fn test_detect_tsv() {
138 let input = "id\tname\n1\tAlice\n2\tBob";
139 assert_eq!(detect_format(input), Format::TSV);
140 }
141
142 #[test]
143 fn test_detect_csv() {
144 let input = "id,name\n1,Alice\n2,Bob";
145 assert_eq!(detect_format(input), Format::CSV);
146 }
147
148 #[test]
149 fn test_detect_tsv_with_pipes_in_data() {
150 let input = "id\tname\tdesc\n1\tAlice\tUses | pipes\n2\tBob\tNormal text";
152 assert_eq!(detect_format(input), Format::TSV);
153 }
154
155 #[test]
157 fn test_postgres_sep_redos_protection() {
158 let attack_string = format!("{} X", " -".repeat(100));
161
162 let result = get_postgres_sep().is_match(&attack_string);
164 assert!(
165 !result,
166 "Attack string should not match valid PostgreSQL separator"
167 );
168
169 assert!(get_postgres_sep().is_match("----+----+----"));
171 assert!(get_postgres_sep().is_match(" ----+------- "));
172 assert!(get_postgres_sep().is_match("-+-"));
173 }
174
175 #[test]
176 fn test_markdown_sep_redos_protection() {
177 let attack_string = format!("|{} X", " :-".repeat(100));
180
181 let result = get_markdown_sep().is_match(&attack_string);
183 assert!(
184 !result,
185 "Attack string should not match valid Markdown separator"
186 );
187
188 assert!(get_markdown_sep().is_match("|---|---|"));
190 assert!(get_markdown_sep().is_match("|:---|:---:|"));
191 assert!(get_markdown_sep().is_match("| --- | --- |"));
192 assert!(get_markdown_sep().is_match("|:-|:-:|"));
193 }
194
195 #[test]
196 fn test_mysql_border_edge_cases() {
197 assert!(get_mysql_border().is_match("+--+"));
199 assert!(get_mysql_border().is_match("+----+----+"));
200 assert!(get_mysql_border().is_match("+-+"));
201
202 assert!(!get_mysql_border().is_match("+ - +"));
204 assert!(!get_mysql_border().is_match("++"));
205 assert!(!get_mysql_border().is_match("----"));
206 }
207
208 #[test]
209 fn test_postgres_format_detection_with_various_separators() {
210 let input1 = " id | name\n----+-------\n 1 | Alice";
212 assert_eq!(detect_format(input1), Format::PostgreSQL);
213
214 let input2 = " id | name\n--+--\n 1 | Alice";
215 assert_eq!(detect_format(input2), Format::PostgreSQL);
216
217 let input3 = " id | name | age\n----+-------+-----\n 1 | Alice | 30";
218 assert_eq!(detect_format(input3), Format::PostgreSQL);
219 }
220
221 #[test]
222 fn test_markdown_format_detection_with_alignment() {
223 let input1 = "| id | name |\n|---|---|\n| 1 | Alice |";
225 assert_eq!(detect_format(input1), Format::Markdown);
226
227 let input2 = "| id | name |\n|:---|:---:|\n| 1 | Alice |";
228 assert_eq!(detect_format(input2), Format::Markdown);
229
230 let input3 = "| id | name |\n| :--- | ---: |\n| 1 | Alice |";
231 assert_eq!(detect_format(input3), Format::Markdown);
232 }
233
234 #[test]
235 fn test_no_catastrophic_backtracking_large_input() {
236 let large_attack = format!("{}X", " -".repeat(1000));
239
240 use std::time::Instant;
242 let start = Instant::now();
243 let _ = get_postgres_sep().is_match(&large_attack);
244 let duration = start.elapsed();
245
246 assert!(
248 duration.as_millis() < 100,
249 "Regex matching took too long: {:?} - possible ReDoS vulnerability",
250 duration
251 );
252 }
253}