table_extractor/
detector.rs1use crate::Format;
2use regex::Regex;
3use std::sync::OnceLock;
4
5static MYSQL_BORDER: OnceLock<Regex> = OnceLock::new();
8static POSTGRES_SEP: OnceLock<Regex> = OnceLock::new();
9static MARKDOWN_SEP: OnceLock<Regex> = OnceLock::new();
10
11fn get_mysql_border() -> &'static Regex {
12 MYSQL_BORDER.get_or_init(|| Regex::new(r"^\+[-+]+\+$").expect("Invalid MySQL border regex"))
13}
14
15fn get_postgres_sep() -> &'static Regex {
16 POSTGRES_SEP.get_or_init(|| {
17 Regex::new(r"^\s*-+(\+-+)+\s*$").expect("Invalid PostgreSQL separator regex")
18 })
19}
20
21fn get_markdown_sep() -> &'static Regex {
22 MARKDOWN_SEP.get_or_init(|| {
23 Regex::new(r"^\s*\|(?:\s*:?\s*-+\s*:?\s*\|)+").expect("Invalid Markdown separator regex")
24 })
25}
26
27pub fn detect_format(input: &str) -> Format {
29 let lines: Vec<&str> = input.lines().take(10).collect();
30
31 if lines.is_empty() {
32 return Format::CSV; }
34
35 if is_mysql_format(&lines) {
37 return Format::MySQL;
38 }
39
40 if is_postgres_format(&lines) {
42 return Format::PostgreSQL;
43 }
44
45 if is_markdown_format(&lines) {
47 return Format::Markdown;
48 }
49
50 if is_tsv_format(&lines) {
52 return Format::TSV;
53 }
54
55 Format::CSV
57}
58
59fn is_mysql_format(lines: &[&str]) -> bool {
60 lines
62 .iter()
63 .any(|line| get_mysql_border().is_match(line.trim()))
64}
65
66fn is_postgres_format(lines: &[&str]) -> bool {
67 if lines.len() < 2 {
70 return false;
71 }
72
73 lines.iter().any(|line| get_postgres_sep().is_match(line))
74}
75
76fn is_markdown_format(lines: &[&str]) -> bool {
77 lines.iter().any(|line| get_markdown_sep().is_match(line))
79}
80
81fn is_tsv_format(lines: &[&str]) -> bool {
82 let has_tabs = lines.iter().any(|line| line.contains('\t'));
84 if !has_tabs {
85 return false;
86 }
87
88 let looks_like_markdown = lines.iter().any(|line| {
92 let trimmed = line.trim();
93 trimmed.starts_with('|') && trimmed.ends_with('|')
94 });
95
96 let has_plus = lines.iter().any(|line| line.contains('+'));
97
98 has_tabs && !looks_like_markdown && !has_plus
99}
100
101#[cfg(test)]
102mod tests {
103 use super::*;
104
105 #[test]
106 fn test_detect_mysql() {
107 let input = r#"+----+-------+
108| id | name |
109+----+-------+
110| 1 | Alice |
111+----+-------+"#;
112 assert_eq!(detect_format(input), Format::MySQL);
113 }
114
115 #[test]
116 fn test_detect_postgres() {
117 let input = r#" id | name
118----+-------
119 1 | Alice
120 2 | Bob"#;
121 assert_eq!(detect_format(input), Format::PostgreSQL);
122 }
123
124 #[test]
125 fn test_detect_markdown() {
126 let input = r#"| id | name |
127|----|-------|
128| 1 | Alice |
129| 2 | Bob |"#;
130 assert_eq!(detect_format(input), Format::Markdown);
131 }
132
133 #[test]
134 fn test_detect_tsv() {
135 let input = "id\tname\n1\tAlice\n2\tBob";
136 assert_eq!(detect_format(input), Format::TSV);
137 }
138
139 #[test]
140 fn test_detect_csv() {
141 let input = "id,name\n1,Alice\n2,Bob";
142 assert_eq!(detect_format(input), Format::CSV);
143 }
144
145 #[test]
146 fn test_detect_tsv_with_pipes_in_data() {
147 let input = "id\tname\tdesc\n1\tAlice\tUses | pipes\n2\tBob\tNormal text";
149 assert_eq!(detect_format(input), Format::TSV);
150 }
151
152 #[test]
154 fn test_postgres_sep_redos_protection() {
155 let attack_string = format!("{} X", " -".repeat(100));
158
159 let result = get_postgres_sep().is_match(&attack_string);
161 assert!(
162 !result,
163 "Attack string should not match valid PostgreSQL separator"
164 );
165
166 assert!(get_postgres_sep().is_match("----+----+----"));
168 assert!(get_postgres_sep().is_match(" ----+------- "));
169 assert!(get_postgres_sep().is_match("-+-"));
170 }
171
172 #[test]
173 fn test_markdown_sep_redos_protection() {
174 let attack_string = format!("|{} X", " :-".repeat(100));
177
178 let result = get_markdown_sep().is_match(&attack_string);
180 assert!(
181 !result,
182 "Attack string should not match valid Markdown separator"
183 );
184
185 assert!(get_markdown_sep().is_match("|---|---|"));
187 assert!(get_markdown_sep().is_match("|:---|:---:|"));
188 assert!(get_markdown_sep().is_match("| --- | --- |"));
189 assert!(get_markdown_sep().is_match("|:-|:-:|"));
190 }
191
192 #[test]
193 fn test_mysql_border_edge_cases() {
194 assert!(get_mysql_border().is_match("+--+"));
196 assert!(get_mysql_border().is_match("+----+----+"));
197 assert!(get_mysql_border().is_match("+-+"));
198
199 assert!(!get_mysql_border().is_match("+ - +"));
201 assert!(!get_mysql_border().is_match("++"));
202 assert!(!get_mysql_border().is_match("----"));
203 }
204
205 #[test]
206 fn test_postgres_format_detection_with_various_separators() {
207 let input1 = " id | name\n----+-------\n 1 | Alice";
209 assert_eq!(detect_format(input1), Format::PostgreSQL);
210
211 let input2 = " id | name\n--+--\n 1 | Alice";
212 assert_eq!(detect_format(input2), Format::PostgreSQL);
213
214 let input3 = " id | name | age\n----+-------+-----\n 1 | Alice | 30";
215 assert_eq!(detect_format(input3), Format::PostgreSQL);
216 }
217
218 #[test]
219 fn test_markdown_format_detection_with_alignment() {
220 let input1 = "| id | name |\n|---|---|\n| 1 | Alice |";
222 assert_eq!(detect_format(input1), Format::Markdown);
223
224 let input2 = "| id | name |\n|:---|:---:|\n| 1 | Alice |";
225 assert_eq!(detect_format(input2), Format::Markdown);
226
227 let input3 = "| id | name |\n| :--- | ---: |\n| 1 | Alice |";
228 assert_eq!(detect_format(input3), Format::Markdown);
229 }
230
231 #[test]
232 fn test_no_catastrophic_backtracking_large_input() {
233 let large_attack = format!("{}X", " -".repeat(1000));
236
237 use std::time::Instant;
239 let start = Instant::now();
240 let _ = get_postgres_sep().is_match(&large_attack);
241 let duration = start.elapsed();
242
243 assert!(
245 duration.as_millis() < 100,
246 "Regex matching took too long: {:?} - possible ReDoS vulnerability",
247 duration
248 );
249 }
250}