Skip to main content

pick/
detector.rs

1use crate::cli::InputFormat;
2
3pub fn detect_format(input: &str) -> InputFormat {
4    let trimmed = input.trim();
5
6    if trimmed.is_empty() {
7        return InputFormat::Text;
8    }
9
10    // JSON: starts with {
11    if trimmed.starts_with('{') {
12        return InputFormat::Json;
13    }
14
15    // Starts with [ — could be JSON array or TOML section header
16    if trimmed.starts_with('[') {
17        // Check if first line looks like a TOML section: [word] or [[word]]
18        let first_line = trimmed.lines().next().unwrap_or("").trim();
19        let is_toml_section = (first_line.starts_with("[[")
20            && first_line.ends_with("]]")
21            && first_line.len() > 4
22            && first_line[2..first_line.len() - 2]
23                .chars()
24                .all(|c| c.is_ascii_alphanumeric() || c == '_' || c == '-' || c == '.'))
25            || (first_line.starts_with('[')
26                && first_line.ends_with(']')
27                && !first_line.starts_with("[[")
28                && first_line[1..first_line.len() - 1]
29                    .chars()
30                    .all(|c| c.is_ascii_alphanumeric() || c == '_' || c == '-' || c == '.'));
31
32        if !is_toml_section {
33            return InputFormat::Json;
34        }
35        // Otherwise fall through to TOML detection
36    }
37
38    let lines: Vec<&str> = trimmed.lines().collect();
39
40    // HTTP headers: lines with "Key: Value" pattern
41    if looks_like_headers(&lines) {
42        return InputFormat::Headers;
43    }
44
45    // TOML: has [section] headers or key = value with TOML conventions
46    if looks_like_toml(&lines) {
47        return InputFormat::Toml;
48    }
49
50    // logfmt: multiple key=value pairs per line
51    if looks_like_logfmt(&lines) {
52        return InputFormat::Logfmt;
53    }
54
55    // .env: KEY=value with uppercase keys
56    if looks_like_env(&lines) {
57        return InputFormat::Env;
58    }
59
60    // CSV: consistent delimiters across rows
61    if looks_like_csv(&lines) {
62        return InputFormat::Csv;
63    }
64
65    // YAML: key: value patterns or --- document separator
66    if looks_like_yaml(&lines) {
67        return InputFormat::Yaml;
68    }
69
70    InputFormat::Text
71}
72
73fn looks_like_headers(lines: &[&str]) -> bool {
74    if lines.len() < 2 {
75        return false;
76    }
77
78    let is_header_line = |line: &str| -> bool {
79        if let Some(colon_pos) = line.find(':') {
80            let key = &line[..colon_pos];
81            // Header keys: non-empty, alphabetic with hyphens, no spaces
82            !key.is_empty() && key.chars().all(|c| c.is_ascii_alphabetic() || c == '-')
83        } else {
84            false
85        }
86    };
87
88    // Allow first line to be HTTP status (HTTP/1.1 200 OK)
89    let start = if lines[0].starts_with("HTTP/") { 1 } else { 0 };
90    let relevant: Vec<&&str> = lines[start..]
91        .iter()
92        .filter(|l| !l.trim().is_empty())
93        .collect();
94
95    if relevant.len() < 2 {
96        return false;
97    }
98
99    let header_count = relevant.iter().filter(|l| is_header_line(l)).count();
100    if (header_count as f64 / relevant.len() as f64) <= 0.7 {
101        return false;
102    }
103
104    // Require at least one key with a hyphen OR all keys start with uppercase
105    // This distinguishes headers from YAML key: value
106    let has_hyphen_key = relevant.iter().any(|line| {
107        if let Some(colon_pos) = line.find(':') {
108            line[..colon_pos].contains('-')
109        } else {
110            false
111        }
112    });
113
114    let uppercase_keys = relevant
115        .iter()
116        .filter(|line| {
117            line.as_bytes()
118                .first()
119                .is_some_and(|b| b.is_ascii_uppercase())
120        })
121        .count();
122
123    has_hyphen_key || uppercase_keys as f64 / relevant.len() as f64 > 0.7
124}
125
126fn looks_like_toml(lines: &[&str]) -> bool {
127    let has_section = lines.iter().any(|l| {
128        let t = l.trim();
129        (t.starts_with('[') && t.ends_with(']') && !t.starts_with("[["))
130            || (t.starts_with("[[") && t.ends_with("]]"))
131    });
132
133    let has_toml_kv = lines.iter().any(|l| {
134        let t = l.trim();
135        // TOML uses "key = value" (with spaces around =)
136        if let Some(eq_pos) = t.find(" = ") {
137            let key = &t[..eq_pos];
138            !key.is_empty()
139                && key
140                    .chars()
141                    .all(|c| c.is_ascii_alphanumeric() || c == '_' || c == '-' || c == '.')
142        } else {
143            false
144        }
145    });
146
147    has_section || (has_toml_kv && !looks_like_env(lines))
148}
149
150fn looks_like_logfmt(lines: &[&str]) -> bool {
151    let relevant: Vec<&str> = lines
152        .iter()
153        .map(|l| l.trim())
154        .filter(|l| !l.is_empty())
155        .collect();
156
157    if relevant.is_empty() {
158        return false;
159    }
160
161    // logfmt: multiple key=value pairs on the same line, space-separated
162    relevant.iter().all(|line| {
163        let pairs: Vec<&str> = line
164            .split_whitespace()
165            .filter(|token| token.contains('='))
166            .collect();
167        pairs.len() >= 2
168    })
169}
170
171fn looks_like_env(lines: &[&str]) -> bool {
172    let relevant: Vec<&str> = lines
173        .iter()
174        .map(|l| l.trim())
175        .filter(|l| !l.is_empty() && !l.starts_with('#'))
176        .collect();
177
178    if relevant.is_empty() {
179        return false;
180    }
181
182    let env_count = relevant
183        .iter()
184        .filter(|line| {
185            let line = line.strip_prefix("export ").unwrap_or(line);
186            if let Some(eq_pos) = line.find('=') {
187                let key = &line[..eq_pos];
188                // .env keys: non-empty, alphanumeric+underscore, typically start uppercase
189                !key.is_empty()
190                    && !key.contains(' ')
191                    && key.chars().all(|c| c.is_ascii_alphanumeric() || c == '_')
192                    && key
193                        .chars()
194                        .next()
195                        .is_some_and(|c| c.is_ascii_uppercase() || c == '_')
196            } else {
197                false
198            }
199        })
200        .count();
201
202    env_count as f64 / relevant.len() as f64 > 0.7
203}
204
205fn looks_like_csv(lines: &[&str]) -> bool {
206    let non_empty: Vec<&str> = lines
207        .iter()
208        .map(|l| l.trim())
209        .filter(|l| !l.is_empty())
210        .collect();
211
212    if non_empty.len() < 2 {
213        return false;
214    }
215
216    // Check for consistent comma count
217    let comma_counts: Vec<usize> = non_empty.iter().map(|l| l.matches(',').count()).collect();
218
219    if comma_counts[0] >= 1 && comma_counts.iter().all(|&c| c == comma_counts[0]) {
220        return true;
221    }
222
223    // Check for consistent tab count
224    let tab_counts: Vec<usize> = non_empty.iter().map(|l| l.matches('\t').count()).collect();
225
226    tab_counts[0] >= 1 && tab_counts.iter().all(|&c| c == tab_counts[0])
227}
228
229fn looks_like_yaml(lines: &[&str]) -> bool {
230    if lines.is_empty() {
231        return false;
232    }
233
234    let first = lines[0].trim();
235    if first == "---" {
236        return true;
237    }
238
239    // key: value pattern (colon followed by space or end of line)
240    let yaml_like = lines
241        .iter()
242        .filter(|l| {
243            let t = l.trim();
244            if t.is_empty() || t.starts_with('#') {
245                return false;
246            }
247            if let Some(colon_pos) = t.find(':') {
248                let after_colon = &t[colon_pos + 1..];
249                after_colon.is_empty() || after_colon.starts_with(' ')
250            } else {
251                t.starts_with("- ") // YAML list item
252            }
253        })
254        .count();
255
256    let non_empty = lines.iter().filter(|l| !l.trim().is_empty()).count();
257
258    non_empty > 0 && yaml_like as f64 / non_empty as f64 > 0.5
259}
260
261#[cfg(test)]
262mod tests {
263    use super::*;
264
265    #[test]
266    fn detect_json_object() {
267        assert_eq!(detect_format("{\"a\": 1}"), InputFormat::Json);
268    }
269
270    #[test]
271    fn detect_json_array() {
272        assert_eq!(detect_format("[1, 2, 3]"), InputFormat::Json);
273    }
274
275    #[test]
276    fn detect_json_whitespace() {
277        assert_eq!(
278            detect_format("  \n  {\"key\": \"val\"}  "),
279            InputFormat::Json
280        );
281    }
282
283    #[test]
284    fn detect_yaml_document() {
285        assert_eq!(
286            detect_format("---\nname: Alice\nage: 30"),
287            InputFormat::Yaml
288        );
289    }
290
291    #[test]
292    fn detect_yaml_kv() {
293        assert_eq!(
294            detect_format("name: Alice\nage: 30\ncity: NYC"),
295            InputFormat::Yaml
296        );
297    }
298
299    #[test]
300    fn detect_toml_with_section() {
301        assert_eq!(
302            detect_format("[package]\nname = \"pick\"\nversion = \"0.1.0\""),
303            InputFormat::Toml
304        );
305    }
306
307    #[test]
308    fn detect_toml_array_of_tables() {
309        assert_eq!(
310            detect_format("[[items]]\nname = \"a\"\n\n[[items]]\nname = \"b\""),
311            InputFormat::Toml
312        );
313    }
314
315    #[test]
316    fn detect_env() {
317        assert_eq!(
318            detect_format("DATABASE_URL=postgres://localhost/db\nPORT=3000\nDEBUG=true"),
319            InputFormat::Env
320        );
321    }
322
323    #[test]
324    fn detect_env_with_export() {
325        assert_eq!(
326            detect_format("export DATABASE_URL=postgres://localhost/db\nexport PORT=3000"),
327            InputFormat::Env
328        );
329    }
330
331    #[test]
332    fn detect_env_with_comments() {
333        assert_eq!(
334            detect_format("# Database config\nDATABASE_URL=postgres://localhost/db\nPORT=3000"),
335            InputFormat::Env
336        );
337    }
338
339    #[test]
340    fn detect_headers() {
341        assert_eq!(
342            detect_format(
343                "Content-Type: application/json\nX-Request-Id: abc123\nCache-Control: no-cache"
344            ),
345            InputFormat::Headers
346        );
347    }
348
349    #[test]
350    fn detect_headers_with_status() {
351        assert_eq!(
352            detect_format("HTTP/1.1 200 OK\nContent-Type: text/html\nContent-Length: 1234"),
353            InputFormat::Headers
354        );
355    }
356
357    #[test]
358    fn detect_logfmt() {
359        assert_eq!(
360            detect_format("level=info msg=\"request handled\" duration=0.5s status=200"),
361            InputFormat::Logfmt
362        );
363    }
364
365    #[test]
366    fn detect_logfmt_multiline() {
367        assert_eq!(
368            detect_format("level=info msg=hello ts=123\nlevel=error msg=fail ts=456"),
369            InputFormat::Logfmt
370        );
371    }
372
373    #[test]
374    fn detect_csv() {
375        assert_eq!(
376            detect_format("name,age,city\nAlice,30,NYC\nBob,25,LA"),
377            InputFormat::Csv
378        );
379    }
380
381    #[test]
382    fn detect_tsv() {
383        assert_eq!(
384            detect_format("name\tage\tcity\nAlice\t30\tNYC\nBob\t25\tLA"),
385            InputFormat::Csv
386        );
387    }
388
389    #[test]
390    fn detect_empty_input() {
391        assert_eq!(detect_format(""), InputFormat::Text);
392        assert_eq!(detect_format("   \n  "), InputFormat::Text);
393    }
394
395    #[test]
396    fn detect_plain_text() {
397        assert_eq!(
398            detect_format("just some random text here"),
399            InputFormat::Text
400        );
401    }
402}