magellan/ingest/
detect.rs

1//! Language detection from file extensions.
2//!
3//! Table-driven language detection. No heuristics, no guessing.
4//! Unknown extensions return None, never infer from content.
5
6use std::path::Path;
7
8/// Programming languages supported by Magellan.
9#[derive(Debug, Clone, Copy, PartialEq, Eq)]
10pub enum Language {
11    /// Rust (.rs)
12    Rust,
13    /// Python (.py)
14    Python,
15    /// C (.c, .h)
16    C,
17    /// C++ (.cpp, .hpp, .cc, .cxx)
18    Cpp,
19    /// Java (.java)
20    Java,
21    /// JavaScript (.js, .mjs, .cjs)
22    JavaScript,
23    /// TypeScript (.ts, .tsx)
24    TypeScript,
25}
26
27impl Language {
28    /// Convert language to string identifier.
29    pub fn as_str(&self) -> &'static str {
30        match self {
31            Language::Rust => "rust",
32            Language::Python => "python",
33            Language::C => "c",
34            Language::Cpp => "cpp",
35            Language::Java => "java",
36            Language::JavaScript => "javascript",
37            Language::TypeScript => "typescript",
38        }
39    }
40}
41
42/// Detect programming language from file path.
43///
44/// Uses table-driven extension mapping. Returns None for unknown extensions.
45/// Never guesses or infers from file content.
46///
47/// # Examples
48///
49/// ```
50/// # use magellan::ingest::detect::{detect_language, Language};
51/// # use std::path::Path;
52/// assert_eq!(detect_language(Path::new("main.rs")), Some(Language::Rust));
53/// assert_eq!(detect_language(Path::new("script.py")), Some(Language::Python));
54/// assert_eq!(detect_language(Path::new("file.txt")), None);
55/// ```
56pub fn detect_language(path: &Path) -> Option<Language> {
57    // Get the file extension
58    let extension = path.extension()?.to_str()?;
59
60    // Table-driven mapping (case-sensitive)
61    let language = match extension {
62        // Rust
63        "rs" => Language::Rust,
64
65        // Python
66        "py" => Language::Python,
67
68        // C
69        "c" | "h" => Language::C,
70
71        // C++
72        "cpp" | "hpp" | "cc" | "cxx" => Language::Cpp,
73
74        // Java
75        "java" => Language::Java,
76
77        // JavaScript
78        "js" | "mjs" | "cjs" => Language::JavaScript,
79
80        // TypeScript
81        "ts" | "tsx" => Language::TypeScript,
82
83        // Unknown extension
84        _ => return None,
85    };
86
87    Some(language)
88}
89
90#[cfg(test)]
91mod tests {
92    use super::*;
93
94    #[test]
95    fn test_detect_rust() {
96        assert_eq!(detect_language(Path::new("main.rs")), Some(Language::Rust));
97        assert_eq!(detect_language(Path::new("lib.rs")), Some(Language::Rust));
98    }
99
100    #[test]
101    fn test_detect_python() {
102        assert_eq!(
103            detect_language(Path::new("script.py")),
104            Some(Language::Python)
105        );
106    }
107
108    #[test]
109    fn test_detect_c() {
110        assert_eq!(detect_language(Path::new("main.c")), Some(Language::C));
111        assert_eq!(detect_language(Path::new("header.h")), Some(Language::C));
112    }
113
114    #[test]
115    fn test_detect_cpp() {
116        assert_eq!(detect_language(Path::new("main.cpp")), Some(Language::Cpp));
117        assert_eq!(
118            detect_language(Path::new("header.hpp")),
119            Some(Language::Cpp)
120        );
121        assert_eq!(detect_language(Path::new("main.cc")), Some(Language::Cpp));
122        assert_eq!(detect_language(Path::new("main.cxx")), Some(Language::Cpp));
123    }
124
125    #[test]
126    fn test_detect_java() {
127        assert_eq!(
128            detect_language(Path::new("Main.java")),
129            Some(Language::Java)
130        );
131    }
132
133    #[test]
134    fn test_detect_javascript() {
135        assert_eq!(
136            detect_language(Path::new("script.js")),
137            Some(Language::JavaScript)
138        );
139        assert_eq!(
140            detect_language(Path::new("module.mjs")),
141            Some(Language::JavaScript)
142        );
143        assert_eq!(
144            detect_language(Path::new("module.cjs")),
145            Some(Language::JavaScript)
146        );
147    }
148
149    #[test]
150    fn test_detect_typescript() {
151        assert_eq!(
152            detect_language(Path::new("component.ts")),
153            Some(Language::TypeScript)
154        );
155        assert_eq!(
156            detect_language(Path::new("component.tsx")),
157            Some(Language::TypeScript)
158        );
159    }
160
161    #[test]
162    fn test_unknown_extension_returns_none() {
163        assert_eq!(detect_language(Path::new("file.unknown")), None);
164        assert_eq!(detect_language(Path::new("file.txt")), None);
165        assert_eq!(detect_language(Path::new("file.md")), None);
166    }
167
168    #[test]
169    fn test_no_extension_returns_none() {
170        assert_eq!(detect_language(Path::new("Makefile")), None);
171        assert_eq!(detect_language(Path::new("Dockerfile")), None);
172    }
173
174    #[test]
175    fn test_empty_path_returns_none() {
176        assert_eq!(detect_language(Path::new("")), None);
177    }
178
179    #[test]
180    fn test_dotfile_returns_none() {
181        assert_eq!(detect_language(Path::new(".gitignore")), None);
182    }
183
184    #[test]
185    fn test_case_sensitive() {
186        // Extensions are case-sensitive on Unix
187        assert_eq!(detect_language(Path::new("file.RS")), None);
188        assert_eq!(detect_language(Path::new("file.PY")), None);
189    }
190
191    #[test]
192    fn test_path_with_directory() {
193        assert_eq!(
194            detect_language(Path::new("src/module/main.rs")),
195            Some(Language::Rust)
196        );
197    }
198
199    #[test]
200    fn test_absolute_path() {
201        assert_eq!(
202            detect_language(Path::new("/usr/local/bin/script.py")),
203            Some(Language::Python)
204        );
205    }
206
207    #[test]
208    fn test_language_as_str() {
209        assert_eq!(Language::Rust.as_str(), "rust");
210        assert_eq!(Language::Python.as_str(), "python");
211        assert_eq!(Language::C.as_str(), "c");
212        assert_eq!(Language::Cpp.as_str(), "cpp");
213        assert_eq!(Language::Java.as_str(), "java");
214        assert_eq!(Language::JavaScript.as_str(), "javascript");
215        assert_eq!(Language::TypeScript.as_str(), "typescript");
216    }
217}