Skip to main content

oo_ide/
language.rs

1//! Language identification for files.
2//!
3//! Provides [`LanguageId`] — a lightweight newtype over `Cow<'static, str>` —
4//! and [`detect_language`], which maps a file path to a language using first
5//! the file extension then a mime-type fallback via `mime_guess`.
6//!
7//! Built-in IDs match the LSP `languageId` convention (lowercase, e.g. "rust",
8//! "cpp"). Using `Cow` keeps zero-allocation for the built-in static strings
9//! while allowing owned `String`s for future config-driven mappings.
10
11use std::borrow::Cow;
12use std::path::Path;
13
14/// A language identifier string (e.g. `"rust"`, `"cpp"`, `"python"`).
15///
16/// Built-in languages are represented as `&'static str` wrapped in
17/// `Cow::Borrowed`, so no allocation occurs for the common case.
18/// Future config-driven language IDs may use `Cow::Owned`.
19#[derive(Debug, Clone, PartialEq, Eq, Hash)]
20pub struct LanguageId(pub Cow<'static, str>);
21
22impl LanguageId {
23    /// Construct from any `&'static str` or `String`.
24    pub fn new(s: impl Into<Cow<'static, str>>) -> Self {
25        Self(s.into())
26    }
27
28    /// Borrow the inner string slice.
29    pub fn as_str(&self) -> &str {
30        &self.0
31    }
32}
33
34impl std::fmt::Display for LanguageId {
35    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
36        f.write_str(&self.0)
37    }
38}
39
40impl AsRef<str> for LanguageId {
41    fn as_ref(&self) -> &str {
42        self.as_str()
43    }
44}
45
46// ---------------------------------------------------------------------------
47// Built-in mappings
48// ---------------------------------------------------------------------------
49
50/// Map a lowercased file extension to a language ID.
51fn lang_from_ext(ext: &str) -> Option<&'static str> {
52    match ext {
53        "rs" => Some("rust"),
54        "py" | "pyw" | "pyi" => Some("python"),
55        "ts" => Some("typescript"),
56        "tsx" => Some("typescriptreact"),
57        "js" | "mjs" | "cjs" => Some("javascript"),
58        "jsx" => Some("javascriptreact"),
59        "go" => Some("go"),
60        "c" | "h" => Some("c"),
61        "cpp" | "cc" | "cxx" | "hpp" | "hxx" | "h++" => Some("cpp"),
62        "java" => Some("java"),
63        "cs" => Some("csharp"),
64        "rb" => Some("ruby"),
65        "lua" => Some("lua"),
66        "zig" => Some("zig"),
67        "toml" => Some("toml"),
68        "yaml" | "yml" => Some("yaml"),
69        "json" | "jsonc" => Some("json"),
70        "md" | "markdown" => Some("markdown"),
71        "sh" | "bash" | "zsh" => Some("shellscript"),
72        "fish" => Some("fish"),
73        "html" | "htm" => Some("html"),
74        "css" => Some("css"),
75        "scss" => Some("scss"),
76        "sql" => Some("sql"),
77        "xml" => Some("xml"),
78        "svelte" => Some("svelte"),
79        "vue" => Some("vue"),
80        "kt" | "kts" => Some("kotlin"),
81        "swift" => Some("swift"),
82        "r" => Some("r"),
83        "dart" => Some("dart"),
84        "ex" | "exs" => Some("elixir"),
85        "hs" => Some("haskell"),
86        "ml" | "mli" => Some("ocaml"),
87        "clj" | "cljs" => Some("clojure"),
88        "erl" | "hrl" => Some("erlang"),
89        "nim" => Some("nim"),
90        "tf" | "tfvars" => Some("terraform"),
91        "dockerfile" => Some("dockerfile"),
92        _ => None,
93    }
94}
95
96/// Map a mime type string to a language ID.
97fn lang_from_mime(mime: &str) -> Option<&'static str> {
98    // Strip optional parameters (e.g. "text/x-rust; charset=utf-8").
99    let mime = mime.split(';').next().unwrap_or(mime).trim();
100    match mime {
101        // Rust
102        "text/rust" | "text/x-rust" => Some("rust"),
103        // Python
104        "text/x-python" | "text/x-python3" | "application/x-python-code" => Some("python"),
105        // Web
106        "application/json" | "text/json" => Some("json"),
107        "text/html" => Some("html"),
108        "text/css" => Some("css"),
109        "text/javascript" | "application/javascript" | "application/x-javascript" => {
110            Some("javascript")
111        }
112        "text/typescript" | "application/typescript" => Some("typescript"),
113        // Markup / data
114        "text/markdown" | "text/x-markdown" => Some("markdown"),
115        "text/x-yaml" | "application/yaml" | "application/x-yaml" => Some("yaml"),
116        "application/toml" => Some("toml"),
117        "text/xml" | "application/xml" => Some("xml"),
118        // Shell
119        "text/x-sh" | "application/x-sh" => Some("shellscript"),
120        // SQL
121        "text/x-sql" | "application/sql" => Some("sql"),
122        _ => None,
123    }
124}
125
126// ---------------------------------------------------------------------------
127// Public API
128// ---------------------------------------------------------------------------
129
130/// Detect the language of a file from its path.
131///
132/// Detection order:
133/// 1. File extension (case-insensitive).
134/// 2. MIME type guessed from the path via `mime_guess` (fallback).
135///
136/// Returns `None` for unknown file types.
137///
138/// # Examples
139/// ```ignore
140/// use std::path::Path;
141/// use oo_ide::language::detect_language;
142/// assert_eq!(detect_language(Path::new("main.rs")).unwrap().as_str(), "rust");
143/// assert!(detect_language(Path::new("unknown.xyz")).is_none());
144/// ```
145pub fn detect_language(path: &Path) -> Option<LanguageId> {
146    // 1. Extension-based lookup.
147    if let Some(id) = path
148        .extension()
149        .and_then(|e| e.to_str())
150        .map(|e| e.to_ascii_lowercase())
151        .as_deref()
152        .and_then(lang_from_ext)
153    {
154        return Some(LanguageId(Cow::Borrowed(id)));
155    }
156
157    // 2. Special-case bare file names without extensions (e.g. "Dockerfile").
158    if let Some(name) = path.file_name().and_then(|n| n.to_str()) {
159        let lower = name.to_ascii_lowercase();
160        if let Some(id) = lang_from_ext(&lower) {
161            return Some(LanguageId(Cow::Borrowed(id)));
162        }
163    }
164
165    // 3. MIME-type fallback.
166    let mime = mime_guess::from_path(path).first_raw()?;
167    let id = lang_from_mime(mime)?;
168    Some(LanguageId(Cow::Borrowed(id)))
169}
170
171// ---------------------------------------------------------------------------
172// Tests
173// ---------------------------------------------------------------------------
174
175#[cfg(test)]
176mod tests {
177    use super::*;
178    use std::path::Path;
179
180    fn id(path: &str) -> Option<String> {
181        detect_language(Path::new(path)).map(|l| l.as_str().to_string())
182    }
183
184    #[test]
185    fn rust_extension() {
186        assert_eq!(id("main.rs"), Some("rust".into()));
187    }
188
189    #[test]
190    fn cpp_extensions() {
191        assert_eq!(id("foo.cpp"), Some("cpp".into()));
192        assert_eq!(id("foo.hpp"), Some("cpp".into()));
193        assert_eq!(id("foo.cc"), Some("cpp".into()));
194    }
195
196    #[test]
197    fn python_extension() {
198        assert_eq!(id("script.py"), Some("python".into()));
199        assert_eq!(id("types.pyi"), Some("python".into()));
200    }
201
202    #[test]
203    fn toml_extension() {
204        assert_eq!(id("Cargo.toml"), Some("toml".into()));
205    }
206
207    #[test]
208    fn yaml_extensions() {
209        assert_eq!(id("config.yaml"), Some("yaml".into()));
210        assert_eq!(id("config.yml"), Some("yaml".into()));
211    }
212
213    #[test]
214    fn unknown_extension_returns_none() {
215        assert_eq!(id("archive.tar"), None);
216        assert_eq!(id("image.png"), None);
217        assert_eq!(id("binary.exe"), None);
218        assert_eq!(id("no_extension"), None);
219    }
220
221    #[test]
222    fn case_insensitive_extension() {
223        assert_eq!(id("main.RS"), Some("rust".into()));
224        assert_eq!(id("main.Py"), Some("python".into()));
225    }
226
227    #[test]
228    fn dockerfile_bare_name() {
229        assert_eq!(id("Dockerfile"), Some("dockerfile".into()));
230    }
231
232    #[test]
233    fn language_id_display() {
234        let lang = LanguageId::new("rust");
235        assert_eq!(lang.to_string(), "rust");
236        assert_eq!(lang.as_str(), "rust");
237    }
238
239    #[test]
240    fn language_id_owned() {
241        let lang = LanguageId::new(String::from("my-lang"));
242        assert_eq!(lang.as_str(), "my-lang");
243    }
244
245    #[test]
246    fn language_id_equality() {
247        assert_eq!(LanguageId::new("rust"), LanguageId::new("rust"));
248        assert_ne!(LanguageId::new("rust"), LanguageId::new("python"));
249    }
250}