Skip to main content

provenant/utils/
language.rs

1use std::collections::HashSet;
2use std::path::Path;
3
4use content_inspector::{ContentType, inspect};
5use file_identify::tags_from_filename;
6
7fn is_utf8_text(content_type: ContentType) -> bool {
8    content_type == ContentType::UTF_8 || content_type == ContentType::UTF_8_BOM
9}
10
11pub fn detect_language(path: &Path, content: &[u8]) -> Option<String> {
12    let inspected = inspect(content);
13
14    if let Some(language) = detect_shebang_language(content) {
15        return Some(language);
16    }
17
18    if let Some(language) = detect_file_identify_language(path) {
19        return Some(language);
20    }
21
22    if let Some(language) = detect_repo_special_file_name_language(path) {
23        return Some(language);
24    }
25
26    if let Some(language) = detect_manual_extension_language(path) {
27        return Some(language);
28    }
29
30    if is_utf8_text(inspected) {
31        let text_sample = String::from_utf8_lossy(&content[..std::cmp::min(content.len(), 1000)]);
32
33        if text_sample.contains("<?php") {
34            return Some("PHP".to_string());
35        } else if text_sample.contains("<html") || text_sample.contains("<!DOCTYPE html") {
36            return Some("HTML".to_string());
37        } else if text_sample.contains("plugins {")
38            || (text_sample.contains("dependencies {") && text_sample.contains("repositories {"))
39        {
40            return Some("Groovy".to_string());
41        } else if text_sample.contains("import React") || text_sample.contains("import {") {
42            return Some("JavaScript/TypeScript".to_string());
43        } else if text_sample.contains("def ") && text_sample.contains(':') {
44            return Some("Python".to_string());
45        } else if text_sample.contains("package ")
46            && text_sample.contains("import ")
47            && text_sample.contains('{')
48        {
49            return Some("Go".to_string());
50        }
51    }
52
53    None
54}
55
56fn detect_shebang_language(content: &[u8]) -> Option<String> {
57    if content.len() <= 2 || content[0] != b'#' || content[1] != b'!' {
58        return None;
59    }
60
61    let shebang_end = content
62        .iter()
63        .position(|&b| b == b'\n')
64        .unwrap_or(content.len());
65    let shebang = String::from_utf8_lossy(&content[0..shebang_end]).to_ascii_lowercase();
66
67    if shebang.contains("python") {
68        Some("Python".to_string())
69    } else if shebang.contains("node") || shebang.contains("deno") || shebang.contains("bun") {
70        Some("JavaScript".to_string())
71    } else if shebang.contains("ruby") {
72        Some("Ruby".to_string())
73    } else if shebang.contains("perl") {
74        Some("Perl".to_string())
75    } else if shebang.contains("php") {
76        Some("PHP".to_string())
77    } else if shebang.contains("pwsh") || shebang.contains("powershell") {
78        Some("PowerShell".to_string())
79    } else if shebang.contains("awk") {
80        Some("Awk".to_string())
81    } else if shebang.contains("bash")
82        || shebang.contains("zsh")
83        || shebang.contains("fish")
84        || shebang.contains("ksh")
85        || shebang.contains("/sh")
86    {
87        Some("Shell".to_string())
88    } else {
89        None
90    }
91}
92
93fn detect_file_identify_language(path: &Path) -> Option<String> {
94    let file_name = path.file_name()?.to_str()?;
95    let tags = tags_from_filename(file_name);
96
97    map_file_identify_tags(&tags).map(str::to_string)
98}
99
100fn map_file_identify_tags(tags: &HashSet<&'static str>) -> Option<&'static str> {
101    if tags.contains("dockerfile") {
102        return Some("Dockerfile");
103    }
104    if tags.contains("makefile") {
105        return Some("Makefile");
106    }
107    if tags.contains("rust") {
108        return Some("Rust");
109    }
110    if tags.contains("python") {
111        return Some("Python");
112    }
113    if tags.contains("javascript") || tags.contains("jsx") {
114        return Some("JavaScript");
115    }
116    if tags.contains("ts") || tags.contains("tsx") {
117        return Some("TypeScript");
118    }
119    if tags.contains("html") {
120        return Some("HTML");
121    }
122    if tags.contains("css") {
123        return Some("CSS");
124    }
125    if tags.contains("c") {
126        return Some("C");
127    }
128    if tags.contains("cpp") {
129        return Some("C++");
130    }
131    if tags.contains("java") {
132        return Some("Java");
133    }
134    if tags.contains("go") {
135        return Some("Go");
136    }
137    if tags.contains("ruby") {
138        return Some("Ruby");
139    }
140    if tags.contains("php") {
141        return Some("PHP");
142    }
143    if tags.contains("perl") {
144        return Some("Perl");
145    }
146    if tags.contains("swift") {
147        return Some("Swift");
148    }
149    if tags.contains("shell") || tags.contains("bash") || tags.contains("zsh") {
150        return Some("Shell");
151    }
152    if tags.contains("kotlin") {
153        return Some("Kotlin");
154    }
155    if tags.contains("dart") {
156        return Some("Dart");
157    }
158    if tags.contains("scala") {
159        return Some("Scala");
160    }
161    if tags.contains("csharp") {
162        return Some("C#");
163    }
164    if tags.contains("fsharp") {
165        return Some("F#");
166    }
167    if tags.contains("r") {
168        return Some("R");
169    }
170    if tags.contains("lua") {
171        return Some("Lua");
172    }
173    if tags.contains("julia") {
174        return Some("Julia");
175    }
176    if tags.contains("elixir") {
177        return Some("Elixir");
178    }
179    if tags.contains("clojure") {
180        return Some("Clojure");
181    }
182    if tags.contains("haskell") {
183        return Some("Haskell");
184    }
185    if tags.contains("erlang") {
186        return Some("Erlang");
187    }
188    if tags.contains("sql") {
189        return Some("SQL");
190    }
191    if tags.contains("tex") {
192        return Some("TeX");
193    }
194    if tags.contains("groovy") || tags.contains("gradle") {
195        return Some("Groovy");
196    }
197    if tags.contains("nix") {
198        return Some("Nix");
199    }
200    if tags.contains("zig") {
201        return Some("Zig");
202    }
203    if tags.contains("powershell") {
204        return Some("PowerShell");
205    }
206    if tags.contains("starlark") {
207        return Some("Starlark");
208    }
209    if tags.contains("awk") {
210        return Some("Awk");
211    }
212    if tags.contains("ocaml") {
213        return Some("OCaml");
214    }
215    if tags.contains("meson") {
216        return Some("Meson");
217    }
218
219    None
220}
221
222fn detect_repo_special_file_name_language(path: &Path) -> Option<String> {
223    let file_name = path
224        .file_name()
225        .and_then(|n| n.to_str())
226        .map(|s| s.to_ascii_lowercase())
227        .unwrap_or_default();
228
229    if matches!(
230        file_name.as_str(),
231        "gemfile" | "rakefile" | "podfile" | "vagrantfile" | "brewfile"
232    ) {
233        Some("Ruby".to_string())
234    } else if matches!(file_name.as_str(), "apkbuild" | "pkgbuild" | "gradlew") {
235        Some("Shell".to_string())
236    } else if matches!(file_name.as_str(), "meson.build") {
237        Some("Meson".to_string())
238    } else if matches!(file_name.as_str(), "containerfile.core") {
239        Some("Dockerfile".to_string())
240    } else if matches!(file_name.as_str(), "build" | "workspace" | "buck") {
241        Some("Starlark".to_string())
242    } else if matches!(
243        file_name.as_str(),
244        "default.nix" | "flake.nix" | "shell.nix"
245    ) {
246        Some("Nix".to_string())
247    } else {
248        None
249    }
250}
251
252fn detect_manual_extension_language(path: &Path) -> Option<String> {
253    let extension = path.extension()?.to_str()?.to_ascii_lowercase();
254
255    match extension.as_str() {
256        "rs" => Some("Rust".to_string()),
257        "py" => Some("Python".to_string()),
258        "js" | "mjs" | "cjs" => Some("JavaScript".to_string()),
259        "ts" | "tsx" | "mts" | "cts" => Some("TypeScript".to_string()),
260        "jsx" => Some("JavaScript".to_string()),
261        "html" | "htm" => Some("HTML".to_string()),
262        "css" => Some("CSS".to_string()),
263        "c" => Some("C".to_string()),
264        "cpp" | "cc" | "cxx" | "hh" | "hxx" => Some("C++".to_string()),
265        "h" => Some("C".to_string()),
266        "hpp" => Some("C++".to_string()),
267        "m" => Some("Objective-C".to_string()),
268        "mm" => Some("Objective-C++".to_string()),
269        "s" | "asm" => Some("GAS".to_string()),
270        "java" => Some("Java".to_string()),
271        "go" => Some("Go".to_string()),
272        "rb" => Some("Ruby".to_string()),
273        "php" => Some("PHP".to_string()),
274        "pl" => Some("Perl".to_string()),
275        "swift" => Some("Swift".to_string()),
276        "sql" => Some("SQL".to_string()),
277        "sh" | "bash" | "zsh" | "fish" | "ksh" => Some("Shell".to_string()),
278        "kt" | "kts" => Some("Kotlin".to_string()),
279        "dart" => Some("Dart".to_string()),
280        "scala" => Some("Scala".to_string()),
281        "cs" => Some("C#".to_string()),
282        "fs" | "fsx" => Some("F#".to_string()),
283        "r" => Some("R".to_string()),
284        "lua" => Some("Lua".to_string()),
285        "jl" => Some("Julia".to_string()),
286        "ex" | "exs" => Some("Elixir".to_string()),
287        "clj" | "cljs" | "cljc" => Some("Clojure".to_string()),
288        "hs" => Some("Haskell".to_string()),
289        "erl" => Some("Erlang".to_string()),
290        "tex" => Some("TeX".to_string()),
291        "groovy" | "gradle" | "gvy" | "gy" | "gsh" => Some("Groovy".to_string()),
292        "nix" => Some("Nix".to_string()),
293        "zig" => Some("Zig".to_string()),
294        "ps1" | "psm1" | "psd1" => Some("PowerShell".to_string()),
295        "bzl" | "bazel" | "star" | "sky" => Some("Starlark".to_string()),
296        "awk" => Some("Awk".to_string()),
297        "ml" | "mli" => Some("OCaml".to_string()),
298        _ => None,
299    }
300}
301
302#[cfg(test)]
303mod tests {
304    use super::detect_language;
305    use std::path::Path;
306
307    #[test]
308    fn detect_language_supports_containerfile_names() {
309        assert_eq!(
310            detect_language(Path::new("Containerfile"), b"FROM scratch\n"),
311            Some("Dockerfile".to_string())
312        );
313        assert_eq!(
314            detect_language(Path::new("containerfile.core"), b"FROM scratch\n"),
315            Some("Dockerfile".to_string())
316        );
317    }
318
319    #[test]
320    fn detect_language_maps_c_headers_to_c() {
321        assert_eq!(
322            detect_language(Path::new("zlib.h"), b"/* header */\n"),
323            Some("C".to_string())
324        );
325    }
326
327    #[test]
328    fn detect_language_maps_uppercase_s_to_gas() {
329        assert_eq!(
330            detect_language(Path::new("gvmat64.S"), b"; asm\n"),
331            Some("GAS".to_string())
332        );
333    }
334
335    #[test]
336    fn detect_language_handles_manifest_dsl_filenames() {
337        assert_eq!(
338            detect_language(Path::new("APKBUILD"), b"pkgname=demo\n"),
339            Some("Shell".to_string())
340        );
341        assert_eq!(
342            detect_language(Path::new("Podfile"), b"source 'https://rubygems.org'\n"),
343            Some("Ruby".to_string())
344        );
345        assert_eq!(
346            detect_language(Path::new("meson.build"), b"project('demo')\n"),
347            Some("Meson".to_string())
348        );
349        assert_eq!(
350            detect_language(Path::new("BUILD"), b"cc_library(name = 'demo')\n"),
351            Some("Starlark".to_string())
352        );
353        assert_eq!(
354            detect_language(Path::new("flake.nix"), b"{ inputs, ... }: {}\n"),
355            Some("Nix".to_string())
356        );
357    }
358
359    #[test]
360    fn detect_language_handles_common_build_extensions() {
361        assert_eq!(
362            detect_language(Path::new("build.gradle"), b"plugins { id 'java' }\n"),
363            Some("Groovy".to_string())
364        );
365        assert_eq!(
366            detect_language(Path::new("main.nix"), b"{ pkgs }: pkgs.hello\n"),
367            Some("Nix".to_string())
368        );
369        assert_eq!(
370            detect_language(Path::new("rules.bzl"), b"def _impl(ctx):\n    pass\n"),
371            Some("Starlark".to_string())
372        );
373        assert_eq!(
374            detect_language(Path::new("script.ps1"), b"Write-Host 'hello'\n"),
375            Some("PowerShell".to_string())
376        );
377    }
378
379    #[test]
380    fn detect_language_omits_generic_text_fallbacks() {
381        assert_eq!(
382            detect_language(Path::new("README.txt"), b"plain text\n"),
383            None
384        );
385        assert_eq!(
386            detect_language(Path::new("data.bin"), &[0, 159, 146, 150]),
387            None
388        );
389    }
390
391    #[test]
392    fn detect_language_ignores_yaml_as_programming_language() {
393        assert_eq!(
394            detect_language(Path::new("config.yaml"), b"key: value\n"),
395            None
396        );
397    }
398
399    #[test]
400    fn detect_language_keeps_extension_detection_for_non_utf8_python() {
401        let latin1_python = b"# coding: latin-1\nprint(\"caf\xe9\")\n# comment padding\n";
402
403        assert_eq!(
404            detect_language(Path::new("script.py"), latin1_python),
405            Some("Python".to_string())
406        );
407    }
408}