Skip to main content

provenant/utils/
language.rs

1use std::collections::HashSet;
2use std::path::Path;
3
4use file_identify::tags_from_filename;
5
6pub fn detect_language(path: &Path, content: &[u8]) -> Option<String> {
7    if let Some(language) = detect_shebang_language(content) {
8        return Some(language);
9    }
10
11    if let Some(language) = detect_file_identify_language(path) {
12        return Some(language);
13    }
14
15    if let Some(language) = detect_repo_special_file_name_language(path) {
16        return Some(language);
17    }
18
19    if let Some(language) = detect_manual_extension_language(path) {
20        return Some(language);
21    }
22
23    detect_content_hint_language(content)
24}
25
26fn detect_content_hint_language(content: &[u8]) -> Option<String> {
27    let sample_end = std::cmp::min(content.len(), 1000);
28    let text_sample = std::str::from_utf8(&content[..sample_end]).ok()?;
29
30    if text_sample.contains("<?php") {
31        Some("PHP".to_string())
32    } else if text_sample.contains("<html") || text_sample.contains("<!DOCTYPE html") {
33        Some("HTML".to_string())
34    } else if text_sample.contains("plugins {")
35        || (text_sample.contains("dependencies {") && text_sample.contains("repositories {"))
36    {
37        Some("Groovy".to_string())
38    } else if text_sample.contains("import React") || text_sample.contains("import {") {
39        Some("JavaScript/TypeScript".to_string())
40    } else if text_sample.contains("def ") && text_sample.contains(':') {
41        Some("Python".to_string())
42    } else if text_sample.contains("package ")
43        && text_sample.contains("import ")
44        && text_sample.contains('{')
45    {
46        Some("Go".to_string())
47    } else {
48        None
49    }
50}
51
52fn detect_shebang_language(content: &[u8]) -> Option<String> {
53    if content.len() <= 2 || content[0] != b'#' || content[1] != b'!' {
54        return None;
55    }
56
57    let shebang_end = content
58        .iter()
59        .position(|&b| b == b'\n')
60        .unwrap_or(content.len());
61    let shebang = String::from_utf8_lossy(&content[0..shebang_end]).to_ascii_lowercase();
62
63    if shebang.contains("python") {
64        Some("Python".to_string())
65    } else if shebang.contains("node") || shebang.contains("deno") || shebang.contains("bun") {
66        Some("JavaScript".to_string())
67    } else if shebang.contains("ruby") {
68        Some("Ruby".to_string())
69    } else if shebang.contains("perl") {
70        Some("Perl".to_string())
71    } else if shebang.contains("php") {
72        Some("PHP".to_string())
73    } else if shebang.contains("pwsh") || shebang.contains("powershell") {
74        Some("PowerShell".to_string())
75    } else if shebang.contains("awk") {
76        Some("Awk".to_string())
77    } else if shebang.contains("bash")
78        || shebang.contains("zsh")
79        || shebang.contains("fish")
80        || shebang.contains("ksh")
81        || shebang.contains("/sh")
82    {
83        Some("Shell".to_string())
84    } else {
85        None
86    }
87}
88
89fn detect_file_identify_language(path: &Path) -> Option<String> {
90    let file_name = path.file_name()?.to_str()?;
91    let tags = tags_from_filename(file_name);
92
93    map_file_identify_tags(&tags).map(str::to_string)
94}
95
96fn map_file_identify_tags(tags: &HashSet<&'static str>) -> Option<&'static str> {
97    if tags.contains("dockerfile") {
98        return Some("Dockerfile");
99    }
100    if tags.contains("makefile") {
101        return Some("Makefile");
102    }
103    if tags.contains("rust") {
104        return Some("Rust");
105    }
106    if tags.contains("python") {
107        return Some("Python");
108    }
109    if tags.contains("javascript") || tags.contains("jsx") {
110        return Some("JavaScript");
111    }
112    if tags.contains("ts") || tags.contains("tsx") {
113        return Some("TypeScript");
114    }
115    if tags.contains("html") {
116        return Some("HTML");
117    }
118    if tags.contains("css") {
119        return Some("CSS");
120    }
121    if tags.contains("c") {
122        return Some("C");
123    }
124    if tags.contains("cpp") {
125        return Some("C++");
126    }
127    if tags.contains("java") {
128        return Some("Java");
129    }
130    if tags.contains("go") {
131        return Some("Go");
132    }
133    if tags.contains("ruby") {
134        return Some("Ruby");
135    }
136    if tags.contains("php") {
137        return Some("PHP");
138    }
139    if tags.contains("perl") {
140        return Some("Perl");
141    }
142    if tags.contains("swift") {
143        return Some("Swift");
144    }
145    if tags.contains("shell") || tags.contains("bash") || tags.contains("zsh") {
146        return Some("Shell");
147    }
148    if tags.contains("kotlin") {
149        return Some("Kotlin");
150    }
151    if tags.contains("dart") {
152        return Some("Dart");
153    }
154    if tags.contains("scala") {
155        return Some("Scala");
156    }
157    if tags.contains("csharp") {
158        return Some("C#");
159    }
160    if tags.contains("fsharp") {
161        return Some("F#");
162    }
163    if tags.contains("r") {
164        return Some("R");
165    }
166    if tags.contains("lua") {
167        return Some("Lua");
168    }
169    if tags.contains("julia") {
170        return Some("Julia");
171    }
172    if tags.contains("elixir") {
173        return Some("Elixir");
174    }
175    if tags.contains("clojure") {
176        return Some("Clojure");
177    }
178    if tags.contains("haskell") {
179        return Some("Haskell");
180    }
181    if tags.contains("erlang") {
182        return Some("Erlang");
183    }
184    if tags.contains("sql") {
185        return Some("SQL");
186    }
187    if tags.contains("tex") {
188        return Some("TeX");
189    }
190    if tags.contains("groovy") || tags.contains("gradle") {
191        return Some("Groovy");
192    }
193    if tags.contains("nix") {
194        return Some("Nix");
195    }
196    if tags.contains("zig") {
197        return Some("Zig");
198    }
199    if tags.contains("powershell") {
200        return Some("PowerShell");
201    }
202    if tags.contains("starlark") {
203        return Some("Starlark");
204    }
205    if tags.contains("awk") {
206        return Some("Awk");
207    }
208    if tags.contains("ocaml") {
209        return Some("OCaml");
210    }
211    if tags.contains("meson") {
212        return Some("Meson");
213    }
214
215    None
216}
217
218fn detect_repo_special_file_name_language(path: &Path) -> Option<String> {
219    let file_name = path
220        .file_name()
221        .and_then(|n| n.to_str())
222        .map(|s| s.to_ascii_lowercase())
223        .unwrap_or_default();
224
225    if matches!(
226        file_name.as_str(),
227        "gemfile" | "rakefile" | "podfile" | "vagrantfile" | "brewfile"
228    ) {
229        Some("Ruby".to_string())
230    } else if matches!(file_name.as_str(), "apkbuild" | "pkgbuild" | "gradlew") {
231        Some("Shell".to_string())
232    } else if matches!(file_name.as_str(), "meson.build") {
233        Some("Meson".to_string())
234    } else if matches!(file_name.as_str(), "containerfile.core") {
235        Some("Dockerfile".to_string())
236    } else if matches!(file_name.as_str(), "build" | "workspace" | "buck") {
237        Some("Starlark".to_string())
238    } else if matches!(
239        file_name.as_str(),
240        "default.nix" | "flake.nix" | "shell.nix"
241    ) {
242        Some("Nix".to_string())
243    } else {
244        None
245    }
246}
247
248fn detect_manual_extension_language(path: &Path) -> Option<String> {
249    let extension = path.extension()?.to_str()?.to_ascii_lowercase();
250
251    match extension.as_str() {
252        "rs" => Some("Rust".to_string()),
253        "py" => Some("Python".to_string()),
254        "js" | "mjs" | "cjs" => Some("JavaScript".to_string()),
255        "ts" | "tsx" | "mts" | "cts" => Some("TypeScript".to_string()),
256        "jsx" => Some("JavaScript".to_string()),
257        "html" | "htm" => Some("HTML".to_string()),
258        "css" => Some("CSS".to_string()),
259        "c" => Some("C".to_string()),
260        "cpp" | "cc" | "cxx" | "hh" | "hxx" => Some("C++".to_string()),
261        "h" => Some("C".to_string()),
262        "hpp" => Some("C++".to_string()),
263        "m" => Some("Objective-C".to_string()),
264        "mm" => Some("Objective-C++".to_string()),
265        "s" | "asm" => Some("GAS".to_string()),
266        "java" => Some("Java".to_string()),
267        "go" => Some("Go".to_string()),
268        "rb" => Some("Ruby".to_string()),
269        "php" => Some("PHP".to_string()),
270        "pl" => Some("Perl".to_string()),
271        "swift" => Some("Swift".to_string()),
272        "sql" => Some("SQL".to_string()),
273        "sh" | "bash" | "zsh" | "fish" | "ksh" => Some("Shell".to_string()),
274        "kt" | "kts" => Some("Kotlin".to_string()),
275        "dart" => Some("Dart".to_string()),
276        "scala" => Some("Scala".to_string()),
277        "cs" => Some("C#".to_string()),
278        "fs" | "fsx" => Some("F#".to_string()),
279        "r" => Some("R".to_string()),
280        "lua" => Some("Lua".to_string()),
281        "jl" => Some("Julia".to_string()),
282        "ex" | "exs" => Some("Elixir".to_string()),
283        "clj" | "cljs" | "cljc" => Some("Clojure".to_string()),
284        "hs" => Some("Haskell".to_string()),
285        "erl" => Some("Erlang".to_string()),
286        "tex" => Some("TeX".to_string()),
287        "groovy" | "gradle" | "gvy" | "gy" | "gsh" => Some("Groovy".to_string()),
288        "nix" => Some("Nix".to_string()),
289        "zig" => Some("Zig".to_string()),
290        "ps1" | "psm1" | "psd1" => Some("PowerShell".to_string()),
291        "bzl" | "bazel" | "star" | "sky" => Some("Starlark".to_string()),
292        "awk" => Some("Awk".to_string()),
293        "ml" | "mli" => Some("OCaml".to_string()),
294        _ => None,
295    }
296}
297
298#[cfg(test)]
299mod tests {
300    use super::detect_language;
301    use std::path::Path;
302
303    #[test]
304    fn detect_language_supports_containerfile_names() {
305        assert_eq!(
306            detect_language(Path::new("Containerfile"), b"FROM scratch\n"),
307            Some("Dockerfile".to_string())
308        );
309        assert_eq!(
310            detect_language(Path::new("containerfile.core"), b"FROM scratch\n"),
311            Some("Dockerfile".to_string())
312        );
313    }
314
315    #[test]
316    fn detect_language_maps_c_headers_to_c() {
317        assert_eq!(
318            detect_language(Path::new("zlib.h"), b"/* header */\n"),
319            Some("C".to_string())
320        );
321    }
322
323    #[test]
324    fn detect_language_maps_uppercase_s_to_gas() {
325        assert_eq!(
326            detect_language(Path::new("gvmat64.S"), b"; asm\n"),
327            Some("GAS".to_string())
328        );
329    }
330
331    #[test]
332    fn detect_language_handles_manifest_dsl_filenames() {
333        assert_eq!(
334            detect_language(Path::new("APKBUILD"), b"pkgname=demo\n"),
335            Some("Shell".to_string())
336        );
337        assert_eq!(
338            detect_language(Path::new("Podfile"), b"source 'https://rubygems.org'\n"),
339            Some("Ruby".to_string())
340        );
341        assert_eq!(
342            detect_language(Path::new("meson.build"), b"project('demo')\n"),
343            Some("Meson".to_string())
344        );
345        assert_eq!(
346            detect_language(Path::new("BUILD"), b"cc_library(name = 'demo')\n"),
347            Some("Starlark".to_string())
348        );
349        assert_eq!(
350            detect_language(Path::new("flake.nix"), b"{ inputs, ... }: {}\n"),
351            Some("Nix".to_string())
352        );
353    }
354
355    #[test]
356    fn detect_language_handles_common_build_extensions() {
357        assert_eq!(
358            detect_language(Path::new("build.gradle"), b"plugins { id 'java' }\n"),
359            Some("Groovy".to_string())
360        );
361        assert_eq!(
362            detect_language(Path::new("main.nix"), b"{ pkgs }: pkgs.hello\n"),
363            Some("Nix".to_string())
364        );
365        assert_eq!(
366            detect_language(Path::new("rules.bzl"), b"def _impl(ctx):\n    pass\n"),
367            Some("Starlark".to_string())
368        );
369        assert_eq!(
370            detect_language(Path::new("script.ps1"), b"Write-Host 'hello'\n"),
371            Some("PowerShell".to_string())
372        );
373    }
374
375    #[test]
376    fn detect_language_omits_generic_text_fallbacks() {
377        assert_eq!(
378            detect_language(Path::new("README.txt"), b"plain text\n"),
379            None
380        );
381        assert_eq!(
382            detect_language(Path::new("data.bin"), &[0, 159, 146, 150]),
383            None
384        );
385    }
386
387    #[test]
388    fn detect_language_ignores_yaml_as_programming_language() {
389        assert_eq!(
390            detect_language(Path::new("config.yaml"), b"key: value\n"),
391            None
392        );
393    }
394
395    #[test]
396    fn detect_language_keeps_extension_detection_for_non_utf8_python() {
397        let latin1_python = b"# coding: latin-1\nprint(\"caf\xe9\")\n# comment padding\n";
398
399        assert_eq!(
400            detect_language(Path::new("script.py"), latin1_python),
401            Some("Python".to_string())
402        );
403    }
404
405    #[test]
406    fn detect_language_uses_utf8_content_hints_for_extensionless_files() {
407        assert_eq!(
408            detect_language(
409                Path::new("index"),
410                b"<!DOCTYPE html><html><body></body></html>"
411            ),
412            Some("HTML".to_string())
413        );
414    }
415
416    #[test]
417    fn detect_language_does_not_use_content_hints_for_invalid_utf8() {
418        assert_eq!(
419            detect_language(
420                Path::new("index"),
421                &[0xff, b'<', b'h', b't', b'm', b'l', b'>']
422            ),
423            None
424        );
425    }
426}