Skip to main content

provenant/utils/
language.rs

1// SPDX-FileCopyrightText: Provenant contributors
2// SPDX-License-Identifier: Apache-2.0
3
4use std::collections::HashSet;
5use std::path::Path;
6
7use file_identify::tags_from_filename;
8
9pub fn detect_language(path: &Path, content: &[u8]) -> Option<String> {
10    if let Some(language) = detect_shebang_language(content) {
11        return Some(language);
12    }
13
14    if let Some(language) = detect_file_identify_language(path) {
15        return Some(language);
16    }
17
18    if let Some(language) = detect_repo_special_file_name_language(path) {
19        return Some(language);
20    }
21
22    if let Some(language) = detect_manual_extension_language(path) {
23        return Some(language);
24    }
25
26    detect_content_hint_language(content)
27}
28
29fn detect_content_hint_language(content: &[u8]) -> Option<String> {
30    let sample_end = std::cmp::min(content.len(), 1000);
31    let text_sample = std::str::from_utf8(&content[..sample_end]).ok()?;
32
33    if text_sample.contains("<?php") {
34        Some("PHP".to_string())
35    } else if text_sample.contains("<html") || text_sample.contains("<!DOCTYPE html") {
36        Some("HTML".to_string())
37    } else if text_sample.contains("plugins {")
38        || (text_sample.contains("dependencies {") && text_sample.contains("repositories {"))
39    {
40        Some("Groovy".to_string())
41    } else if text_sample.contains("import React") || text_sample.contains("import {") {
42        Some("JavaScript/TypeScript".to_string())
43    } else if has_python_definition_line(text_sample) {
44        Some("Python".to_string())
45    } else if text_sample.contains("package ")
46        && text_sample.contains("import ")
47        && text_sample.contains('{')
48    {
49        Some("Go".to_string())
50    } else {
51        None
52    }
53}
54
55fn has_python_definition_line(text: &str) -> bool {
56    text.lines().any(|line| {
57        let trimmed = line.trim_start();
58        trimmed.starts_with("def ") && trimmed.contains(':')
59    })
60}
61
62fn detect_shebang_language(content: &[u8]) -> Option<String> {
63    if content.len() <= 2 || content[0] != b'#' || content[1] != b'!' {
64        return None;
65    }
66
67    let shebang_end = content
68        .iter()
69        .position(|&b| b == b'\n')
70        .unwrap_or(content.len());
71    let shebang = String::from_utf8_lossy(&content[0..shebang_end]).to_ascii_lowercase();
72
73    if shebang.contains("python") {
74        Some("Python".to_string())
75    } else if shebang.contains("node") || shebang.contains("deno") || shebang.contains("bun") {
76        Some("JavaScript".to_string())
77    } else if shebang.contains("bash") {
78        Some("Bash".to_string())
79    } else if shebang.contains("zsh") {
80        Some("Zsh".to_string())
81    } else if shebang.contains("fish") {
82        Some("Fish".to_string())
83    } else if shebang.contains("ksh") {
84        Some("Ksh".to_string())
85    } else if shebang.contains("ruby") {
86        Some("Ruby".to_string())
87    } else if shebang.contains("perl") {
88        Some("Perl".to_string())
89    } else if shebang.contains("php") {
90        Some("PHP".to_string())
91    } else if shebang.contains("pwsh") || shebang.contains("powershell") {
92        Some("PowerShell".to_string())
93    } else if shebang.contains("awk") {
94        Some("Awk".to_string())
95    } else if shebang.contains("/sh") {
96        Some("Shell".to_string())
97    } else {
98        None
99    }
100}
101
102fn detect_file_identify_language(path: &Path) -> Option<String> {
103    let file_name = path.file_name()?.to_str()?;
104    let tags = tags_from_filename(file_name);
105
106    map_file_identify_tags(&tags).map(str::to_string)
107}
108
109fn map_file_identify_tags(tags: &HashSet<&'static str>) -> Option<&'static str> {
110    if tags.contains("dockerfile") {
111        return Some("Dockerfile");
112    }
113    if tags.contains("makefile") {
114        return Some("Makefile");
115    }
116    if tags.contains("rust") {
117        return Some("Rust");
118    }
119    if tags.contains("python") {
120        return Some("Python");
121    }
122    if tags.contains("javascript") || tags.contains("jsx") {
123        return Some("JavaScript");
124    }
125    if tags.contains("ts") || tags.contains("tsx") {
126        return Some("TypeScript");
127    }
128    if tags.contains("html") {
129        return Some("HTML");
130    }
131    if tags.contains("css") {
132        return Some("CSS");
133    }
134    if tags.contains("c") {
135        return Some("C");
136    }
137    if tags.contains("cpp") {
138        return Some("C++");
139    }
140    if tags.contains("java") {
141        return Some("Java");
142    }
143    if tags.contains("go") {
144        return Some("Go");
145    }
146    if tags.contains("ruby") {
147        return Some("Ruby");
148    }
149    if tags.contains("php") {
150        return Some("PHP");
151    }
152    if tags.contains("perl") {
153        return Some("Perl");
154    }
155    if tags.contains("swift") {
156        return Some("Swift");
157    }
158    if tags.contains("shell") || tags.contains("bash") || tags.contains("zsh") {
159        return Some("Shell");
160    }
161    if tags.contains("kotlin") {
162        return Some("Kotlin");
163    }
164    if tags.contains("dart") {
165        return Some("Dart");
166    }
167    if tags.contains("scala") {
168        return Some("Scala");
169    }
170    if tags.contains("csharp") {
171        return Some("C#");
172    }
173    if tags.contains("fsharp") {
174        return Some("F#");
175    }
176    if tags.contains("r") {
177        return Some("R");
178    }
179    if tags.contains("lua") {
180        return Some("Lua");
181    }
182    if tags.contains("julia") {
183        return Some("Julia");
184    }
185    if tags.contains("elixir") {
186        return Some("Elixir");
187    }
188    if tags.contains("clojure") {
189        return Some("Clojure");
190    }
191    if tags.contains("haskell") {
192        return Some("Haskell");
193    }
194    if tags.contains("erlang") {
195        return Some("Erlang");
196    }
197    if tags.contains("sql") {
198        return Some("SQL");
199    }
200    if tags.contains("tex") {
201        return Some("TeX");
202    }
203    if tags.contains("groovy") || tags.contains("gradle") {
204        return Some("Groovy");
205    }
206    if tags.contains("nix") {
207        return Some("Nix");
208    }
209    if tags.contains("zig") {
210        return Some("Zig");
211    }
212    if tags.contains("powershell") {
213        return Some("PowerShell");
214    }
215    if tags.contains("starlark") {
216        return Some("Starlark");
217    }
218    if tags.contains("awk") {
219        return Some("Awk");
220    }
221    if tags.contains("ocaml") {
222        return Some("OCaml");
223    }
224    if tags.contains("meson") {
225        return Some("Meson");
226    }
227
228    None
229}
230
231fn detect_repo_special_file_name_language(path: &Path) -> Option<String> {
232    let file_name = path
233        .file_name()
234        .and_then(|n| n.to_str())
235        .map(|s| s.to_ascii_lowercase())
236        .unwrap_or_default();
237
238    if matches!(
239        file_name.as_str(),
240        "gemfile" | "rakefile" | "podfile" | "vagrantfile" | "brewfile"
241    ) {
242        Some("Ruby".to_string())
243    } else if matches!(file_name.as_str(), "apkbuild" | "pkgbuild" | "gradlew") {
244        Some("Shell".to_string())
245    } else if matches!(file_name.as_str(), "jamfile" | "jamroot") {
246        Some("Jamfile".to_string())
247    } else if matches!(file_name.as_str(), "meson.build") {
248        Some("Meson".to_string())
249    } else if matches!(file_name.as_str(), "containerfile.core") {
250        Some("Dockerfile".to_string())
251    } else if matches!(file_name.as_str(), "build" | "workspace" | "buck") {
252        Some("Starlark".to_string())
253    } else if matches!(
254        file_name.as_str(),
255        "default.nix" | "flake.nix" | "shell.nix"
256    ) {
257        Some("Nix".to_string())
258    } else {
259        None
260    }
261}
262
263fn detect_manual_extension_language(path: &Path) -> Option<String> {
264    let extension = path.extension()?.to_str()?.to_ascii_lowercase();
265
266    match extension.as_str() {
267        "rs" => Some("Rust".to_string()),
268        "py" => Some("Python".to_string()),
269        "js" | "mjs" | "cjs" => Some("JavaScript".to_string()),
270        "ts" | "tsx" | "mts" | "cts" => Some("TypeScript".to_string()),
271        "jsx" => Some("JavaScript".to_string()),
272        "html" | "htm" => Some("HTML".to_string()),
273        "css" => Some("CSS".to_string()),
274        "c" => Some("C".to_string()),
275        "cpp" | "cc" | "cxx" | "hh" | "hxx" => Some("C++".to_string()),
276        "h" => Some("C".to_string()),
277        "hpp" => Some("C++".to_string()),
278        "m" => Some("Objective-C".to_string()),
279        "mm" => Some("Objective-C++".to_string()),
280        "s" | "asm" => Some("GAS".to_string()),
281        "java" => Some("Java".to_string()),
282        "go" => Some("Go".to_string()),
283        "rb" => Some("Ruby".to_string()),
284        "php" => Some("PHP".to_string()),
285        "pl" => Some("Perl".to_string()),
286        "swift" => Some("Swift".to_string()),
287        "sql" => Some("SQL".to_string()),
288        "sh" => Some("Shell".to_string()),
289        "bash" => Some("Bash".to_string()),
290        "zsh" => Some("Zsh".to_string()),
291        "fish" => Some("Fish".to_string()),
292        "ksh" => Some("Ksh".to_string()),
293        "bat" | "cmd" => Some("Batchfile".to_string()),
294        "kt" | "kts" => Some("Kotlin".to_string()),
295        "dart" => Some("Dart".to_string()),
296        "scala" => Some("Scala".to_string()),
297        "cs" => Some("C#".to_string()),
298        "fs" | "fsx" => Some("F#".to_string()),
299        "r" => Some("R".to_string()),
300        "lua" => Some("Lua".to_string()),
301        "jl" => Some("Julia".to_string()),
302        "ex" | "exs" => Some("Elixir".to_string()),
303        "clj" | "cljs" | "cljc" => Some("Clojure".to_string()),
304        "hs" => Some("Haskell".to_string()),
305        "erl" => Some("Erlang".to_string()),
306        "tex" => Some("TeX".to_string()),
307        "groovy" | "gradle" | "gvy" | "gy" | "gsh" => Some("Groovy".to_string()),
308        "cmake" => Some("CMake".to_string()),
309        "nix" => Some("Nix".to_string()),
310        "zig" => Some("Zig".to_string()),
311        "ps1" | "psm1" | "psd1" => Some("PowerShell".to_string()),
312        "bzl" | "bazel" | "star" | "sky" => Some("Starlark".to_string()),
313        "awk" => Some("Awk".to_string()),
314        "ml" | "mli" => Some("OCaml".to_string()),
315        _ => None,
316    }
317}
318
319#[cfg(test)]
320mod tests {
321    use super::detect_language;
322    use std::path::Path;
323
324    #[test]
325    fn detect_language_supports_containerfile_names() {
326        assert_eq!(
327            detect_language(Path::new("Containerfile"), b"FROM scratch\n"),
328            Some("Dockerfile".to_string())
329        );
330        assert_eq!(
331            detect_language(Path::new("containerfile.core"), b"FROM scratch\n"),
332            Some("Dockerfile".to_string())
333        );
334    }
335
336    #[test]
337    fn detect_language_maps_c_headers_to_c() {
338        assert_eq!(
339            detect_language(Path::new("zlib.h"), b"/* header */\n"),
340            Some("C".to_string())
341        );
342    }
343
344    #[test]
345    fn detect_language_maps_uppercase_s_to_gas() {
346        assert_eq!(
347            detect_language(Path::new("gvmat64.S"), b"; asm\n"),
348            Some("GAS".to_string())
349        );
350    }
351
352    #[test]
353    fn detect_language_handles_manifest_dsl_filenames() {
354        assert_eq!(
355            detect_language(Path::new("APKBUILD"), b"pkgname=demo\n"),
356            Some("Shell".to_string())
357        );
358        assert_eq!(
359            detect_language(Path::new("Podfile"), b"source 'https://rubygems.org'\n"),
360            Some("Ruby".to_string())
361        );
362        assert_eq!(
363            detect_language(Path::new("meson.build"), b"project('demo')\n"),
364            Some("Meson".to_string())
365        );
366        assert_eq!(
367            detect_language(Path::new("BUILD"), b"cc_library(name = 'demo')\n"),
368            Some("Starlark".to_string())
369        );
370        assert_eq!(
371            detect_language(Path::new("flake.nix"), b"{ inputs, ... }: {}\n"),
372            Some("Nix".to_string())
373        );
374    }
375
376    #[test]
377    fn detect_language_handles_common_build_extensions() {
378        assert_eq!(
379            detect_language(Path::new("build.gradle"), b"plugins { id 'java' }\n"),
380            Some("Groovy".to_string())
381        );
382        assert_eq!(
383            detect_language(
384                Path::new("toolchain.cmake"),
385                b"set(CMAKE_CXX_STANDARD 20)\n"
386            ),
387            Some("CMake".to_string())
388        );
389        assert_eq!(
390            detect_language(Path::new("main.nix"), b"{ pkgs }: pkgs.hello\n"),
391            Some("Nix".to_string())
392        );
393        assert_eq!(
394            detect_language(Path::new("rules.bzl"), b"def _impl(ctx):\n    pass\n"),
395            Some("Starlark".to_string())
396        );
397        assert_eq!(
398            detect_language(Path::new("script.ps1"), b"Write-Host 'hello'\n"),
399            Some("PowerShell".to_string())
400        );
401    }
402
403    #[test]
404    fn detect_language_maps_batch_and_ipp_extensions() {
405        assert_eq!(
406            detect_language(Path::new("build.cmd"), b"@echo off\r\n"),
407            Some("Batchfile".to_string())
408        );
409        assert_eq!(
410            detect_language(
411                Path::new("from_chars.ipp"),
412                b"template <class T> void parse();\n"
413            ),
414            None
415        );
416    }
417
418    #[test]
419    fn detect_language_handles_jamfile_names() {
420        assert_eq!(
421            detect_language(Path::new("Jamfile"), b"lib boost_json ;\n"),
422            Some("Jamfile".to_string())
423        );
424    }
425
426    #[test]
427    fn detect_language_omits_generic_text_fallbacks() {
428        assert_eq!(
429            detect_language(Path::new("README.txt"), b"plain text\n"),
430            None
431        );
432        assert_eq!(
433            detect_language(Path::new("data.bin"), &[0, 159, 146, 150]),
434            None
435        );
436    }
437
438    #[test]
439    fn detect_language_ignores_yaml_as_programming_language() {
440        assert_eq!(
441            detect_language(Path::new("config.yaml"), b"key: value\n"),
442            None
443        );
444    }
445
446    #[test]
447    fn detect_language_keeps_extension_detection_for_non_utf8_python() {
448        let latin1_python = b"# coding: latin-1\nprint(\"caf\xe9\")\n# comment padding\n";
449
450        assert_eq!(
451            detect_language(Path::new("script.py"), latin1_python),
452            Some("Python".to_string())
453        );
454    }
455
456    #[test]
457    fn detect_language_uses_utf8_content_hints_for_extensionless_files() {
458        assert_eq!(
459            detect_language(
460                Path::new("index"),
461                b"<!DOCTYPE html><html><body></body></html>"
462            ),
463            Some("HTML".to_string())
464        );
465    }
466
467    #[test]
468    fn detect_language_does_not_infer_python_from_default_labels() {
469        assert_eq!(
470            detect_language(
471                Path::new("from_chars.ipp"),
472                b"switch (value) {\n  default: return parse();\n}\n"
473            ),
474            None
475        );
476    }
477
478    #[test]
479    fn detect_language_does_not_use_content_hints_for_invalid_utf8() {
480        assert_eq!(
481            detect_language(
482                Path::new("index"),
483                &[0xff, b'<', b'h', b't', b'm', b'l', b'>']
484            ),
485            None
486        );
487    }
488}