1use std::collections::HashSet;
2use std::path::Path;
3
4use content_inspector::{ContentType, inspect};
5use file_identify::tags_from_filename;
6
7fn is_utf8_text(content_type: ContentType) -> bool {
8 content_type == ContentType::UTF_8 || content_type == ContentType::UTF_8_BOM
9}
10
11pub fn detect_language(path: &Path, content: &[u8]) -> Option<String> {
12 let inspected = inspect(content);
13
14 if let Some(language) = detect_shebang_language(content) {
15 return Some(language);
16 }
17
18 if let Some(language) = detect_file_identify_language(path) {
19 return Some(language);
20 }
21
22 if let Some(language) = detect_repo_special_file_name_language(path) {
23 return Some(language);
24 }
25
26 if let Some(language) = detect_manual_extension_language(path) {
27 return Some(language);
28 }
29
30 if is_utf8_text(inspected) {
31 let text_sample = String::from_utf8_lossy(&content[..std::cmp::min(content.len(), 1000)]);
32
33 if text_sample.contains("<?php") {
34 return Some("PHP".to_string());
35 } else if text_sample.contains("<html") || text_sample.contains("<!DOCTYPE html") {
36 return Some("HTML".to_string());
37 } else if text_sample.contains("plugins {")
38 || (text_sample.contains("dependencies {") && text_sample.contains("repositories {"))
39 {
40 return Some("Groovy".to_string());
41 } else if text_sample.contains("import React") || text_sample.contains("import {") {
42 return Some("JavaScript/TypeScript".to_string());
43 } else if text_sample.contains("def ") && text_sample.contains(':') {
44 return Some("Python".to_string());
45 } else if text_sample.contains("package ")
46 && text_sample.contains("import ")
47 && text_sample.contains('{')
48 {
49 return Some("Go".to_string());
50 }
51 }
52
53 None
54}
55
56fn detect_shebang_language(content: &[u8]) -> Option<String> {
57 if content.len() <= 2 || content[0] != b'#' || content[1] != b'!' {
58 return None;
59 }
60
61 let shebang_end = content
62 .iter()
63 .position(|&b| b == b'\n')
64 .unwrap_or(content.len());
65 let shebang = String::from_utf8_lossy(&content[0..shebang_end]).to_ascii_lowercase();
66
67 if shebang.contains("python") {
68 Some("Python".to_string())
69 } else if shebang.contains("node") || shebang.contains("deno") || shebang.contains("bun") {
70 Some("JavaScript".to_string())
71 } else if shebang.contains("ruby") {
72 Some("Ruby".to_string())
73 } else if shebang.contains("perl") {
74 Some("Perl".to_string())
75 } else if shebang.contains("php") {
76 Some("PHP".to_string())
77 } else if shebang.contains("pwsh") || shebang.contains("powershell") {
78 Some("PowerShell".to_string())
79 } else if shebang.contains("awk") {
80 Some("Awk".to_string())
81 } else if shebang.contains("bash")
82 || shebang.contains("zsh")
83 || shebang.contains("fish")
84 || shebang.contains("ksh")
85 || shebang.contains("/sh")
86 {
87 Some("Shell".to_string())
88 } else {
89 None
90 }
91}
92
93fn detect_file_identify_language(path: &Path) -> Option<String> {
94 let file_name = path.file_name()?.to_str()?;
95 let tags = tags_from_filename(file_name);
96
97 map_file_identify_tags(&tags).map(str::to_string)
98}
99
100fn map_file_identify_tags(tags: &HashSet<&'static str>) -> Option<&'static str> {
101 if tags.contains("dockerfile") {
102 return Some("Dockerfile");
103 }
104 if tags.contains("makefile") {
105 return Some("Makefile");
106 }
107 if tags.contains("rust") {
108 return Some("Rust");
109 }
110 if tags.contains("python") {
111 return Some("Python");
112 }
113 if tags.contains("javascript") || tags.contains("jsx") {
114 return Some("JavaScript");
115 }
116 if tags.contains("ts") || tags.contains("tsx") {
117 return Some("TypeScript");
118 }
119 if tags.contains("html") {
120 return Some("HTML");
121 }
122 if tags.contains("css") {
123 return Some("CSS");
124 }
125 if tags.contains("c") {
126 return Some("C");
127 }
128 if tags.contains("cpp") {
129 return Some("C++");
130 }
131 if tags.contains("java") {
132 return Some("Java");
133 }
134 if tags.contains("go") {
135 return Some("Go");
136 }
137 if tags.contains("ruby") {
138 return Some("Ruby");
139 }
140 if tags.contains("php") {
141 return Some("PHP");
142 }
143 if tags.contains("perl") {
144 return Some("Perl");
145 }
146 if tags.contains("swift") {
147 return Some("Swift");
148 }
149 if tags.contains("shell") || tags.contains("bash") || tags.contains("zsh") {
150 return Some("Shell");
151 }
152 if tags.contains("kotlin") {
153 return Some("Kotlin");
154 }
155 if tags.contains("dart") {
156 return Some("Dart");
157 }
158 if tags.contains("scala") {
159 return Some("Scala");
160 }
161 if tags.contains("csharp") {
162 return Some("C#");
163 }
164 if tags.contains("fsharp") {
165 return Some("F#");
166 }
167 if tags.contains("r") {
168 return Some("R");
169 }
170 if tags.contains("lua") {
171 return Some("Lua");
172 }
173 if tags.contains("julia") {
174 return Some("Julia");
175 }
176 if tags.contains("elixir") {
177 return Some("Elixir");
178 }
179 if tags.contains("clojure") {
180 return Some("Clojure");
181 }
182 if tags.contains("haskell") {
183 return Some("Haskell");
184 }
185 if tags.contains("erlang") {
186 return Some("Erlang");
187 }
188 if tags.contains("sql") {
189 return Some("SQL");
190 }
191 if tags.contains("tex") {
192 return Some("TeX");
193 }
194 if tags.contains("groovy") || tags.contains("gradle") {
195 return Some("Groovy");
196 }
197 if tags.contains("nix") {
198 return Some("Nix");
199 }
200 if tags.contains("zig") {
201 return Some("Zig");
202 }
203 if tags.contains("powershell") {
204 return Some("PowerShell");
205 }
206 if tags.contains("starlark") {
207 return Some("Starlark");
208 }
209 if tags.contains("awk") {
210 return Some("Awk");
211 }
212 if tags.contains("ocaml") {
213 return Some("OCaml");
214 }
215 if tags.contains("meson") {
216 return Some("Meson");
217 }
218
219 None
220}
221
222fn detect_repo_special_file_name_language(path: &Path) -> Option<String> {
223 let file_name = path
224 .file_name()
225 .and_then(|n| n.to_str())
226 .map(|s| s.to_ascii_lowercase())
227 .unwrap_or_default();
228
229 if matches!(
230 file_name.as_str(),
231 "gemfile" | "rakefile" | "podfile" | "vagrantfile" | "brewfile"
232 ) {
233 Some("Ruby".to_string())
234 } else if matches!(file_name.as_str(), "apkbuild" | "pkgbuild" | "gradlew") {
235 Some("Shell".to_string())
236 } else if matches!(file_name.as_str(), "meson.build") {
237 Some("Meson".to_string())
238 } else if matches!(file_name.as_str(), "containerfile.core") {
239 Some("Dockerfile".to_string())
240 } else if matches!(file_name.as_str(), "build" | "workspace" | "buck") {
241 Some("Starlark".to_string())
242 } else if matches!(
243 file_name.as_str(),
244 "default.nix" | "flake.nix" | "shell.nix"
245 ) {
246 Some("Nix".to_string())
247 } else {
248 None
249 }
250}
251
252fn detect_manual_extension_language(path: &Path) -> Option<String> {
253 let extension = path.extension()?.to_str()?.to_ascii_lowercase();
254
255 match extension.as_str() {
256 "rs" => Some("Rust".to_string()),
257 "py" => Some("Python".to_string()),
258 "js" | "mjs" | "cjs" => Some("JavaScript".to_string()),
259 "ts" | "tsx" | "mts" | "cts" => Some("TypeScript".to_string()),
260 "jsx" => Some("JavaScript".to_string()),
261 "html" | "htm" => Some("HTML".to_string()),
262 "css" => Some("CSS".to_string()),
263 "c" => Some("C".to_string()),
264 "cpp" | "cc" | "cxx" | "hh" | "hxx" => Some("C++".to_string()),
265 "h" => Some("C".to_string()),
266 "hpp" => Some("C++".to_string()),
267 "m" => Some("Objective-C".to_string()),
268 "mm" => Some("Objective-C++".to_string()),
269 "s" | "asm" => Some("GAS".to_string()),
270 "java" => Some("Java".to_string()),
271 "go" => Some("Go".to_string()),
272 "rb" => Some("Ruby".to_string()),
273 "php" => Some("PHP".to_string()),
274 "pl" => Some("Perl".to_string()),
275 "swift" => Some("Swift".to_string()),
276 "sql" => Some("SQL".to_string()),
277 "sh" | "bash" | "zsh" | "fish" | "ksh" => Some("Shell".to_string()),
278 "kt" | "kts" => Some("Kotlin".to_string()),
279 "dart" => Some("Dart".to_string()),
280 "scala" => Some("Scala".to_string()),
281 "cs" => Some("C#".to_string()),
282 "fs" | "fsx" => Some("F#".to_string()),
283 "r" => Some("R".to_string()),
284 "lua" => Some("Lua".to_string()),
285 "jl" => Some("Julia".to_string()),
286 "ex" | "exs" => Some("Elixir".to_string()),
287 "clj" | "cljs" | "cljc" => Some("Clojure".to_string()),
288 "hs" => Some("Haskell".to_string()),
289 "erl" => Some("Erlang".to_string()),
290 "tex" => Some("TeX".to_string()),
291 "groovy" | "gradle" | "gvy" | "gy" | "gsh" => Some("Groovy".to_string()),
292 "nix" => Some("Nix".to_string()),
293 "zig" => Some("Zig".to_string()),
294 "ps1" | "psm1" | "psd1" => Some("PowerShell".to_string()),
295 "bzl" | "bazel" | "star" | "sky" => Some("Starlark".to_string()),
296 "awk" => Some("Awk".to_string()),
297 "ml" | "mli" => Some("OCaml".to_string()),
298 _ => None,
299 }
300}
301
302#[cfg(test)]
303mod tests {
304 use super::detect_language;
305 use std::path::Path;
306
307 #[test]
308 fn detect_language_supports_containerfile_names() {
309 assert_eq!(
310 detect_language(Path::new("Containerfile"), b"FROM scratch\n"),
311 Some("Dockerfile".to_string())
312 );
313 assert_eq!(
314 detect_language(Path::new("containerfile.core"), b"FROM scratch\n"),
315 Some("Dockerfile".to_string())
316 );
317 }
318
319 #[test]
320 fn detect_language_maps_c_headers_to_c() {
321 assert_eq!(
322 detect_language(Path::new("zlib.h"), b"/* header */\n"),
323 Some("C".to_string())
324 );
325 }
326
327 #[test]
328 fn detect_language_maps_uppercase_s_to_gas() {
329 assert_eq!(
330 detect_language(Path::new("gvmat64.S"), b"; asm\n"),
331 Some("GAS".to_string())
332 );
333 }
334
335 #[test]
336 fn detect_language_handles_manifest_dsl_filenames() {
337 assert_eq!(
338 detect_language(Path::new("APKBUILD"), b"pkgname=demo\n"),
339 Some("Shell".to_string())
340 );
341 assert_eq!(
342 detect_language(Path::new("Podfile"), b"source 'https://rubygems.org'\n"),
343 Some("Ruby".to_string())
344 );
345 assert_eq!(
346 detect_language(Path::new("meson.build"), b"project('demo')\n"),
347 Some("Meson".to_string())
348 );
349 assert_eq!(
350 detect_language(Path::new("BUILD"), b"cc_library(name = 'demo')\n"),
351 Some("Starlark".to_string())
352 );
353 assert_eq!(
354 detect_language(Path::new("flake.nix"), b"{ inputs, ... }: {}\n"),
355 Some("Nix".to_string())
356 );
357 }
358
359 #[test]
360 fn detect_language_handles_common_build_extensions() {
361 assert_eq!(
362 detect_language(Path::new("build.gradle"), b"plugins { id 'java' }\n"),
363 Some("Groovy".to_string())
364 );
365 assert_eq!(
366 detect_language(Path::new("main.nix"), b"{ pkgs }: pkgs.hello\n"),
367 Some("Nix".to_string())
368 );
369 assert_eq!(
370 detect_language(Path::new("rules.bzl"), b"def _impl(ctx):\n pass\n"),
371 Some("Starlark".to_string())
372 );
373 assert_eq!(
374 detect_language(Path::new("script.ps1"), b"Write-Host 'hello'\n"),
375 Some("PowerShell".to_string())
376 );
377 }
378
379 #[test]
380 fn detect_language_omits_generic_text_fallbacks() {
381 assert_eq!(
382 detect_language(Path::new("README.txt"), b"plain text\n"),
383 None
384 );
385 assert_eq!(
386 detect_language(Path::new("data.bin"), &[0, 159, 146, 150]),
387 None
388 );
389 }
390
391 #[test]
392 fn detect_language_ignores_yaml_as_programming_language() {
393 assert_eq!(
394 detect_language(Path::new("config.yaml"), b"key: value\n"),
395 None
396 );
397 }
398
399 #[test]
400 fn detect_language_keeps_extension_detection_for_non_utf8_python() {
401 let latin1_python = b"# coding: latin-1\nprint(\"caf\xe9\")\n# comment padding\n";
402
403 assert_eq!(
404 detect_language(Path::new("script.py"), latin1_python),
405 Some("Python".to_string())
406 );
407 }
408}