1use std::collections::HashSet;
2use std::path::Path;
3
4use file_identify::tags_from_filename;
5
6pub fn detect_language(path: &Path, content: &[u8]) -> Option<String> {
7 if let Some(language) = detect_shebang_language(content) {
8 return Some(language);
9 }
10
11 if let Some(language) = detect_file_identify_language(path) {
12 return Some(language);
13 }
14
15 if let Some(language) = detect_repo_special_file_name_language(path) {
16 return Some(language);
17 }
18
19 if let Some(language) = detect_manual_extension_language(path) {
20 return Some(language);
21 }
22
23 detect_content_hint_language(content)
24}
25
26fn detect_content_hint_language(content: &[u8]) -> Option<String> {
27 let sample_end = std::cmp::min(content.len(), 1000);
28 let text_sample = std::str::from_utf8(&content[..sample_end]).ok()?;
29
30 if text_sample.contains("<?php") {
31 Some("PHP".to_string())
32 } else if text_sample.contains("<html") || text_sample.contains("<!DOCTYPE html") {
33 Some("HTML".to_string())
34 } else if text_sample.contains("plugins {")
35 || (text_sample.contains("dependencies {") && text_sample.contains("repositories {"))
36 {
37 Some("Groovy".to_string())
38 } else if text_sample.contains("import React") || text_sample.contains("import {") {
39 Some("JavaScript/TypeScript".to_string())
40 } else if text_sample.contains("def ") && text_sample.contains(':') {
41 Some("Python".to_string())
42 } else if text_sample.contains("package ")
43 && text_sample.contains("import ")
44 && text_sample.contains('{')
45 {
46 Some("Go".to_string())
47 } else {
48 None
49 }
50}
51
52fn detect_shebang_language(content: &[u8]) -> Option<String> {
53 if content.len() <= 2 || content[0] != b'#' || content[1] != b'!' {
54 return None;
55 }
56
57 let shebang_end = content
58 .iter()
59 .position(|&b| b == b'\n')
60 .unwrap_or(content.len());
61 let shebang = String::from_utf8_lossy(&content[0..shebang_end]).to_ascii_lowercase();
62
63 if shebang.contains("python") {
64 Some("Python".to_string())
65 } else if shebang.contains("node") || shebang.contains("deno") || shebang.contains("bun") {
66 Some("JavaScript".to_string())
67 } else if shebang.contains("ruby") {
68 Some("Ruby".to_string())
69 } else if shebang.contains("perl") {
70 Some("Perl".to_string())
71 } else if shebang.contains("php") {
72 Some("PHP".to_string())
73 } else if shebang.contains("pwsh") || shebang.contains("powershell") {
74 Some("PowerShell".to_string())
75 } else if shebang.contains("awk") {
76 Some("Awk".to_string())
77 } else if shebang.contains("bash")
78 || shebang.contains("zsh")
79 || shebang.contains("fish")
80 || shebang.contains("ksh")
81 || shebang.contains("/sh")
82 {
83 Some("Shell".to_string())
84 } else {
85 None
86 }
87}
88
89fn detect_file_identify_language(path: &Path) -> Option<String> {
90 let file_name = path.file_name()?.to_str()?;
91 let tags = tags_from_filename(file_name);
92
93 map_file_identify_tags(&tags).map(str::to_string)
94}
95
96fn map_file_identify_tags(tags: &HashSet<&'static str>) -> Option<&'static str> {
97 if tags.contains("dockerfile") {
98 return Some("Dockerfile");
99 }
100 if tags.contains("makefile") {
101 return Some("Makefile");
102 }
103 if tags.contains("rust") {
104 return Some("Rust");
105 }
106 if tags.contains("python") {
107 return Some("Python");
108 }
109 if tags.contains("javascript") || tags.contains("jsx") {
110 return Some("JavaScript");
111 }
112 if tags.contains("ts") || tags.contains("tsx") {
113 return Some("TypeScript");
114 }
115 if tags.contains("html") {
116 return Some("HTML");
117 }
118 if tags.contains("css") {
119 return Some("CSS");
120 }
121 if tags.contains("c") {
122 return Some("C");
123 }
124 if tags.contains("cpp") {
125 return Some("C++");
126 }
127 if tags.contains("java") {
128 return Some("Java");
129 }
130 if tags.contains("go") {
131 return Some("Go");
132 }
133 if tags.contains("ruby") {
134 return Some("Ruby");
135 }
136 if tags.contains("php") {
137 return Some("PHP");
138 }
139 if tags.contains("perl") {
140 return Some("Perl");
141 }
142 if tags.contains("swift") {
143 return Some("Swift");
144 }
145 if tags.contains("shell") || tags.contains("bash") || tags.contains("zsh") {
146 return Some("Shell");
147 }
148 if tags.contains("kotlin") {
149 return Some("Kotlin");
150 }
151 if tags.contains("dart") {
152 return Some("Dart");
153 }
154 if tags.contains("scala") {
155 return Some("Scala");
156 }
157 if tags.contains("csharp") {
158 return Some("C#");
159 }
160 if tags.contains("fsharp") {
161 return Some("F#");
162 }
163 if tags.contains("r") {
164 return Some("R");
165 }
166 if tags.contains("lua") {
167 return Some("Lua");
168 }
169 if tags.contains("julia") {
170 return Some("Julia");
171 }
172 if tags.contains("elixir") {
173 return Some("Elixir");
174 }
175 if tags.contains("clojure") {
176 return Some("Clojure");
177 }
178 if tags.contains("haskell") {
179 return Some("Haskell");
180 }
181 if tags.contains("erlang") {
182 return Some("Erlang");
183 }
184 if tags.contains("sql") {
185 return Some("SQL");
186 }
187 if tags.contains("tex") {
188 return Some("TeX");
189 }
190 if tags.contains("groovy") || tags.contains("gradle") {
191 return Some("Groovy");
192 }
193 if tags.contains("nix") {
194 return Some("Nix");
195 }
196 if tags.contains("zig") {
197 return Some("Zig");
198 }
199 if tags.contains("powershell") {
200 return Some("PowerShell");
201 }
202 if tags.contains("starlark") {
203 return Some("Starlark");
204 }
205 if tags.contains("awk") {
206 return Some("Awk");
207 }
208 if tags.contains("ocaml") {
209 return Some("OCaml");
210 }
211 if tags.contains("meson") {
212 return Some("Meson");
213 }
214
215 None
216}
217
218fn detect_repo_special_file_name_language(path: &Path) -> Option<String> {
219 let file_name = path
220 .file_name()
221 .and_then(|n| n.to_str())
222 .map(|s| s.to_ascii_lowercase())
223 .unwrap_or_default();
224
225 if matches!(
226 file_name.as_str(),
227 "gemfile" | "rakefile" | "podfile" | "vagrantfile" | "brewfile"
228 ) {
229 Some("Ruby".to_string())
230 } else if matches!(file_name.as_str(), "apkbuild" | "pkgbuild" | "gradlew") {
231 Some("Shell".to_string())
232 } else if matches!(file_name.as_str(), "meson.build") {
233 Some("Meson".to_string())
234 } else if matches!(file_name.as_str(), "containerfile.core") {
235 Some("Dockerfile".to_string())
236 } else if matches!(file_name.as_str(), "build" | "workspace" | "buck") {
237 Some("Starlark".to_string())
238 } else if matches!(
239 file_name.as_str(),
240 "default.nix" | "flake.nix" | "shell.nix"
241 ) {
242 Some("Nix".to_string())
243 } else {
244 None
245 }
246}
247
248fn detect_manual_extension_language(path: &Path) -> Option<String> {
249 let extension = path.extension()?.to_str()?.to_ascii_lowercase();
250
251 match extension.as_str() {
252 "rs" => Some("Rust".to_string()),
253 "py" => Some("Python".to_string()),
254 "js" | "mjs" | "cjs" => Some("JavaScript".to_string()),
255 "ts" | "tsx" | "mts" | "cts" => Some("TypeScript".to_string()),
256 "jsx" => Some("JavaScript".to_string()),
257 "html" | "htm" => Some("HTML".to_string()),
258 "css" => Some("CSS".to_string()),
259 "c" => Some("C".to_string()),
260 "cpp" | "cc" | "cxx" | "hh" | "hxx" => Some("C++".to_string()),
261 "h" => Some("C".to_string()),
262 "hpp" => Some("C++".to_string()),
263 "m" => Some("Objective-C".to_string()),
264 "mm" => Some("Objective-C++".to_string()),
265 "s" | "asm" => Some("GAS".to_string()),
266 "java" => Some("Java".to_string()),
267 "go" => Some("Go".to_string()),
268 "rb" => Some("Ruby".to_string()),
269 "php" => Some("PHP".to_string()),
270 "pl" => Some("Perl".to_string()),
271 "swift" => Some("Swift".to_string()),
272 "sql" => Some("SQL".to_string()),
273 "sh" | "bash" | "zsh" | "fish" | "ksh" => Some("Shell".to_string()),
274 "kt" | "kts" => Some("Kotlin".to_string()),
275 "dart" => Some("Dart".to_string()),
276 "scala" => Some("Scala".to_string()),
277 "cs" => Some("C#".to_string()),
278 "fs" | "fsx" => Some("F#".to_string()),
279 "r" => Some("R".to_string()),
280 "lua" => Some("Lua".to_string()),
281 "jl" => Some("Julia".to_string()),
282 "ex" | "exs" => Some("Elixir".to_string()),
283 "clj" | "cljs" | "cljc" => Some("Clojure".to_string()),
284 "hs" => Some("Haskell".to_string()),
285 "erl" => Some("Erlang".to_string()),
286 "tex" => Some("TeX".to_string()),
287 "groovy" | "gradle" | "gvy" | "gy" | "gsh" => Some("Groovy".to_string()),
288 "nix" => Some("Nix".to_string()),
289 "zig" => Some("Zig".to_string()),
290 "ps1" | "psm1" | "psd1" => Some("PowerShell".to_string()),
291 "bzl" | "bazel" | "star" | "sky" => Some("Starlark".to_string()),
292 "awk" => Some("Awk".to_string()),
293 "ml" | "mli" => Some("OCaml".to_string()),
294 _ => None,
295 }
296}
297
298#[cfg(test)]
299mod tests {
300 use super::detect_language;
301 use std::path::Path;
302
303 #[test]
304 fn detect_language_supports_containerfile_names() {
305 assert_eq!(
306 detect_language(Path::new("Containerfile"), b"FROM scratch\n"),
307 Some("Dockerfile".to_string())
308 );
309 assert_eq!(
310 detect_language(Path::new("containerfile.core"), b"FROM scratch\n"),
311 Some("Dockerfile".to_string())
312 );
313 }
314
315 #[test]
316 fn detect_language_maps_c_headers_to_c() {
317 assert_eq!(
318 detect_language(Path::new("zlib.h"), b"/* header */\n"),
319 Some("C".to_string())
320 );
321 }
322
323 #[test]
324 fn detect_language_maps_uppercase_s_to_gas() {
325 assert_eq!(
326 detect_language(Path::new("gvmat64.S"), b"; asm\n"),
327 Some("GAS".to_string())
328 );
329 }
330
331 #[test]
332 fn detect_language_handles_manifest_dsl_filenames() {
333 assert_eq!(
334 detect_language(Path::new("APKBUILD"), b"pkgname=demo\n"),
335 Some("Shell".to_string())
336 );
337 assert_eq!(
338 detect_language(Path::new("Podfile"), b"source 'https://rubygems.org'\n"),
339 Some("Ruby".to_string())
340 );
341 assert_eq!(
342 detect_language(Path::new("meson.build"), b"project('demo')\n"),
343 Some("Meson".to_string())
344 );
345 assert_eq!(
346 detect_language(Path::new("BUILD"), b"cc_library(name = 'demo')\n"),
347 Some("Starlark".to_string())
348 );
349 assert_eq!(
350 detect_language(Path::new("flake.nix"), b"{ inputs, ... }: {}\n"),
351 Some("Nix".to_string())
352 );
353 }
354
355 #[test]
356 fn detect_language_handles_common_build_extensions() {
357 assert_eq!(
358 detect_language(Path::new("build.gradle"), b"plugins { id 'java' }\n"),
359 Some("Groovy".to_string())
360 );
361 assert_eq!(
362 detect_language(Path::new("main.nix"), b"{ pkgs }: pkgs.hello\n"),
363 Some("Nix".to_string())
364 );
365 assert_eq!(
366 detect_language(Path::new("rules.bzl"), b"def _impl(ctx):\n pass\n"),
367 Some("Starlark".to_string())
368 );
369 assert_eq!(
370 detect_language(Path::new("script.ps1"), b"Write-Host 'hello'\n"),
371 Some("PowerShell".to_string())
372 );
373 }
374
375 #[test]
376 fn detect_language_omits_generic_text_fallbacks() {
377 assert_eq!(
378 detect_language(Path::new("README.txt"), b"plain text\n"),
379 None
380 );
381 assert_eq!(
382 detect_language(Path::new("data.bin"), &[0, 159, 146, 150]),
383 None
384 );
385 }
386
387 #[test]
388 fn detect_language_ignores_yaml_as_programming_language() {
389 assert_eq!(
390 detect_language(Path::new("config.yaml"), b"key: value\n"),
391 None
392 );
393 }
394
395 #[test]
396 fn detect_language_keeps_extension_detection_for_non_utf8_python() {
397 let latin1_python = b"# coding: latin-1\nprint(\"caf\xe9\")\n# comment padding\n";
398
399 assert_eq!(
400 detect_language(Path::new("script.py"), latin1_python),
401 Some("Python".to_string())
402 );
403 }
404
405 #[test]
406 fn detect_language_uses_utf8_content_hints_for_extensionless_files() {
407 assert_eq!(
408 detect_language(
409 Path::new("index"),
410 b"<!DOCTYPE html><html><body></body></html>"
411 ),
412 Some("HTML".to_string())
413 );
414 }
415
416 #[test]
417 fn detect_language_does_not_use_content_hints_for_invalid_utf8() {
418 assert_eq!(
419 detect_language(
420 Path::new("index"),
421 &[0xff, b'<', b'h', b't', b'm', b'l', b'>']
422 ),
423 None
424 );
425 }
426}