1use std::collections::HashSet;
2use std::path::Path;
3
4use file_identify::tags_from_filename;
5
6pub fn detect_language(path: &Path, content: &[u8]) -> Option<String> {
7 if let Some(language) = detect_shebang_language(content) {
8 return Some(language);
9 }
10
11 if let Some(language) = detect_file_identify_language(path) {
12 return Some(language);
13 }
14
15 if let Some(language) = detect_repo_special_file_name_language(path) {
16 return Some(language);
17 }
18
19 if let Some(language) = detect_manual_extension_language(path) {
20 return Some(language);
21 }
22
23 detect_content_hint_language(content)
24}
25
26fn detect_content_hint_language(content: &[u8]) -> Option<String> {
27 let sample_end = std::cmp::min(content.len(), 1000);
28 let text_sample = std::str::from_utf8(&content[..sample_end]).ok()?;
29
30 if text_sample.contains("<?php") {
31 Some("PHP".to_string())
32 } else if text_sample.contains("<html") || text_sample.contains("<!DOCTYPE html") {
33 Some("HTML".to_string())
34 } else if text_sample.contains("plugins {")
35 || (text_sample.contains("dependencies {") && text_sample.contains("repositories {"))
36 {
37 Some("Groovy".to_string())
38 } else if text_sample.contains("import React") || text_sample.contains("import {") {
39 Some("JavaScript/TypeScript".to_string())
40 } else if has_python_definition_line(text_sample) {
41 Some("Python".to_string())
42 } else if text_sample.contains("package ")
43 && text_sample.contains("import ")
44 && text_sample.contains('{')
45 {
46 Some("Go".to_string())
47 } else {
48 None
49 }
50}
51
52fn has_python_definition_line(text: &str) -> bool {
53 text.lines().any(|line| {
54 let trimmed = line.trim_start();
55 trimmed.starts_with("def ") && trimmed.contains(':')
56 })
57}
58
59fn detect_shebang_language(content: &[u8]) -> Option<String> {
60 if content.len() <= 2 || content[0] != b'#' || content[1] != b'!' {
61 return None;
62 }
63
64 let shebang_end = content
65 .iter()
66 .position(|&b| b == b'\n')
67 .unwrap_or(content.len());
68 let shebang = String::from_utf8_lossy(&content[0..shebang_end]).to_ascii_lowercase();
69
70 if shebang.contains("python") {
71 Some("Python".to_string())
72 } else if shebang.contains("node") || shebang.contains("deno") || shebang.contains("bun") {
73 Some("JavaScript".to_string())
74 } else if shebang.contains("bash") {
75 Some("Bash".to_string())
76 } else if shebang.contains("zsh") {
77 Some("Zsh".to_string())
78 } else if shebang.contains("fish") {
79 Some("Fish".to_string())
80 } else if shebang.contains("ksh") {
81 Some("Ksh".to_string())
82 } else if shebang.contains("ruby") {
83 Some("Ruby".to_string())
84 } else if shebang.contains("perl") {
85 Some("Perl".to_string())
86 } else if shebang.contains("php") {
87 Some("PHP".to_string())
88 } else if shebang.contains("pwsh") || shebang.contains("powershell") {
89 Some("PowerShell".to_string())
90 } else if shebang.contains("awk") {
91 Some("Awk".to_string())
92 } else if shebang.contains("/sh") {
93 Some("Shell".to_string())
94 } else {
95 None
96 }
97}
98
99fn detect_file_identify_language(path: &Path) -> Option<String> {
100 let file_name = path.file_name()?.to_str()?;
101 let tags = tags_from_filename(file_name);
102
103 map_file_identify_tags(&tags).map(str::to_string)
104}
105
106fn map_file_identify_tags(tags: &HashSet<&'static str>) -> Option<&'static str> {
107 if tags.contains("dockerfile") {
108 return Some("Dockerfile");
109 }
110 if tags.contains("makefile") {
111 return Some("Makefile");
112 }
113 if tags.contains("rust") {
114 return Some("Rust");
115 }
116 if tags.contains("python") {
117 return Some("Python");
118 }
119 if tags.contains("javascript") || tags.contains("jsx") {
120 return Some("JavaScript");
121 }
122 if tags.contains("ts") || tags.contains("tsx") {
123 return Some("TypeScript");
124 }
125 if tags.contains("html") {
126 return Some("HTML");
127 }
128 if tags.contains("css") {
129 return Some("CSS");
130 }
131 if tags.contains("c") {
132 return Some("C");
133 }
134 if tags.contains("cpp") {
135 return Some("C++");
136 }
137 if tags.contains("java") {
138 return Some("Java");
139 }
140 if tags.contains("go") {
141 return Some("Go");
142 }
143 if tags.contains("ruby") {
144 return Some("Ruby");
145 }
146 if tags.contains("php") {
147 return Some("PHP");
148 }
149 if tags.contains("perl") {
150 return Some("Perl");
151 }
152 if tags.contains("swift") {
153 return Some("Swift");
154 }
155 if tags.contains("shell") || tags.contains("bash") || tags.contains("zsh") {
156 return Some("Shell");
157 }
158 if tags.contains("kotlin") {
159 return Some("Kotlin");
160 }
161 if tags.contains("dart") {
162 return Some("Dart");
163 }
164 if tags.contains("scala") {
165 return Some("Scala");
166 }
167 if tags.contains("csharp") {
168 return Some("C#");
169 }
170 if tags.contains("fsharp") {
171 return Some("F#");
172 }
173 if tags.contains("r") {
174 return Some("R");
175 }
176 if tags.contains("lua") {
177 return Some("Lua");
178 }
179 if tags.contains("julia") {
180 return Some("Julia");
181 }
182 if tags.contains("elixir") {
183 return Some("Elixir");
184 }
185 if tags.contains("clojure") {
186 return Some("Clojure");
187 }
188 if tags.contains("haskell") {
189 return Some("Haskell");
190 }
191 if tags.contains("erlang") {
192 return Some("Erlang");
193 }
194 if tags.contains("sql") {
195 return Some("SQL");
196 }
197 if tags.contains("tex") {
198 return Some("TeX");
199 }
200 if tags.contains("groovy") || tags.contains("gradle") {
201 return Some("Groovy");
202 }
203 if tags.contains("nix") {
204 return Some("Nix");
205 }
206 if tags.contains("zig") {
207 return Some("Zig");
208 }
209 if tags.contains("powershell") {
210 return Some("PowerShell");
211 }
212 if tags.contains("starlark") {
213 return Some("Starlark");
214 }
215 if tags.contains("awk") {
216 return Some("Awk");
217 }
218 if tags.contains("ocaml") {
219 return Some("OCaml");
220 }
221 if tags.contains("meson") {
222 return Some("Meson");
223 }
224
225 None
226}
227
228fn detect_repo_special_file_name_language(path: &Path) -> Option<String> {
229 let file_name = path
230 .file_name()
231 .and_then(|n| n.to_str())
232 .map(|s| s.to_ascii_lowercase())
233 .unwrap_or_default();
234
235 if matches!(
236 file_name.as_str(),
237 "gemfile" | "rakefile" | "podfile" | "vagrantfile" | "brewfile"
238 ) {
239 Some("Ruby".to_string())
240 } else if matches!(file_name.as_str(), "apkbuild" | "pkgbuild" | "gradlew") {
241 Some("Shell".to_string())
242 } else if matches!(file_name.as_str(), "jamfile" | "jamroot") {
243 Some("Jamfile".to_string())
244 } else if matches!(file_name.as_str(), "meson.build") {
245 Some("Meson".to_string())
246 } else if matches!(file_name.as_str(), "containerfile.core") {
247 Some("Dockerfile".to_string())
248 } else if matches!(file_name.as_str(), "build" | "workspace" | "buck") {
249 Some("Starlark".to_string())
250 } else if matches!(
251 file_name.as_str(),
252 "default.nix" | "flake.nix" | "shell.nix"
253 ) {
254 Some("Nix".to_string())
255 } else {
256 None
257 }
258}
259
260fn detect_manual_extension_language(path: &Path) -> Option<String> {
261 let extension = path.extension()?.to_str()?.to_ascii_lowercase();
262
263 match extension.as_str() {
264 "rs" => Some("Rust".to_string()),
265 "py" => Some("Python".to_string()),
266 "js" | "mjs" | "cjs" => Some("JavaScript".to_string()),
267 "ts" | "tsx" | "mts" | "cts" => Some("TypeScript".to_string()),
268 "jsx" => Some("JavaScript".to_string()),
269 "html" | "htm" => Some("HTML".to_string()),
270 "css" => Some("CSS".to_string()),
271 "c" => Some("C".to_string()),
272 "cpp" | "cc" | "cxx" | "hh" | "hxx" => Some("C++".to_string()),
273 "h" => Some("C".to_string()),
274 "hpp" => Some("C++".to_string()),
275 "m" => Some("Objective-C".to_string()),
276 "mm" => Some("Objective-C++".to_string()),
277 "s" | "asm" => Some("GAS".to_string()),
278 "java" => Some("Java".to_string()),
279 "go" => Some("Go".to_string()),
280 "rb" => Some("Ruby".to_string()),
281 "php" => Some("PHP".to_string()),
282 "pl" => Some("Perl".to_string()),
283 "swift" => Some("Swift".to_string()),
284 "sql" => Some("SQL".to_string()),
285 "sh" => Some("Shell".to_string()),
286 "bash" => Some("Bash".to_string()),
287 "zsh" => Some("Zsh".to_string()),
288 "fish" => Some("Fish".to_string()),
289 "ksh" => Some("Ksh".to_string()),
290 "bat" | "cmd" => Some("Batchfile".to_string()),
291 "kt" | "kts" => Some("Kotlin".to_string()),
292 "dart" => Some("Dart".to_string()),
293 "scala" => Some("Scala".to_string()),
294 "cs" => Some("C#".to_string()),
295 "fs" | "fsx" => Some("F#".to_string()),
296 "r" => Some("R".to_string()),
297 "lua" => Some("Lua".to_string()),
298 "jl" => Some("Julia".to_string()),
299 "ex" | "exs" => Some("Elixir".to_string()),
300 "clj" | "cljs" | "cljc" => Some("Clojure".to_string()),
301 "hs" => Some("Haskell".to_string()),
302 "erl" => Some("Erlang".to_string()),
303 "tex" => Some("TeX".to_string()),
304 "groovy" | "gradle" | "gvy" | "gy" | "gsh" => Some("Groovy".to_string()),
305 "cmake" => Some("CMake".to_string()),
306 "nix" => Some("Nix".to_string()),
307 "zig" => Some("Zig".to_string()),
308 "ps1" | "psm1" | "psd1" => Some("PowerShell".to_string()),
309 "bzl" | "bazel" | "star" | "sky" => Some("Starlark".to_string()),
310 "awk" => Some("Awk".to_string()),
311 "ml" | "mli" => Some("OCaml".to_string()),
312 _ => None,
313 }
314}
315
316#[cfg(test)]
317mod tests {
318 use super::detect_language;
319 use std::path::Path;
320
321 #[test]
322 fn detect_language_supports_containerfile_names() {
323 assert_eq!(
324 detect_language(Path::new("Containerfile"), b"FROM scratch\n"),
325 Some("Dockerfile".to_string())
326 );
327 assert_eq!(
328 detect_language(Path::new("containerfile.core"), b"FROM scratch\n"),
329 Some("Dockerfile".to_string())
330 );
331 }
332
333 #[test]
334 fn detect_language_maps_c_headers_to_c() {
335 assert_eq!(
336 detect_language(Path::new("zlib.h"), b"/* header */\n"),
337 Some("C".to_string())
338 );
339 }
340
341 #[test]
342 fn detect_language_maps_uppercase_s_to_gas() {
343 assert_eq!(
344 detect_language(Path::new("gvmat64.S"), b"; asm\n"),
345 Some("GAS".to_string())
346 );
347 }
348
349 #[test]
350 fn detect_language_handles_manifest_dsl_filenames() {
351 assert_eq!(
352 detect_language(Path::new("APKBUILD"), b"pkgname=demo\n"),
353 Some("Shell".to_string())
354 );
355 assert_eq!(
356 detect_language(Path::new("Podfile"), b"source 'https://rubygems.org'\n"),
357 Some("Ruby".to_string())
358 );
359 assert_eq!(
360 detect_language(Path::new("meson.build"), b"project('demo')\n"),
361 Some("Meson".to_string())
362 );
363 assert_eq!(
364 detect_language(Path::new("BUILD"), b"cc_library(name = 'demo')\n"),
365 Some("Starlark".to_string())
366 );
367 assert_eq!(
368 detect_language(Path::new("flake.nix"), b"{ inputs, ... }: {}\n"),
369 Some("Nix".to_string())
370 );
371 }
372
373 #[test]
374 fn detect_language_handles_common_build_extensions() {
375 assert_eq!(
376 detect_language(Path::new("build.gradle"), b"plugins { id 'java' }\n"),
377 Some("Groovy".to_string())
378 );
379 assert_eq!(
380 detect_language(
381 Path::new("toolchain.cmake"),
382 b"set(CMAKE_CXX_STANDARD 20)\n"
383 ),
384 Some("CMake".to_string())
385 );
386 assert_eq!(
387 detect_language(Path::new("main.nix"), b"{ pkgs }: pkgs.hello\n"),
388 Some("Nix".to_string())
389 );
390 assert_eq!(
391 detect_language(Path::new("rules.bzl"), b"def _impl(ctx):\n pass\n"),
392 Some("Starlark".to_string())
393 );
394 assert_eq!(
395 detect_language(Path::new("script.ps1"), b"Write-Host 'hello'\n"),
396 Some("PowerShell".to_string())
397 );
398 }
399
400 #[test]
401 fn detect_language_maps_batch_and_ipp_extensions() {
402 assert_eq!(
403 detect_language(Path::new("build.cmd"), b"@echo off\r\n"),
404 Some("Batchfile".to_string())
405 );
406 assert_eq!(
407 detect_language(
408 Path::new("from_chars.ipp"),
409 b"template <class T> void parse();\n"
410 ),
411 None
412 );
413 }
414
415 #[test]
416 fn detect_language_handles_jamfile_names() {
417 assert_eq!(
418 detect_language(Path::new("Jamfile"), b"lib boost_json ;\n"),
419 Some("Jamfile".to_string())
420 );
421 }
422
423 #[test]
424 fn detect_language_omits_generic_text_fallbacks() {
425 assert_eq!(
426 detect_language(Path::new("README.txt"), b"plain text\n"),
427 None
428 );
429 assert_eq!(
430 detect_language(Path::new("data.bin"), &[0, 159, 146, 150]),
431 None
432 );
433 }
434
435 #[test]
436 fn detect_language_ignores_yaml_as_programming_language() {
437 assert_eq!(
438 detect_language(Path::new("config.yaml"), b"key: value\n"),
439 None
440 );
441 }
442
443 #[test]
444 fn detect_language_keeps_extension_detection_for_non_utf8_python() {
445 let latin1_python = b"# coding: latin-1\nprint(\"caf\xe9\")\n# comment padding\n";
446
447 assert_eq!(
448 detect_language(Path::new("script.py"), latin1_python),
449 Some("Python".to_string())
450 );
451 }
452
453 #[test]
454 fn detect_language_uses_utf8_content_hints_for_extensionless_files() {
455 assert_eq!(
456 detect_language(
457 Path::new("index"),
458 b"<!DOCTYPE html><html><body></body></html>"
459 ),
460 Some("HTML".to_string())
461 );
462 }
463
464 #[test]
465 fn detect_language_does_not_infer_python_from_default_labels() {
466 assert_eq!(
467 detect_language(
468 Path::new("from_chars.ipp"),
469 b"switch (value) {\n default: return parse();\n}\n"
470 ),
471 None
472 );
473 }
474
475 #[test]
476 fn detect_language_does_not_use_content_hints_for_invalid_utf8() {
477 assert_eq!(
478 detect_language(
479 Path::new("index"),
480 &[0xff, b'<', b'h', b't', b'm', b'l', b'>']
481 ),
482 None
483 );
484 }
485}