provenant/utils/
language.rs1use content_inspector::{ContentType, inspect};
2use std::path::Path;
3
4fn is_utf8_text(content_type: ContentType) -> bool {
5 content_type == ContentType::UTF_8 || content_type == ContentType::UTF_8_BOM
6}
7
8pub fn detect_language(path: &Path, content: &[u8]) -> String {
9 if content.len() > 32 && !is_utf8_text(inspect(content)) {
10 return "Binary".to_string();
11 }
12
13 if content.len() > 2 && content[0] == b'#' && content[1] == b'!' {
15 let shebang_end = content
16 .iter()
17 .position(|&b| b == b'\n')
18 .unwrap_or(content.len());
19 let shebang = String::from_utf8_lossy(&content[0..shebang_end]);
20
21 if shebang.contains("python") {
22 return "Python".to_string();
23 } else if shebang.contains("node") {
24 return "JavaScript".to_string();
25 } else if shebang.contains("ruby") {
26 return "Ruby".to_string();
27 } else if shebang.contains("perl") {
28 return "Perl".to_string();
29 } else if shebang.contains("php") {
30 return "PHP".to_string();
31 } else if shebang.contains("bash") || shebang.contains("sh") {
32 return "Shell".to_string();
33 }
34 }
35
36 if let Some(extension) = path.extension().and_then(|e| e.to_str()) {
38 match extension.to_lowercase().as_str() {
39 "rs" => return "Rust".to_string(),
40 "py" => return "Python".to_string(),
41 "js" => return "JavaScript".to_string(),
42 "ts" => return "TypeScript".to_string(),
43 "html" | "htm" => return "HTML".to_string(),
44 "css" => return "CSS".to_string(),
45 "c" => return "C".to_string(),
46 "cpp" | "cc" | "cxx" => return "C++".to_string(),
47 "h" | "hpp" => return "C/C++ Header".to_string(),
48 "java" => return "Java".to_string(),
49 "go" => return "Go".to_string(),
50 "rb" => return "Ruby".to_string(),
51 "php" => return "PHP".to_string(),
52 "pl" => return "Perl".to_string(),
53 "swift" => return "Swift".to_string(),
54 "md" | "markdown" => return "Markdown".to_string(),
55 "json" => return "JSON".to_string(),
56 "xml" => return "XML".to_string(),
57 "yml" | "yaml" => return "YAML".to_string(),
58 "sql" => return "SQL".to_string(),
59 "sh" | "bash" => return "Shell".to_string(),
60 "kt" | "kts" => return "Kotlin".to_string(),
61 "dart" => return "Dart".to_string(),
62 "scala" => return "Scala".to_string(),
63 "cs" => return "C#".to_string(),
64 "fs" => return "F#".to_string(),
65 "r" => return "R".to_string(),
66 "lua" => return "Lua".to_string(),
67 "jl" => return "Julia".to_string(),
68 "ex" | "exs" => return "Elixir".to_string(),
69 "clj" => return "Clojure".to_string(),
70 "hs" => return "Haskell".to_string(),
71 "erl" => return "Erlang".to_string(),
72 "sc" => return "SuperCollider".to_string(),
73 "tex" => return "TeX".to_string(),
74 _ => {}
75 }
76 }
77
78 let file_name = path
80 .file_name()
81 .and_then(|n| n.to_str())
82 .map(|s| s.to_lowercase())
83 .unwrap_or_default();
84
85 if matches!(
86 file_name.as_str(),
87 "dockerfile" | "containerfile" | "containerfile.core"
88 ) {
89 return "Dockerfile".to_string();
90 } else if file_name == "makefile" {
91 return "Makefile".to_string();
92 } else if file_name == "gemfile" || file_name == "rakefile" {
93 return "Ruby".to_string();
94 }
95
96 if is_utf8_text(inspect(content)) {
97 let text_sample = String::from_utf8_lossy(&content[..std::cmp::min(content.len(), 1000)]);
98
99 if text_sample.contains("<?php") {
100 return "PHP".to_string();
101 } else if text_sample.contains("<html") || text_sample.contains("<!DOCTYPE html") {
102 return "HTML".to_string();
103 } else if text_sample.contains("import React") || text_sample.contains("import {") {
104 return "JavaScript/TypeScript".to_string();
105 } else if text_sample.contains("def ") && text_sample.contains(":") {
106 return "Python".to_string();
107 } else if text_sample.contains("package ")
108 && text_sample.contains("import ")
109 && text_sample.contains("{")
110 {
111 return "Go".to_string();
112 }
113
114 return "Text".to_string();
115 }
116
117 "Unknown".to_string()
118}
119
120#[cfg(test)]
121mod tests {
122 use super::detect_language;
123 use std::path::Path;
124
125 #[test]
126 fn detect_language_supports_containerfile_names() {
127 assert_eq!(
128 detect_language(Path::new("Containerfile"), b"FROM scratch\n"),
129 "Dockerfile"
130 );
131 assert_eq!(
132 detect_language(Path::new("containerfile.core"), b"FROM scratch\n"),
133 "Dockerfile"
134 );
135 }
136}