scancode_rust/utils/
language.rs

1use content_inspector::{ContentType, inspect};
2use std::path::Path;
3
4/// Detect programming language based on file extension and contents
5pub fn detect_language(path: &Path, content: &[u8]) -> String {
6    // Skip binary files
7    if content.len() > 32 && inspect(content) != ContentType::UTF_8 {
8        return "Binary".to_string();
9    }
10
11    // Check for shebang in script files
12    if content.len() > 2 && content[0] == b'#' && content[1] == b'!' {
13        let shebang_end = content
14            .iter()
15            .position(|&b| b == b'\n')
16            .unwrap_or(content.len());
17        let shebang = String::from_utf8_lossy(&content[0..shebang_end]);
18
19        if shebang.contains("python") {
20            return "Python".to_string();
21        } else if shebang.contains("node") {
22            return "JavaScript".to_string();
23        } else if shebang.contains("ruby") {
24            return "Ruby".to_string();
25        } else if shebang.contains("perl") {
26            return "Perl".to_string();
27        } else if shebang.contains("php") {
28            return "PHP".to_string();
29        } else if shebang.contains("bash") || shebang.contains("sh") {
30            return "Shell".to_string();
31        }
32    }
33
34    // Check file extension
35    if let Some(extension) = path.extension().and_then(|e| e.to_str()) {
36        match extension.to_lowercase().as_str() {
37            "rs" => return "Rust".to_string(),
38            "py" => return "Python".to_string(),
39            "js" => return "JavaScript".to_string(),
40            "ts" => return "TypeScript".to_string(),
41            "html" | "htm" => return "HTML".to_string(),
42            "css" => return "CSS".to_string(),
43            "c" => return "C".to_string(),
44            "cpp" | "cc" | "cxx" => return "C++".to_string(),
45            "h" | "hpp" => return "C/C++ Header".to_string(),
46            "java" => return "Java".to_string(),
47            "go" => return "Go".to_string(),
48            "rb" => return "Ruby".to_string(),
49            "php" => return "PHP".to_string(),
50            "pl" => return "Perl".to_string(),
51            "swift" => return "Swift".to_string(),
52            "md" | "markdown" => return "Markdown".to_string(),
53            "json" => return "JSON".to_string(),
54            "xml" => return "XML".to_string(),
55            "yml" | "yaml" => return "YAML".to_string(),
56            "sql" => return "SQL".to_string(),
57            "sh" | "bash" => return "Shell".to_string(),
58            "kt" | "kts" => return "Kotlin".to_string(),
59            "dart" => return "Dart".to_string(),
60            "scala" => return "Scala".to_string(),
61            "cs" => return "C#".to_string(),
62            "fs" => return "F#".to_string(),
63            "r" => return "R".to_string(),
64            "lua" => return "Lua".to_string(),
65            "jl" => return "Julia".to_string(),
66            "ex" | "exs" => return "Elixir".to_string(),
67            "clj" => return "Clojure".to_string(),
68            "hs" => return "Haskell".to_string(),
69            "erl" => return "Erlang".to_string(),
70            "sc" => return "SuperCollider".to_string(),
71            "tex" => return "TeX".to_string(),
72            _ => {}
73        }
74    }
75
76    // Check file name for special cases
77    let file_name = path
78        .file_name()
79        .and_then(|n| n.to_str())
80        .map(|s| s.to_lowercase())
81        .unwrap_or_default();
82
83    if file_name == "dockerfile" || file_name.starts_with("dockerfile.") {
84        return "Dockerfile".to_string();
85    } else if file_name == "makefile" {
86        return "Makefile".to_string();
87    } else if file_name == "gemfile" {
88        return "Ruby".to_string();
89    } else if file_name == "rakefile" {
90        return "Ruby".to_string();
91    }
92
93    // Content-based detection as a fallback for plain text files
94    if inspect(content) == ContentType::UTF_8 {
95        // Check for common patterns in the content
96        let text_sample = String::from_utf8_lossy(&content[..std::cmp::min(content.len(), 1000)]);
97
98        if text_sample.contains("<?php") {
99            return "PHP".to_string();
100        } else if text_sample.contains("<html") || text_sample.contains("<!DOCTYPE html") {
101            return "HTML".to_string();
102        } else if text_sample.contains("import React") || text_sample.contains("import {") {
103            return "JavaScript/TypeScript".to_string();
104        } else if text_sample.contains("def ") && text_sample.contains(":") {
105            return "Python".to_string();
106        } else if text_sample.contains("package ")
107            && text_sample.contains("import ")
108            && text_sample.contains("{")
109        {
110            return "Go".to_string();
111        }
112
113        return "Text".to_string();
114    }
115
116    "Unknown".to_string()
117}