code_digest/utils/
file_ext.rs

1//! File extension to language mapping utilities
2
3use std::path::Path;
4
5/// File type enumeration for categorizing files
6#[derive(Debug, Clone, PartialEq, Eq, Hash)]
7pub enum FileType {
8    // Programming languages
9    Rust,
10    Python,
11    JavaScript,
12    TypeScript,
13    Go,
14    Java,
15    Cpp,
16    C,
17    CSharp,
18    Ruby,
19    Php,
20    Swift,
21    Kotlin,
22    Scala,
23    Haskell,
24
25    // Data formats
26    Markdown,
27    Json,
28    Yaml,
29    Toml,
30    Xml,
31    Html,
32    Css,
33
34    // Other
35    Text,
36    Other,
37}
38
39impl FileType {
40    /// Determine file type from path
41    pub fn from_path(path: &Path) -> Self {
42        let extension = path.extension().and_then(|ext| ext.to_str()).unwrap_or("").to_lowercase();
43
44        match extension.as_str() {
45            "rs" => FileType::Rust,
46            "py" => FileType::Python,
47            "js" | "mjs" | "cjs" => FileType::JavaScript,
48            "ts" | "tsx" => FileType::TypeScript,
49            "go" => FileType::Go,
50            "java" => FileType::Java,
51            "cpp" | "cc" | "cxx" | "c++" | "hpp" | "hxx" | "h++" => FileType::Cpp,
52            "c" | "h" => FileType::C,
53            "cs" => FileType::CSharp,
54            "rb" => FileType::Ruby,
55            "php" => FileType::Php,
56            "swift" => FileType::Swift,
57            "kt" | "kts" => FileType::Kotlin,
58            "scala" => FileType::Scala,
59            "hs" => FileType::Haskell,
60            "md" | "markdown" => FileType::Markdown,
61            "json" => FileType::Json,
62            "yaml" | "yml" => FileType::Yaml,
63            "toml" => FileType::Toml,
64            "xml" => FileType::Xml,
65            "html" | "htm" => FileType::Html,
66            "css" | "scss" | "sass" | "less" => FileType::Css,
67            "txt" | "text" => FileType::Text,
68            _ => {
69                // Check if it's a text file by name
70                let filename = path.file_name().and_then(|name| name.to_str()).unwrap_or("");
71
72                match filename {
73                    "README" | "LICENSE" | "CHANGELOG" | "AUTHORS" | "CONTRIBUTORS" => {
74                        FileType::Text
75                    }
76                    "Makefile" | "Dockerfile" | "Vagrantfile" | "Jenkinsfile" => FileType::Text,
77                    _ if !is_binary_extension(path) => FileType::Text,
78                    _ => FileType::Other,
79                }
80            }
81        }
82    }
83}
84
85/// Get the markdown code fence language for a file extension
86pub fn get_language_from_extension(path: &Path) -> &'static str {
87    let extension = path.extension().and_then(|ext| ext.to_str()).unwrap_or("");
88
89    match extension.to_lowercase().as_str() {
90        // Programming languages
91        "rs" => "rust",
92        "py" => "python",
93        "js" | "mjs" | "cjs" => "javascript",
94        "ts" | "tsx" => "typescript",
95        "jsx" => "jsx",
96        "go" => "go",
97        "c" => "c",
98        "cpp" | "cc" | "cxx" | "c++" => "cpp",
99        "h" | "hpp" | "hxx" => "cpp",
100        "cs" => "csharp",
101        "java" => "java",
102        "kt" | "kts" => "kotlin",
103        "swift" => "swift",
104        "rb" => "ruby",
105        "php" => "php",
106        "lua" => "lua",
107        "r" => "r",
108        "scala" => "scala",
109        "clj" | "cljs" => "clojure",
110        "ex" | "exs" => "elixir",
111        "elm" => "elm",
112        "hs" => "haskell",
113        "ml" | "mli" => "ocaml",
114        "fs" | "fsx" => "fsharp",
115        "pl" => "perl",
116        "sh" => "bash",
117        "fish" => "fish",
118        "zsh" => "zsh",
119        "ps1" => "powershell",
120        "dart" => "dart",
121        "julia" | "jl" => "julia",
122        "nim" => "nim",
123        "zig" => "zig",
124        "v" => "v",
125        "d" => "d",
126
127        // Web technologies
128        "html" | "htm" => "html",
129        "css" => "css",
130        "scss" | "sass" => "scss",
131        "less" => "less",
132        "vue" => "vue",
133        "svelte" => "svelte",
134
135        // Data formats
136        "json" => "json",
137        "yaml" | "yml" => "yaml",
138        "toml" => "toml",
139        "xml" => "xml",
140        "csv" => "csv",
141        "sql" => "sql",
142
143        // Markup languages
144        "md" | "markdown" => "markdown",
145        "tex" => "latex",
146        "rst" => "rst",
147        "adoc" | "asciidoc" => "asciidoc",
148
149        // Configuration files
150        "ini" | "cfg" => "ini",
151        "conf" | "config" => "text",
152        "env" => "dotenv",
153        "dockerfile" => "dockerfile",
154        "makefile" | "mk" => "makefile",
155
156        // Shell scripts
157        "bash" => "bash",
158        "bat" | "cmd" => "batch",
159
160        // Other
161        "proto" => "protobuf",
162        "graphql" | "gql" => "graphql",
163        "tf" => "hcl",
164        "vim" => "vim",
165        "diff" | "patch" => "diff",
166
167        // Default to text for unknown extensions
168        _ => "text",
169    }
170}
171
172/// Check if a file is likely to be binary based on its extension
173pub fn is_binary_extension(path: &Path) -> bool {
174    let extension = path.extension().and_then(|ext| ext.to_str()).unwrap_or("");
175
176    matches!(
177        extension.to_lowercase().as_str(),
178        // Executables and libraries
179        "exe" | "dll" | "so" | "dylib" | "a" | "lib" | "bin" |
180        // Archives
181        "zip" | "tar" | "gz" | "bz2" | "xz" | "7z" | "rar" |
182        // Images
183        "jpg" | "jpeg" | "png" | "gif" | "bmp" | "ico" | "svg" | "webp" |
184        // Audio
185        "mp3" | "wav" | "flac" | "aac" | "ogg" | "wma" |
186        // Video
187        "mp4" | "avi" | "mkv" | "mov" | "wmv" | "flv" | "webm" |
188        // Documents
189        "pdf" | "doc" | "docx" | "xls" | "xlsx" | "ppt" | "pptx" |
190        // Fonts
191        "ttf" | "otf" | "woff" | "woff2" | "eot" |
192        // Database
193        "db" | "sqlite" | "sqlite3" |
194        // Other binary formats
195        "pyc" | "pyo" | "class" | "o" | "obj" | "pdb"
196    )
197}
198
199/// Detect if content contains binary data (null bytes)
200pub fn is_binary_content(content: &[u8]) -> bool {
201    // Check first 8KB for null bytes
202    let check_len = content.len().min(8192);
203    content[..check_len].contains(&0)
204}
205
206#[cfg(test)]
207mod tests {
208    use super::*;
209    use std::path::Path;
210
211    #[test]
212    fn test_language_detection() {
213        assert_eq!(get_language_from_extension(Path::new("test.rs")), "rust");
214        assert_eq!(get_language_from_extension(Path::new("test.py")), "python");
215        assert_eq!(get_language_from_extension(Path::new("test.js")), "javascript");
216        assert_eq!(get_language_from_extension(Path::new("test.unknown")), "text");
217        assert_eq!(get_language_from_extension(Path::new("Makefile")), "text");
218    }
219
220    #[test]
221    fn test_binary_extension_detection() {
222        assert!(is_binary_extension(Path::new("test.exe")));
223        assert!(is_binary_extension(Path::new("image.png")));
224        assert!(is_binary_extension(Path::new("archive.zip")));
225        assert!(!is_binary_extension(Path::new("code.rs")));
226        assert!(!is_binary_extension(Path::new("text.md")));
227    }
228
229    #[test]
230    fn test_binary_content_detection() {
231        assert!(!is_binary_content(b"Hello, world!"));
232        assert!(is_binary_content(b"Hello\0world"));
233        assert!(is_binary_content(&[0xFF, 0xFE, 0x00, 0x00]));
234    }
235}