context_creator/utils/
file_ext.rs

1//! File extension to language mapping utilities
2
3use std::path::Path;
4
5/// File type enumeration for categorizing files
6#[derive(Debug, Clone, PartialEq, Eq, Hash)]
7pub enum FileType {
8    // Programming languages
9    Rust,
10    Python,
11    JavaScript,
12    TypeScript,
13    Go,
14    Java,
15    Cpp,
16    C,
17    CSharp,
18    Ruby,
19    Php,
20    Swift,
21    Kotlin,
22    Scala,
23    Haskell,
24    Dart,
25    Lua,
26    R,
27    Julia,
28    Elixir,
29    Elm,
30
31    // Data formats
32    Markdown,
33    Json,
34    Yaml,
35    Toml,
36    Xml,
37    Html,
38    Css,
39
40    // Other
41    Text,
42    Other,
43}
44
45impl FileType {
46    /// Determine file type from path
47    pub fn from_path(path: &Path) -> Self {
48        let extension = path
49            .extension()
50            .and_then(|ext| ext.to_str())
51            .unwrap_or("")
52            .to_lowercase();
53
54        match extension.as_str() {
55            "rs" => FileType::Rust,
56            "py" => FileType::Python,
57            "js" | "mjs" | "cjs" => FileType::JavaScript,
58            "ts" | "tsx" => FileType::TypeScript,
59            "go" => FileType::Go,
60            "java" => FileType::Java,
61            "cpp" | "cc" | "cxx" | "c++" | "hpp" | "hxx" | "h++" => FileType::Cpp,
62            "c" | "h" => FileType::C,
63            "cs" => FileType::CSharp,
64            "rb" => FileType::Ruby,
65            "php" => FileType::Php,
66            "swift" => FileType::Swift,
67            "kt" | "kts" => FileType::Kotlin,
68            "scala" | "sc" => FileType::Scala,
69            "hs" => FileType::Haskell,
70            "dart" => FileType::Dart,
71            "lua" => FileType::Lua,
72            "r" => FileType::R,
73            "jl" => FileType::Julia,
74            "ex" | "exs" => FileType::Elixir,
75            "elm" => FileType::Elm,
76            "md" | "markdown" => FileType::Markdown,
77            "json" => FileType::Json,
78            "yaml" | "yml" => FileType::Yaml,
79            "toml" => FileType::Toml,
80            "xml" => FileType::Xml,
81            "html" | "htm" => FileType::Html,
82            "css" | "scss" | "sass" | "less" => FileType::Css,
83            "txt" | "text" => FileType::Text,
84            _ => {
85                // Check if it's a text file by name
86                let filename = path
87                    .file_name()
88                    .and_then(|name| name.to_str())
89                    .unwrap_or("");
90
91                match filename {
92                    "README" | "LICENSE" | "CHANGELOG" | "AUTHORS" | "CONTRIBUTORS" => {
93                        FileType::Text
94                    }
95                    "Makefile" | "Dockerfile" | "Vagrantfile" | "Jenkinsfile" => FileType::Text,
96                    _ if !is_binary_extension(path) => FileType::Text,
97                    _ => FileType::Other,
98                }
99            }
100        }
101    }
102}
103
104/// Get the markdown code fence language for a file extension
105pub fn get_language_from_extension(path: &Path) -> &'static str {
106    let extension = path.extension().and_then(|ext| ext.to_str()).unwrap_or("");
107
108    match extension.to_lowercase().as_str() {
109        // Programming languages
110        "rs" => "rust",
111        "py" => "python",
112        "js" | "mjs" | "cjs" => "javascript",
113        "ts" | "tsx" => "typescript",
114        "jsx" => "jsx",
115        "go" => "go",
116        "c" => "c",
117        "cpp" | "cc" | "cxx" | "c++" => "cpp",
118        "h" | "hpp" | "hxx" => "cpp",
119        "cs" => "csharp",
120        "java" => "java",
121        "kt" | "kts" => "kotlin",
122        "swift" => "swift",
123        "rb" => "ruby",
124        "php" => "php",
125        "lua" => "lua",
126        "r" => "r",
127        "scala" => "scala",
128        "clj" | "cljs" => "clojure",
129        "ex" | "exs" => "elixir",
130        "elm" => "elm",
131        "hs" => "haskell",
132        "ml" | "mli" => "ocaml",
133        "fs" | "fsx" => "fsharp",
134        "pl" => "perl",
135        "sh" => "bash",
136        "fish" => "fish",
137        "zsh" => "zsh",
138        "ps1" => "powershell",
139        "dart" => "dart",
140        "julia" | "jl" => "julia",
141        "nim" => "nim",
142        "zig" => "zig",
143        "v" => "v",
144        "d" => "d",
145
146        // Web technologies
147        "html" | "htm" => "html",
148        "css" => "css",
149        "scss" | "sass" => "scss",
150        "less" => "less",
151        "vue" => "vue",
152        "svelte" => "svelte",
153
154        // Data formats
155        "json" => "json",
156        "yaml" | "yml" => "yaml",
157        "toml" => "toml",
158        "xml" => "xml",
159        "csv" => "csv",
160        "sql" => "sql",
161
162        // Markup languages
163        "md" | "markdown" => "markdown",
164        "tex" => "latex",
165        "rst" => "rst",
166        "adoc" | "asciidoc" => "asciidoc",
167
168        // Configuration files
169        "ini" | "cfg" => "ini",
170        "conf" | "config" => "text",
171        "env" => "dotenv",
172        "dockerfile" => "dockerfile",
173        "makefile" | "mk" => "makefile",
174
175        // Shell scripts
176        "bash" => "bash",
177        "bat" | "cmd" => "batch",
178
179        // Other
180        "proto" => "protobuf",
181        "graphql" | "gql" => "graphql",
182        "tf" => "hcl",
183        "vim" => "vim",
184        "diff" | "patch" => "diff",
185
186        // Default to text for unknown extensions
187        _ => "text",
188    }
189}
190
191/// Check if a file is likely to be binary based on its extension
192pub fn is_binary_extension(path: &Path) -> bool {
193    let extension = path.extension().and_then(|ext| ext.to_str()).unwrap_or("");
194
195    matches!(
196        extension.to_lowercase().as_str(),
197        // Executables and libraries
198        "exe" | "dll" | "so" | "dylib" | "a" | "lib" | "bin" |
199        // Archives
200        "zip" | "tar" | "gz" | "bz2" | "xz" | "7z" | "rar" |
201        // Images
202        "jpg" | "jpeg" | "png" | "gif" | "bmp" | "ico" | "svg" | "webp" |
203        // Audio
204        "mp3" | "wav" | "flac" | "aac" | "ogg" | "wma" |
205        // Video
206        "mp4" | "avi" | "mkv" | "mov" | "wmv" | "flv" | "webm" |
207        // Documents
208        "pdf" | "doc" | "docx" | "xls" | "xlsx" | "ppt" | "pptx" |
209        // Fonts
210        "ttf" | "otf" | "woff" | "woff2" | "eot" |
211        // Database
212        "db" | "sqlite" | "sqlite3" |
213        // Other binary formats
214        "pyc" | "pyo" | "class" | "o" | "obj" | "pdb"
215    )
216}
217
218/// Detect if content contains binary data (null bytes)
219pub fn is_binary_content(content: &[u8]) -> bool {
220    // Check first 8KB for null bytes
221    let check_len = content.len().min(8192);
222    content[..check_len].contains(&0)
223}
224
225#[cfg(test)]
226mod tests {
227    use super::*;
228    use std::path::Path;
229
230    #[test]
231    fn test_language_detection() {
232        assert_eq!(get_language_from_extension(Path::new("test.rs")), "rust");
233        assert_eq!(get_language_from_extension(Path::new("test.py")), "python");
234        assert_eq!(
235            get_language_from_extension(Path::new("test.js")),
236            "javascript"
237        );
238        assert_eq!(
239            get_language_from_extension(Path::new("test.unknown")),
240            "text"
241        );
242        assert_eq!(get_language_from_extension(Path::new("Makefile")), "text");
243    }
244
245    #[test]
246    fn test_binary_extension_detection() {
247        assert!(is_binary_extension(Path::new("test.exe")));
248        assert!(is_binary_extension(Path::new("image.png")));
249        assert!(is_binary_extension(Path::new("archive.zip")));
250        assert!(!is_binary_extension(Path::new("code.rs")));
251        assert!(!is_binary_extension(Path::new("text.md")));
252    }
253
254    #[test]
255    fn test_binary_content_detection() {
256        assert!(!is_binary_content(b"Hello, world!"));
257        assert!(is_binary_content(b"Hello\0world"));
258        assert!(is_binary_content(&[0xFF, 0xFE, 0x00, 0x00]));
259    }
260}