file_identify/
lib.rs

1//! # file-identify
2//!
3//! A Rust library for identifying file types based on extensions, content, and shebangs.
4//!
5//! This library provides a comprehensive way to identify files by analyzing:
6//! - File extensions and special filenames
7//! - File content (binary vs text detection)
8//! - Shebang lines for executable scripts
9//! - File system metadata (permissions, file type)
10//!
11//! ## Quick Start
12//!
13//! ```rust
14//! use file_identify::{tags_from_path, tags_from_filename};
15//!
16//! // Identify a Python file
17//! let tags = tags_from_filename("script.py");
18//! assert!(tags.contains("python"));
19//! assert!(tags.contains("text"));
20//!
21//! // Identify from filesystem path
22//! # use std::fs;
23//! # use tempfile::tempdir;
24//! # let dir = tempdir().unwrap();
25//! # let file_path = dir.path().join("test.py");
26//! # fs::write(&file_path, "print('hello')").unwrap();
27//! let tags = tags_from_path(&file_path).unwrap();
28//! assert!(tags.contains("file"));
29//! assert!(tags.contains("python"));
30//! ```
31//!
32//! ## Tag System
33//!
34//! Files are identified using a set of standardized tags:
35//!
36//! - **Type tags**: `file`, `directory`, `symlink`, `socket`
37//! - **Mode tags**: `executable`, `non-executable`
38//! - **Encoding tags**: `text`, `binary`
39//! - **Language/format tags**: `python`, `javascript`, `json`, `xml`, etc.
40//!
41//! ## Error Handling
42//!
43//! Functions that access the filesystem return [`Result`] types. The main error
44//! conditions are:
45//!
46//! - [`IdentifyError::PathNotFound`] - when the specified path doesn't exist
47//! - [`IdentifyError::IoError`] - for other I/O related errors
48
49use std::collections::HashSet;
50use std::fs;
51use std::io::{BufRead, BufReader, Read};
52use std::path::Path;
53
54pub mod extensions;
55pub mod interpreters;
56pub mod tags;
57
58use extensions::{EXTENSIONS, EXTENSIONS_NEED_BINARY_CHECK, NAMES};
59use interpreters::INTERPRETERS;
60use tags::*;
61
62/// Result type for file identification operations.
63///
64/// This is a convenience type alias for operations that may fail with
65/// file system or parsing errors.
66pub type Result<T> = std::result::Result<T, Box<dyn std::error::Error>>;
67
68/// Errors that can occur during file identification.
69#[derive(Debug)]
70pub enum IdentifyError {
71    /// The specified path does not exist on the filesystem.
72    PathNotFound(String),
73    /// An I/O error occurred while accessing the file.
74    IoError(std::io::Error),
75}
76
77impl std::fmt::Display for IdentifyError {
78    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
79        match self {
80            IdentifyError::PathNotFound(path) => write!(f, "{path} does not exist."),
81            IdentifyError::IoError(err) => write!(f, "IO error: {err}"),
82        }
83    }
84}
85
86impl std::error::Error for IdentifyError {}
87
88impl From<std::io::Error> for IdentifyError {
89    fn from(err: std::io::Error) -> Self {
90        IdentifyError::IoError(err)
91    }
92}
93
94/// Identify a file from its filesystem path.
95///
96/// This is the most comprehensive identification method, providing a superset
97/// of information from other methods. It analyzes:
98///
99/// 1. File type (regular file, directory, symlink, socket)
100/// 2. File permissions (executable vs non-executable)
101/// 3. Filename and extension patterns
102/// 4. File content (binary vs text detection)
103/// 5. Shebang lines for executable files
104///
105/// # Arguments
106///
107/// * `path` - Path to the file to identify
108///
109/// # Returns
110///
111/// A set of tags identifying the file type and characteristics.
112///
113/// # Errors
114///
115/// Returns [`IdentifyError::PathNotFound`] if the path doesn't exist, or
116/// [`IdentifyError::IoError`] for other I/O failures.
117///
118/// # Examples
119///
120/// ```rust
121/// use file_identify::tags_from_path;
122/// # use std::fs;
123/// # use tempfile::tempdir;
124///
125/// # let dir = tempdir().unwrap();
126/// # let file_path = dir.path().join("script.py");
127/// # fs::write(&file_path, "#!/usr/bin/env python3\nprint('hello')").unwrap();
128/// let tags = tags_from_path(&file_path).unwrap();
129/// assert!(tags.contains("file"));
130/// assert!(tags.contains("python"));
131/// assert!(tags.contains("text"));
132/// ```
133pub fn tags_from_path<P: AsRef<Path>>(path: P) -> Result<TagSet> {
134    let path = path.as_ref();
135    let path_str = path.to_string_lossy();
136
137    let metadata = match fs::symlink_metadata(path) {
138        Ok(meta) => meta,
139        Err(_) => return Err(Box::new(IdentifyError::PathNotFound(path_str.to_string()))),
140    };
141
142    let file_type = metadata.file_type();
143
144    if file_type.is_dir() {
145        return Ok([DIRECTORY].iter().cloned().collect());
146    }
147    if file_type.is_symlink() {
148        return Ok([SYMLINK].iter().cloned().collect());
149    }
150
151    // Check for socket (Unix-specific)
152    #[cfg(unix)]
153    {
154        use std::os::unix::fs::FileTypeExt;
155        if file_type.is_socket() {
156            return Ok([SOCKET].iter().cloned().collect());
157        }
158    }
159
160    let mut tags = TagSet::new();
161    tags.insert(FILE);
162
163    // Check if executable
164    let is_executable = {
165        #[cfg(unix)]
166        {
167            use std::os::unix::fs::PermissionsExt;
168            metadata.permissions().mode() & 0o111 != 0
169        }
170        #[cfg(not(unix))]
171        {
172            // On non-Unix systems, check file extension for common executables
173            path.extension()
174                .and_then(|ext| ext.to_str())
175                .map(|ext| matches!(ext.to_lowercase().as_str(), "exe" | "bat" | "cmd"))
176                .unwrap_or(false)
177        }
178    };
179
180    if is_executable {
181        tags.insert(EXECUTABLE);
182    } else {
183        tags.insert(NON_EXECUTABLE);
184    }
185
186    // Check filename-based tags
187    if let Some(filename) = path.file_name().and_then(|n| n.to_str()) {
188        let filename_tags = tags_from_filename(filename);
189        if !filename_tags.is_empty() {
190            tags.extend(filename_tags);
191        } else if is_executable {
192            // Parse shebang for executable files without recognized extensions
193            if let Ok(shebang_tags) = parse_shebang_from_file(path) {
194                tags.extend(shebang_tags);
195            }
196        }
197    }
198
199    // Check if we need to determine binary vs text
200    if !tags.iter().any(|tag| ENCODING_TAGS.contains(tag)) {
201        if file_is_text(path)? {
202            tags.insert(TEXT);
203        } else {
204            tags.insert(BINARY);
205        }
206    }
207
208    Ok(tags)
209}
210
211/// Identify a file based only on its filename.
212///
213/// This method analyzes the filename and extension to determine file type,
214/// without accessing the filesystem. It's useful when you only have the
215/// filename or want to avoid I/O operations.
216///
217/// # Arguments
218///
219/// * `filename` - The filename to analyze (can include path)
220///
221/// # Returns
222///
223/// A set of tags identifying the file type. Returns an empty set if
224/// the filename is not recognized.
225///
226/// # Examples
227///
228/// ```rust
229/// use file_identify::tags_from_filename;
230///
231/// let tags = tags_from_filename("script.py");
232/// assert!(tags.contains("python"));
233/// assert!(tags.contains("text"));
234///
235/// let tags = tags_from_filename("Dockerfile");
236/// assert!(tags.contains("dockerfile"));
237///
238/// let tags = tags_from_filename("unknown.xyz");
239/// assert!(tags.is_empty());
240/// ```
241pub fn tags_from_filename(filename: &str) -> TagSet {
242    let mut tags = TagSet::new();
243
244    // Check exact filename matches first
245    for part in std::iter::once(filename).chain(filename.split('.')) {
246        if let Some(name_tags) = NAMES.get(part) {
247            tags.extend(name_tags.iter().cloned());
248            break;
249        }
250    }
251
252    // Check file extension
253    if let Some(ext) = Path::new(filename).extension().and_then(|e| e.to_str()) {
254        let ext_lower = ext.to_lowercase();
255
256        if let Some(ext_tags) = EXTENSIONS.get(ext_lower.as_str()) {
257            tags.extend(ext_tags.iter().cloned());
258        } else if let Some(ext_tags) = EXTENSIONS_NEED_BINARY_CHECK.get(ext_lower.as_str()) {
259            tags.extend(ext_tags.iter().cloned());
260        }
261    }
262
263    tags
264}
265
266/// Identify tags based on a shebang interpreter.
267///
268/// This function analyzes interpreter names from shebang lines to determine
269/// the script type. It handles version-specific interpreters by progressively
270/// removing version suffixes.
271///
272/// # Arguments
273///
274/// * `interpreter` - The interpreter name or path from a shebang
275///
276/// # Returns
277///
278/// A set of tags for the interpreter type. Returns an empty set if
279/// the interpreter is not recognized.
280///
281/// # Examples
282///
283/// ```rust
284/// use file_identify::tags_from_interpreter;
285///
286/// let tags = tags_from_interpreter("python3.11");
287/// assert!(tags.contains("python"));
288/// assert!(tags.contains("python3"));
289///
290/// let tags = tags_from_interpreter("/usr/bin/bash");
291/// assert!(tags.contains("shell"));
292/// assert!(tags.contains("bash"));
293///
294/// let tags = tags_from_interpreter("unknown-interpreter");
295/// assert!(tags.is_empty());
296/// ```
297pub fn tags_from_interpreter(interpreter: &str) -> TagSet {
298    // Extract the interpreter name from the path
299    let interpreter_name = interpreter.split('/').next_back().unwrap_or(interpreter);
300
301    // Try progressively shorter versions (e.g., "python3.5.2" -> "python3.5" -> "python3")
302    let mut current = interpreter_name;
303    while !current.is_empty() {
304        if let Some(tags) = INTERPRETERS.get(current) {
305            return tags.clone();
306        }
307
308        // Try removing the last dot-separated part
309        match current.rfind('.') {
310            Some(pos) => current = &current[..pos],
311            None => break,
312        }
313    }
314
315    TagSet::new()
316}
317
318/// Determine if a file contains text or binary data.
319///
320/// This function reads the first 1KB of a file to determine if it contains
321/// text or binary data, using a similar algorithm to the `file` command.
322///
323/// # Arguments
324///
325/// * `path` - Path to the file to analyze
326///
327/// # Returns
328///
329/// `true` if the file appears to contain text, `false` if binary.
330///
331/// # Errors
332///
333/// Returns an error if the file cannot be opened or read.
334///
335/// # Examples
336///
337/// ```rust
338/// use file_identify::file_is_text;
339/// # use std::fs;
340/// # use tempfile::tempdir;
341///
342/// # let dir = tempdir().unwrap();
343/// # let text_path = dir.path().join("text.txt");
344/// # fs::write(&text_path, "Hello, world!").unwrap();
345/// assert!(file_is_text(&text_path).unwrap());
346///
347/// # let binary_path = dir.path().join("binary.bin");
348/// # fs::write(&binary_path, &[0x7f, 0x45, 0x4c, 0x46]).unwrap();
349/// assert!(!file_is_text(&binary_path).unwrap());
350/// ```
351pub fn file_is_text<P: AsRef<Path>>(path: P) -> Result<bool> {
352    let file = fs::File::open(path)?;
353    is_text(file)
354}
355
356/// Determine if data from a reader contains text or binary content.
357///
358/// This function reads up to 1KB from the provided reader and analyzes
359/// the bytes to determine if they represent text or binary data.
360///
361/// # Arguments
362///
363/// * `reader` - A reader providing the data to analyze
364///
365/// # Returns
366///
367/// `true` if the data appears to be text, `false` if binary.
368///
369/// # Examples
370///
371/// ```rust
372/// use file_identify::is_text;
373/// use std::io::Cursor;
374///
375/// let text_data = Cursor::new(b"Hello, world!");
376/// assert!(is_text(text_data).unwrap());
377///
378/// let binary_data = Cursor::new(&[0x7f, 0x45, 0x4c, 0x46, 0x00]);
379/// assert!(!is_text(binary_data).unwrap());
380/// ```
381pub fn is_text<R: Read>(mut reader: R) -> Result<bool> {
382    let mut buffer = [0; 1024];
383    let bytes_read = reader.read(&mut buffer)?;
384
385    // Check for null bytes or other non-text indicators
386    let text_chars: HashSet<u8> = [
387        7, 8, 9, 10, 11, 12, 13, 27, // Control chars
388    ]
389    .iter()
390    .cloned()
391    .chain(0x20..0x7F) // ASCII printable
392    .chain(0x80..=0xFF) // Extended ASCII
393    .collect();
394
395    let is_text = buffer[..bytes_read]
396        .iter()
397        .all(|&byte| text_chars.contains(&byte));
398    Ok(is_text)
399}
400
401/// Parse shebang line from an executable file and return interpreter tags.
402///
403/// This function reads the first line of an executable file to extract
404/// shebang information and determine the script interpreter.
405///
406/// # Arguments
407///
408/// * `path` - Path to the executable file
409///
410/// # Returns
411///
412/// A set of tags for the interpreter found in the shebang line.
413/// Returns an empty set if:
414/// - The file is not executable
415/// - No shebang is found
416/// - The interpreter is not recognized
417///
418/// # Errors
419///
420/// Returns an error if the file cannot be accessed or read.
421///
422/// # Examples
423///
424/// ```rust
425/// use file_identify::parse_shebang_from_file;
426/// # use std::fs;
427/// # use std::os::unix::fs::PermissionsExt;
428/// # use tempfile::tempdir;
429///
430/// # let dir = tempdir().unwrap();
431/// # let script_path = dir.path().join("script");
432/// # fs::write(&script_path, "#!/usr/bin/env python3\nprint('hello')").unwrap();
433/// # let mut perms = fs::metadata(&script_path).unwrap().permissions();
434/// # perms.set_mode(0o755);
435/// # fs::set_permissions(&script_path, perms).unwrap();
436/// let tags = parse_shebang_from_file(&script_path).unwrap();
437/// assert!(tags.contains("python"));
438/// ```
439pub fn parse_shebang_from_file<P: AsRef<Path>>(path: P) -> Result<TagSet> {
440    let path = path.as_ref();
441
442    // Only check executable files
443    let metadata = fs::metadata(path)?;
444    #[cfg(unix)]
445    {
446        use std::os::unix::fs::PermissionsExt;
447        if metadata.permissions().mode() & 0o111 == 0 {
448            return Ok(TagSet::new());
449        }
450    }
451
452    let file = fs::File::open(path)?;
453    parse_shebang(file)
454}
455
456/// Parse a shebang line from a reader and return interpreter tags.
457///
458/// This function reads the first line from the provided reader and parses
459/// it as a shebang line to determine the script interpreter.
460///
461/// # Arguments
462///
463/// * `reader` - A reader providing the file content
464///
465/// # Returns
466///
467/// A set of tags for the interpreter found in the shebang line.
468/// Returns an empty set if no valid shebang is found.
469///
470/// # Examples
471///
472/// ```rust
473/// use file_identify::parse_shebang;
474/// use std::io::Cursor;
475///
476/// let shebang = Cursor::new(b"#!/usr/bin/env python3\nprint('hello')");
477/// let tags = parse_shebang(shebang).unwrap();
478/// assert!(tags.contains("python"));
479/// assert!(tags.contains("python3"));
480///
481/// let no_shebang = Cursor::new(b"print('hello')");
482/// let tags = parse_shebang(no_shebang).unwrap();
483/// assert!(tags.is_empty());
484/// ```
485pub fn parse_shebang<R: Read>(reader: R) -> Result<TagSet> {
486    let mut buf_reader = BufReader::new(reader);
487    let mut first_line = String::new();
488    buf_reader.read_line(&mut first_line)?;
489
490    if !first_line.starts_with("#!") {
491        return Ok(TagSet::new());
492    }
493
494    // Remove the #! and clean up the line
495    let shebang_line = first_line[2..].trim();
496
497    // Parse the shebang command
498    let parts: Vec<&str> = shebang_line.split_whitespace().collect();
499    if parts.is_empty() {
500        return Ok(TagSet::new());
501    }
502
503    let cmd = if parts.len() >= 2 && parts[0] == "/usr/bin/env" {
504        if parts[1] == "-S" && parts.len() > 2 {
505            &parts[2..]
506        } else {
507            &parts[1..]
508        }
509    } else {
510        &parts
511    };
512
513    if cmd.is_empty() {
514        return Ok(TagSet::new());
515    }
516
517    // Extract interpreter name and get tags
518    let interpreter = cmd[0].split('/').next_back().unwrap_or(cmd[0]);
519    Ok(tags_from_interpreter(interpreter))
520}
521
522#[cfg(test)]
523mod tests {
524    use super::*;
525    use std::fs;
526    use std::io::Cursor;
527    use std::os::unix::fs::PermissionsExt;
528    use tempfile::{NamedTempFile, tempdir};
529
530    // Test tag system completeness
531    #[test]
532    fn test_all_basic_tags_exist() {
533        assert!(TYPE_TAGS.contains("file"));
534        assert!(TYPE_TAGS.contains("directory"));
535        assert!(MODE_TAGS.contains("executable"));
536        assert!(ENCODING_TAGS.contains("text"));
537    }
538
539    #[test]
540    fn test_tag_groups_are_disjoint() {
541        assert!(TYPE_TAGS.is_disjoint(&MODE_TAGS));
542        assert!(TYPE_TAGS.is_disjoint(&ENCODING_TAGS));
543        assert!(MODE_TAGS.is_disjoint(&ENCODING_TAGS));
544    }
545
546    // Test tags_from_filename with various scenarios
547    #[test]
548    fn test_tags_from_filename_basic() {
549        let tags = tags_from_filename("file.py");
550        assert!(tags.contains("text"));
551        assert!(tags.contains("python"));
552    }
553
554    #[test]
555    fn test_tags_from_filename_special_names() {
556        let tags = tags_from_filename("Dockerfile");
557        assert!(tags.contains("dockerfile"));
558        assert!(tags.contains("text"));
559
560        let tags = tags_from_filename("Makefile");
561        assert!(tags.contains("makefile"));
562        assert!(tags.contains("text"));
563
564        let tags = tags_from_filename("Cargo.toml");
565        assert!(tags.contains("toml"));
566        assert!(tags.contains("cargo"));
567    }
568
569    #[test]
570    fn test_tags_from_filename_case_insensitive_extension() {
571        let tags = tags_from_filename("image.JPG");
572        assert!(tags.contains("binary"));
573        assert!(tags.contains("image"));
574        assert!(tags.contains("jpeg"));
575    }
576
577    #[test]
578    fn test_tags_from_filename_precedence() {
579        // setup.cfg should match by name, not .cfg extension
580        let tags = tags_from_filename("setup.cfg");
581        assert!(tags.contains("ini"));
582    }
583
584    #[test]
585    fn test_tags_from_filename_complex_names() {
586        let tags = tags_from_filename("Dockerfile.xenial");
587        assert!(tags.contains("dockerfile"));
588
589        let tags = tags_from_filename("README.md");
590        assert!(tags.contains("markdown"));
591        assert!(tags.contains("plain-text"));
592    }
593
594    #[test]
595    fn test_tags_from_filename_unrecognized() {
596        let tags = tags_from_filename("unknown.xyz");
597        assert!(tags.is_empty());
598
599        let tags = tags_from_filename("noextension");
600        assert!(tags.is_empty());
601    }
602
603    // Test tags_from_interpreter
604    #[test]
605    fn test_tags_from_interpreter_basic() {
606        let tags = tags_from_interpreter("python3");
607        assert!(tags.contains("python"));
608        assert!(tags.contains("python3"));
609    }
610
611    #[test]
612    fn test_tags_from_interpreter_versioned() {
613        let tags = tags_from_interpreter("python3.11.2");
614        assert!(tags.contains("python"));
615        assert!(tags.contains("python3"));
616
617        let tags = tags_from_interpreter("php8.1");
618        assert!(tags.contains("php"));
619        assert!(tags.contains("php8"));
620    }
621
622    #[test]
623    fn test_tags_from_interpreter_with_path() {
624        let tags = tags_from_interpreter("/usr/bin/python3");
625        assert!(tags.contains("python"));
626        assert!(tags.contains("python3"));
627    }
628
629    #[test]
630    fn test_tags_from_interpreter_unrecognized() {
631        let tags = tags_from_interpreter("unknown-interpreter");
632        assert!(tags.is_empty());
633
634        let tags = tags_from_interpreter("");
635        assert!(tags.is_empty());
636    }
637
638    // Test is_text function
639    #[test]
640    fn test_is_text_basic() {
641        assert!(is_text(Cursor::new(b"hello world")).unwrap());
642        assert!(is_text(Cursor::new(b"")).unwrap());
643        assert!(!is_text(Cursor::new(b"hello\x00world")).unwrap());
644    }
645
646    #[test]
647    fn test_is_text_unicode() {
648        assert!(is_text(Cursor::new("éóñəå  ⊂(◉‿◉)つ(ノ≥∇≤)ノ".as_bytes())).unwrap());
649        assert!(is_text(Cursor::new(r"¯\_(ツ)_/¯".as_bytes())).unwrap());
650        assert!(is_text(Cursor::new("♪┏(・o・)┛♪┗ ( ・o・) ┓♪".as_bytes())).unwrap());
651    }
652
653    #[test]
654    fn test_is_text_binary_data() {
655        // ELF header
656        assert!(!is_text(Cursor::new(&[0x7f, 0x45, 0x4c, 0x46, 0x02, 0x01, 0x01])).unwrap());
657        // Random binary data
658        assert!(!is_text(Cursor::new(&[0x43, 0x92, 0xd9, 0x0f, 0xaf, 0x32, 0x2c])).unwrap());
659    }
660
661    // Test parse_shebang function
662    #[test]
663    fn test_parse_shebang_basic() {
664        let tags = parse_shebang(Cursor::new(b"#!/usr/bin/python")).unwrap();
665        assert!(tags.contains("python"));
666
667        let tags = parse_shebang(Cursor::new(b"#!/usr/bin/env python")).unwrap();
668        assert!(tags.contains("python"));
669    }
670
671    #[test]
672    fn test_parse_shebang_env_with_flags() {
673        let tags = parse_shebang(Cursor::new(b"#!/usr/bin/env -S python -u")).unwrap();
674        assert!(tags.contains("python"));
675    }
676
677    #[test]
678    fn test_parse_shebang_spaces() {
679        let tags = parse_shebang(Cursor::new(b"#! /usr/bin/python")).unwrap();
680        assert!(tags.contains("python"));
681
682        let tags = parse_shebang(Cursor::new(b"#!/usr/bin/foo  python")).unwrap();
683        // Should get first interpreter
684        assert!(tags.is_empty()); // "foo" is not recognized
685    }
686
687    #[test]
688    fn test_parse_shebang_no_shebang() {
689        let tags = parse_shebang(Cursor::new(b"import sys")).unwrap();
690        assert!(tags.is_empty());
691
692        let tags = parse_shebang(Cursor::new(b"")).unwrap();
693        assert!(tags.is_empty());
694    }
695
696    #[test]
697    fn test_parse_shebang_invalid_utf8() {
698        let result = parse_shebang(Cursor::new(&[0x23, 0x21, 0xf9, 0x93, 0x01, 0x42, 0xcd]));
699        match result {
700            Ok(tags) => assert!(tags.is_empty()),
701            Err(_) => (), // I/O errors are acceptable for invalid UTF-8 data
702        }
703    }
704
705    // File system tests using tempfiles
706    #[test]
707    fn test_tags_from_path_file_not_found() {
708        let result = tags_from_path("/nonexistent/path");
709        assert!(result.is_err());
710        assert!(result.unwrap_err().to_string().contains("does not exist"));
711    }
712
713    #[test]
714    fn test_tags_from_path_regular_file() {
715        let file = NamedTempFile::new().unwrap();
716        fs::write(&file, "print('hello')").unwrap();
717
718        let tags = tags_from_path(file.path()).unwrap();
719        assert!(tags.contains("file"));
720        assert!(tags.contains("non-executable"));
721        assert!(tags.contains("text"));
722    }
723
724    #[test]
725    fn test_tags_from_path_executable_file() {
726        let dir = tempdir().unwrap();
727        let script_path = dir.path().join("script.py");
728        fs::write(&script_path, "#!/usr/bin/env python3\nprint('hello')").unwrap();
729
730        let mut perms = fs::metadata(&script_path).unwrap().permissions();
731        perms.set_mode(0o755);
732        fs::set_permissions(&script_path, perms).unwrap();
733
734        let tags = tags_from_path(&script_path).unwrap();
735        assert!(tags.contains("file"));
736        assert!(tags.contains("executable"));
737        assert!(tags.contains("python"));
738        assert!(tags.contains("text"));
739    }
740
741    #[test]
742    fn test_tags_from_path_directory() {
743        let dir = tempdir().unwrap();
744        let tags = tags_from_path(dir.path()).unwrap();
745        assert_eq!(tags, HashSet::from(["directory"]));
746    }
747
748    #[test]
749    fn test_tags_from_path_binary_file() {
750        let dir = tempdir().unwrap();
751        let binary_path = dir.path().join("binary");
752        fs::write(&binary_path, &[0x7f, 0x45, 0x4c, 0x46, 0x02, 0x01, 0x01]).unwrap();
753
754        let tags = tags_from_path(&binary_path).unwrap();
755        assert!(tags.contains("file"));
756        assert!(tags.contains("binary"));
757        assert!(tags.contains("non-executable"));
758    }
759
760    #[test]
761    fn test_file_is_text_simple() {
762        let dir = tempdir().unwrap();
763        let text_path = dir.path().join("text.txt");
764        fs::write(&text_path, "Hello, world!").unwrap();
765        assert!(file_is_text(&text_path).unwrap());
766    }
767
768    #[test]
769    fn test_file_is_text_does_not_exist() {
770        let result = file_is_text("/nonexistent/file");
771        assert!(result.is_err());
772    }
773
774    // Test extensions that need binary check
775    #[test]
776    fn test_plist_binary_detection() {
777        let dir = tempdir().unwrap();
778        let plist_path = dir.path().join("test.plist");
779
780        // Binary plist
781        let binary_plist = [
782            0x62, 0x70, 0x6c, 0x69, 0x73, 0x74, 0x30, 0x30, // "bplist00"
783            0xd1, 0x01, 0x02, 0x5f, 0x10, 0x0f,
784        ];
785        fs::write(&plist_path, &binary_plist).unwrap();
786
787        let tags = tags_from_path(&plist_path).unwrap();
788        assert!(tags.contains("plist"));
789        assert!(tags.contains("binary"));
790    }
791
792    #[test]
793    fn test_plist_text_detection() {
794        let dir = tempdir().unwrap();
795        let plist_path = dir.path().join("test.plist");
796
797        let text_plist = r#"<?xml version="1.0" encoding="UTF-8"?>
798<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
799<plist version="1.0">
800<dict>
801    <key>TestKey</key>
802    <string>TestValue</string>
803</dict>
804</plist>"#;
805        fs::write(&plist_path, text_plist).unwrap();
806
807        let tags = tags_from_path(&plist_path).unwrap();
808        assert!(tags.contains("plist"));
809        assert!(tags.contains("text"));
810    }
811
812    // Additional edge case tests
813    #[test]
814    fn test_empty_file() {
815        let dir = tempdir().unwrap();
816        let empty_path = dir.path().join("empty");
817        fs::write(&empty_path, "").unwrap();
818
819        let tags = tags_from_path(&empty_path).unwrap();
820        assert!(tags.contains("file"));
821        assert!(tags.contains("text")); // Empty files are considered text
822        assert!(tags.contains("non-executable"));
823    }
824
825    #[test]
826    fn test_shebang_incomplete() {
827        let shebang_incomplete = parse_shebang(Cursor::new(b"#!   \n")).unwrap();
828        assert!(shebang_incomplete.is_empty());
829    }
830
831    #[test]
832    fn test_multiple_extensions() {
833        let tags = tags_from_filename("backup.tar.gz");
834        assert!(tags.contains("binary"));
835        assert!(tags.contains("gzip"));
836    }
837
838    // Additional comprehensive tests from Python version
839    #[test]
840    fn test_comprehensive_shebang_parsing() {
841        let test_cases = vec![
842            ("", vec![]),
843            ("#!/usr/bin/python", vec!["python"]),
844            ("#!/usr/bin/env python", vec!["python"]),
845            ("#! /usr/bin/python", vec!["python"]),
846            ("#!/usr/bin/foo  python", vec![]), // "foo" not recognized
847            ("#!/usr/bin/env -S python -u", vec!["python"]),
848            ("#!/usr/bin/env", vec![]),
849            ("#!/usr/bin/env -S", vec![]),
850        ];
851
852        for (input, expected) in test_cases {
853            let tags = parse_shebang(Cursor::new(input.as_bytes())).unwrap();
854            let expected_set: TagSet = expected.iter().cloned().collect();
855            assert_eq!(tags, expected_set, "Failed for input: '{}'", input);
856        }
857    }
858
859    #[test]
860    fn test_invalid_utf8_shebang() {
861        // Test that invalid UTF-8 in shebang doesn't crash
862        let invalid_utf8_cases = vec![
863            &[0xf9, 0x93, 0x01, 0x42, 0xcd][..],
864            &[0x23, 0x21, 0xf9, 0x93, 0x01, 0x42, 0xcd][..],
865            &[0x23, 0x21, 0x00, 0x00, 0x00, 0x00][..],
866        ];
867
868        for input in invalid_utf8_cases {
869            // Should not panic, should return empty set for invalid UTF-8
870            let result = parse_shebang(Cursor::new(input));
871            match result {
872                Ok(tags) => assert!(tags.is_empty()),
873                Err(_) => (), // I/O errors are acceptable for invalid data
874            }
875        }
876    }
877}