file_identify/
lib.rs

1//! # file-identify
2//!
3//! A Rust library for identifying file types based on extensions, content, and shebangs.
4//!
5//! This library provides a comprehensive way to identify files by analyzing:
6//! - File extensions and special filenames
7//! - File content (binary vs text detection)
8//! - Shebang lines for executable scripts
9//! - File system metadata (permissions, file type)
10//!
11//! ## Quick Start
12//!
13//! ```rust
14//! use file_identify::{tags_from_path, tags_from_filename, FileIdentifier};
15//!
16//! // Simple filename identification
17//! let tags = tags_from_filename("script.py");
18//! assert!(tags.contains("python"));
19//! assert!(tags.contains("text"));
20//!
21//! // Full file identification from filesystem path
22//! # use std::fs;
23//! # use tempfile::tempdir;
24//! # let dir = tempdir().unwrap();
25//! # let file_path = dir.path().join("test.py");
26//! # fs::write(&file_path, "print('hello')").unwrap();
27//! let tags = tags_from_path(&file_path).unwrap();
28//! assert!(tags.contains("file"));
29//! assert!(tags.contains("python"));
30//!
31//! // Customized identification with builder pattern
32//! let identifier = FileIdentifier::new()
33//!     .skip_content_analysis()  // Skip text vs binary detection
34//!     .skip_shebang_analysis(); // Skip shebang parsing
35//!
36//! let tags = identifier.identify(&file_path).unwrap();
37//! assert!(tags.contains("file"));
38//! assert!(tags.contains("python"));
39//! ```
40//!
41//! ## Tag System
42//!
43//! Files are identified using a set of standardized tags:
44//!
45//! - **Type tags**: `file`, `directory`, `symlink`, `socket`
46//! - **Mode tags**: `executable`, `non-executable`
47//! - **Encoding tags**: `text`, `binary`
48//! - **Language/format tags**: `python`, `javascript`, `json`, `xml`, etc.
49//!
50//! ## Error Handling
51//!
52//! Functions that access the filesystem return [`Result`] types. The main error
53//! conditions are:
54//!
55//! - [`IdentifyError::PathNotFound`] - when the specified path doesn't exist
56//! - [`IdentifyError::IoError`] - for other I/O related errors
57
58use std::collections::HashSet;
59use std::fmt;
60use std::fs;
61use std::io::{BufReader, Read};
62use std::path::Path;
63
64pub mod extensions;
65pub mod interpreters;
66pub mod tags;
67
68/// A tuple-like immutable container for shebang components that matches Python's tuple behavior.
69///
70/// This type is designed to be a direct equivalent to Python's `tuple[str, ...]` for
71/// parse_shebang functions, providing immutable access to shebang components.
72#[derive(Debug, Clone, PartialEq, Eq, Hash)]
73pub struct ShebangTuple {
74    components: Box<[String]>,
75}
76
77impl ShebangTuple {
78    /// Create a new empty ShebangTuple (equivalent to Python's `()`)
79    pub fn new() -> Self {
80        Self {
81            components: Box::new([]),
82        }
83    }
84
85    /// Create a ShebangTuple from a vector of strings
86    pub fn from_vec(vec: Vec<String>) -> Self {
87        Self {
88            components: vec.into_boxed_slice(),
89        }
90    }
91
92    /// Get the length of the tuple (equivalent to Python's `len(tuple)`)
93    pub const fn len(&self) -> usize {
94        self.components.len()
95    }
96
97    /// Check if the tuple is empty (equivalent to Python's `not tuple`)
98    pub const fn is_empty(&self) -> bool {
99        self.components.is_empty()
100    }
101
102    /// Get an element by index (equivalent to Python's `tuple[index]`)
103    /// Returns None if index is out of bounds
104    pub fn get(&self, index: usize) -> Option<&str> {
105        self.components.get(index).map(|s| s.as_str())
106    }
107
108    /// Get the first element (equivalent to Python's `tuple[0]` when safe)
109    pub fn first(&self) -> Option<&str> {
110        self.get(0)
111    }
112
113    /// Convert to a Vec for internal use (consumes the tuple)
114    pub fn into_vec(self) -> Vec<String> {
115        self.components.into_vec()
116    }
117
118    /// Iterate over the components (equivalent to Python's `for item in tuple`)
119    pub fn iter(&self) -> std::slice::Iter<'_, String> {
120        self.components.iter()
121    }
122
123    /// Convert to a slice for easy pattern matching
124    pub fn as_slice(&self) -> &[String] {
125        &self.components
126    }
127}
128
129// Implement Index trait for tuple[index] syntax
130impl std::ops::Index<usize> for ShebangTuple {
131    type Output = str;
132
133    fn index(&self, index: usize) -> &Self::Output {
134        &self.components[index]
135    }
136}
137
138// Implement IntoIterator for for-loops
139impl<'a> IntoIterator for &'a ShebangTuple {
140    type Item = &'a String;
141    type IntoIter = std::slice::Iter<'a, String>;
142
143    fn into_iter(self) -> Self::IntoIter {
144        self.components.iter()
145    }
146}
147
148// Implement FromIterator for collecting
149impl FromIterator<String> for ShebangTuple {
150    fn from_iter<T: IntoIterator<Item = String>>(iter: T) -> Self {
151        Self::from_vec(iter.into_iter().collect())
152    }
153}
154
155// Display implementation (equivalent to Python's str(tuple))
156impl fmt::Display for ShebangTuple {
157    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
158        write!(f, "(")?;
159        for (i, component) in self.components.iter().enumerate() {
160            if i > 0 {
161                write!(f, ", ")?;
162            }
163            write!(f, "'{component}'")?;
164        }
165        if self.components.len() == 1 {
166            write!(f, ",")?; // Python tuple trailing comma for single element
167        }
168        write!(f, ")")
169    }
170}
171
172// Conversion from Vec<String>
173impl From<Vec<String>> for ShebangTuple {
174    fn from(vec: Vec<String>) -> Self {
175        Self::from_vec(vec)
176    }
177}
178
179// Conversion from empty ()
180impl Default for ShebangTuple {
181    fn default() -> Self {
182        Self::new()
183    }
184}
185
186use extensions::{get_extension_tags, get_extensions_need_binary_check_tags, get_name_tags};
187use interpreters::get_interpreter_tags;
188use tags::*;
189
190/// Configuration for file identification behavior.
191///
192/// Allows customizing which analysis steps to perform and their order.
193/// Use `FileIdentifier::new()` to create a builder and customize identification.
194#[derive(Debug, Clone)]
195pub struct FileIdentifier {
196    skip_content_analysis: bool,
197    skip_shebang_analysis: bool,
198    custom_extensions: Option<std::collections::HashMap<String, TagSet>>,
199}
200
201impl Default for FileIdentifier {
202    fn default() -> Self {
203        Self::new()
204    }
205}
206
207impl FileIdentifier {
208    /// Create a new file identifier with default settings.
209    ///
210    /// By default, all analysis steps are enabled:
211    /// - File system metadata analysis
212    /// - Filename and extension analysis  
213    /// - Shebang analysis for executable files
214    /// - Content analysis (text vs binary detection)
215    pub fn new() -> Self {
216        Self {
217            skip_content_analysis: false,
218            skip_shebang_analysis: false,
219            custom_extensions: None,
220        }
221    }
222
223    /// Skip content analysis (text vs binary detection).
224    ///
225    /// This avoids reading file contents, making identification faster
226    /// but potentially less accurate for files without clear extension/filename patterns.
227    pub fn skip_content_analysis(mut self) -> Self {
228        self.skip_content_analysis = true;
229        self
230    }
231
232    /// Skip shebang analysis for executable files.
233    ///
234    /// This avoids parsing shebang lines, making identification faster
235    /// but less accurate for executable scripts without recognized extensions.
236    pub fn skip_shebang_analysis(mut self) -> Self {
237        self.skip_shebang_analysis = true;
238        self
239    }
240
241    /// Add custom file extension mappings.
242    ///
243    /// These will be checked before the built-in extension mappings.
244    /// Useful for organization-specific or project-specific file types.
245    pub fn with_custom_extensions(
246        mut self,
247        extensions: std::collections::HashMap<String, TagSet>,
248    ) -> Self {
249        self.custom_extensions = Some(extensions);
250        self
251    }
252
253    /// Identify a file using the configured settings.
254    ///
255    /// This is equivalent to `tags_from_path` but with customizable behavior.
256    pub fn identify<P: AsRef<Path>>(&self, path: P) -> Result<TagSet> {
257        self.identify_with_config(path)
258    }
259
260    fn identify_with_config<P: AsRef<Path>>(&self, path: P) -> Result<TagSet> {
261        let path = path.as_ref();
262        let path_str = path.to_string_lossy();
263
264        // Get file metadata
265        let metadata = match fs::symlink_metadata(path) {
266            Ok(meta) => meta,
267            Err(_) => {
268                return Err(IdentifyError::PathNotFound {
269                    path: path_str.to_string(),
270                });
271            }
272        };
273
274        // Step 1: Check for non-regular file types (directory, symlink, socket)
275        if let Some(file_type_tags) = analyze_file_type(&metadata) {
276            return Ok(file_type_tags);
277        }
278
279        // Step 2: This is a regular file - start building tag set
280        let mut tags = TagSet::new();
281        tags.insert(FILE);
282
283        // Step 3: Analyze permissions (executable vs non-executable)
284        let is_executable = analyze_permissions(path, &metadata);
285        if is_executable {
286            tags.insert(EXECUTABLE);
287        } else {
288            tags.insert(NON_EXECUTABLE);
289        }
290
291        // Step 4: Analyze filename and potentially shebang (with custom config)
292        let filename_and_shebang_tags =
293            self.analyze_filename_and_shebang_configured(path, is_executable);
294        tags.extend(filename_and_shebang_tags);
295
296        // Step 5: Analyze content encoding (text vs binary) if not skipped and not already determined
297        if !self.skip_content_analysis {
298            let encoding_tags = analyze_content_encoding(path, &tags)?;
299            tags.extend(encoding_tags);
300        }
301
302        Ok(tags)
303    }
304
305    fn analyze_filename_and_shebang_configured<P: AsRef<Path>>(
306        &self,
307        path: P,
308        is_executable: bool,
309    ) -> TagSet {
310        let path = path.as_ref();
311        let mut tags = TagSet::new();
312
313        // Check filename-based tags first (including custom extensions)
314        if let Some(filename) = path.file_name().and_then(|n| n.to_str()) {
315            // Check custom extensions first if provided
316            if let Some(custom_exts) = &self.custom_extensions {
317                if let Some(ext) = Path::new(filename).extension().and_then(|e| e.to_str()) {
318                    let ext_lower = ext.to_lowercase();
319                    if let Some(ext_tags) = custom_exts.get(&ext_lower) {
320                        tags.extend(ext_tags.iter().cloned());
321                        return tags; // Custom extension takes precedence
322                    }
323                }
324            }
325
326            // Fall back to standard filename analysis
327            let filename_tags = tags_from_filename(filename);
328            if !filename_tags.is_empty() {
329                tags.extend(filename_tags);
330            } else if is_executable && !self.skip_shebang_analysis {
331                // Parse shebang for executable files without recognized extensions
332                if let Ok(shebang_components) = parse_shebang_from_file(path) {
333                    if !shebang_components.is_empty() {
334                        let interpreter_tags = tags_from_interpreter(&shebang_components[0]);
335                        tags.extend(interpreter_tags);
336                    }
337                }
338            }
339        }
340
341        tags
342    }
343}
344
345/// Result type for file identification operations.
346///
347/// This is a convenience type alias for operations that may fail with
348/// file system or parsing errors.
349pub type Result<T> = std::result::Result<T, IdentifyError>;
350
351/// Errors that can occur during file identification.
352#[derive(thiserror::Error, Debug)]
353pub enum IdentifyError {
354    /// The specified path does not exist on the filesystem.
355    #[error("{path} does not exist.")]
356    PathNotFound { path: String },
357
358    /// An I/O error occurred while accessing the file.
359    #[error("IO error: {source}")]
360    IoError {
361        #[from]
362        source: std::io::Error,
363    },
364
365    /// The file path contains invalid UTF-8 sequences.
366    #[error("Path contains invalid UTF-8: {path}")]
367    InvalidPath { path: String },
368
369    /// The file content is not valid UTF-8 when UTF-8 is expected.
370    #[error("File contains invalid UTF-8 content")]
371    InvalidUtf8,
372}
373
374/// Analyze file system metadata to determine basic file type.
375///
376/// Returns tags for directory, symlink, socket, or file based on metadata.
377/// This is the first step in file identification.
378fn analyze_file_type(metadata: &std::fs::Metadata) -> Option<TagSet> {
379    let file_type = metadata.file_type();
380
381    if file_type.is_dir() {
382        return Some([DIRECTORY].iter().cloned().collect());
383    }
384    if file_type.is_symlink() {
385        return Some([SYMLINK].iter().cloned().collect());
386    }
387
388    // Check for socket (Unix-specific)
389    #[cfg(unix)]
390    {
391        use std::os::unix::fs::FileTypeExt;
392        if file_type.is_socket() {
393            return Some([SOCKET].iter().cloned().collect());
394        }
395    }
396
397    // Regular file - continue with further analysis
398    None
399}
400
401/// Analyze file permissions to determine executable status.
402///
403/// Returns true if the file is executable, false otherwise.
404/// On Unix systems, checks permission bits. On other systems, checks file extension.
405fn analyze_permissions<P: AsRef<Path>>(path: P, metadata: &std::fs::Metadata) -> bool {
406    #[cfg(unix)]
407    {
408        use std::os::unix::fs::PermissionsExt;
409        let _ = path; // Suppress unused warning on Unix
410        metadata.permissions().mode() & 0o111 != 0
411    }
412    #[cfg(not(unix))]
413    {
414        // On non-Unix systems, check file extension for common executables
415        let _ = metadata; // Suppress unused warning on non-Unix
416        let path = path.as_ref();
417        path.extension()
418            .and_then(|ext| ext.to_str())
419            .map(|ext| matches!(ext.to_lowercase().as_str(), "exe" | "bat" | "cmd"))
420            .unwrap_or(false)
421    }
422}
423
424/// Analyze filename and potentially shebang for file type identification.
425///
426/// First tries filename-based identification. If that fails and the file is executable,
427/// falls back to shebang analysis.
428fn analyze_filename_and_shebang<P: AsRef<Path>>(path: P, is_executable: bool) -> TagSet {
429    let path = path.as_ref();
430    let mut tags = TagSet::new();
431
432    // Check filename-based tags first
433    if let Some(filename) = path.file_name().and_then(|n| n.to_str()) {
434        let filename_tags = tags_from_filename(filename);
435        if !filename_tags.is_empty() {
436            tags.extend(filename_tags);
437        } else if is_executable {
438            // Parse shebang for executable files without recognized extensions
439            if let Ok(shebang_components) = parse_shebang_from_file(path) {
440                if !shebang_components.is_empty() {
441                    let interpreter_tags = tags_from_interpreter(&shebang_components[0]);
442                    tags.extend(interpreter_tags);
443                }
444            }
445        }
446    }
447
448    tags
449}
450
451/// Analyze file content to determine encoding (text vs binary).
452///
453/// Only performs analysis if encoding tags are not already present.
454fn analyze_content_encoding<P: AsRef<Path>>(path: P, existing_tags: &TagSet) -> Result<TagSet> {
455    let mut tags = TagSet::new();
456
457    // Check if we need to determine binary vs text
458    if !existing_tags.iter().any(|tag| ENCODING_TAGS.contains(tag)) {
459        if file_is_text(path)? {
460            tags.insert(TEXT);
461        } else {
462            tags.insert(BINARY);
463        }
464    }
465
466    Ok(tags)
467}
468
469/// Identify a file from its filesystem path.
470///
471/// This is the most comprehensive identification method, providing a superset
472/// of information from other methods. It analyzes:
473///
474/// 1. File type (regular file, directory, symlink, socket)
475/// 2. File permissions (executable vs non-executable)
476/// 3. Filename and extension patterns
477/// 4. File content (binary vs text detection)
478/// 5. Shebang lines for executable files
479///
480/// # Arguments
481///
482/// * `path` - Path to the file to identify
483///
484/// # Returns
485///
486/// A set of tags identifying the file type and characteristics.
487///
488/// # Errors
489///
490/// Returns [`IdentifyError::PathNotFound`] if the path doesn't exist, or
491/// [`IdentifyError::IoError`] for other I/O failures.
492///
493/// # Examples
494///
495/// ```rust
496/// use file_identify::tags_from_path;
497/// # use std::fs;
498/// # use tempfile::tempdir;
499///
500/// # let dir = tempdir().unwrap();
501/// # let file_path = dir.path().join("script.py");
502/// # fs::write(&file_path, "#!/usr/bin/env python3\nprint('hello')").unwrap();
503/// let tags = tags_from_path(&file_path).unwrap();
504/// assert!(tags.contains("file"));
505/// assert!(tags.contains("python"));
506/// assert!(tags.contains("text"));
507/// ```
508pub fn tags_from_path<P: AsRef<Path>>(path: P) -> Result<TagSet> {
509    let path = path.as_ref();
510    let path_str = path.to_string_lossy();
511
512    // Get file metadata
513    let metadata = match fs::symlink_metadata(path) {
514        Ok(meta) => meta,
515        Err(_) => {
516            return Err(IdentifyError::PathNotFound {
517                path: path_str.to_string(),
518            });
519        }
520    };
521
522    // Step 1: Check for non-regular file types (directory, symlink, socket)
523    if let Some(file_type_tags) = analyze_file_type(&metadata) {
524        return Ok(file_type_tags);
525    }
526
527    // Step 2: This is a regular file - start building tag set
528    let mut tags = TagSet::new();
529    tags.insert(FILE);
530
531    // Step 3: Analyze permissions (executable vs non-executable)
532    let is_executable = analyze_permissions(path, &metadata);
533    if is_executable {
534        tags.insert(EXECUTABLE);
535    } else {
536        tags.insert(NON_EXECUTABLE);
537    }
538
539    // Step 4: Analyze filename and potentially shebang
540    let filename_and_shebang_tags = analyze_filename_and_shebang(path, is_executable);
541    tags.extend(filename_and_shebang_tags);
542
543    // Step 5: Analyze content encoding (text vs binary) if not already determined
544    let encoding_tags = analyze_content_encoding(path, &tags)?;
545    tags.extend(encoding_tags);
546
547    Ok(tags)
548}
549
550/// Identify a file based only on its filename.
551///
552/// This method analyzes the filename and extension to determine file type,
553/// without accessing the filesystem. It's useful when you only have the
554/// filename or want to avoid I/O operations.
555///
556/// # Arguments
557///
558/// * `filename` - The filename to analyze (can include path)
559///
560/// # Returns
561///
562/// A set of tags identifying the file type. Returns an empty set if
563/// the filename is not recognized.
564///
565/// # Examples
566///
567/// ```rust
568/// use file_identify::tags_from_filename;
569///
570/// let tags = tags_from_filename("script.py");
571/// assert!(tags.contains("python"));
572/// assert!(tags.contains("text"));
573///
574/// let tags = tags_from_filename("Dockerfile");
575/// assert!(tags.contains("dockerfile"));
576///
577/// let tags = tags_from_filename("unknown.xyz");
578/// assert!(tags.is_empty());
579/// ```
580pub fn tags_from_filename(filename: &str) -> TagSet {
581    let mut tags = TagSet::new();
582
583    // Check exact filename matches first
584    for part in std::iter::once(filename).chain(filename.split('.')) {
585        let name_tags = get_name_tags(part);
586        if !name_tags.is_empty() {
587            tags.extend(name_tags);
588            break;
589        }
590    }
591
592    // Check file extension
593    if let Some(ext) = Path::new(filename).extension().and_then(|e| e.to_str()) {
594        let ext_lower = ext.to_lowercase();
595
596        let ext_tags = get_extension_tags(&ext_lower);
597        if !ext_tags.is_empty() {
598            tags.extend(ext_tags);
599        } else {
600            let binary_check_tags = get_extensions_need_binary_check_tags(&ext_lower);
601            if !binary_check_tags.is_empty() {
602                tags.extend(binary_check_tags);
603            }
604        }
605    }
606
607    tags
608}
609
610/// Identify tags based on a shebang interpreter.
611///
612/// This function analyzes interpreter names from shebang lines to determine
613/// the script type. It handles version-specific interpreters by progressively
614/// removing version suffixes.
615///
616/// # Arguments
617///
618/// * `interpreter` - The interpreter name or path from a shebang
619///
620/// # Returns
621///
622/// A set of tags for the interpreter type. Returns an empty set if
623/// the interpreter is not recognized.
624///
625/// # Examples
626///
627/// ```rust
628/// use file_identify::tags_from_interpreter;
629///
630/// let tags = tags_from_interpreter("python3.11");
631/// assert!(tags.contains("python"));
632/// assert!(tags.contains("python3"));
633///
634/// let tags = tags_from_interpreter("/usr/bin/bash");
635/// assert!(tags.contains("shell"));
636/// assert!(tags.contains("bash"));
637///
638/// let tags = tags_from_interpreter("unknown-interpreter");
639/// assert!(tags.is_empty());
640/// ```
641pub fn tags_from_interpreter(interpreter: &str) -> TagSet {
642    // Extract the interpreter name from the path
643    let interpreter_name = interpreter.split('/').next_back().unwrap_or(interpreter);
644
645    // Try progressively shorter versions (e.g., "python3.5.2" -> "python3.5" -> "python3")
646    let mut current = interpreter_name;
647    while !current.is_empty() {
648        let tags = get_interpreter_tags(current);
649        if !tags.is_empty() {
650            return tags;
651        }
652
653        // Try removing the last dot-separated part
654        match current.rfind('.') {
655            Some(pos) => current = &current[..pos],
656            None => break,
657        }
658    }
659
660    TagSet::new()
661}
662
663/// Determine if a file contains text or binary data.
664///
665/// This function reads the first 1KB of a file to determine if it contains
666/// text or binary data, using a similar algorithm to the `file` command.
667///
668/// # Arguments
669///
670/// * `path` - Path to the file to analyze
671///
672/// # Returns
673///
674/// `true` if the file appears to contain text, `false` if binary.
675///
676/// # Errors
677///
678/// Returns an error if the file cannot be opened or read.
679///
680/// # Examples
681///
682/// ```rust
683/// use file_identify::file_is_text;
684/// # use std::fs;
685/// # use tempfile::tempdir;
686///
687/// # let dir = tempdir().unwrap();
688/// # let text_path = dir.path().join("text.txt");
689/// # fs::write(&text_path, "Hello, world!").unwrap();
690/// assert!(file_is_text(&text_path).unwrap());
691///
692/// # let binary_path = dir.path().join("binary.bin");
693/// # fs::write(&binary_path, &[0x7f, 0x45, 0x4c, 0x46]).unwrap();
694/// assert!(!file_is_text(&binary_path).unwrap());
695/// ```
696pub fn file_is_text<P: AsRef<Path>>(path: P) -> Result<bool> {
697    let file = fs::File::open(path)?;
698    is_text(file)
699}
700
701/// Determine if data from a reader contains text or binary content.
702///
703/// This function reads up to 1KB from the provided reader and analyzes
704/// the bytes to determine if they represent text or binary data.
705///
706/// # Arguments
707///
708/// * `reader` - A reader providing the data to analyze
709///
710/// # Returns
711///
712/// `true` if the data appears to be text, `false` if binary.
713///
714/// # Examples
715///
716/// ```rust
717/// use file_identify::is_text;
718/// use std::io::Cursor;
719///
720/// let text_data = Cursor::new(b"Hello, world!");
721/// assert!(is_text(text_data).unwrap());
722///
723/// let binary_data = Cursor::new(&[0x7f, 0x45, 0x4c, 0x46, 0x00]);
724/// assert!(!is_text(binary_data).unwrap());
725/// ```
726pub fn is_text<R: Read>(mut reader: R) -> Result<bool> {
727    let mut buffer = [0; 1024];
728    let bytes_read = reader.read(&mut buffer)?;
729
730    // Check for null bytes or other non-text indicators
731    let text_chars: HashSet<u8> = [
732        7, 8, 9, 10, 11, 12, 13, 27, // Control chars
733    ]
734    .iter()
735    .cloned()
736    .chain(0x20..0x7F) // ASCII printable
737    .chain(0x80..=0xFF) // Extended ASCII
738    .collect();
739
740    let is_text = buffer[..bytes_read]
741        .iter()
742        .all(|&byte| text_chars.contains(&byte));
743    Ok(is_text)
744}
745
746/// Parse shebang line from an executable file and return raw shebang components.
747///
748/// This function reads the first line of an executable file to extract
749/// shebang information and return the raw command components, similar to
750/// Python's identify.parse_shebang_from_file().
751///
752/// # Arguments
753///
754/// * `path` - Path to the executable file
755///
756/// # Returns
757///
758/// A vector of raw shebang components. Returns an empty vector if:
759/// - The file is not executable
760/// - No shebang is found
761///
762/// # Errors
763///
764/// Returns an error if the file cannot be accessed or read.
765///
766/// # Examples
767///
768/// ```rust
769/// use file_identify::parse_shebang_from_file;
770/// # use std::fs;
771/// # use std::os::unix::fs::PermissionsExt;
772/// # use tempfile::tempdir;
773///
774/// # let dir = tempdir().unwrap();
775/// # let script_path = dir.path().join("script");
776/// # fs::write(&script_path, "#!/usr/bin/env python3\nprint('hello')").unwrap();
777/// # let mut perms = fs::metadata(&script_path).unwrap().permissions();
778/// # perms.set_mode(0o755);
779/// # fs::set_permissions(&script_path, perms).unwrap();
780/// let shebang = parse_shebang_from_file(&script_path).unwrap();
781/// assert_eq!(shebang.get(0).unwrap(), "python3");
782/// ```
783pub fn parse_shebang_from_file<P: AsRef<Path>>(path: P) -> Result<ShebangTuple> {
784    let path = path.as_ref();
785
786    // Only check executable files
787    let metadata = fs::metadata(path)?;
788    #[cfg(unix)]
789    {
790        use std::os::unix::fs::PermissionsExt;
791        if metadata.permissions().mode() & 0o111 == 0 {
792            return Ok(ShebangTuple::new());
793        }
794    }
795
796    let file = fs::File::open(path)?;
797    parse_shebang(file)
798}
799
800/// Parse a shebang line from a reader and return raw shebang components.
801///
802/// This function reads the first line from the provided reader and parses
803/// it as a shebang line to extract raw command components, similar to
804/// Python's identify.parse_shebang().
805///
806/// # Arguments
807///
808/// * `reader` - A reader providing the file content
809///
810/// # Returns
811///
812/// A vector of raw shebang components. Returns an empty vector if no valid shebang is found.
813///
814/// # Examples
815///
816/// ```rust
817/// use file_identify::parse_shebang;
818/// use std::io::Cursor;
819///
820/// let shebang = Cursor::new(b"#!/usr/bin/env python3\nprint('hello')");
821/// let components = parse_shebang(shebang).unwrap();
822/// assert_eq!(components.get(0).unwrap(), "python3");
823///
824/// let no_shebang = Cursor::new(b"print('hello')");
825/// let components = parse_shebang(no_shebang).unwrap();
826/// assert!(components.is_empty());
827/// ```
828pub fn parse_shebang<R: Read>(reader: R) -> Result<ShebangTuple> {
829    use std::io::BufRead;
830
831    let mut buf_reader = BufReader::new(reader);
832
833    // Read first line efficiently using read_until
834    let mut first_line_bytes = Vec::new();
835    match buf_reader.read_until(b'\n', &mut first_line_bytes) {
836        Ok(0) => return Ok(ShebangTuple::new()), // EOF with no data
837        Ok(_) => {
838            // Remove trailing newline if present
839            if first_line_bytes.ends_with(b"\n") {
840                first_line_bytes.pop();
841            }
842            // Also handle \r\n line endings
843            if first_line_bytes.ends_with(b"\r") {
844                first_line_bytes.pop();
845            }
846        }
847        Err(_) => return Ok(ShebangTuple::new()), // Read error
848    }
849
850    // Check if starts with shebang
851    if first_line_bytes.len() < 2 || &first_line_bytes[0..2] != b"#!" {
852        return Ok(ShebangTuple::new());
853    }
854
855    // Limit line length to prevent memory issues
856    if first_line_bytes.len() > 1024 {
857        first_line_bytes.truncate(1024);
858    }
859
860    // Try to decode as UTF-8, return empty if invalid (like Python does)
861    let first_line = match String::from_utf8(first_line_bytes) {
862        Ok(line) => line,
863        Err(_) => return Ok(ShebangTuple::new()),
864    };
865
866    // Remove the #! and clean up the line
867    let shebang_line = first_line[2..].trim();
868
869    // Check for only printable ASCII (like Python does)
870    for c in shebang_line.chars() {
871        if !c.is_ascii() || (c.is_control() && c != '\t') {
872            return Ok(ShebangTuple::new());
873        }
874    }
875
876    // Parse the shebang command using simple split (like Python's shlex fallback)
877    let parts: smallvec::SmallVec<[&str; 4]> = shebang_line.split_whitespace().collect();
878    if parts.is_empty() {
879        return Ok(ShebangTuple::new());
880    }
881
882    let cmd: smallvec::SmallVec<[&str; 2]> = if parts[0] == "/usr/bin/env" {
883        if parts.len() == 1 {
884            // Just "#!/usr/bin/env" with no interpreter
885            smallvec::SmallVec::new()
886        } else if parts.len() >= 2 && parts[1] == "-S" {
887            if parts.len() > 2 {
888                parts[2..].iter().copied().collect()
889            } else {
890                // Just "#!/usr/bin/env -S" with no interpreter
891                smallvec::SmallVec::new()
892            }
893        } else {
894            parts[1..].iter().copied().collect()
895        }
896    } else {
897        parts.iter().copied().collect()
898    };
899
900    if cmd.is_empty() {
901        return Ok(ShebangTuple::new());
902    }
903
904    // Return the raw command components as strings
905    Ok(ShebangTuple::from_vec(
906        cmd.iter().map(|s| s.to_string()).collect(),
907    ))
908}
909
910#[cfg(test)]
911mod tests {
912    use super::*;
913    use std::fs;
914    use std::io::Cursor;
915    use std::os::unix::fs::PermissionsExt;
916    use tempfile::{NamedTempFile, tempdir};
917
918    // Helper macro to create ShebangTuple from string slices for testing
919    macro_rules! shebang_tuple {
920        () => {
921            ShebangTuple::new()
922        };
923        ($($item:expr),+) => {
924            ShebangTuple::from_vec(vec![$($item.to_string()),+])
925        };
926    }
927
928    // Test tag system completeness
929    #[test]
930    fn test_all_basic_tags_exist() {
931        assert!(TYPE_TAGS.contains("file"));
932        assert!(TYPE_TAGS.contains("directory"));
933        assert!(MODE_TAGS.contains("executable"));
934        assert!(ENCODING_TAGS.contains("text"));
935    }
936
937    #[test]
938    fn test_tag_groups_are_disjoint() {
939        assert!(TYPE_TAGS.is_disjoint(&MODE_TAGS));
940        assert!(TYPE_TAGS.is_disjoint(&ENCODING_TAGS));
941        assert!(MODE_TAGS.is_disjoint(&ENCODING_TAGS));
942    }
943
944    // Test tags_from_filename with various scenarios
945    #[test]
946    fn test_tags_from_filename_basic() {
947        let tags = tags_from_filename("file.py");
948        assert!(tags.contains("text"));
949        assert!(tags.contains("python"));
950    }
951
952    #[test]
953    fn test_tags_from_filename_special_names() {
954        let tags = tags_from_filename("Dockerfile");
955        assert!(tags.contains("dockerfile"));
956        assert!(tags.contains("text"));
957
958        let tags = tags_from_filename("Makefile");
959        assert!(tags.contains("makefile"));
960        assert!(tags.contains("text"));
961
962        let tags = tags_from_filename("Cargo.toml");
963        assert!(tags.contains("toml"));
964        assert!(tags.contains("cargo"));
965    }
966
967    #[test]
968    fn test_tags_from_filename_case_insensitive_extension() {
969        let tags = tags_from_filename("image.JPG");
970        assert!(tags.contains("binary"));
971        assert!(tags.contains("image"));
972        assert!(tags.contains("jpeg"));
973    }
974
975    #[test]
976    fn test_tags_from_filename_precedence() {
977        // setup.cfg should match by name, not .cfg extension
978        let tags = tags_from_filename("setup.cfg");
979        assert!(tags.contains("ini"));
980    }
981
982    #[test]
983    fn test_tags_from_filename_complex_names() {
984        let tags = tags_from_filename("Dockerfile.xenial");
985        assert!(tags.contains("dockerfile"));
986
987        let tags = tags_from_filename("README.md");
988        assert!(tags.contains("markdown"));
989        assert!(tags.contains("plain-text"));
990    }
991
992    #[test]
993    fn test_tags_from_filename_unrecognized() {
994        let tags = tags_from_filename("unknown.xyz");
995        assert!(tags.is_empty());
996
997        let tags = tags_from_filename("noextension");
998        assert!(tags.is_empty());
999    }
1000
1001    // Test tags_from_interpreter
1002    #[test]
1003    fn test_tags_from_interpreter_basic() {
1004        let tags = tags_from_interpreter("python3");
1005        assert!(tags.contains("python"));
1006        assert!(tags.contains("python3"));
1007    }
1008
1009    #[test]
1010    fn test_tags_from_interpreter_versioned() {
1011        let tags = tags_from_interpreter("python3.11.2");
1012        assert!(tags.contains("python"));
1013        assert!(tags.contains("python3"));
1014
1015        let tags = tags_from_interpreter("php8.1");
1016        assert!(tags.contains("php"));
1017        assert!(tags.contains("php8"));
1018    }
1019
1020    #[test]
1021    fn test_tags_from_interpreter_with_path() {
1022        let tags = tags_from_interpreter("/usr/bin/python3");
1023        assert!(tags.contains("python"));
1024        assert!(tags.contains("python3"));
1025    }
1026
1027    #[test]
1028    fn test_tags_from_interpreter_unrecognized() {
1029        let tags = tags_from_interpreter("unknown-interpreter");
1030        assert!(tags.is_empty());
1031
1032        let tags = tags_from_interpreter("");
1033        assert!(tags.is_empty());
1034    }
1035
1036    // Test is_text function
1037    #[test]
1038    fn test_is_text_basic() {
1039        assert!(is_text(Cursor::new(b"hello world")).unwrap());
1040        assert!(is_text(Cursor::new(b"")).unwrap());
1041        assert!(!is_text(Cursor::new(b"hello\x00world")).unwrap());
1042    }
1043
1044    #[test]
1045    fn test_is_text_unicode() {
1046        assert!(is_text(Cursor::new("éóñəå  ⊂(◉‿◉)つ(ノ≥∇≤)ノ".as_bytes())).unwrap());
1047        assert!(is_text(Cursor::new(r"¯\_(ツ)_/¯".as_bytes())).unwrap());
1048        assert!(is_text(Cursor::new("♪┏(・o･)┛♪┗ ( ･o･) ┓♪".as_bytes())).unwrap());
1049    }
1050
1051    #[test]
1052    fn test_is_text_binary_data() {
1053        // ELF header
1054        assert!(!is_text(Cursor::new(&[0x7f, 0x45, 0x4c, 0x46, 0x02, 0x01, 0x01])).unwrap());
1055        // Random binary data
1056        assert!(!is_text(Cursor::new(&[0x43, 0x92, 0xd9, 0x0f, 0xaf, 0x32, 0x2c])).unwrap());
1057    }
1058
1059    // Test parse_shebang function
1060    #[test]
1061    fn test_parse_shebang_basic() {
1062        let components = parse_shebang(Cursor::new(b"#!/usr/bin/python")).unwrap();
1063        assert_eq!(components, shebang_tuple!["/usr/bin/python"]);
1064
1065        let components = parse_shebang(Cursor::new(b"#!/usr/bin/env python")).unwrap();
1066        assert_eq!(components, shebang_tuple!["python"]);
1067    }
1068
1069    #[test]
1070    fn test_parse_shebang_env_with_flags() {
1071        let components = parse_shebang(Cursor::new(b"#!/usr/bin/env -S python -u")).unwrap();
1072        assert_eq!(components, shebang_tuple!["python", "-u"]);
1073    }
1074
1075    #[test]
1076    fn test_parse_shebang_spaces() {
1077        let components = parse_shebang(Cursor::new(b"#! /usr/bin/python")).unwrap();
1078        assert_eq!(components, shebang_tuple!["/usr/bin/python"]);
1079
1080        let components = parse_shebang(Cursor::new(b"#!/usr/bin/foo  python")).unwrap();
1081        assert_eq!(components, shebang_tuple!["/usr/bin/foo", "python"]);
1082    }
1083
1084    #[test]
1085    fn test_parse_shebang_no_shebang() {
1086        let components = parse_shebang(Cursor::new(b"import sys")).unwrap();
1087        assert!(components.is_empty());
1088
1089        let components = parse_shebang(Cursor::new(b"")).unwrap();
1090        assert!(components.is_empty());
1091    }
1092
1093    #[test]
1094    fn test_parse_shebang_invalid_utf8() {
1095        let result = parse_shebang(Cursor::new(&[0x23, 0x21, 0xf9, 0x93, 0x01, 0x42, 0xcd]));
1096        match result {
1097            Ok(components) => assert!(components.is_empty()),
1098            Err(_) => (), // I/O errors are acceptable for invalid UTF-8 data
1099        }
1100    }
1101
1102    // File system tests using tempfiles
1103    #[test]
1104    fn test_tags_from_path_file_not_found() {
1105        let result = tags_from_path("/nonexistent/path");
1106        assert!(result.is_err());
1107        assert!(result.unwrap_err().to_string().contains("does not exist"));
1108    }
1109
1110    #[test]
1111    fn test_tags_from_path_regular_file() {
1112        let file = NamedTempFile::new().unwrap();
1113        fs::write(&file, "print('hello')").unwrap();
1114
1115        let tags = tags_from_path(file.path()).unwrap();
1116        assert!(tags.contains("file"));
1117        assert!(tags.contains("non-executable"));
1118        assert!(tags.contains("text"));
1119    }
1120
1121    #[test]
1122    fn test_tags_from_path_executable_file() {
1123        let dir = tempdir().unwrap();
1124        let script_path = dir.path().join("script.py");
1125        fs::write(&script_path, "#!/usr/bin/env python3\nprint('hello')").unwrap();
1126
1127        let mut perms = fs::metadata(&script_path).unwrap().permissions();
1128        perms.set_mode(0o755);
1129        fs::set_permissions(&script_path, perms).unwrap();
1130
1131        let tags = tags_from_path(&script_path).unwrap();
1132        assert!(tags.contains("file"));
1133        assert!(tags.contains("executable"));
1134        assert!(tags.contains("python"));
1135        assert!(tags.contains("text"));
1136    }
1137
1138    #[test]
1139    fn test_tags_from_path_directory() {
1140        let dir = tempdir().unwrap();
1141        let tags = tags_from_path(dir.path()).unwrap();
1142        assert_eq!(tags, HashSet::from(["directory"]));
1143    }
1144
1145    #[test]
1146    fn test_tags_from_path_binary_file() {
1147        let dir = tempdir().unwrap();
1148        let binary_path = dir.path().join("binary");
1149        fs::write(&binary_path, &[0x7f, 0x45, 0x4c, 0x46, 0x02, 0x01, 0x01]).unwrap();
1150
1151        let tags = tags_from_path(&binary_path).unwrap();
1152        assert!(tags.contains("file"));
1153        assert!(tags.contains("binary"));
1154        assert!(tags.contains("non-executable"));
1155    }
1156
1157    #[test]
1158    fn test_file_is_text_simple() {
1159        let dir = tempdir().unwrap();
1160        let text_path = dir.path().join("text.txt");
1161        fs::write(&text_path, "Hello, world!").unwrap();
1162        assert!(file_is_text(&text_path).unwrap());
1163    }
1164
1165    #[test]
1166    fn test_file_is_text_does_not_exist() {
1167        let result = file_is_text("/nonexistent/file");
1168        assert!(result.is_err());
1169    }
1170
1171    // Test extensions that need binary check
1172    #[test]
1173    fn test_plist_binary_detection() {
1174        let dir = tempdir().unwrap();
1175        let plist_path = dir.path().join("test.plist");
1176
1177        // Binary plist
1178        let binary_plist = [
1179            0x62, 0x70, 0x6c, 0x69, 0x73, 0x74, 0x30, 0x30, // "bplist00"
1180            0xd1, 0x01, 0x02, 0x5f, 0x10, 0x0f,
1181        ];
1182        fs::write(&plist_path, &binary_plist).unwrap();
1183
1184        let tags = tags_from_path(&plist_path).unwrap();
1185        assert!(tags.contains("plist"));
1186        assert!(tags.contains("binary"));
1187    }
1188
1189    #[test]
1190    fn test_plist_text_detection() {
1191        let dir = tempdir().unwrap();
1192        let plist_path = dir.path().join("test.plist");
1193
1194        let text_plist = r#"<?xml version="1.0" encoding="UTF-8"?>
1195<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
1196<plist version="1.0">
1197<dict>
1198    <key>TestKey</key>
1199    <string>TestValue</string>
1200</dict>
1201</plist>"#;
1202        fs::write(&plist_path, text_plist).unwrap();
1203
1204        let tags = tags_from_path(&plist_path).unwrap();
1205        assert!(tags.contains("plist"));
1206        assert!(tags.contains("text"));
1207    }
1208
1209    // Additional edge case tests
1210    #[test]
1211    fn test_empty_file() {
1212        let dir = tempdir().unwrap();
1213        let empty_path = dir.path().join("empty");
1214        fs::write(&empty_path, "").unwrap();
1215
1216        let tags = tags_from_path(&empty_path).unwrap();
1217        assert!(tags.contains("file"));
1218        assert!(tags.contains("text")); // Empty files are considered text
1219        assert!(tags.contains("non-executable"));
1220    }
1221
1222    #[test]
1223    fn test_shebang_incomplete() {
1224        let shebang_incomplete = parse_shebang(Cursor::new(b"#!   \n")).unwrap();
1225        assert!(shebang_incomplete.is_empty());
1226    }
1227
1228    #[test]
1229    fn test_multiple_extensions() {
1230        let tags = tags_from_filename("backup.tar.gz");
1231        assert!(tags.contains("binary"));
1232        assert!(tags.contains("gzip"));
1233    }
1234
1235    // Test FileIdentifier builder pattern
1236    #[test]
1237    fn test_file_identifier_default() {
1238        let dir = tempdir().unwrap();
1239        let py_file = dir.path().join("test.py");
1240        fs::write(&py_file, "print('hello')").unwrap();
1241
1242        let identifier = FileIdentifier::new();
1243        let tags = identifier.identify(&py_file).unwrap();
1244
1245        assert!(tags.contains("file"));
1246        assert!(tags.contains("python"));
1247        assert!(tags.contains("text"));
1248        assert!(tags.contains("non-executable"));
1249    }
1250
1251    #[test]
1252    fn test_file_identifier_skip_content_analysis() {
1253        let dir = tempdir().unwrap();
1254        let unknown_file = dir.path().join("unknown_file");
1255        fs::write(&unknown_file, "some content").unwrap();
1256
1257        let identifier = FileIdentifier::new().skip_content_analysis();
1258        let tags = identifier.identify(&unknown_file).unwrap();
1259
1260        assert!(tags.contains("file"));
1261        assert!(tags.contains("non-executable"));
1262        // Should not have text or binary tags since content analysis was skipped
1263        assert!(!tags.contains("text"));
1264        assert!(!tags.contains("binary"));
1265    }
1266
1267    #[test]
1268    fn test_file_identifier_skip_shebang_analysis() {
1269        let dir = tempdir().unwrap();
1270        let script_file = dir.path().join("script");
1271        fs::write(&script_file, "#!/usr/bin/env python3\nprint('hello')").unwrap();
1272
1273        let mut perms = fs::metadata(&script_file).unwrap().permissions();
1274        perms.set_mode(0o755);
1275        fs::set_permissions(&script_file, perms).unwrap();
1276
1277        let identifier = FileIdentifier::new().skip_shebang_analysis();
1278        let tags = identifier.identify(&script_file).unwrap();
1279
1280        assert!(tags.contains("file"));
1281        assert!(tags.contains("executable"));
1282        // Should not have python tags since shebang analysis was skipped
1283        // and filename doesn't match any patterns
1284        assert!(!tags.contains("python"));
1285    }
1286
1287    #[test]
1288    fn test_file_identifier_custom_extensions() {
1289        let dir = tempdir().unwrap();
1290        let custom_file = dir.path().join("test.myext");
1291        fs::write(&custom_file, "custom content").unwrap();
1292
1293        let mut custom_extensions = std::collections::HashMap::new();
1294        custom_extensions.insert("myext".to_string(), HashSet::from(["custom", "text"]));
1295
1296        let identifier = FileIdentifier::new().with_custom_extensions(custom_extensions);
1297        let tags = identifier.identify(&custom_file).unwrap();
1298
1299        assert!(tags.contains("file"));
1300        assert!(tags.contains("custom"));
1301        assert!(tags.contains("text"));
1302        assert!(tags.contains("non-executable"));
1303    }
1304
1305    #[test]
1306    fn test_file_identifier_chaining() {
1307        let dir = tempdir().unwrap();
1308        let test_file = dir.path().join("test.unknown");
1309        fs::write(&test_file, "content").unwrap();
1310
1311        let identifier = FileIdentifier::new()
1312            .skip_content_analysis()
1313            .skip_shebang_analysis();
1314        let tags = identifier.identify(&test_file).unwrap();
1315
1316        assert!(tags.contains("file"));
1317        assert!(tags.contains("non-executable"));
1318        // Should have minimal tags due to skipping analyses
1319        assert!(!tags.contains("text"));
1320        assert!(!tags.contains("binary"));
1321    }
1322
1323    // Additional comprehensive tests from Python version
1324    #[test]
1325    fn test_comprehensive_shebang_parsing() {
1326        let test_cases = vec![
1327            ("", vec![]),
1328            ("#!/usr/bin/python", vec!["python"]),
1329            ("#!/usr/bin/env python", vec!["python"]),
1330            ("#! /usr/bin/python", vec!["python"]),
1331            ("#!/usr/bin/foo  python", vec![]), // "foo" not recognized
1332            ("#!/usr/bin/env -S python -u", vec!["python"]),
1333            ("#!/usr/bin/env", vec![]),
1334            ("#!/usr/bin/env -S", vec![]),
1335        ];
1336
1337        for (input, _expected) in test_cases {
1338            let components = parse_shebang(Cursor::new(input.as_bytes())).unwrap();
1339
1340            match input {
1341                "" => assert!(components.is_empty()),
1342                "#!/usr/bin/python" => assert_eq!(components, shebang_tuple!["/usr/bin/python"]),
1343                "#!/usr/bin/env python" => assert_eq!(components, shebang_tuple!["python"]),
1344                "#! /usr/bin/python" => assert_eq!(components, shebang_tuple!["/usr/bin/python"]),
1345                "#!/usr/bin/foo  python" => {
1346                    assert_eq!(components, shebang_tuple!["/usr/bin/foo", "python"])
1347                }
1348                "#!/usr/bin/env -S python -u" => {
1349                    assert_eq!(components, shebang_tuple!["python", "-u"])
1350                }
1351                "#!/usr/bin/env" => {
1352                    // This should be empty since no interpreter specified
1353                    assert!(
1354                        components.is_empty(),
1355                        "Got components: {:?} for input: '{}'",
1356                        components,
1357                        input
1358                    );
1359                }
1360                "#!/usr/bin/env -S" => {
1361                    // This should be empty since no interpreter after -S
1362                    assert!(
1363                        components.is_empty(),
1364                        "Got components: {:?} for input: '{}'",
1365                        components,
1366                        input
1367                    );
1368                }
1369                _ => {}
1370            }
1371        }
1372    }
1373
1374    #[test]
1375    fn test_invalid_utf8_shebang() {
1376        // Test that invalid UTF-8 in shebang doesn't crash
1377        let invalid_utf8_cases = vec![
1378            &[0xf9, 0x93, 0x01, 0x42, 0xcd][..],
1379            &[0x23, 0x21, 0xf9, 0x93, 0x01, 0x42, 0xcd][..],
1380            &[0x23, 0x21, 0x00, 0x00, 0x00, 0x00][..],
1381        ];
1382
1383        for input in invalid_utf8_cases {
1384            // Should not panic, should return empty components for invalid UTF-8
1385            let result = parse_shebang(Cursor::new(input));
1386            match result {
1387                Ok(components) => assert!(components.is_empty()),
1388                Err(_) => (), // I/O errors are acceptable for invalid data
1389            }
1390        }
1391    }
1392}
file_identify/lib.rs

file_identify/
lib.rs