file_identify/
lib.rs

1//! # file-identify
2//!
3//! A Rust library for identifying file types based on extensions, content, and shebangs.
4//!
5//! This library provides a comprehensive way to identify files by analyzing:
6//! - File extensions and special filenames
7//! - File content (binary vs text detection)
8//! - Shebang lines for executable scripts
9//! - File system metadata (permissions, file type)
10//!
11//! ## Quick Start
12//!
13//! ```rust
14//! use file_identify::{tags_from_path, tags_from_filename, FileIdentifier};
15//!
16//! // Simple filename identification
17//! let tags = tags_from_filename("script.py");
18//! assert!(tags.contains("python"));
19//! assert!(tags.contains("text"));
20//!
21//! // Full file identification from filesystem path
22//! # use std::fs;
23//! # use tempfile::tempdir;
24//! # let dir = tempdir().unwrap();
25//! # let file_path = dir.path().join("test.py");
26//! # fs::write(&file_path, "print('hello')").unwrap();
27//! let tags = tags_from_path(&file_path).unwrap();
28//! assert!(tags.contains("file"));
29//! assert!(tags.contains("python"));
30//!
31//! // Customized identification with builder pattern
32//! let identifier = FileIdentifier::new()
33//!     .skip_content_analysis()  // Skip text vs binary detection
34//!     .skip_shebang_analysis(); // Skip shebang parsing
35//!
36//! let tags = identifier.identify(&file_path).unwrap();
37//! assert!(tags.contains("file"));
38//! assert!(tags.contains("python"));
39//! ```
40//!
41//! ## Tag System
42//!
43//! Files are identified using a set of standardized tags:
44//!
45//! - **Type tags**: `file`, `directory`, `symlink`, `socket`
46//! - **Mode tags**: `executable`, `non-executable`
47//! - **Encoding tags**: `text`, `binary`
48//! - **Language/format tags**: `python`, `javascript`, `json`, `xml`, etc.
49//!
50//! ## Error Handling
51//!
52//! Functions that access the filesystem return [`Result`] types. The main error
53//! conditions are:
54//!
55//! - [`IdentifyError::PathNotFound`] - when the specified path doesn't exist
56//! - [`IdentifyError::IoError`] - for other I/O related errors
57
58use std::collections::HashSet;
59use std::fmt;
60use std::fs;
61use std::io::{BufReader, Read};
62use std::path::Path;
63
64pub mod extensions;
65pub mod interpreters;
66pub mod tags;
67
68/// A tuple-like immutable container for shebang components that matches Python's tuple behavior.
69///
70/// This type is designed to be a direct equivalent to Python's `tuple[str, ...]` for
71/// parse_shebang functions, providing immutable access to shebang components.
72#[derive(Debug, Clone, PartialEq, Eq, Hash)]
73pub struct ShebangTuple {
74    components: Box<[String]>,
75}
76
77impl ShebangTuple {
78    /// Create a new empty ShebangTuple (equivalent to Python's `()`)
79    pub fn new() -> Self {
80        Self {
81            components: Box::new([]),
82        }
83    }
84
85    /// Create a ShebangTuple from a vector of strings
86    pub fn from_vec(vec: Vec<String>) -> Self {
87        Self {
88            components: vec.into_boxed_slice(),
89        }
90    }
91
92    /// Get the length of the tuple (equivalent to Python's `len(tuple)`)
93    pub const fn len(&self) -> usize {
94        self.components.len()
95    }
96
97    /// Check if the tuple is empty (equivalent to Python's `not tuple`)
98    pub const fn is_empty(&self) -> bool {
99        self.components.is_empty()
100    }
101
102    /// Get an element by index (equivalent to Python's `tuple[index]`)
103    /// Returns None if index is out of bounds
104    pub fn get(&self, index: usize) -> Option<&str> {
105        self.components.get(index).map(|s| s.as_str())
106    }
107
108    /// Get the first element (equivalent to Python's `tuple[0]` when safe)
109    pub fn first(&self) -> Option<&str> {
110        self.get(0)
111    }
112
113    /// Convert to a Vec for internal use (consumes the tuple)
114    pub fn into_vec(self) -> Vec<String> {
115        self.components.into_vec()
116    }
117
118    /// Iterate over the components (equivalent to Python's `for item in tuple`)
119    pub fn iter(&self) -> std::slice::Iter<'_, String> {
120        self.components.iter()
121    }
122
123    /// Convert to a slice for easy pattern matching
124    pub fn as_slice(&self) -> &[String] {
125        &self.components
126    }
127}
128
129// Implement Index trait for tuple[index] syntax
130impl std::ops::Index<usize> for ShebangTuple {
131    type Output = str;
132
133    fn index(&self, index: usize) -> &Self::Output {
134        &self.components[index]
135    }
136}
137
138// Implement IntoIterator for for-loops
139impl<'a> IntoIterator for &'a ShebangTuple {
140    type Item = &'a String;
141    type IntoIter = std::slice::Iter<'a, String>;
142
143    fn into_iter(self) -> Self::IntoIter {
144        self.components.iter()
145    }
146}
147
148// Implement FromIterator for collecting
149impl FromIterator<String> for ShebangTuple {
150    fn from_iter<T: IntoIterator<Item = String>>(iter: T) -> Self {
151        Self::from_vec(iter.into_iter().collect())
152    }
153}
154
155// Display implementation (equivalent to Python's str(tuple))
156impl fmt::Display for ShebangTuple {
157    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
158        write!(f, "(")?;
159        for (i, component) in self.components.iter().enumerate() {
160            if i > 0 {
161                write!(f, ", ")?;
162            }
163            write!(f, "'{component}'")?;
164        }
165        if self.components.len() == 1 {
166            write!(f, ",")?; // Python tuple trailing comma for single element
167        }
168        write!(f, ")")
169    }
170}
171
172// Conversion from Vec<String>
173impl From<Vec<String>> for ShebangTuple {
174    fn from(vec: Vec<String>) -> Self {
175        Self::from_vec(vec)
176    }
177}
178
179// Conversion from empty ()
180impl Default for ShebangTuple {
181    fn default() -> Self {
182        Self::new()
183    }
184}
185
186use extensions::{get_extension_tags, get_extensions_need_binary_check_tags, get_name_tags};
187use interpreters::get_interpreter_tags;
188pub use tags::FileKind;
189use tags::*;
190
191/// Pre-loaded file information for I/O-free identification.
192///
193/// Use this when you have file data from a source other than the host
194/// filesystem (e.g., a mocked/virtual filesystem in tests).
195///
196/// # Examples
197///
198/// ```rust
199/// use file_identify::{tags_from_info, FileInfo, FileKind};
200///
201/// let info = FileInfo {
202///     filename: "script.py",
203///     file_kind: FileKind::Regular,
204///     is_executable: false,
205///     content: Some(b"print('hello')"),
206/// };
207/// let tags = tags_from_info(&info);
208/// assert!(tags.contains("python"));
209/// assert!(tags.contains("text"));
210/// ```
211#[derive(Debug, Clone)]
212pub struct FileInfo<'a> {
213    /// The filename (just the name component, not a full path).
214    pub filename: &'a str,
215    /// The kind of filesystem entry.
216    pub file_kind: FileKind,
217    /// Whether the file has executable permissions.
218    pub is_executable: bool,
219    /// Optional file content for shebang and encoding analysis.
220    /// Pass `None` to skip content-based analysis entirely.
221    pub content: Option<&'a [u8]>,
222}
223
224/// Configuration for file identification behavior.
225///
226/// Allows customizing which analysis steps to perform and their order.
227/// Use `FileIdentifier::new()` to create a builder and customize identification.
228#[derive(Debug, Clone)]
229pub struct FileIdentifier {
230    skip_content_analysis: bool,
231    skip_shebang_analysis: bool,
232    custom_extensions: Option<std::collections::HashMap<String, TagSet>>,
233}
234
235impl Default for FileIdentifier {
236    fn default() -> Self {
237        Self::new()
238    }
239}
240
241impl FileIdentifier {
242    /// Create a new file identifier with default settings.
243    ///
244    /// By default, all analysis steps are enabled:
245    /// - File system metadata analysis
246    /// - Filename and extension analysis  
247    /// - Shebang analysis for executable files
248    /// - Content analysis (text vs binary detection)
249    pub fn new() -> Self {
250        Self {
251            skip_content_analysis: false,
252            skip_shebang_analysis: false,
253            custom_extensions: None,
254        }
255    }
256
257    /// Skip content analysis (text vs binary detection).
258    ///
259    /// This avoids reading file contents, making identification faster
260    /// but potentially less accurate for files without clear extension/filename patterns.
261    pub fn skip_content_analysis(mut self) -> Self {
262        self.skip_content_analysis = true;
263        self
264    }
265
266    /// Skip shebang analysis for executable files.
267    ///
268    /// This avoids parsing shebang lines, making identification faster
269    /// but less accurate for executable scripts without recognized extensions.
270    pub fn skip_shebang_analysis(mut self) -> Self {
271        self.skip_shebang_analysis = true;
272        self
273    }
274
275    /// Add custom file extension mappings.
276    ///
277    /// These will be checked before the built-in extension mappings.
278    /// Useful for organization-specific or project-specific file types.
279    pub fn with_custom_extensions(
280        mut self,
281        extensions: std::collections::HashMap<String, TagSet>,
282    ) -> Self {
283        self.custom_extensions = Some(extensions);
284        self
285    }
286
287    /// Identify a file using the configured settings.
288    ///
289    /// This is equivalent to `tags_from_path` but with customizable behavior.
290    pub fn identify<P: AsRef<Path>>(&self, path: P) -> Result<TagSet> {
291        self.identify_with_config(path)
292    }
293
294    fn identify_with_config<P: AsRef<Path>>(&self, path: P) -> Result<TagSet> {
295        let path = path.as_ref();
296        let path_str = path.to_string_lossy();
297
298        // Get file metadata
299        let metadata = match fs::symlink_metadata(path) {
300            Ok(meta) => meta,
301            Err(_) => {
302                return Err(IdentifyError::PathNotFound {
303                    path: path_str.to_string(),
304                });
305            }
306        };
307
308        // Step 1: Check for non-regular file types (directory, symlink, socket)
309        if let Some(file_type_tags) = analyze_file_type(&metadata) {
310            return Ok(file_type_tags);
311        }
312
313        // Step 2: This is a regular file - start building tag set
314        let mut tags = TagSet::new();
315        tags.insert(FILE);
316
317        // Step 3: Analyze permissions (executable vs non-executable)
318        let is_executable = analyze_permissions(path, &metadata);
319        tags.insert(if is_executable {
320            EXECUTABLE
321        } else {
322            NON_EXECUTABLE
323        });
324
325        // Step 4: Analyze filename and potentially shebang (with custom config)
326        tags.extend(self.analyze_filename_and_shebang_configured(path, is_executable));
327
328        // Step 5: Analyze content encoding (text vs binary) if not skipped and not already determined
329        if !self.skip_content_analysis {
330            tags.extend(analyze_content_encoding(path, &tags)?);
331        }
332
333        Ok(tags)
334    }
335
336    /// Identify a file from pre-loaded information, using configured settings.
337    ///
338    /// This is the pure, I/O-free equivalent of [`FileIdentifier::identify`].
339    /// The caller provides all necessary file data via [`FileInfo`].
340    pub fn identify_from(&self, info: &FileInfo<'_>) -> TagSet {
341        match info.file_kind {
342            FileKind::Directory => return HashSet::from([DIRECTORY]),
343            FileKind::Symlink => return HashSet::from([SYMLINK]),
344            FileKind::Socket => return HashSet::from([SOCKET]),
345            FileKind::Regular => {}
346        }
347
348        let mut tags = TagSet::new();
349        tags.insert(FILE);
350        tags.insert(if info.is_executable {
351            EXECUTABLE
352        } else {
353            NON_EXECUTABLE
354        });
355
356        // Filename analysis with custom extensions support
357        let mut filename_matched = false;
358        if let Some(custom_exts) = &self.custom_extensions
359            && let Some(ext) = Path::new(info.filename)
360                .extension()
361                .and_then(|e| e.to_str())
362            && let Some(ext_tags) = custom_exts.get(&ext.to_lowercase())
363        {
364            tags.extend(ext_tags.iter().copied());
365            filename_matched = true;
366        }
367        if !filename_matched {
368            let filename_tags = tags_from_filename(info.filename);
369            if !filename_tags.is_empty() {
370                tags.extend(filename_tags);
371                filename_matched = true;
372            }
373        }
374
375        // Shebang fallback
376        if !filename_matched
377            && info.is_executable
378            && !self.skip_shebang_analysis
379            && let Some(content) = info.content
380            && let Ok(shebang) = parse_shebang(content)
381            && let Some(interp) = shebang.first()
382        {
383            tags.extend(tags_from_interpreter(interp));
384        }
385
386        // Content encoding
387        if !self.skip_content_analysis
388            && !tags.iter().any(|t| ENCODING_TAGS.contains(t))
389            && let Some(content) = info.content
390            && let Ok(text) = is_text(content)
391        {
392            tags.insert(if text { TEXT } else { BINARY });
393        }
394
395        tags
396    }
397
398    fn analyze_filename_and_shebang_configured<P: AsRef<Path>>(
399        &self,
400        path: P,
401        is_executable: bool,
402    ) -> TagSet {
403        let path = path.as_ref();
404        let mut tags = TagSet::new();
405
406        // Check filename-based tags first (including custom extensions)
407        if let Some(filename) = path.file_name().and_then(|n| n.to_str()) {
408            // Check custom extensions first if provided
409            if let Some(custom_exts) = &self.custom_extensions
410                && let Some(ext) = Path::new(filename).extension().and_then(|e| e.to_str())
411                && let Some(ext_tags) = custom_exts.get(&ext.to_lowercase())
412            {
413                tags.extend(ext_tags.iter().copied());
414                return tags; // Custom extension takes precedence
415            }
416
417            // Fall back to standard filename analysis
418            let filename_tags = tags_from_filename(filename);
419            if !filename_tags.is_empty() {
420                tags.extend(filename_tags);
421            } else if is_executable && !self.skip_shebang_analysis {
422                // Parse shebang for executable files without recognized extensions
423                if let Ok(shebang_components) = parse_shebang_from_file(path)
424                    && let Some(interp) = shebang_components.first()
425                {
426                    tags.extend(tags_from_interpreter(interp));
427                }
428            }
429        }
430
431        tags
432    }
433}
434
435/// Result type for file identification operations.
436///
437/// This is a convenience type alias for operations that may fail with
438/// file system or parsing errors.
439pub type Result<T> = std::result::Result<T, IdentifyError>;
440
441/// Errors that can occur during file identification.
442#[derive(thiserror::Error, Debug)]
443pub enum IdentifyError {
444    /// The specified path does not exist on the filesystem.
445    #[error("{path} does not exist.")]
446    PathNotFound { path: String },
447
448    /// An I/O error occurred while accessing the file.
449    #[error("IO error: {source}")]
450    IoError {
451        #[from]
452        source: std::io::Error,
453    },
454
455    /// The file path contains invalid UTF-8 sequences.
456    #[error("Path contains invalid UTF-8: {path}")]
457    InvalidPath { path: String },
458
459    /// The file content is not valid UTF-8 when UTF-8 is expected.
460    #[error("File contains invalid UTF-8 content")]
461    InvalidUtf8,
462}
463
464/// Analyze file system metadata to determine basic file type.
465///
466/// Returns tags for directory, symlink, socket, or file based on metadata.
467/// This is the first step in file identification.
468fn analyze_file_type(metadata: &std::fs::Metadata) -> Option<TagSet> {
469    let file_type = metadata.file_type();
470
471    if file_type.is_dir() {
472        return Some(HashSet::from([DIRECTORY]));
473    }
474    if file_type.is_symlink() {
475        return Some(HashSet::from([SYMLINK]));
476    }
477
478    // Check for socket (Unix-specific)
479    #[cfg(unix)]
480    {
481        use std::os::unix::fs::FileTypeExt;
482        if file_type.is_socket() {
483            return Some(HashSet::from([SOCKET]));
484        }
485    }
486
487    // Regular file - continue with further analysis
488    None
489}
490
491/// Analyze file permissions to determine executable status.
492///
493/// Returns true if the file is executable, false otherwise.
494/// On Unix systems, checks permission bits. On other systems, checks file extension.
495fn analyze_permissions<P: AsRef<Path>>(path: P, metadata: &std::fs::Metadata) -> bool {
496    #[cfg(unix)]
497    {
498        use std::os::unix::fs::PermissionsExt;
499        let _ = path; // Suppress unused warning on Unix
500        metadata.permissions().mode() & 0o111 != 0
501    }
502    #[cfg(not(unix))]
503    {
504        // On non-Unix systems, check file extension for common executables
505        let _ = metadata; // Suppress unused warning on non-Unix
506        let path = path.as_ref();
507        path.extension()
508            .and_then(|ext| ext.to_str())
509            .map(|ext| matches!(ext.to_lowercase().as_str(), "exe" | "bat" | "cmd"))
510            .unwrap_or(false)
511    }
512}
513
514/// Analyze filename and potentially shebang for file type identification.
515///
516/// First tries filename-based identification. If that fails and the file is executable,
517/// falls back to shebang analysis.
518fn analyze_filename_and_shebang<P: AsRef<Path>>(path: P, is_executable: bool) -> TagSet {
519    let path = path.as_ref();
520    let mut tags = TagSet::new();
521
522    // Check filename-based tags first
523    if let Some(filename) = path.file_name().and_then(|n| n.to_str()) {
524        let filename_tags = tags_from_filename(filename);
525        if !filename_tags.is_empty() {
526            tags.extend(filename_tags);
527        } else if is_executable {
528            // Parse shebang for executable files without recognized extensions
529            if let Ok(shebang_components) = parse_shebang_from_file(path)
530                && let Some(interp) = shebang_components.first()
531            {
532                tags.extend(tags_from_interpreter(interp));
533            }
534        }
535    }
536
537    tags
538}
539
540/// Analyze file content to determine encoding (text vs binary).
541///
542/// Only performs analysis if encoding tags are not already present.
543fn analyze_content_encoding<P: AsRef<Path>>(path: P, existing_tags: &TagSet) -> Result<TagSet> {
544    let mut tags = TagSet::new();
545
546    // Check if we need to determine binary vs text
547    if !existing_tags.iter().any(|tag| ENCODING_TAGS.contains(tag)) {
548        if file_is_text(path)? {
549            tags.insert(TEXT);
550        } else {
551            tags.insert(BINARY);
552        }
553    }
554
555    Ok(tags)
556}
557
558/// Identify a file from its filesystem path.
559///
560/// This is the most comprehensive identification method, providing a superset
561/// of information from other methods. It analyzes:
562///
563/// 1. File type (regular file, directory, symlink, socket)
564/// 2. File permissions (executable vs non-executable)
565/// 3. Filename and extension patterns
566/// 4. File content (binary vs text detection)
567/// 5. Shebang lines for executable files
568///
569/// # Arguments
570///
571/// * `path` - Path to the file to identify
572///
573/// # Returns
574///
575/// A set of tags identifying the file type and characteristics.
576///
577/// # Errors
578///
579/// Returns [`IdentifyError::PathNotFound`] if the path doesn't exist, or
580/// [`IdentifyError::IoError`] for other I/O failures.
581///
582/// # Examples
583///
584/// ```rust
585/// use file_identify::tags_from_path;
586/// # use std::fs;
587/// # use tempfile::tempdir;
588///
589/// # let dir = tempdir().unwrap();
590/// # let file_path = dir.path().join("script.py");
591/// # fs::write(&file_path, "#!/usr/bin/env python3\nprint('hello')").unwrap();
592/// let tags = tags_from_path(&file_path).unwrap();
593/// assert!(tags.contains("file"));
594/// assert!(tags.contains("python"));
595/// assert!(tags.contains("text"));
596/// ```
597pub fn tags_from_path<P: AsRef<Path>>(path: P) -> Result<TagSet> {
598    let path = path.as_ref();
599    let path_str = path.to_string_lossy();
600
601    // Get file metadata
602    let metadata = match fs::symlink_metadata(path) {
603        Ok(meta) => meta,
604        Err(_) => {
605            return Err(IdentifyError::PathNotFound {
606                path: path_str.to_string(),
607            });
608        }
609    };
610
611    // Step 1: Check for non-regular file types (directory, symlink, socket)
612    if let Some(file_type_tags) = analyze_file_type(&metadata) {
613        return Ok(file_type_tags);
614    }
615
616    // Step 2: This is a regular file - start building tag set
617    let mut tags = TagSet::new();
618    tags.insert(FILE);
619
620    // Step 3: Analyze permissions (executable vs non-executable)
621    let is_executable = analyze_permissions(path, &metadata);
622    tags.insert(if is_executable {
623        EXECUTABLE
624    } else {
625        NON_EXECUTABLE
626    });
627
628    // Step 4: Analyze filename and potentially shebang
629    tags.extend(analyze_filename_and_shebang(path, is_executable));
630
631    // Step 5: Analyze content encoding (text vs binary) if not already determined
632    tags.extend(analyze_content_encoding(path, &tags)?);
633
634    Ok(tags)
635}
636
637/// Identify a file from pre-loaded information, without any I/O.
638///
639/// This is the pure equivalent of [`tags_from_path`]. The caller provides
640/// all necessary file data upfront via [`FileInfo`], making it usable with
641/// mocked or virtual filesystems.
642///
643/// # Arguments
644///
645/// * `info` - Pre-loaded file information
646///
647/// # Returns
648///
649/// A set of tags identifying the file type and characteristics.
650///
651/// # Examples
652///
653/// ```rust
654/// use file_identify::{tags_from_info, FileInfo, FileKind};
655///
656/// let info = FileInfo {
657///     filename: "script.py",
658///     file_kind: FileKind::Regular,
659///     is_executable: false,
660///     content: Some(b"print('hello')"),
661/// };
662/// let tags = tags_from_info(&info);
663/// assert!(tags.contains("file"));
664/// assert!(tags.contains("python"));
665/// assert!(tags.contains("text"));
666///
667/// // Directories return just the type tag
668/// let info = FileInfo {
669///     filename: "src",
670///     file_kind: FileKind::Directory,
671///     is_executable: false,
672///     content: None,
673/// };
674/// let tags = tags_from_info(&info);
675/// assert!(tags.contains("directory"));
676/// ```
677pub fn tags_from_info(info: &FileInfo<'_>) -> TagSet {
678    match info.file_kind {
679        FileKind::Directory => return HashSet::from([DIRECTORY]),
680        FileKind::Symlink => return HashSet::from([SYMLINK]),
681        FileKind::Socket => return HashSet::from([SOCKET]),
682        FileKind::Regular => {}
683    }
684
685    let mut tags = TagSet::new();
686    tags.insert(FILE);
687    tags.insert(if info.is_executable {
688        EXECUTABLE
689    } else {
690        NON_EXECUTABLE
691    });
692
693    // Filename/extension matching
694    let filename_tags = tags_from_filename(info.filename);
695    if !filename_tags.is_empty() {
696        tags.extend(filename_tags);
697    } else if info.is_executable {
698        // Shebang fallback for executables without recognized extension
699        if let Some(content) = info.content
700            && let Ok(shebang) = parse_shebang(content)
701            && let Some(interp) = shebang.first()
702        {
703            tags.extend(tags_from_interpreter(interp));
704        }
705    }
706
707    // Content encoding (text vs binary) if not already determined
708    if !tags.iter().any(|tag| ENCODING_TAGS.contains(tag))
709        && let Some(content) = info.content
710        && let Ok(text) = is_text(content)
711    {
712        tags.insert(if text { TEXT } else { BINARY });
713    }
714
715    tags
716}
717
718/// Identify a file based only on its filename.
719///
720/// This method analyzes the filename and extension to determine file type,
721/// without accessing the filesystem. It's useful when you only have the
722/// filename or want to avoid I/O operations.
723///
724/// # Arguments
725///
726/// * `filename` - The filename to analyze (can include path)
727///
728/// # Returns
729///
730/// A set of tags identifying the file type. Returns an empty set if
731/// the filename is not recognized.
732///
733/// # Examples
734///
735/// ```rust
736/// use file_identify::tags_from_filename;
737///
738/// let tags = tags_from_filename("script.py");
739/// assert!(tags.contains("python"));
740/// assert!(tags.contains("text"));
741///
742/// let tags = tags_from_filename("Dockerfile");
743/// assert!(tags.contains("dockerfile"));
744///
745/// let tags = tags_from_filename("unknown.xyz");
746/// assert!(tags.is_empty());
747/// ```
748pub fn tags_from_filename(filename: &str) -> TagSet {
749    let mut tags = TagSet::new();
750
751    // Check exact filename matches first
752    for part in std::iter::once(filename).chain(filename.split('.')) {
753        let name_tags = get_name_tags(part);
754        if !name_tags.is_empty() {
755            tags.extend(name_tags);
756            break;
757        }
758    }
759
760    // Check file extension
761    if let Some(ext) = Path::new(filename).extension().and_then(|e| e.to_str()) {
762        let ext_lower = ext.to_lowercase();
763
764        let ext_tags = get_extension_tags(&ext_lower);
765        if !ext_tags.is_empty() {
766            tags.extend(ext_tags);
767        } else {
768            let binary_check_tags = get_extensions_need_binary_check_tags(&ext_lower);
769            if !binary_check_tags.is_empty() {
770                tags.extend(binary_check_tags);
771            }
772        }
773    }
774
775    tags
776}
777
778/// Identify tags based on a shebang interpreter.
779///
780/// This function analyzes interpreter names from shebang lines to determine
781/// the script type. It handles version-specific interpreters by progressively
782/// removing version suffixes.
783///
784/// # Arguments
785///
786/// * `interpreter` - The interpreter name or path from a shebang
787///
788/// # Returns
789///
790/// A set of tags for the interpreter type. Returns an empty set if
791/// the interpreter is not recognized.
792///
793/// # Examples
794///
795/// ```rust
796/// use file_identify::tags_from_interpreter;
797///
798/// let tags = tags_from_interpreter("python3.11");
799/// assert!(tags.contains("python"));
800/// assert!(tags.contains("python3"));
801///
802/// let tags = tags_from_interpreter("/usr/bin/bash");
803/// assert!(tags.contains("shell"));
804/// assert!(tags.contains("bash"));
805///
806/// let tags = tags_from_interpreter("unknown-interpreter");
807/// assert!(tags.is_empty());
808/// ```
809pub fn tags_from_interpreter(interpreter: &str) -> TagSet {
810    // Extract the interpreter name from the path
811    let interpreter_name = interpreter.split('/').next_back().unwrap_or(interpreter);
812
813    // Try progressively shorter versions (e.g., "python3.5.2" -> "python3.5" -> "python3")
814    let mut current = interpreter_name;
815    while !current.is_empty() {
816        let tags = get_interpreter_tags(current);
817        if !tags.is_empty() {
818            return tags;
819        }
820
821        // Try removing the last dot-separated part
822        match current.rfind('.') {
823            Some(pos) => current = &current[..pos],
824            None => break,
825        }
826    }
827
828    TagSet::new()
829}
830
831/// Determine if a file contains text or binary data.
832///
833/// This function reads the first 1KB of a file to determine if it contains
834/// text or binary data, using a similar algorithm to the `file` command.
835///
836/// # Arguments
837///
838/// * `path` - Path to the file to analyze
839///
840/// # Returns
841///
842/// `true` if the file appears to contain text, `false` if binary.
843///
844/// # Errors
845///
846/// Returns an error if the file cannot be opened or read.
847///
848/// # Examples
849///
850/// ```rust
851/// use file_identify::file_is_text;
852/// # use std::fs;
853/// # use tempfile::tempdir;
854///
855/// # let dir = tempdir().unwrap();
856/// # let text_path = dir.path().join("text.txt");
857/// # fs::write(&text_path, "Hello, world!").unwrap();
858/// assert!(file_is_text(&text_path).unwrap());
859///
860/// # let binary_path = dir.path().join("binary.bin");
861/// # fs::write(&binary_path, &[0x7f, 0x45, 0x4c, 0x46]).unwrap();
862/// assert!(!file_is_text(&binary_path).unwrap());
863/// ```
864pub fn file_is_text<P: AsRef<Path>>(path: P) -> Result<bool> {
865    let file = fs::File::open(path)?;
866    is_text(file)
867}
868
869/// Determine if data from a reader contains text or binary content.
870///
871/// This function reads up to 1KB from the provided reader and analyzes
872/// the bytes to determine if they represent text or binary data.
873///
874/// # Arguments
875///
876/// * `reader` - A reader providing the data to analyze
877///
878/// # Returns
879///
880/// `true` if the data appears to be text, `false` if binary.
881///
882/// # Examples
883///
884/// ```rust
885/// use file_identify::is_text;
886/// use std::io::Cursor;
887///
888/// let text_data = Cursor::new(b"Hello, world!");
889/// assert!(is_text(text_data).unwrap());
890///
891/// let binary_data = Cursor::new(&[0x7f, 0x45, 0x4c, 0x46, 0x00]);
892/// assert!(!is_text(binary_data).unwrap());
893/// ```
894pub fn is_text<R: Read>(mut reader: R) -> Result<bool> {
895    // Compile-time lookup table: true for bytes that are valid in text files.
896    // Covers ASCII printable (0x20..0x7F), extended ASCII (0x80..=0xFF),
897    // and common control chars (BEL, BS, TAB, LF, VT, FF, CR, ESC).
898    const TEXT_BYTES: [bool; 256] = {
899        let mut table = [false; 256];
900        let mut i = 0x20;
901        while i < 0x7F {
902            table[i] = true;
903            i += 1;
904        }
905        let mut i = 0x80;
906        while i < 256 {
907            table[i] = true;
908            i += 1;
909        }
910        table[7] = true;
911        table[8] = true;
912        table[9] = true;
913        table[10] = true;
914        table[11] = true;
915        table[12] = true;
916        table[13] = true;
917        table[27] = true;
918        table
919    };
920
921    let mut buffer = [0; 1024];
922    let bytes_read = reader.read(&mut buffer)?;
923
924    Ok(buffer[..bytes_read].iter().all(|&b| TEXT_BYTES[b as usize]))
925}
926
927/// Parse shebang line from an executable file and return raw shebang components.
928///
929/// This function reads the first line of an executable file to extract
930/// shebang information and return the raw command components, similar to
931/// Python's identify.parse_shebang_from_file().
932///
933/// # Arguments
934///
935/// * `path` - Path to the executable file
936///
937/// # Returns
938///
939/// A vector of raw shebang components. Returns an empty vector if:
940/// - The file is not executable
941/// - No shebang is found
942///
943/// # Errors
944///
945/// Returns an error if the file cannot be accessed or read.
946///
947/// # Examples
948///
949/// ```rust
950/// use file_identify::parse_shebang_from_file;
951/// # use std::fs;
952/// # use std::os::unix::fs::PermissionsExt;
953/// # use tempfile::tempdir;
954///
955/// # let dir = tempdir().unwrap();
956/// # let script_path = dir.path().join("script");
957/// # fs::write(&script_path, "#!/usr/bin/env python3\nprint('hello')").unwrap();
958/// # let mut perms = fs::metadata(&script_path).unwrap().permissions();
959/// # perms.set_mode(0o755);
960/// # fs::set_permissions(&script_path, perms).unwrap();
961/// let shebang = parse_shebang_from_file(&script_path).unwrap();
962/// assert_eq!(shebang.get(0).unwrap(), "python3");
963/// ```
964pub fn parse_shebang_from_file<P: AsRef<Path>>(path: P) -> Result<ShebangTuple> {
965    let path = path.as_ref();
966
967    // Only check executable files
968    let metadata = fs::metadata(path)?;
969    #[cfg(unix)]
970    {
971        use std::os::unix::fs::PermissionsExt;
972        if metadata.permissions().mode() & 0o111 == 0 {
973            return Ok(ShebangTuple::new());
974        }
975    }
976
977    let file = fs::File::open(path)?;
978    parse_shebang(file)
979}
980
981/// Parse a shebang line from a reader and return raw shebang components.
982///
983/// This function reads the first line from the provided reader and parses
984/// it as a shebang line to extract raw command components, similar to
985/// Python's identify.parse_shebang().
986///
987/// # Arguments
988///
989/// * `reader` - A reader providing the file content
990///
991/// # Returns
992///
993/// A vector of raw shebang components. Returns an empty vector if no valid shebang is found.
994///
995/// # Examples
996///
997/// ```rust
998/// use file_identify::parse_shebang;
999/// use std::io::Cursor;
1000///
1001/// let shebang = Cursor::new(b"#!/usr/bin/env python3\nprint('hello')");
1002/// let components = parse_shebang(shebang).unwrap();
1003/// assert_eq!(components.get(0).unwrap(), "python3");
1004///
1005/// let no_shebang = Cursor::new(b"print('hello')");
1006/// let components = parse_shebang(no_shebang).unwrap();
1007/// assert!(components.is_empty());
1008/// ```
1009pub fn parse_shebang<R: Read>(reader: R) -> Result<ShebangTuple> {
1010    use std::io::BufRead;
1011
1012    let mut buf_reader = BufReader::new(reader);
1013
1014    // Read first line efficiently using read_until
1015    let mut first_line_bytes = Vec::new();
1016    match buf_reader.read_until(b'\n', &mut first_line_bytes) {
1017        Ok(0) => return Ok(ShebangTuple::new()), // EOF with no data
1018        Ok(_) => {
1019            // Remove trailing newline if present
1020            if first_line_bytes.ends_with(b"\n") {
1021                first_line_bytes.pop();
1022            }
1023            // Also handle \r\n line endings
1024            if first_line_bytes.ends_with(b"\r") {
1025                first_line_bytes.pop();
1026            }
1027        }
1028        Err(_) => return Ok(ShebangTuple::new()), // Read error
1029    }
1030
1031    // Check if starts with shebang
1032    if first_line_bytes.len() < 2 || &first_line_bytes[0..2] != b"#!" {
1033        return Ok(ShebangTuple::new());
1034    }
1035
1036    // Limit line length to prevent memory issues
1037    if first_line_bytes.len() > 1024 {
1038        first_line_bytes.truncate(1024);
1039    }
1040
1041    // Try to decode as UTF-8, return empty if invalid (like Python does)
1042    let first_line = match String::from_utf8(first_line_bytes) {
1043        Ok(line) => line,
1044        Err(_) => return Ok(ShebangTuple::new()),
1045    };
1046
1047    // Remove the #! and clean up the line
1048    let shebang_line = first_line[2..].trim();
1049
1050    // Check for only printable ASCII (like Python does)
1051    for c in shebang_line.chars() {
1052        if !c.is_ascii() || (c.is_control() && c != '\t') {
1053            return Ok(ShebangTuple::new());
1054        }
1055    }
1056
1057    // Parse the shebang command using simple split (like Python's shlex fallback)
1058    let parts: smallvec::SmallVec<[&str; 4]> = shebang_line.split_whitespace().collect();
1059    if parts.is_empty() {
1060        return Ok(ShebangTuple::new());
1061    }
1062
1063    let cmd: smallvec::SmallVec<[&str; 2]> = if parts[0] == "/usr/bin/env" {
1064        if parts.len() == 1 {
1065            // Just "#!/usr/bin/env" with no interpreter
1066            smallvec::SmallVec::new()
1067        } else if parts.len() >= 2 && parts[1] == "-S" {
1068            if parts.len() > 2 {
1069                parts[2..].iter().copied().collect()
1070            } else {
1071                // Just "#!/usr/bin/env -S" with no interpreter
1072                smallvec::SmallVec::new()
1073            }
1074        } else {
1075            parts[1..].iter().copied().collect()
1076        }
1077    } else {
1078        parts.iter().copied().collect()
1079    };
1080
1081    if cmd.is_empty() {
1082        return Ok(ShebangTuple::new());
1083    }
1084
1085    // Return the raw command components as strings
1086    Ok(ShebangTuple::from_vec(
1087        cmd.iter().map(|s| s.to_string()).collect(),
1088    ))
1089}
1090
1091#[cfg(test)]
1092mod tests {
1093    use super::*;
1094    use std::fs;
1095    use std::io::Cursor;
1096    use std::os::unix::fs::PermissionsExt;
1097    use tempfile::{NamedTempFile, tempdir};
1098
1099    // Helper macro to create ShebangTuple from string slices for testing
1100    macro_rules! shebang_tuple {
1101        () => {
1102            ShebangTuple::new()
1103        };
1104        ($($item:expr),+) => {
1105            ShebangTuple::from_vec(vec![$($item.to_string()),+])
1106        };
1107    }
1108
1109    // Test tag system completeness
1110    #[test]
1111    fn test_all_basic_tags_exist() {
1112        assert!(TYPE_TAGS.contains("file"));
1113        assert!(TYPE_TAGS.contains("directory"));
1114        assert!(MODE_TAGS.contains("executable"));
1115        assert!(ENCODING_TAGS.contains("text"));
1116    }
1117
1118    #[test]
1119    fn test_tag_groups_are_disjoint() {
1120        assert!(TYPE_TAGS.is_disjoint(&MODE_TAGS));
1121        assert!(TYPE_TAGS.is_disjoint(&ENCODING_TAGS));
1122        assert!(MODE_TAGS.is_disjoint(&ENCODING_TAGS));
1123    }
1124
1125    // Test tags_from_filename with various scenarios
1126    #[test]
1127    fn test_tags_from_filename_basic() {
1128        let tags = tags_from_filename("file.py");
1129        assert!(tags.contains("text"));
1130        assert!(tags.contains("python"));
1131    }
1132
1133    #[test]
1134    fn test_tags_from_filename_special_names() {
1135        let tags = tags_from_filename("Dockerfile");
1136        assert!(tags.contains("dockerfile"));
1137        assert!(tags.contains("text"));
1138
1139        let tags = tags_from_filename("Makefile");
1140        assert!(tags.contains("makefile"));
1141        assert!(tags.contains("text"));
1142
1143        let tags = tags_from_filename("Cargo.toml");
1144        assert!(tags.contains("toml"));
1145        assert!(tags.contains("cargo"));
1146    }
1147
1148    #[test]
1149    fn test_tags_from_filename_case_insensitive_extension() {
1150        let tags = tags_from_filename("image.JPG");
1151        assert!(tags.contains("binary"));
1152        assert!(tags.contains("image"));
1153        assert!(tags.contains("jpeg"));
1154    }
1155
1156    #[test]
1157    fn test_tags_from_filename_precedence() {
1158        // setup.cfg should match by name, not .cfg extension
1159        let tags = tags_from_filename("setup.cfg");
1160        assert!(tags.contains("ini"));
1161    }
1162
1163    #[test]
1164    fn test_tags_from_filename_complex_names() {
1165        let tags = tags_from_filename("Dockerfile.xenial");
1166        assert!(tags.contains("dockerfile"));
1167
1168        let tags = tags_from_filename("README.md");
1169        assert!(tags.contains("markdown"));
1170        assert!(tags.contains("plain-text"));
1171    }
1172
1173    #[test]
1174    fn test_tags_from_filename_unrecognized() {
1175        let tags = tags_from_filename("unknown.xyz");
1176        assert!(tags.is_empty());
1177
1178        let tags = tags_from_filename("noextension");
1179        assert!(tags.is_empty());
1180    }
1181
1182    // Test tags_from_interpreter
1183    #[test]
1184    fn test_tags_from_interpreter_basic() {
1185        let tags = tags_from_interpreter("python3");
1186        assert!(tags.contains("python"));
1187        assert!(tags.contains("python3"));
1188    }
1189
1190    #[test]
1191    fn test_tags_from_interpreter_versioned() {
1192        let tags = tags_from_interpreter("python3.11.2");
1193        assert!(tags.contains("python"));
1194        assert!(tags.contains("python3"));
1195
1196        let tags = tags_from_interpreter("php8.1");
1197        assert!(tags.contains("php"));
1198        assert!(tags.contains("php8"));
1199    }
1200
1201    #[test]
1202    fn test_tags_from_interpreter_with_path() {
1203        let tags = tags_from_interpreter("/usr/bin/python3");
1204        assert!(tags.contains("python"));
1205        assert!(tags.contains("python3"));
1206    }
1207
1208    #[test]
1209    fn test_tags_from_interpreter_unrecognized() {
1210        let tags = tags_from_interpreter("unknown-interpreter");
1211        assert!(tags.is_empty());
1212
1213        let tags = tags_from_interpreter("");
1214        assert!(tags.is_empty());
1215    }
1216
1217    // Test is_text function
1218    #[test]
1219    fn test_is_text_basic() {
1220        assert!(is_text(Cursor::new(b"hello world")).unwrap());
1221        assert!(is_text(Cursor::new(b"")).unwrap());
1222        assert!(!is_text(Cursor::new(b"hello\x00world")).unwrap());
1223    }
1224
1225    #[test]
1226    fn test_is_text_unicode() {
1227        assert!(is_text(Cursor::new("éóñəå  ⊂(◉‿◉)つ(ノ≥∇≤)ノ".as_bytes())).unwrap());
1228        assert!(is_text(Cursor::new(r"¯\_(ツ)_/¯".as_bytes())).unwrap());
1229        assert!(is_text(Cursor::new("♪┏(・o･)┛♪┗ ( ･o･) ┓♪".as_bytes())).unwrap());
1230    }
1231
1232    #[test]
1233    fn test_is_text_binary_data() {
1234        // ELF header
1235        assert!(!is_text(Cursor::new(&[0x7f, 0x45, 0x4c, 0x46, 0x02, 0x01, 0x01])).unwrap());
1236        // Random binary data
1237        assert!(!is_text(Cursor::new(&[0x43, 0x92, 0xd9, 0x0f, 0xaf, 0x32, 0x2c])).unwrap());
1238    }
1239
1240    // Test parse_shebang function
1241    #[test]
1242    fn test_parse_shebang_basic() {
1243        let components = parse_shebang(Cursor::new(b"#!/usr/bin/python")).unwrap();
1244        assert_eq!(components, shebang_tuple!["/usr/bin/python"]);
1245
1246        let components = parse_shebang(Cursor::new(b"#!/usr/bin/env python")).unwrap();
1247        assert_eq!(components, shebang_tuple!["python"]);
1248    }
1249
1250    #[test]
1251    fn test_parse_shebang_env_with_flags() {
1252        let components = parse_shebang(Cursor::new(b"#!/usr/bin/env -S python -u")).unwrap();
1253        assert_eq!(components, shebang_tuple!["python", "-u"]);
1254    }
1255
1256    #[test]
1257    fn test_parse_shebang_spaces() {
1258        let components = parse_shebang(Cursor::new(b"#! /usr/bin/python")).unwrap();
1259        assert_eq!(components, shebang_tuple!["/usr/bin/python"]);
1260
1261        let components = parse_shebang(Cursor::new(b"#!/usr/bin/foo  python")).unwrap();
1262        assert_eq!(components, shebang_tuple!["/usr/bin/foo", "python"]);
1263    }
1264
1265    #[test]
1266    fn test_parse_shebang_no_shebang() {
1267        let components = parse_shebang(Cursor::new(b"import sys")).unwrap();
1268        assert!(components.is_empty());
1269
1270        let components = parse_shebang(Cursor::new(b"")).unwrap();
1271        assert!(components.is_empty());
1272    }
1273
1274    #[test]
1275    fn test_parse_shebang_invalid_utf8() {
1276        let result = parse_shebang(Cursor::new(&[0x23, 0x21, 0xf9, 0x93, 0x01, 0x42, 0xcd]));
1277        match result {
1278            Ok(components) => assert!(components.is_empty()),
1279            Err(_) => (), // I/O errors are acceptable for invalid UTF-8 data
1280        }
1281    }
1282
1283    // File system tests using tempfiles
1284    #[test]
1285    fn test_tags_from_path_file_not_found() {
1286        let result = tags_from_path("/nonexistent/path");
1287        assert!(result.is_err());
1288        assert!(result.unwrap_err().to_string().contains("does not exist"));
1289    }
1290
1291    #[test]
1292    fn test_tags_from_path_regular_file() {
1293        let file = NamedTempFile::new().unwrap();
1294        fs::write(&file, "print('hello')").unwrap();
1295
1296        let tags = tags_from_path(file.path()).unwrap();
1297        assert!(tags.contains("file"));
1298        assert!(tags.contains("non-executable"));
1299        assert!(tags.contains("text"));
1300    }
1301
1302    #[test]
1303    fn test_tags_from_path_executable_file() {
1304        let dir = tempdir().unwrap();
1305        let script_path = dir.path().join("script.py");
1306        fs::write(&script_path, "#!/usr/bin/env python3\nprint('hello')").unwrap();
1307
1308        let mut perms = fs::metadata(&script_path).unwrap().permissions();
1309        perms.set_mode(0o755);
1310        fs::set_permissions(&script_path, perms).unwrap();
1311
1312        let tags = tags_from_path(&script_path).unwrap();
1313        assert!(tags.contains("file"));
1314        assert!(tags.contains("executable"));
1315        assert!(tags.contains("python"));
1316        assert!(tags.contains("text"));
1317    }
1318
1319    #[test]
1320    fn test_tags_from_path_directory() {
1321        let dir = tempdir().unwrap();
1322        let tags = tags_from_path(dir.path()).unwrap();
1323        assert_eq!(tags, HashSet::from(["directory"]));
1324    }
1325
1326    #[test]
1327    fn test_tags_from_path_binary_file() {
1328        let dir = tempdir().unwrap();
1329        let binary_path = dir.path().join("binary");
1330        fs::write(&binary_path, &[0x7f, 0x45, 0x4c, 0x46, 0x02, 0x01, 0x01]).unwrap();
1331
1332        let tags = tags_from_path(&binary_path).unwrap();
1333        assert!(tags.contains("file"));
1334        assert!(tags.contains("binary"));
1335        assert!(tags.contains("non-executable"));
1336    }
1337
1338    #[test]
1339    fn test_file_is_text_simple() {
1340        let dir = tempdir().unwrap();
1341        let text_path = dir.path().join("text.txt");
1342        fs::write(&text_path, "Hello, world!").unwrap();
1343        assert!(file_is_text(&text_path).unwrap());
1344    }
1345
1346    #[test]
1347    fn test_file_is_text_does_not_exist() {
1348        let result = file_is_text("/nonexistent/file");
1349        assert!(result.is_err());
1350    }
1351
1352    // Test extensions that need binary check
1353    #[test]
1354    fn test_plist_binary_detection() {
1355        let dir = tempdir().unwrap();
1356        let plist_path = dir.path().join("test.plist");
1357
1358        // Binary plist
1359        let binary_plist = [
1360            0x62, 0x70, 0x6c, 0x69, 0x73, 0x74, 0x30, 0x30, // "bplist00"
1361            0xd1, 0x01, 0x02, 0x5f, 0x10, 0x0f,
1362        ];
1363        fs::write(&plist_path, &binary_plist).unwrap();
1364
1365        let tags = tags_from_path(&plist_path).unwrap();
1366        assert!(tags.contains("plist"));
1367        assert!(tags.contains("binary"));
1368    }
1369
1370    #[test]
1371    fn test_plist_text_detection() {
1372        let dir = tempdir().unwrap();
1373        let plist_path = dir.path().join("test.plist");
1374
1375        let text_plist = r#"<?xml version="1.0" encoding="UTF-8"?>
1376<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
1377<plist version="1.0">
1378<dict>
1379    <key>TestKey</key>
1380    <string>TestValue</string>
1381</dict>
1382</plist>"#;
1383        fs::write(&plist_path, text_plist).unwrap();
1384
1385        let tags = tags_from_path(&plist_path).unwrap();
1386        assert!(tags.contains("plist"));
1387        assert!(tags.contains("text"));
1388    }
1389
1390    // Additional edge case tests
1391    #[test]
1392    fn test_empty_file() {
1393        let dir = tempdir().unwrap();
1394        let empty_path = dir.path().join("empty");
1395        fs::write(&empty_path, "").unwrap();
1396
1397        let tags = tags_from_path(&empty_path).unwrap();
1398        assert!(tags.contains("file"));
1399        assert!(tags.contains("text")); // Empty files are considered text
1400        assert!(tags.contains("non-executable"));
1401    }
1402
1403    #[test]
1404    fn test_shebang_incomplete() {
1405        let shebang_incomplete = parse_shebang(Cursor::new(b"#!   \n")).unwrap();
1406        assert!(shebang_incomplete.is_empty());
1407    }
1408
1409    #[test]
1410    fn test_multiple_extensions() {
1411        let tags = tags_from_filename("backup.tar.gz");
1412        assert!(tags.contains("binary"));
1413        assert!(tags.contains("gzip"));
1414    }
1415
1416    // Test FileIdentifier builder pattern
1417    #[test]
1418    fn test_file_identifier_default() {
1419        let dir = tempdir().unwrap();
1420        let py_file = dir.path().join("test.py");
1421        fs::write(&py_file, "print('hello')").unwrap();
1422
1423        let identifier = FileIdentifier::new();
1424        let tags = identifier.identify(&py_file).unwrap();
1425
1426        assert!(tags.contains("file"));
1427        assert!(tags.contains("python"));
1428        assert!(tags.contains("text"));
1429        assert!(tags.contains("non-executable"));
1430    }
1431
1432    #[test]
1433    fn test_file_identifier_skip_content_analysis() {
1434        let dir = tempdir().unwrap();
1435        let unknown_file = dir.path().join("unknown_file");
1436        fs::write(&unknown_file, "some content").unwrap();
1437
1438        let identifier = FileIdentifier::new().skip_content_analysis();
1439        let tags = identifier.identify(&unknown_file).unwrap();
1440
1441        assert!(tags.contains("file"));
1442        assert!(tags.contains("non-executable"));
1443        // Should not have text or binary tags since content analysis was skipped
1444        assert!(!tags.contains("text"));
1445        assert!(!tags.contains("binary"));
1446    }
1447
1448    #[test]
1449    fn test_file_identifier_skip_shebang_analysis() {
1450        let dir = tempdir().unwrap();
1451        let script_file = dir.path().join("script");
1452        fs::write(&script_file, "#!/usr/bin/env python3\nprint('hello')").unwrap();
1453
1454        let mut perms = fs::metadata(&script_file).unwrap().permissions();
1455        perms.set_mode(0o755);
1456        fs::set_permissions(&script_file, perms).unwrap();
1457
1458        let identifier = FileIdentifier::new().skip_shebang_analysis();
1459        let tags = identifier.identify(&script_file).unwrap();
1460
1461        assert!(tags.contains("file"));
1462        assert!(tags.contains("executable"));
1463        // Should not have python tags since shebang analysis was skipped
1464        // and filename doesn't match any patterns
1465        assert!(!tags.contains("python"));
1466    }
1467
1468    #[test]
1469    fn test_file_identifier_custom_extensions() {
1470        let dir = tempdir().unwrap();
1471        let custom_file = dir.path().join("test.myext");
1472        fs::write(&custom_file, "custom content").unwrap();
1473
1474        let mut custom_extensions = std::collections::HashMap::new();
1475        custom_extensions.insert("myext".to_string(), HashSet::from(["custom", "text"]));
1476
1477        let identifier = FileIdentifier::new().with_custom_extensions(custom_extensions);
1478        let tags = identifier.identify(&custom_file).unwrap();
1479
1480        assert!(tags.contains("file"));
1481        assert!(tags.contains("custom"));
1482        assert!(tags.contains("text"));
1483        assert!(tags.contains("non-executable"));
1484    }
1485
1486    #[test]
1487    fn test_file_identifier_chaining() {
1488        let dir = tempdir().unwrap();
1489        let test_file = dir.path().join("test.unknown");
1490        fs::write(&test_file, "content").unwrap();
1491
1492        let identifier = FileIdentifier::new()
1493            .skip_content_analysis()
1494            .skip_shebang_analysis();
1495        let tags = identifier.identify(&test_file).unwrap();
1496
1497        assert!(tags.contains("file"));
1498        assert!(tags.contains("non-executable"));
1499        // Should have minimal tags due to skipping analyses
1500        assert!(!tags.contains("text"));
1501        assert!(!tags.contains("binary"));
1502    }
1503
1504    // Additional comprehensive tests from Python version
1505    #[test]
1506    fn test_comprehensive_shebang_parsing() {
1507        let test_cases = vec![
1508            ("", vec![]),
1509            ("#!/usr/bin/python", vec!["python"]),
1510            ("#!/usr/bin/env python", vec!["python"]),
1511            ("#! /usr/bin/python", vec!["python"]),
1512            ("#!/usr/bin/foo  python", vec![]), // "foo" not recognized
1513            ("#!/usr/bin/env -S python -u", vec!["python"]),
1514            ("#!/usr/bin/env", vec![]),
1515            ("#!/usr/bin/env -S", vec![]),
1516        ];
1517
1518        for (input, _expected) in test_cases {
1519            let components = parse_shebang(Cursor::new(input.as_bytes())).unwrap();
1520
1521            match input {
1522                "" => assert!(components.is_empty()),
1523                "#!/usr/bin/python" => assert_eq!(components, shebang_tuple!["/usr/bin/python"]),
1524                "#!/usr/bin/env python" => assert_eq!(components, shebang_tuple!["python"]),
1525                "#! /usr/bin/python" => assert_eq!(components, shebang_tuple!["/usr/bin/python"]),
1526                "#!/usr/bin/foo  python" => {
1527                    assert_eq!(components, shebang_tuple!["/usr/bin/foo", "python"])
1528                }
1529                "#!/usr/bin/env -S python -u" => {
1530                    assert_eq!(components, shebang_tuple!["python", "-u"])
1531                }
1532                "#!/usr/bin/env" => {
1533                    // This should be empty since no interpreter specified
1534                    assert!(
1535                        components.is_empty(),
1536                        "Got components: {:?} for input: '{}'",
1537                        components,
1538                        input
1539                    );
1540                }
1541                "#!/usr/bin/env -S" => {
1542                    // This should be empty since no interpreter after -S
1543                    assert!(
1544                        components.is_empty(),
1545                        "Got components: {:?} for input: '{}'",
1546                        components,
1547                        input
1548                    );
1549                }
1550                _ => {}
1551            }
1552        }
1553    }
1554
1555    #[test]
1556    fn test_invalid_utf8_shebang() {
1557        // Test that invalid UTF-8 in shebang doesn't crash
1558        let invalid_utf8_cases = vec![
1559            &[0xf9, 0x93, 0x01, 0x42, 0xcd][..],
1560            &[0x23, 0x21, 0xf9, 0x93, 0x01, 0x42, 0xcd][..],
1561            &[0x23, 0x21, 0x00, 0x00, 0x00, 0x00][..],
1562        ];
1563
1564        for input in invalid_utf8_cases {
1565            // Should not panic, should return empty components for invalid UTF-8
1566            let result = parse_shebang(Cursor::new(input));
1567            match result {
1568                Ok(components) => assert!(components.is_empty()),
1569                Err(_) => (), // I/O errors are acceptable for invalid data
1570            }
1571        }
1572    }
1573
1574    // Tests for tags_from_info (I/O-free API)
1575
1576    #[test]
1577    fn test_tags_from_info_regular_file() {
1578        let info = FileInfo {
1579            filename: "script.py",
1580            file_kind: FileKind::Regular,
1581            is_executable: false,
1582            content: Some(b"print('hello')"),
1583        };
1584        let tags = tags_from_info(&info);
1585        assert!(tags.contains("file"));
1586        assert!(tags.contains("non-executable"));
1587        assert!(tags.contains("python"));
1588        assert!(tags.contains("text"));
1589    }
1590
1591    #[test]
1592    fn test_tags_from_info_directory() {
1593        let info = FileInfo {
1594            filename: "src",
1595            file_kind: FileKind::Directory,
1596            is_executable: false,
1597            content: None,
1598        };
1599        let tags = tags_from_info(&info);
1600        assert!(tags.contains("directory"));
1601        assert_eq!(tags.len(), 1);
1602    }
1603
1604    #[test]
1605    fn test_tags_from_info_symlink() {
1606        let info = FileInfo {
1607            filename: "link",
1608            file_kind: FileKind::Symlink,
1609            is_executable: false,
1610            content: None,
1611        };
1612        let tags = tags_from_info(&info);
1613        assert!(tags.contains("symlink"));
1614        assert_eq!(tags.len(), 1);
1615    }
1616
1617    #[test]
1618    fn test_tags_from_info_socket() {
1619        let info = FileInfo {
1620            filename: "sock",
1621            file_kind: FileKind::Socket,
1622            is_executable: false,
1623            content: None,
1624        };
1625        let tags = tags_from_info(&info);
1626        assert!(tags.contains("socket"));
1627        assert_eq!(tags.len(), 1);
1628    }
1629
1630    #[test]
1631    fn test_tags_from_info_executable_with_shebang() {
1632        let info = FileInfo {
1633            filename: "my-script",
1634            file_kind: FileKind::Regular,
1635            is_executable: true,
1636            content: Some(b"#!/usr/bin/env python3\nprint('hello')"),
1637        };
1638        let tags = tags_from_info(&info);
1639        assert!(tags.contains("file"));
1640        assert!(tags.contains("executable"));
1641        assert!(tags.contains("python"));
1642        assert!(tags.contains("python3"));
1643        assert!(tags.contains("text"));
1644    }
1645
1646    #[test]
1647    fn test_tags_from_info_binary_content() {
1648        let info = FileInfo {
1649            filename: "data.bin",
1650            file_kind: FileKind::Regular,
1651            is_executable: false,
1652            content: Some(&[0x7f, 0x45, 0x4c, 0x46, 0x00]),
1653        };
1654        let tags = tags_from_info(&info);
1655        assert!(tags.contains("file"));
1656        assert!(tags.contains("binary"));
1657    }
1658
1659    #[test]
1660    fn test_tags_from_info_no_content() {
1661        let info = FileInfo {
1662            filename: "unknown",
1663            file_kind: FileKind::Regular,
1664            is_executable: false,
1665            content: None,
1666        };
1667        let tags = tags_from_info(&info);
1668        assert!(tags.contains("file"));
1669        assert!(tags.contains("non-executable"));
1670        // No encoding tag since no content was provided
1671        assert!(!tags.contains("text"));
1672        assert!(!tags.contains("binary"));
1673    }
1674
1675    #[test]
1676    fn test_tags_from_info_extension_provides_encoding() {
1677        let info = FileInfo {
1678            filename: "app.js",
1679            file_kind: FileKind::Regular,
1680            is_executable: false,
1681            content: None,
1682        };
1683        let tags = tags_from_info(&info);
1684        assert!(tags.contains("javascript"));
1685        assert!(tags.contains("text"));
1686    }
1687
1688    #[test]
1689    fn test_identify_from_with_custom_extensions() {
1690        let mut custom = std::collections::HashMap::new();
1691        custom.insert("myext".to_string(), HashSet::from(["text", "custom-lang"]));
1692
1693        let identifier = FileIdentifier::new().with_custom_extensions(custom);
1694        let info = FileInfo {
1695            filename: "code.myext",
1696            file_kind: FileKind::Regular,
1697            is_executable: false,
1698            content: Some(b"some code"),
1699        };
1700        let tags = identifier.identify_from(&info);
1701        assert!(tags.contains("custom-lang"));
1702        assert!(tags.contains("text"));
1703    }
1704
1705    #[test]
1706    fn test_identify_from_skip_content() {
1707        let identifier = FileIdentifier::new().skip_content_analysis();
1708        let info = FileInfo {
1709            filename: "unknown",
1710            file_kind: FileKind::Regular,
1711            is_executable: false,
1712            content: Some(b"hello world"),
1713        };
1714        let tags = identifier.identify_from(&info);
1715        assert!(!tags.contains("text"));
1716        assert!(!tags.contains("binary"));
1717    }
1718
1719    #[test]
1720    fn test_identify_from_skip_shebang() {
1721        let identifier = FileIdentifier::new().skip_shebang_analysis();
1722        let info = FileInfo {
1723            filename: "my-script",
1724            file_kind: FileKind::Regular,
1725            is_executable: true,
1726            content: Some(b"#!/usr/bin/env python3\nprint('hello')"),
1727        };
1728        let tags = identifier.identify_from(&info);
1729        assert!(!tags.contains("python"));
1730        // Still detects text encoding
1731        assert!(tags.contains("text"));
1732    }
1733}
file_identify/lib.rs

file_identify/
lib.rs